public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 15:24 [RFC][PATCH] UBC: user resource beancounters Kirill Korotaev
@ 2006-08-16 15:37 ` Kirill Korotaev
  2006-08-16 16:58   ` Alan Cox
                     ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Kirill Korotaev @ 2006-08-16 15:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

Core functionality and interfaces of UBC:
find/create beancounter, initialization,
charge/uncharge of resource, core objects' declarations.

Basic structures:
  ubparm           - resource description
  user_beancounter - set of resources, id, lock

Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
Signed-Off-By: Kirill Korotaev <dev@sw.ru>

---
 include/ub/beancounter.h |  157 ++++++++++++++++++
 init/main.c              |    4
 kernel/Makefile          |    1
 kernel/ub/Makefile       |    7
 kernel/ub/beancounter.c  |  398 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 567 insertions(+)

--- /dev/null	2006-07-18 14:52:43.075228448 +0400
+++ ./include/ub/beancounter.h	2006-08-10 14:58:27.000000000 +0400
@@ -0,0 +1,157 @@
+/*
+ *  include/ub/beancounter.h
+ *
+ *  Copyright (C) 2006 OpenVZ. SWsoft Inc
+ *
+ */
+
+#ifndef _LINUX_BEANCOUNTER_H
+#define _LINUX_BEANCOUNTER_H
+
+/*
+ *	Resource list.
+ */
+
+#define UB_RESOURCES	0
+
+struct ubparm {
+	/*
+	 * A barrier over which resource allocations are failed gracefully.
+	 * e.g. if the amount of consumed memory is over the barrier further
+	 * sbrk() or mmap() calls fail, the existing processes are not killed.
+	 */
+	unsigned long	barrier;
+	/* hard resource limit */
+	unsigned long	limit;
+	/* consumed resources */
+	unsigned long	held;
+	/* maximum amount of consumed resources through the last period */
+	unsigned long	maxheld;
+	/* minimum amount of consumed resources through the last period */
+	unsigned long	minheld;
+	/* count of failed charges */
+	unsigned long	failcnt;
+};
+
+/*
+ * Kernel internal part.
+ */
+
+#ifdef __KERNEL__
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+/*
+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
+ */
+#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
+
+
+/*
+ *	Resource management structures
+ * Serialization issues:
+ *   beancounter list management is protected via ub_hash_lock
+ *   task pointers are set only for current task and only once
+ *   refcount is managed atomically
+ *   value and limit comparison and change are protected by per-ub spinlock
+ */
+
+struct user_beancounter
+{
+	atomic_t		ub_refcount;
+	spinlock_t		ub_lock;
+	uid_t			ub_uid;
+	struct hlist_node	hash;
+
+	struct user_beancounter	*parent;
+	void			*private_data;
+
+	/* resources statistics and settings */
+	struct ubparm		ub_parms[UB_RESOURCES];
+};
+
+enum severity { UB_BARRIER, UB_LIMIT, UB_FORCE };
+
+/* Flags passed to beancounter_findcreate() */
+#define UB_LOOKUP_SUB		0x01 /* Lookup subbeancounter */
+#define UB_ALLOC		0x02 /* May allocate new one */
+#define UB_ALLOC_ATOMIC		0x04 /* Allocate with GFP_ATOMIC */
+
+#define UB_HASH_SIZE		256
+
+#ifdef CONFIG_USER_RESOURCE
+extern struct hlist_head ub_hash[];
+extern spinlock_t ub_hash_lock;
+
+static inline void ub_adjust_held_minmax(struct user_beancounter *ub,
+		int resource)
+{
+	if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
+		ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
+	if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
+		ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
+}
+
+void ub_print_resource_warning(struct user_beancounter *ub, int res,
+		char *str, unsigned long val, unsigned long held);
+void ub_print_uid(struct user_beancounter *ub, char *str, int size);
+
+int  __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum severity strict);
+void charge_beancounter_notop(struct user_beancounter *ub,
+		int resource, unsigned long val);
+int  charge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val, enum severity strict);
+
+void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val);
+void uncharge_beancounter_notop(struct user_beancounter *ub,
+		int resource, unsigned long val);
+void uncharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val);
+
+struct user_beancounter *beancounter_findcreate(uid_t uid,
+		struct user_beancounter *parent, int flags);
+
+static inline struct user_beancounter *get_beancounter(
+		struct user_beancounter *ub)
+{
+	atomic_inc(&ub->ub_refcount);
+	return ub;
+}
+
+void __put_beancounter(struct user_beancounter *ub);
+static inline void put_beancounter(struct user_beancounter *ub)
+{
+	__put_beancounter(ub);
+}
+
+void ub_init_early(void);
+void ub_init_late(void);
+void ub_init_proc(void);
+
+extern struct user_beancounter ub0;
+extern const char *ub_rnames[];
+
+#else /* CONFIG_USER_RESOURCE */
+
+#define beancounter_findcreate(id, p, f)		(NULL)
+#define get_beancounter(ub)				(NULL)
+#define put_beancounter(ub)				do { } while (0)
+#define __charge_beancounter_locked(ub, r, v, s)	(0)
+#define charge_beancounter(ub, r, v, s)			(0)
+#define charge_beancounter_notop(ub, r, v)		do { } while (0)
+#define __uncharge_beancounter_locked(ub, r, v)		do { } while (0)
+#define uncharge_beancounter(ub, r, v)			do { } while (0)
+#define uncharge_beancounter_notop(ub, r, v)		do { } while (0)
+#define ub_init_early()					do { } while (0)
+#define ub_init_late()					do { } while (0)
+#define ub_init_proc()					do { } while (0)
+
+#endif /* CONFIG_USER_RESOURCE */
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_BEANCOUNTER_H */
--- ./init/main.c.ubcore	2006-08-10 14:55:47.000000000 +0400
+++ ./init/main.c	2006-08-10 14:57:01.000000000 +0400
@@ -52,6 +52,8 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 
+#include <ub/beancounter.h>
+
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
@@ -470,6 +472,7 @@ asmlinkage void __init start_kernel(void
 	early_boot_irqs_off();
 	early_init_irq_lock_class();
 
+	ub_init_early();
 /*
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
@@ -563,6 +566,7 @@ asmlinkage void __init start_kernel(void
 #endif
 	fork_init(num_physpages);
 	proc_caches_init();
+	ub_init_late();
 	buffer_init();
 	unnamed_dev_init();
 	key_init();
--- ./kernel/Makefile.ubcore	2006-08-10 14:55:47.000000000 +0400
+++ ./kernel/Makefile	2006-08-10 14:57:01.000000000 +0400
@@ -12,6 +12,7 @@ obj-y     = sched.o fork.o exec_domain.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+obj-y += ub/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
--- /dev/null	2006-07-18 14:52:43.075228448 +0400
+++ ./kernel/ub/Makefile	2006-08-10 14:57:01.000000000 +0400
@@ -0,0 +1,7 @@
+#
+# User resources part (UBC)
+#
+# Copyright (C) 2006 OpenVZ. SWsoft Inc
+#
+
+obj-$(CONFIG_USER_RESOURCE) += beancounter.o
--- /dev/null	2006-07-18 14:52:43.075228448 +0400
+++ ./kernel/ub/beancounter.c	2006-08-10 15:09:34.000000000 +0400
@@ -0,0 +1,398 @@
+/*
+ *  kernel/ub/beancounter.c
+ *
+ *  Copyright (C) 2006 OpenVZ. SWsoft Inc
+ *  Original code by (C) 1998      Alan Cox
+ *                       1998-2000 Andrey Savochkin <saw@saw.sw.com.sg>
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <ub/beancounter.h>
+
+static kmem_cache_t *ub_cachep;
+static struct user_beancounter default_beancounter;
+static struct user_beancounter default_subbeancounter;
+
+static void init_beancounter_struct(struct user_beancounter *ub, uid_t id);
+
+struct user_beancounter ub0;
+
+const char *ub_rnames[] = {
+};
+
+#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
+#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
+
+struct hlist_head ub_hash[UB_HASH_SIZE];
+spinlock_t ub_hash_lock;
+
+EXPORT_SYMBOL(ub_hash);
+EXPORT_SYMBOL(ub_hash_lock);
+
+/*
+ *	Per user resource beancounting. Resources are tied to their luid.
+ *	The resource structure itself is tagged both to the process and
+ *	the charging resources (a socket doesn't want to have to search for
+ *	things at irq time for example). Reference counters keep things in
+ *	hand.
+ *
+ *	The case where a user creates resource, kills all his processes and
+ *	then starts new ones is correctly handled this way. The refcounters
+ *	will mean the old entry is still around with resource tied to it.
+ */
+
+struct user_beancounter *beancounter_findcreate(uid_t uid,
+		struct user_beancounter *p, int mask)
+{
+	struct user_beancounter *new_ub, *ub, *tmpl_ub;
+	unsigned long flags;
+	struct hlist_head *slot;
+	struct hlist_node *pos;
+
+	if (mask & UB_LOOKUP_SUB) {
+		WARN_ON(p == NULL);
+		tmpl_ub = &default_subbeancounter;
+		slot = &ub_hash[ub_subhash_fun(p, uid)];
+	} else {
+		WARN_ON(p != NULL);
+		tmpl_ub = &default_beancounter;
+		slot = &ub_hash[ub_hash_fun(uid)];
+	}
+	new_ub = NULL;
+
+retry:
+	spin_lock_irqsave(&ub_hash_lock, flags);
+	hlist_for_each_entry (ub, pos, slot, hash)
+		if (ub->ub_uid == uid && ub->parent == p)
+			break;
+
+	if (pos != NULL) {
+		get_beancounter(ub);
+		spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+		if (new_ub != NULL) {
+			put_beancounter(new_ub->parent);
+			kmem_cache_free(ub_cachep, new_ub);
+		}
+		return ub;
+	}
+
+	if (!(mask & UB_ALLOC))
+		goto out_unlock;
+
+	if (new_ub != NULL)
+		goto out_install;
+
+	if (mask & UB_ALLOC_ATOMIC) {
+		new_ub = kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
+		if (new_ub == NULL)
+			goto out_unlock;
+
+		memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
+		init_beancounter_struct(new_ub, uid);
+		if (p)
+			new_ub->parent = get_beancounter(p);
+		goto out_install;
+	}
+
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+	new_ub = kmem_cache_alloc(ub_cachep, GFP_KERNEL);
+	if (new_ub == NULL)
+		goto out;
+
+	memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
+	init_beancounter_struct(new_ub, uid);
+	if (p)
+		new_ub->parent = get_beancounter(p);
+	goto retry;
+
+out_install:
+	hlist_add_head(&new_ub->hash, slot);
+out_unlock:
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+out:
+	return new_ub;
+}
+
+EXPORT_SYMBOL(beancounter_findcreate);
+
+void ub_print_uid(struct user_beancounter *ub, char *str, int size)
+{
+	if (ub->parent != NULL)
+		snprintf(str, size, "%u.%u", ub->parent->ub_uid, ub->ub_uid);
+	else
+		snprintf(str, size, "%u", ub->ub_uid);
+}
+
+EXPORT_SYMBOL(ub_print_uid);
+
+void ub_print_resource_warning(struct user_beancounter *ub, int res,
+		char *str, unsigned long val, unsigned long held)
+{
+	char uid[64];
+
+	ub_print_uid(ub, uid, sizeof(uid));
+	printk(KERN_WARNING "UB %s %s warning: %s "
+			"(held %lu, fails %lu, val %lu)\n",
+			uid, ub_rnames[res], str,
+			(res < UB_RESOURCES ? ub->ub_parms[res].held : held),
+			(res < UB_RESOURCES ? ub->ub_parms[res].failcnt : 0),
+			val);
+}
+
+EXPORT_SYMBOL(ub_print_resource_warning);
+
+static inline void verify_held(struct user_beancounter *ub)
+{
+	int i;
+
+	for (i = 0; i < UB_RESOURCES; i++)
+		if (ub->ub_parms[i].held != 0)
+			ub_print_resource_warning(ub, i,
+					"resource is held on put", 0, 0);
+}
+
+void __put_beancounter(struct user_beancounter *ub)
+{
+	unsigned long flags;
+	struct user_beancounter *parent;
+
+again:
+	parent = ub->parent;
+	/* equevalent to atomic_dec_and_lock_irqsave() */
+	local_irq_save(flags);
+	if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
+		if (unlikely(atomic_read(&ub->ub_refcount) < 0))
+			printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
+					"luid=%d, ref=%d\n",
+					ub, ub->ub_uid,
+					atomic_read(&ub->ub_refcount));
+		local_irq_restore(flags);
+		return;
+	}
+
+	if (unlikely(ub == &ub0)) {
+		printk(KERN_ERR "Trying to put ub0\n");
+		spin_unlock_irqrestore(&ub_hash_lock, flags);
+		return;
+	}
+
+	verify_held(ub);
+	hlist_del(&ub->hash);
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+	kmem_cache_free(ub_cachep, ub);
+
+	ub = parent;
+	if (ub != NULL)
+		goto again;
+}
+
+EXPORT_SYMBOL(__put_beancounter);
+
+/*
+ *	Generic resource charging stuff
+ */
+
+int __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum severity strict)
+{
+	/*
+	 * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
+	 * at the moment is possible so an overflow is impossible.  
+	 */
+	ub->ub_parms[resource].held += val;
+
+	switch (strict) {
+		case UB_BARRIER:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].barrier)
+				break;
+			/* fallthrough */
+		case UB_LIMIT:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].limit)
+				break;
+			/* fallthrough */
+		case UB_FORCE:
+			ub_adjust_held_minmax(ub, resource);
+			return 0;
+		default:
+			BUG();
+	}
+
+	ub->ub_parms[resource].failcnt++;
+	ub->ub_parms[resource].held -= val;
+	return -ENOMEM;
+}
+
+int charge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val, enum severity strict)
+{
+	int retval;
+	struct user_beancounter *p, *q;
+	unsigned long flags;
+
+	retval = -EINVAL;
+	BUG_ON(val > UB_MAXVALUE);
+
+	local_irq_save(flags);
+	for (p = ub; p != NULL; p = p->parent) {
+		spin_lock(&p->ub_lock);
+		retval = __charge_beancounter_locked(p, resource, val, strict);
+		spin_unlock(&p->ub_lock);
+		if (retval)
+			goto unroll;
+	}
+out_restore:
+	local_irq_restore(flags);
+	return retval;
+
+unroll:
+	for (q = ub; q != p; q = q->parent) {
+		spin_lock(&q->ub_lock);
+		__uncharge_beancounter_locked(q, resource, val);
+		spin_unlock(&q->ub_lock);
+	}
+	goto out_restore;
+}
+
+EXPORT_SYMBOL(charge_beancounter);
+
+void charge_beancounter_notop(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct user_beancounter *p;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	for (p = ub; p->parent != NULL; p = p->parent) {
+		spin_lock(&p->ub_lock);
+		__charge_beancounter_locked(p, resource, val, UB_FORCE);
+		spin_unlock(&p->ub_lock);
+	}
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(charge_beancounter_notop);
+
+void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	if (unlikely(ub->ub_parms[resource].held < val)) {
+		ub_print_resource_warning(ub, resource,
+				"uncharging too much", val, 0);
+		val = ub->ub_parms[resource].held;
+	}
+	ub->ub_parms[resource].held -= val;
+	ub_adjust_held_minmax(ub, resource);
+}
+
+void uncharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	unsigned long flags;
+	struct user_beancounter *p;
+
+	for (p = ub; p != NULL; p = p->parent) {
+		spin_lock_irqsave(&p->ub_lock, flags);
+		__uncharge_beancounter_locked(p, resource, val);
+		spin_unlock_irqrestore(&p->ub_lock, flags);
+	}
+}
+
+EXPORT_SYMBOL(uncharge_beancounter);
+
+void uncharge_beancounter_notop(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct user_beancounter *p;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	for (p = ub; p->parent != NULL; p = p->parent) {
+		spin_lock(&p->ub_lock);
+		__uncharge_beancounter_locked(p, resource, val);
+		spin_unlock(&p->ub_lock);
+	}
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(uncharge_beancounter_notop);
+
+/*
+ *	Initialization
+ *
+ *	struct user_beancounter contains
+ *	 - limits and other configuration settings
+ *	 - structural fields: lists, spinlocks and so on.
+ *
+ *	Before these parts are initialized, the structure should be memset
+ *	to 0 or copied from a known clean structure.  That takes care of a lot
+ *	of fields not initialized explicitly.
+ */
+
+static void init_beancounter_struct(struct user_beancounter *ub, uid_t id)
+{
+	atomic_set(&ub->ub_refcount, 1);
+	spin_lock_init(&ub->ub_lock);
+	ub->ub_uid = id;
+}
+
+static void init_beancounter_nolimits(struct user_beancounter *ub)
+{
+	int k;
+
+	for (k = 0; k < UB_RESOURCES; k++) {
+		ub->ub_parms[k].limit = UB_MAXVALUE;
+		ub->ub_parms[k].barrier = UB_MAXVALUE;
+	}
+}
+
+static void init_beancounter_syslimits(struct user_beancounter *ub)
+{
+	int k;
+
+	for (k = 0; k < UB_RESOURCES; k++)
+		ub->ub_parms[k].barrier = ub->ub_parms[k].limit;
+}
+
+void __init ub_init_early(void)
+{
+	struct user_beancounter *ub;
+	struct hlist_head *slot;
+
+	ub = &ub0;
+
+	memset(ub, 0, sizeof(*ub));
+	init_beancounter_nolimits(ub);
+	init_beancounter_struct(ub, 0);
+
+	spin_lock_init(&ub_hash_lock);
+	slot = &ub_hash[ub_hash_fun(ub->ub_uid)];
+	hlist_add_head(&ub->hash, slot);
+}
+
+void __init ub_init_late(void)
+{
+	struct user_beancounter *ub;
+
+	ub_cachep = kmem_cache_create("user_beancounters",
+			sizeof(struct user_beancounter),
+			0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (ub_cachep == NULL)
+		panic("Can't create ubc caches\n");
+
+	ub = &default_beancounter;
+	memset(ub, 0, sizeof(default_beancounter));
+	init_beancounter_syslimits(ub);
+	init_beancounter_struct(ub, 0);
+
+	ub = &default_subbeancounter;
+	memset(ub, 0, sizeof(default_subbeancounter));
+	init_beancounter_nolimits(ub);
+	init_beancounter_struct(ub, 0);
+}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 15:37 ` [RFC][PATCH 2/7] UBC: core (structures, API) Kirill Korotaev
@ 2006-08-16 16:58   ` Alan Cox
  2006-08-17 11:42     ` Kirill Korotaev
  2006-08-16 17:15   ` Greg KH
  2006-08-16 18:11   ` Rohit Seth
  2 siblings, 1 reply; 17+ messages in thread
From: Alan Cox @ 2006-08-16 16:58 UTC (permalink / raw)
  To: Kirill Korotaev
  Cc: Andrew Morton, Linux Kernel Mailing List, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

Ar Mer, 2006-08-16 am 19:37 +0400, ysgrifennodd Kirill Korotaev:
> + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
> + */
> +#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
> +

Whats wrong with using the kernels LONG_MAX ?


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 15:37 ` [RFC][PATCH 2/7] UBC: core (structures, API) Kirill Korotaev
  2006-08-16 16:58   ` Alan Cox
@ 2006-08-16 17:15   ` Greg KH
  2006-08-17 11:45     ` Kirill Korotaev
  2006-08-16 18:11   ` Rohit Seth
  2 siblings, 1 reply; 17+ messages in thread
From: Greg KH @ 2006-08-16 17:15 UTC (permalink / raw)
  To: Kirill Korotaev
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

On Wed, Aug 16, 2006 at 07:37:26PM +0400, Kirill Korotaev wrote:
> +struct user_beancounter
> +{
> +	atomic_t		ub_refcount;

Why not use a struct kref here instead of rolling your own reference
counting logic?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 15:37 ` [RFC][PATCH 2/7] UBC: core (structures, API) Kirill Korotaev
  2006-08-16 16:58   ` Alan Cox
  2006-08-16 17:15   ` Greg KH
@ 2006-08-16 18:11   ` Rohit Seth
  2006-08-16 18:18     ` Andrew Morton
  2006-08-17 11:53     ` Kirill Korotaev
  2 siblings, 2 replies; 17+ messages in thread
From: Rohit Seth @ 2006-08-16 18:11 UTC (permalink / raw)
  To: Kirill Korotaev
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

On Wed, 2006-08-16 at 19:37 +0400, Kirill Korotaev wrote:
> Core functionality and interfaces of UBC:
> find/create beancounter, initialization,
> charge/uncharge of resource, core objects' declarations.
> 
> Basic structures:
>   ubparm           - resource description
>   user_beancounter - set of resources, id, lock
> 
> Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
> Signed-Off-By: Kirill Korotaev <dev@sw.ru>
> 
> ---
>  include/ub/beancounter.h |  157 ++++++++++++++++++
>  init/main.c              |    4
>  kernel/Makefile          |    1
>  kernel/ub/Makefile       |    7
>  kernel/ub/beancounter.c  |  398 +++++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 567 insertions(+)
> 
> --- /dev/null	2006-07-18 14:52:43.075228448 +0400
> +++ ./include/ub/beancounter.h	2006-08-10 14:58:27.000000000 +0400
> @@ -0,0 +1,157 @@
> +/*
> + *  include/ub/beancounter.h
> + *
> + *  Copyright (C) 2006 OpenVZ. SWsoft Inc
> + *
> + */
> +
> +#ifndef _LINUX_BEANCOUNTER_H
> +#define _LINUX_BEANCOUNTER_H
> +
> +/*
> + *	Resource list.
> + */
> +
> +#define UB_RESOURCES	0
> +
> +struct ubparm {
> +	/*
> +	 * A barrier over which resource allocations are failed gracefully.
> +	 * e.g. if the amount of consumed memory is over the barrier further
> +	 * sbrk() or mmap() calls fail, the existing processes are not killed.
> +	 */
> +	unsigned long	barrier;
> +	/* hard resource limit */
> +	unsigned long	limit;
> +	/* consumed resources */
> +	unsigned long	held;
> +	/* maximum amount of consumed resources through the last period */
> +	unsigned long	maxheld;
> +	/* minimum amount of consumed resources through the last period */
> +	unsigned long	minheld;
> +	/* count of failed charges */
> +	unsigned long	failcnt;
> +};

What is the difference between barrier and limit. They both sound like
hard limits.  No?

> +
> +/*
> + * Kernel internal part.
> + */
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/config.h>
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <asm/atomic.h>
> +
> +/*
> + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
> + */
> +#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
> +
> +
> +/*
> + *	Resource management structures
> + * Serialization issues:
> + *   beancounter list management is protected via ub_hash_lock
> + *   task pointers are set only for current task and only once
> + *   refcount is managed atomically
> + *   value and limit comparison and change are protected by per-ub spinlock
> + */
> +
> +struct user_beancounter
> +{
> +	atomic_t		ub_refcount;
> +	spinlock_t		ub_lock;
> +	uid_t			ub_uid;

Why uid?  Will it be possible to club processes belonging to different
users to same bean counter.

> +	struct hlist_node	hash;
> +
> +	struct user_beancounter	*parent;
> +	void			*private_data;
> +

What are the above two fields used for?

> +	/* resources statistics and settings */
> +	struct ubparm		ub_parms[UB_RESOURCES];
> +};
> +

I presume UB_RESOURCES value is going to change as different resources
start getting tracked.

I think something like configfs should be used for user interface.  It
automatically presents the right interfaces to user land (based on
kernel implementation).  And you wouldn't need any changes in glibc etc.


-rohit


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 18:11   ` Rohit Seth
@ 2006-08-16 18:18     ` Andrew Morton
  2006-08-17 11:54       ` Kirill Korotaev
  2006-08-17 11:53     ` Kirill Korotaev
  1 sibling, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2006-08-16 18:18 UTC (permalink / raw)
  To: rohitseth
  Cc: Kirill Korotaev, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

On Wed, 16 Aug 2006 11:11:08 -0700
Rohit Seth <rohitseth@google.com> wrote:

> > +struct user_beancounter
> > +{
> > +	atomic_t		ub_refcount;
> > +	spinlock_t		ub_lock;
> > +	uid_t			ub_uid;
> 
> Why uid?  Will it be possible to club processes belonging to different
> users to same bean counter.

hm.  I'd have expected to see a `struct user_struct *' here, not a uid_t.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 16:58   ` Alan Cox
@ 2006-08-17 11:42     ` Kirill Korotaev
  0 siblings, 0 replies; 17+ messages in thread
From: Kirill Korotaev @ 2006-08-17 11:42 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrew Morton, Linux Kernel Mailing List, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

> Ar Mer, 2006-08-16 am 19:37 +0400, ysgrifennodd Kirill Korotaev:
> 
>>+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
>>+ */
>>+#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
>>+
> 
> 
> Whats wrong with using the kernels LONG_MAX ?
just historical code line which introduces UB_MAXVALUE independant of
cross-compiler/headers etc.

Will replace it.

Kirill


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 17:15   ` Greg KH
@ 2006-08-17 11:45     ` Kirill Korotaev
  2006-08-17 12:14       ` Greg KH
  0 siblings, 1 reply; 17+ messages in thread
From: Kirill Korotaev @ 2006-08-17 11:45 UTC (permalink / raw)
  To: Greg KH
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

>>+struct user_beancounter
>>+{
>>+	atomic_t		ub_refcount;
> 
> 
> Why not use a struct kref here instead of rolling your own reference
> counting logic?

We need more complex decrement/locking scheme than krefs
provide. e.g. in __put_beancounter() we need
atomic_dec_and_lock_irqsave() semantics for performance optimizations.

Kirill


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 18:11   ` Rohit Seth
  2006-08-16 18:18     ` Andrew Morton
@ 2006-08-17 11:53     ` Kirill Korotaev
  2006-08-17 16:55       ` Rohit Seth
  2006-08-18  5:31       ` Andrew Morton
  1 sibling, 2 replies; 17+ messages in thread
From: Kirill Korotaev @ 2006-08-17 11:53 UTC (permalink / raw)
  To: rohitseth
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

Rohit Seth wrote:
> On Wed, 2006-08-16 at 19:37 +0400, Kirill Korotaev wrote:
> 
>>Core functionality and interfaces of UBC:
>>find/create beancounter, initialization,
>>charge/uncharge of resource, core objects' declarations.
>>
>>Basic structures:
>>  ubparm           - resource description
>>  user_beancounter - set of resources, id, lock
>>
>>Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
>>Signed-Off-By: Kirill Korotaev <dev@sw.ru>
>>
>>---
>> include/ub/beancounter.h |  157 ++++++++++++++++++
>> init/main.c              |    4
>> kernel/Makefile          |    1
>> kernel/ub/Makefile       |    7
>> kernel/ub/beancounter.c  |  398 +++++++++++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 567 insertions(+)
>>
>>--- /dev/null	2006-07-18 14:52:43.075228448 +0400
>>+++ ./include/ub/beancounter.h	2006-08-10 14:58:27.000000000 +0400
>>@@ -0,0 +1,157 @@
>>+/*
>>+ *  include/ub/beancounter.h
>>+ *
>>+ *  Copyright (C) 2006 OpenVZ. SWsoft Inc
>>+ *
>>+ */
>>+
>>+#ifndef _LINUX_BEANCOUNTER_H
>>+#define _LINUX_BEANCOUNTER_H
>>+
>>+/*
>>+ *	Resource list.
>>+ */
>>+
>>+#define UB_RESOURCES	0
>>+
>>+struct ubparm {
>>+	/*
>>+	 * A barrier over which resource allocations are failed gracefully.
>>+	 * e.g. if the amount of consumed memory is over the barrier further
>>+	 * sbrk() or mmap() calls fail, the existing processes are not killed.
>>+	 */
>>+	unsigned long	barrier;
>>+	/* hard resource limit */
>>+	unsigned long	limit;
>>+	/* consumed resources */
>>+	unsigned long	held;
>>+	/* maximum amount of consumed resources through the last period */
>>+	unsigned long	maxheld;
>>+	/* minimum amount of consumed resources through the last period */
>>+	unsigned long	minheld;
>>+	/* count of failed charges */
>>+	unsigned long	failcnt;
>>+};
> 
> 
> What is the difference between barrier and limit. They both sound like
> hard limits.  No?
check __charge_beancounter_locked and severity.
It provides some kind of soft and hard limits.

>>+
>>+/*
>>+ * Kernel internal part.
>>+ */
>>+
>>+#ifdef __KERNEL__
>>+
>>+#include <linux/config.h>
>>+#include <linux/spinlock.h>
>>+#include <linux/list.h>
>>+#include <asm/atomic.h>
>>+
>>+/*
>>+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
>>+ */
>>+#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
>>+
>>+
>>+/*
>>+ *	Resource management structures
>>+ * Serialization issues:
>>+ *   beancounter list management is protected via ub_hash_lock
>>+ *   task pointers are set only for current task and only once
>>+ *   refcount is managed atomically
>>+ *   value and limit comparison and change are protected by per-ub spinlock
>>+ */
>>+
>>+struct user_beancounter
>>+{
>>+	atomic_t		ub_refcount;
>>+	spinlock_t		ub_lock;
>>+	uid_t			ub_uid;
> 
> 
> Why uid?  Will it be possible to club processes belonging to different
> users to same bean counter.
oh, its a misname. Should be ub_id. it is ID of user_beancounter
and has nothing to do with user id.

>>+	struct hlist_node	hash;
>>+
>>+	struct user_beancounter	*parent;
>>+	void			*private_data;
>>+
> 
> 
> What are the above two fields used for?
the first one is for hierarchical UBs,
see beancounter_findcreate with UB_LOOKUP_SUB.
private_data is probably not used yet :)

>>+	/* resources statistics and settings */
>>+	struct ubparm		ub_parms[UB_RESOURCES];
>>+};
>>+
> 
> 
> I presume UB_RESOURCES value is going to change as different resources
> start getting tracked.
what's wrong with it?

> I think something like configfs should be used for user interface.  It
> automatically presents the right interfaces to user land (based on
> kernel implementation).  And you wouldn't need any changes in glibc etc.
1. UBC doesn't require glibc modificatins.
2. if you think a bit more about it, adding UB parameters doesn't
   require user space changes as well.
3. it is possible to add any kind of interface for UBC. but do you like the idea
   to grep 200(containers)x20(parameters) files for getting current usages?
   Do you like the idea to convert numbers to strings and back w/o
   thinking of data types?

Thanks,
Kirill


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-16 18:18     ` Andrew Morton
@ 2006-08-17 11:54       ` Kirill Korotaev
  0 siblings, 0 replies; 17+ messages in thread
From: Kirill Korotaev @ 2006-08-17 11:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: rohitseth, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

Andrew Morton wrote:
> On Wed, 16 Aug 2006 11:11:08 -0700
> Rohit Seth <rohitseth@google.com> wrote:
> 
> 
>>>+struct user_beancounter
>>>+{
>>>+	atomic_t		ub_refcount;
>>>+	spinlock_t		ub_lock;
>>>+	uid_t			ub_uid;
>>
>>Why uid?  Will it be possible to club processes belonging to different
>>users to same bean counter.
> 
> 
> hm.  I'd have expected to see a `struct user_struct *' here, not a uid_t.

Sorry, misused name. should be ub_id. not related to user_struct or user.

Thanks,
Kirill

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-17 11:45     ` Kirill Korotaev
@ 2006-08-17 12:14       ` Greg KH
  0 siblings, 0 replies; 17+ messages in thread
From: Greg KH @ 2006-08-17 12:14 UTC (permalink / raw)
  To: Kirill Korotaev
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

On Thu, Aug 17, 2006 at 03:45:56PM +0400, Kirill Korotaev wrote:
> >>+struct user_beancounter
> >>+{
> >>+	atomic_t		ub_refcount;
> >
> >
> >Why not use a struct kref here instead of rolling your own reference
> >counting logic?
> 
> We need more complex decrement/locking scheme than krefs
> provide. e.g. in __put_beancounter() we need
> atomic_dec_and_lock_irqsave() semantics for performance optimizations.

Ah, ok, missed that.  Nevermind then :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-17 18:35 [RFC][PATCH 2/7] UBC: core (structures, API) Oleg Nesterov
@ 2006-08-17 14:29 ` Kirill Korotaev
  0 siblings, 0 replies; 17+ messages in thread
From: Kirill Korotaev @ 2006-08-17 14:29 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Pavel Emelianov, Alan Cox, Andrew Morton, linux-kernel

Oleg,

>>+struct user_beancounter *beancounter_findcreate(uid_t uid,
>>+		struct user_beancounter *p, int mask)
>>+{
>>+	struct user_beancounter *new_ub, *ub, *tmpl_ub;
>>+	unsigned long flags;
>>+	struct hlist_head *slot;
>>+	struct hlist_node *pos;
>>+
>>+	if (mask & UB_LOOKUP_SUB) {
>>+		WARN_ON(p == NULL);
>>+		tmpl_ub = &default_subbeancounter;
>>+		slot = &ub_hash[ub_subhash_fun(p, uid)];
>>+	} else {
>>+		WARN_ON(p != NULL);
>>+		tmpl_ub = &default_beancounter;
>>+		slot = &ub_hash[ub_hash_fun(uid)];
>>+	}
>>+	new_ub = NULL;
>>+
>>+retry:
>>+	spin_lock_irqsave(&ub_hash_lock, flags);
>>+	hlist_for_each_entry (ub, pos, slot, hash)
>>+		if (ub->ub_uid == uid && ub->parent == p)
>>+			break;
>>+
>>+	if (pos != NULL) {
>>+		get_beancounter(ub);
>>+		spin_unlock_irqrestore(&ub_hash_lock, flags);
>>+
>>+		if (new_ub != NULL) {
>>+			put_beancounter(new_ub->parent);
> 
> 					^^^^^^^^^^^^^^
> 
> Stupid question: why ->parent can't be NULL ? (without UB_LOOKUP_SUB).
oh, good catch. We removed the check for NULL in put_beancounter() for cleanup,
but didn't notice this place. Thanks!

>>+	if (mask & UB_ALLOC_ATOMIC) {
>>+		new_ub = kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
>>+		if (new_ub == NULL)
>>+			goto out_unlock;
>>+
>>+		memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
>>+		init_beancounter_struct(new_ub, uid);
>>+		if (p)
>>+			new_ub->parent = get_beancounter(p);
>>+		goto out_install;
>>+	}
>>+
>>+	spin_unlock_irqrestore(&ub_hash_lock, flags);
>>+
>>+	new_ub = kmem_cache_alloc(ub_cachep, GFP_KERNEL);
> 
> 
> beancounter_findcreate() is not time critical, yes? Isn't it better
> to kill 'if (mask & UB_ALLOC_ATOMIC) { ... }' and just do
> 
> 	new_ub = kmem_cache_alloc(ub_cachep,
> 		UB_ALLOC_ATOMIC ? GFP_ATOMIC : GFP_KERNEL);
> 
> after spin_unlock(&ub_hash_lock) ?
ok, will do. Thanks for comments!

Kirill

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-17 11:53     ` Kirill Korotaev
@ 2006-08-17 16:55       ` Rohit Seth
  2006-08-18 11:14         ` Kirill Korotaev
  2006-08-18  5:31       ` Andrew Morton
  1 sibling, 1 reply; 17+ messages in thread
From: Rohit Seth @ 2006-08-17 16:55 UTC (permalink / raw)
  To: Kirill Korotaev
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

On Thu, 2006-08-17 at 15:53 +0400, Kirill Korotaev wrote:
> Rohit Seth wrote:
> > On Wed, 2006-08-16 at 19:37 +0400, Kirill Korotaev wrote:
> > 
> >>Core functionality and interfaces of UBC:
> >>find/create beancounter, initialization,
> >>charge/uncharge of resource, core objects' declarations.
> >>
> >>Basic structures:
> >>  ubparm           - resource description
> >>  user_beancounter - set of resources, id, lock
> >>
> >>Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
> >>Signed-Off-By: Kirill Korotaev <dev@sw.ru>
> >>
> >>---
> >> include/ub/beancounter.h |  157 ++++++++++++++++++
> >> init/main.c              |    4
> >> kernel/Makefile          |    1
> >> kernel/ub/Makefile       |    7
> >> kernel/ub/beancounter.c  |  398 +++++++++++++++++++++++++++++++++++++++++++++++
> >> 5 files changed, 567 insertions(+)
> >>
> >>--- /dev/null	2006-07-18 14:52:43.075228448 +0400
> >>+++ ./include/ub/beancounter.h	2006-08-10 14:58:27.000000000 +0400
> >>@@ -0,0 +1,157 @@
> >>+/*
> >>+ *  include/ub/beancounter.h
> >>+ *
> >>+ *  Copyright (C) 2006 OpenVZ. SWsoft Inc
> >>+ *
> >>+ */
> >>+
> >>+#ifndef _LINUX_BEANCOUNTER_H
> >>+#define _LINUX_BEANCOUNTER_H
> >>+
> >>+/*
> >>+ *	Resource list.
> >>+ */
> >>+
> >>+#define UB_RESOURCES	0
> >>+
> >>+struct ubparm {
> >>+	/*
> >>+	 * A barrier over which resource allocations are failed gracefully.
> >>+	 * e.g. if the amount of consumed memory is over the barrier further
> >>+	 * sbrk() or mmap() calls fail, the existing processes are not killed.
> >>+	 */
> >>+	unsigned long	barrier;
> >>+	/* hard resource limit */
> >>+	unsigned long	limit;
> >>+	/* consumed resources */
> >>+	unsigned long	held;
> >>+	/* maximum amount of consumed resources through the last period */
> >>+	unsigned long	maxheld;
> >>+	/* minimum amount of consumed resources through the last period */
> >>+	unsigned long	minheld;
> >>+	/* count of failed charges */
> >>+	unsigned long	failcnt;
> >>+};
> > 
> > 
> > What is the difference between barrier and limit. They both sound like
> > hard limits.  No?
> check __charge_beancounter_locked and severity.
> It provides some kind of soft and hard limits.
> 

Would be easier to just rename them as soft and hard limits...

> >>+
> >>+/*
> >>+ * Kernel internal part.
> >>+ */
> >>+
> >>+#ifdef __KERNEL__
> >>+
> >>+#include <linux/config.h>
> >>+#include <linux/spinlock.h>
> >>+#include <linux/list.h>
> >>+#include <asm/atomic.h>
> >>+
> >>+/*
> >>+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
> >>+ */
> >>+	/* resources statistics and settings */
> >>+	struct ubparm		ub_parms[UB_RESOURCES];
> >>+};
> >>+
> > 
> > 
> > I presume UB_RESOURCES value is going to change as different resources
> > start getting tracked.
> what's wrong with it?
> 

...just that user land will need to be some how informed about that.

> > I think something like configfs should be used for user interface.  It
> > automatically presents the right interfaces to user land (based on
> > kernel implementation).  And you wouldn't need any changes in glibc etc.
> 1. UBC doesn't require glibc modificatins.

You are right not for setting the limits.  But for adding any new
functionality related to containers....as in you just added a new system
call to get the limits.

> 2. if you think a bit more about it, adding UB parameters doesn't
>    require user space changes as well.
> 3. it is possible to add any kind of interface for UBC. but do you like the idea
>    to grep 200(containers)x20(parameters) files for getting current usages?

How are you doing it currently and how much more efficient it is in
comparison to configfs?

>    Do you like the idea to convert numbers to strings and back w/o
>    thinking of data types?

IMO, setting up limits and containers (themselves) is not a common
operation.    I wouldn't be too worried about loosing those few extra
cycles in setting them up.

-rohit


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
@ 2006-08-17 18:35 Oleg Nesterov
  2006-08-17 14:29 ` Kirill Korotaev
  0 siblings, 1 reply; 17+ messages in thread
From: Oleg Nesterov @ 2006-08-17 18:35 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: Pavel Emelianov, Alan Cox, Andrew Morton, linux-kernel

Kirill Korotaev wrote:
>
> +struct user_beancounter *beancounter_findcreate(uid_t uid,
> +		struct user_beancounter *p, int mask)
> +{
> +	struct user_beancounter *new_ub, *ub, *tmpl_ub;
> +	unsigned long flags;
> +	struct hlist_head *slot;
> +	struct hlist_node *pos;
> +
> +	if (mask & UB_LOOKUP_SUB) {
> +		WARN_ON(p == NULL);
> +		tmpl_ub = &default_subbeancounter;
> +		slot = &ub_hash[ub_subhash_fun(p, uid)];
> +	} else {
> +		WARN_ON(p != NULL);
> +		tmpl_ub = &default_beancounter;
> +		slot = &ub_hash[ub_hash_fun(uid)];
> +	}
> +	new_ub = NULL;
> +
> +retry:
> +	spin_lock_irqsave(&ub_hash_lock, flags);
> +	hlist_for_each_entry (ub, pos, slot, hash)
> +		if (ub->ub_uid == uid && ub->parent == p)
> +			break;
> +
> +	if (pos != NULL) {
> +		get_beancounter(ub);
> +		spin_unlock_irqrestore(&ub_hash_lock, flags);
> +
> +		if (new_ub != NULL) {
> +			put_beancounter(new_ub->parent);
					^^^^^^^^^^^^^^

Stupid question: why ->parent can't be NULL ? (without UB_LOOKUP_SUB).

> +	if (mask & UB_ALLOC_ATOMIC) {
> +		new_ub = kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
> +		if (new_ub == NULL)
> +			goto out_unlock;
> +
> +		memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
> +		init_beancounter_struct(new_ub, uid);
> +		if (p)
> +			new_ub->parent = get_beancounter(p);
> +		goto out_install;
> +	}
> +
> +	spin_unlock_irqrestore(&ub_hash_lock, flags);
> +
> +	new_ub = kmem_cache_alloc(ub_cachep, GFP_KERNEL);

beancounter_findcreate() is not time critical, yes? Isn't it better
to kill 'if (mask & UB_ALLOC_ATOMIC) { ... }' and just do

	new_ub = kmem_cache_alloc(ub_cachep,
		UB_ALLOC_ATOMIC ? GFP_ATOMIC : GFP_KERNEL);

after spin_unlock(&ub_hash_lock) ?

Oleg.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-17 11:53     ` Kirill Korotaev
  2006-08-17 16:55       ` Rohit Seth
@ 2006-08-18  5:31       ` Andrew Morton
  2006-08-18 15:59         ` Alan Cox
  1 sibling, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2006-08-18  5:31 UTC (permalink / raw)
  To: Kirill Korotaev
  Cc: rohitseth, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

On Thu, 17 Aug 2006 15:53:40 +0400
Kirill Korotaev <dev@sw.ru> wrote:

> >>+struct user_beancounter
> >>+{
> >>+	atomic_t		ub_refcount;
> >>+	spinlock_t		ub_lock;
> >>+	uid_t			ub_uid;
> > 
> > 
> > Why uid?  Will it be possible to club processes belonging to different
> > users to same bean counter.
> oh, its a misname. Should be ub_id. it is ID of user_beancounter
> and has nothing to do with user id.

But it uses a uid_t.  That's more than a misnaming?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-17 16:55       ` Rohit Seth
@ 2006-08-18 11:14         ` Kirill Korotaev
  2006-08-18 17:51           ` Rohit Seth
  0 siblings, 1 reply; 17+ messages in thread
From: Kirill Korotaev @ 2006-08-18 11:14 UTC (permalink / raw)
  To: rohitseth
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

Rohit Seth wrote:
> On Thu, 2006-08-17 at 15:53 +0400, Kirill Korotaev wrote:
> 
>>Rohit Seth wrote:
>>
>>>On Wed, 2006-08-16 at 19:37 +0400, Kirill Korotaev wrote:
>>>
>>>
>>>>Core functionality and interfaces of UBC:
>>>>find/create beancounter, initialization,
>>>>charge/uncharge of resource, core objects' declarations.
>>>>
>>>>Basic structures:
>>>> ubparm           - resource description
>>>> user_beancounter - set of resources, id, lock
>>>>
>>>>Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
>>>>Signed-Off-By: Kirill Korotaev <dev@sw.ru>
>>>>
>>>>---
>>>>include/ub/beancounter.h |  157 ++++++++++++++++++
>>>>init/main.c              |    4
>>>>kernel/Makefile          |    1
>>>>kernel/ub/Makefile       |    7
>>>>kernel/ub/beancounter.c  |  398 +++++++++++++++++++++++++++++++++++++++++++++++
>>>>5 files changed, 567 insertions(+)
>>>>
>>>>--- /dev/null	2006-07-18 14:52:43.075228448 +0400
>>>>+++ ./include/ub/beancounter.h	2006-08-10 14:58:27.000000000 +0400
>>>>@@ -0,0 +1,157 @@
>>>>+/*
>>>>+ *  include/ub/beancounter.h
>>>>+ *
>>>>+ *  Copyright (C) 2006 OpenVZ. SWsoft Inc
>>>>+ *
>>>>+ */
>>>>+
>>>>+#ifndef _LINUX_BEANCOUNTER_H
>>>>+#define _LINUX_BEANCOUNTER_H
>>>>+
>>>>+/*
>>>>+ *	Resource list.
>>>>+ */
>>>>+
>>>>+#define UB_RESOURCES	0
>>>>+
>>>>+struct ubparm {
>>>>+	/*
>>>>+	 * A barrier over which resource allocations are failed gracefully.
>>>>+	 * e.g. if the amount of consumed memory is over the barrier further
>>>>+	 * sbrk() or mmap() calls fail, the existing processes are not killed.
>>>>+	 */
>>>>+	unsigned long	barrier;
>>>>+	/* hard resource limit */
>>>>+	unsigned long	limit;
>>>>+	/* consumed resources */
>>>>+	unsigned long	held;
>>>>+	/* maximum amount of consumed resources through the last period */
>>>>+	unsigned long	maxheld;
>>>>+	/* minimum amount of consumed resources through the last period */
>>>>+	unsigned long	minheld;
>>>>+	/* count of failed charges */
>>>>+	unsigned long	failcnt;
>>>>+};
>>>
>>>
>>>What is the difference between barrier and limit. They both sound like
>>>hard limits.  No?
>>
>>check __charge_beancounter_locked and severity.
>>It provides some kind of soft and hard limits.
>>
> 
> 
> Would be easier to just rename them as soft and hard limits...
> 
> 
>>>>+
>>>>+/*
>>>>+ * Kernel internal part.
>>>>+ */
>>>>+
>>>>+#ifdef __KERNEL__
>>>>+
>>>>+#include <linux/config.h>
>>>>+#include <linux/spinlock.h>
>>>>+#include <linux/list.h>
>>>>+#include <asm/atomic.h>
>>>>+
>>>>+/*
>>>>+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
>>>>+ */
>>>>+	/* resources statistics and settings */
>>>>+	struct ubparm		ub_parms[UB_RESOURCES];
>>>>+};
>>>>+
>>>
>>>
>>>I presume UB_RESOURCES value is going to change as different resources
>>>start getting tracked.
>>
>>what's wrong with it?
>>
> 
> 
> ...just that user land will need to be some how informed about that.
the same way user space knows that system call is (not) implemented.
(include unistd.h :))) )

>>>I think something like configfs should be used for user interface.  It
>>>automatically presents the right interfaces to user land (based on
>>>kernel implementation).  And you wouldn't need any changes in glibc etc.
>>
>>1. UBC doesn't require glibc modificatins.
> 
> 
> You are right not for setting the limits.  But for adding any new
> functionality related to containers....as in you just added a new system
> call to get the limits.
Do you state that glibc describes _all_ the existing system calls with some wrappers?

>>2. if you think a bit more about it, adding UB parameters doesn't
>>   require user space changes as well.
>>3. it is possible to add any kind of interface for UBC. but do you like the idea
>>   to grep 200(containers)x20(parameters) files for getting current usages?
> 
> 
> How are you doing it currently and how much more efficient it is in
> comparison to configfs?
currently it is done with a single file read.
you can grep it, sum up resources or do what ever you want from bash.
what is important! you can check whether container hits its limits
with a single command, while with configs you would have to look through
20 files...

IMHO it is convinient to have a text file representing the whole information state
and system call for applications.

>>   Do you like the idea to convert numbers to strings and back w/o
>>   thinking of data types?
> 
> 
> IMO, setting up limits and containers (themselves) is not a common
> operation.    I wouldn't be too worried about loosing those few extra
> cycles in setting them up.
it is not the question of performance...

Kirill


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-18  5:31       ` Andrew Morton
@ 2006-08-18 15:59         ` Alan Cox
  0 siblings, 0 replies; 17+ messages in thread
From: Alan Cox @ 2006-08-18 15:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Kirill Korotaev, rohitseth, Linux Kernel Mailing List,
	Ingo Molnar, Christoph Hellwig, Pavel Emelianov, Andrey Savochkin,
	devel, Rik van Riel, hugh, ckrm-tech, Andi Kleen

Ar Iau, 2006-08-17 am 22:31 -0700, ysgrifennodd Andrew Morton:
> > oh, its a misname. Should be ub_id. it is ID of user_beancounter
> > and has nothing to do with user id.
> 
> But it uses a uid_t.  That's more than a misnaming?

A container id in UBC is an luid which is a type of uid, and uid_t. That
follows setluid() in other operating system environments.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 2/7] UBC: core (structures, API)
  2006-08-18 11:14         ` Kirill Korotaev
@ 2006-08-18 17:51           ` Rohit Seth
  0 siblings, 0 replies; 17+ messages in thread
From: Rohit Seth @ 2006-08-18 17:51 UTC (permalink / raw)
  To: Kirill Korotaev
  Cc: Andrew Morton, Linux Kernel Mailing List, Alan Cox, Ingo Molnar,
	Christoph Hellwig, Pavel Emelianov, Andrey Savochkin, devel,
	Rik van Riel, hugh, ckrm-tech, Andi Kleen

On Fri, 2006-08-18 at 15:14 +0400, Kirill Korotaev wrote:

> >>2. if you think a bit more about it, adding UB parameters doesn't
> >>   require user space changes as well.
> >>3. it is possible to add any kind of interface for UBC. but do you like the idea
> >>   to grep 200(containers)x20(parameters) files for getting current usages?
> > 
> > 
> > How are you doing it currently and how much more efficient it is in
> > comparison to configfs?
> currently it is done with a single file read.
> you can grep it, sum up resources or do what ever you want from bash.
> what is important! you can check whether container hits its limits
> with a single command, while with configs you would have to look through
> 20 files...
> 

I think configfs provides all the required functionality that you
listed. You can define the attributes in a such a away that it prints
all the information that you need in one single read operation (I think
the limit is PAGE_SIZE....which is kind of sad).

I've just started playing with configfs for a container implementation
that I'm trying to get a better idea of details.
 
> IMHO it is convinient to have a text file representing the whole information state
> and system call for applications.
> 

There should be an easy interface for shell to be able to do the needful
as well, for example, set the limits.


-rohit


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2006-08-18 17:52 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-17 18:35 [RFC][PATCH 2/7] UBC: core (structures, API) Oleg Nesterov
2006-08-17 14:29 ` Kirill Korotaev
  -- strict thread matches above, loose matches on Subject: below --
2006-08-16 15:24 [RFC][PATCH] UBC: user resource beancounters Kirill Korotaev
2006-08-16 15:37 ` [RFC][PATCH 2/7] UBC: core (structures, API) Kirill Korotaev
2006-08-16 16:58   ` Alan Cox
2006-08-17 11:42     ` Kirill Korotaev
2006-08-16 17:15   ` Greg KH
2006-08-17 11:45     ` Kirill Korotaev
2006-08-17 12:14       ` Greg KH
2006-08-16 18:11   ` Rohit Seth
2006-08-16 18:18     ` Andrew Morton
2006-08-17 11:54       ` Kirill Korotaev
2006-08-17 11:53     ` Kirill Korotaev
2006-08-17 16:55       ` Rohit Seth
2006-08-18 11:14         ` Kirill Korotaev
2006-08-18 17:51           ` Rohit Seth
2006-08-18  5:31       ` Andrew Morton
2006-08-18 15:59         ` Alan Cox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox