* [PATCH 1/5] slab: Tossing bits around
2011-10-06 16:22 [PATCH 0/5] Slab objects identifiers Pavel Emelyanov
@ 2011-10-06 16:23 ` Pavel Emelyanov
2011-10-06 16:23 ` [PATCH 2/5] slab_id: Generic slab ID infrastructure Pavel Emelyanov
` (5 subsequent siblings)
6 siblings, 0 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-06 16:23 UTC (permalink / raw)
To: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm
Cc: Glauber Costa, Cyrill Gorcunov, Andrew Morton
This is the preparation patch, that just moves the sl[au]b code
around making the further patching simpler.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
---
mm/slab.c | 28 ++++++++++++++++++----------
mm/slub.c | 26 +++++++++++++++-----------
2 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/mm/slab.c b/mm/slab.c
index 6d90a09..81a2063 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -538,7 +538,7 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
* reciprocal_divide(offset, cache->reciprocal_buffer_size)
*/
static inline unsigned int obj_to_index(const struct kmem_cache *cache,
- const struct slab *slab, void *obj)
+ const struct slab *slab, const void *obj)
{
u32 offset = (obj - slab->s_mem);
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
@@ -2178,6 +2178,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
return 0;
}
+static inline size_t __slab_size(int nr_objs, unsigned long flags)
+{
+ size_t ret;
+
+ ret = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t);
+
+ return ret;
+}
+
/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -2406,8 +2415,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep = NULL;
goto oops;
}
- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
- + sizeof(struct slab), align);
+
+ slab_size = ALIGN(__slab_size(cachep->num, flags), align);
/*
* If the slab has been placed off-slab, and we have enough space then
@@ -2420,8 +2429,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
- slab_size =
- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
+ slab_size = __slab_size(cachep->num, flags);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
@@ -2690,6 +2698,11 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_destroy);
+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+{
+ return (kmem_bufctl_t *) (slabp + 1);
+}
+
/*
* Get the memory for a slab management obj.
* For a slab cache when the slab descriptor is off-slab, slab descriptors
@@ -2733,11 +2746,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
return slabp;
}
-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
-{
- return (kmem_bufctl_t *) (slabp + 1);
-}
-
static void cache_init_objs(struct kmem_cache *cachep,
struct slab *slabp)
{
diff --git a/mm/slub.c b/mm/slub.c
index 7c54fe8..ab9d6fc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1414,6 +1414,18 @@ static void setup_object(struct kmem_cache *s, struct page *page,
s->ctor(object);
}
+#define need_reserve_slab_rcu \
+ (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
+
+static inline void *slab_reserved_space(struct kmem_cache *s, struct page *page, int size)
+{
+ int order = compound_order(page);
+ int offset = (PAGE_SIZE << order) - s->reserved;
+
+ VM_BUG_ON(s->reserved < size);
+ return page_address(page) + offset;
+}
+
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
@@ -1481,9 +1493,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__free_pages(page, order);
}
-#define need_reserve_slab_rcu \
- (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
-
static void rcu_free_slab(struct rcu_head *h)
{
struct page *page;
@@ -1501,18 +1510,13 @@ static void free_slab(struct kmem_cache *s, struct page *page)
if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
struct rcu_head *head;
- if (need_reserve_slab_rcu) {
- int order = compound_order(page);
- int offset = (PAGE_SIZE << order) - s->reserved;
-
- VM_BUG_ON(s->reserved != sizeof(*head));
- head = page_address(page) + offset;
- } else {
+ if (need_reserve_slab_rcu)
+ head = slab_reserved_space(s, page, sizeof(struct rcu_head));
+ else
/*
* RCU free overloads the RCU head over the LRU
*/
head = (void *)&page->lru;
- }
call_rcu(head, rcu_free_slab);
} else
--
1.5.5.6
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 2/5] slab_id: Generic slab ID infrastructure
2011-10-06 16:22 [PATCH 0/5] Slab objects identifiers Pavel Emelyanov
2011-10-06 16:23 ` [PATCH 1/5] slab: Tossing bits around Pavel Emelyanov
@ 2011-10-06 16:23 ` Pavel Emelyanov
2011-10-07 8:27 ` Glauber Costa
2011-10-06 16:23 ` [PATCH 3/5] slab_id: Slab support for IDs Pavel Emelyanov
` (4 subsequent siblings)
6 siblings, 1 reply; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-06 16:23 UTC (permalink / raw)
To: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm
Cc: Glauber Costa, Cyrill Gorcunov, Andrew Morton
The idea of how to generate and ID for an arbitrary slab object is simple:
- The ID is 128 bits
- The upper 64 bits are slab ID
- The lower 64 bits are object index withing a slab (yes, it's too many,
but is done for simplicity - not to deal with 96-bit numbers)
- The slab ID is the 48-bit per-cpu monotonic counter mixed with 16-bit
cpuid. Even if being incremented 1M times per second the first part
will stay uniqe for 200+ years. The cpuid is required to make values
picked on two cpus differ.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
---
include/linux/slab.h | 17 +++++++++++++++++
init/Kconfig | 9 +++++++++
mm/Makefile | 1 +
mm/slab_obj_ids.c | 25 +++++++++++++++++++++++++
4 files changed, 52 insertions(+), 0 deletions(-)
create mode 100644 mm/slab_obj_ids.c
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 573c809..ae9c735 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -23,6 +23,7 @@
#define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */
#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */
#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */
+#define SLAB_WANT_OBJIDS 0x00080000UL /* Want GENERIC_OBJECT_IDS-friendly slabs */
/*
* SLAB_DESTROY_BY_RCU - **WARNING** READ THIS!
*
@@ -162,6 +163,22 @@ void kfree(const void *);
void kzfree(const void *);
size_t ksize(const void *);
+#ifdef CONFIG_SLAB_OBJECT_IDS
+void __slab_pick_id(u64 *s_id);
+static inline void __slab_get_id(u64 *id, u64 s_id, u64 o_id)
+{
+ id[0] = o_id;
+ id[1] = s_id;
+}
+
+void k_object_id(const void *, u64 *id);
+#else
+static inline void k_object_id(const void *x, u64 *id)
+{
+ id[0] = id[1] = 0;
+}
+#endif
+
/*
* Allocator specific definitions. These are mainly used to establish optimized
* ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
diff --git a/init/Kconfig b/init/Kconfig
index d627783..4c1c0e6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1200,6 +1200,15 @@ config SLUB_DEBUG
SLUB sysfs support. /sys/slab will not exist and there will be
no support for cache validation etc.
+config SLAB_OBJECT_IDS
+ default y
+ bool "Enable slab kernel object ID infrastructure"
+ depends on !SLOB
+ help
+ This option provides an infrastructure for calculating ID-s of
+ slab/slub objects. These ID-s are not based on the object location
+ in memory and thus can be shown to the userspace.
+
config COMPAT_BRK
bool "Disable heap randomization"
default y
diff --git a/mm/Makefile b/mm/Makefile
index 836e416..fb65080 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -50,3 +50,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_SLAB_OBJECT_IDS) += slab_obj_ids.o
diff --git a/mm/slab_obj_ids.c b/mm/slab_obj_ids.c
new file mode 100644
index 0000000..87d1693
--- /dev/null
+++ b/mm/slab_obj_ids.c
@@ -0,0 +1,25 @@
+#include <linux/percpu.h>
+
+#define SLUB_ID_CPU_SHIFT 16
+static DEFINE_PER_CPU(u64, slub_ids);
+
+void __slab_pick_id(u64 *s_id)
+{
+ int cpu;
+ u64 id;
+
+ /*
+ * The idea behind this all is very simple:
+ *
+ * The ID is the 48-bit per-cpu monotonic counter mixed with 16-bit cpuid.
+ * Even if being incremented 1M times per second the first part will stay
+ * uniqe for 200+ years. The cpuid is required to make values picked on
+ * two cpus differ.
+ */
+
+ cpu = get_cpu();
+ id = ++per_cpu(slub_ids, cpu);
+ WARN_ON_ONCE(id >> (64 - SLUB_ID_CPU_SHIFT) != 0);
+ *s_id = (id << SLUB_ID_CPU_SHIFT) | cpu;
+ put_cpu();
+}
--
1.5.5.6
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH 2/5] slab_id: Generic slab ID infrastructure
2011-10-06 16:23 ` [PATCH 2/5] slab_id: Generic slab ID infrastructure Pavel Emelyanov
@ 2011-10-07 8:27 ` Glauber Costa
2011-10-07 8:31 ` Pavel Emelyanov
0 siblings, 1 reply; 18+ messages in thread
From: Glauber Costa @ 2011-10-07 8:27 UTC (permalink / raw)
To: Pavel Emelyanov
Cc: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm,
Cyrill Gorcunov, Andrew Morton, devel, Greg Thelen,
Suleiman Souhlal
Hi Pavel,
On 10/06/2011 08:23 PM, Pavel Emelyanov wrote:
> The idea of how to generate and ID for an arbitrary slab object is simple:
>
> - The ID is 128 bits
> - The upper 64 bits are slab ID
> - The lower 64 bits are object index withing a slab (yes, it's too many,
> but is done for simplicity - not to deal with 96-bit numbers)
> - The slab ID is the 48-bit per-cpu monotonic counter mixed with 16-bit
> cpuid. Even if being incremented 1M times per second the first part
> will stay uniqe for 200+ years. The cpuid is required to make values
> picked on two cpus differ.
So why can't we just use tighter numbers, and leave some reserved fields
instead ?
Having ids in the objects of the slab may prove useful in the future for
other uses as well.
For instance, concurrent to that, we're trying to figure out ways to
have per-cgroup pages/objects accounted in the memory controller.
The most up2date proposals create an entire kmem cache for each cgroup,
thus trivially guaranteeing uniqueness. It however, leads to fragmentation.
Having the objects to be IDed and being cgroup part of this id, could
help us achieve the same goal with less fragmentation.
>
> Signed-off-by: Pavel Emelyanov<xemul@parallels.com>
>
> ---
> include/linux/slab.h | 17 +++++++++++++++++
> init/Kconfig | 9 +++++++++
> mm/Makefile | 1 +
> mm/slab_obj_ids.c | 25 +++++++++++++++++++++++++
> 4 files changed, 52 insertions(+), 0 deletions(-)
> create mode 100644 mm/slab_obj_ids.c
>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 573c809..ae9c735 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -23,6 +23,7 @@
> #define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */
> #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */
> #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */
> +#define SLAB_WANT_OBJIDS 0x00080000UL /* Want GENERIC_OBJECT_IDS-friendly slabs */
> /*
> * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS!
> *
> @@ -162,6 +163,22 @@ void kfree(const void *);
> void kzfree(const void *);
> size_t ksize(const void *);
>
> +#ifdef CONFIG_SLAB_OBJECT_IDS
> +void __slab_pick_id(u64 *s_id);
> +static inline void __slab_get_id(u64 *id, u64 s_id, u64 o_id)
> +{
> + id[0] = o_id;
> + id[1] = s_id;
> +}
> +
> +void k_object_id(const void *, u64 *id);
> +#else
> +static inline void k_object_id(const void *x, u64 *id)
> +{
> + id[0] = id[1] = 0;
> +}
> +#endif
> +
> /*
> * Allocator specific definitions. These are mainly used to establish optimized
> * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
> diff --git a/init/Kconfig b/init/Kconfig
> index d627783..4c1c0e6 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1200,6 +1200,15 @@ config SLUB_DEBUG
> SLUB sysfs support. /sys/slab will not exist and there will be
> no support for cache validation etc.
>
> +config SLAB_OBJECT_IDS
> + default y
> + bool "Enable slab kernel object ID infrastructure"
> + depends on !SLOB
> + help
> + This option provides an infrastructure for calculating ID-s of
> + slab/slub objects. These ID-s are not based on the object location
> + in memory and thus can be shown to the userspace.
> +
> config COMPAT_BRK
> bool "Disable heap randomization"
> default y
> diff --git a/mm/Makefile b/mm/Makefile
> index 836e416..fb65080 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -50,3 +50,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
> obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
> obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
> obj-$(CONFIG_CLEANCACHE) += cleancache.o
> +obj-$(CONFIG_SLAB_OBJECT_IDS) += slab_obj_ids.o
> diff --git a/mm/slab_obj_ids.c b/mm/slab_obj_ids.c
> new file mode 100644
> index 0000000..87d1693
> --- /dev/null
> +++ b/mm/slab_obj_ids.c
> @@ -0,0 +1,25 @@
> +#include<linux/percpu.h>
> +
> +#define SLUB_ID_CPU_SHIFT 16
> +static DEFINE_PER_CPU(u64, slub_ids);
> +
> +void __slab_pick_id(u64 *s_id)
> +{
> + int cpu;
> + u64 id;
> +
> + /*
> + * The idea behind this all is very simple:
> + *
> + * The ID is the 48-bit per-cpu monotonic counter mixed with 16-bit cpuid.
> + * Even if being incremented 1M times per second the first part will stay
> + * uniqe for 200+ years. The cpuid is required to make values picked on
> + * two cpus differ.
> + */
> +
> + cpu = get_cpu();
> + id = ++per_cpu(slub_ids, cpu);
> + WARN_ON_ONCE(id>> (64 - SLUB_ID_CPU_SHIFT) != 0);
> + *s_id = (id<< SLUB_ID_CPU_SHIFT) | cpu;
> + put_cpu();
> +}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 2/5] slab_id: Generic slab ID infrastructure
2011-10-07 8:27 ` Glauber Costa
@ 2011-10-07 8:31 ` Pavel Emelyanov
0 siblings, 0 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-07 8:31 UTC (permalink / raw)
To: Glauber Costa
Cc: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm@kvack.org,
Cyrill Gorcunov, Andrew Morton, devel@openvz.org, Greg Thelen,
Suleiman Souhlal
On 10/07/2011 12:27 PM, Glauber Costa wrote:
> Hi Pavel,
>
> On 10/06/2011 08:23 PM, Pavel Emelyanov wrote:
>> The idea of how to generate and ID for an arbitrary slab object is simple:
>>
>> - The ID is 128 bits
>> - The upper 64 bits are slab ID
>> - The lower 64 bits are object index withing a slab (yes, it's too many,
>> but is done for simplicity - not to deal with 96-bit numbers)
>> - The slab ID is the 48-bit per-cpu monotonic counter mixed with 16-bit
>> cpuid. Even if being incremented 1M times per second the first part
>> will stay uniqe for 200+ years. The cpuid is required to make values
>> picked on two cpus differ.
>
> So why can't we just use tighter numbers, and leave some reserved fields
> instead ?
Well, we have to save the ID on the slab and for 64-bit kernel we can already
use the 64-bit mapping field. For 32-bit kernels 32-bit value is not enough as
it can overlap in several days (like 32bit jiffies do) which is not enough.
> Having ids in the objects of the slab may prove useful in the future for
> other uses as well.
>
> For instance, concurrent to that, we're trying to figure out ways to
> have per-cgroup pages/objects accounted in the memory controller.
>
> The most up2date proposals create an entire kmem cache for each cgroup,
> thus trivially guaranteeing uniqueness. It however, leads to fragmentation.
> Having the objects to be IDed and being cgroup part of this id, could
> help us achieve the same goal with less fragmentation.
That's good point! I can extend the patches to provide the space reservation
infrastructure for slabs.
Thanks,
Pavel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 3/5] slab_id: Slab support for IDs
2011-10-06 16:22 [PATCH 0/5] Slab objects identifiers Pavel Emelyanov
2011-10-06 16:23 ` [PATCH 1/5] slab: Tossing bits around Pavel Emelyanov
2011-10-06 16:23 ` [PATCH 2/5] slab_id: Generic slab ID infrastructure Pavel Emelyanov
@ 2011-10-06 16:23 ` Pavel Emelyanov
2011-10-06 16:24 ` [PATCH 4/5] slab_id: Slub " Pavel Emelyanov
` (3 subsequent siblings)
6 siblings, 0 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-06 16:23 UTC (permalink / raw)
To: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm
Cc: Glauber Costa, Cyrill Gorcunov, Andrew Morton
Just place the slab ID generation into proper places of slab.c
The slab ID value is stored right after the bufctl array.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
---
mm/slab.c | 38 ++++++++++++++++++++++++++++++++++++++
1 files changed, 38 insertions(+), 0 deletions(-)
diff --git a/mm/slab.c b/mm/slab.c
index 81a2063..f87eb25 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2183,6 +2183,10 @@ static inline size_t __slab_size(int nr_objs, unsigned long flags)
size_t ret;
ret = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t);
+#ifdef CONFIG_SLAB_OBJECT_IDS
+ if (flags & SLAB_WANT_OBJIDS)
+ ret += sizeof(u64);
+#endif
return ret;
}
@@ -2703,6 +2707,39 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
return (kmem_bufctl_t *) (slabp + 1);
}
+#ifdef CONFIG_SLAB_OBJECT_IDS
+static void slab_pick_id(struct kmem_cache *c, struct slab *s)
+{
+ if (c->flags & SLAB_WANT_OBJIDS)
+ __slab_pick_id((u64 *)(slab_bufctl(s) + c->num));
+}
+
+void k_object_id(const void *x, u64 *id)
+{
+ struct page *p;
+ struct kmem_cache *c;
+ struct slab *s;
+
+ id[0] = id[1] = 0;
+
+ if (x == NULL)
+ return;
+
+ p = virt_to_head_page(x);
+ c = page_get_cache(p);
+ if (!(c->flags & SLAB_WANT_OBJIDS))
+ return;
+
+ s = page_get_slab(p);
+ __slab_get_id(id, *(u64 *)(slab_bufctl(s) + c->num),
+ obj_to_index(c, s, x));
+}
+#else
+static inline void slab_pick_id(struct kmem_cache *c, struct slab *s)
+{
+}
+#endif
+
/*
* Get the memory for a slab management obj.
* For a slab cache when the slab descriptor is off-slab, slab descriptors
@@ -2743,6 +2780,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
slabp->s_mem = objp + colour_off;
slabp->nodeid = nodeid;
slabp->free = 0;
+ slab_pick_id(cachep, slabp);
return slabp;
}
--
1.5.5.6
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 4/5] slab_id: Slub support for IDs
2011-10-06 16:22 [PATCH 0/5] Slab objects identifiers Pavel Emelyanov
` (2 preceding siblings ...)
2011-10-06 16:23 ` [PATCH 3/5] slab_id: Slab support for IDs Pavel Emelyanov
@ 2011-10-06 16:24 ` Pavel Emelyanov
2011-10-06 16:24 ` [PATCH 5/5] slab_id: Show the task's mm ID in proc Pavel Emelyanov
` (2 subsequent siblings)
6 siblings, 0 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-06 16:24 UTC (permalink / raw)
To: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm
Cc: Glauber Costa, Cyrill Gorcunov, Andrew Morton
Just place the slab ID generation in proper places of slub code.
The slub ID value is stored on the page->mapping field for 64-bit
kernel and at the end of the page itself for 32-bit ones. It's
stored on the same place where the slab rcu would be stored (the
need_reserve_slab_rcu functionality).
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
---
mm/slub.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 71 insertions(+), 0 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index ab9d6fc..398877a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1426,6 +1426,69 @@ static inline void *slab_reserved_space(struct kmem_cache *s, struct page *page,
return page_address(page) + offset;
}
+#ifdef CONFIG_SLAB_OBJECT_IDS
+#define need_reserve_slab_id \
+ (sizeof(((struct page *)NULL)->mapping) < sizeof(u64))
+
+static inline u64 *slub_id_location(struct kmem_cache *s, struct page *page)
+{
+ if (!(s->flags & SLAB_WANT_OBJIDS))
+ return NULL;
+
+ if (need_reserve_slab_id)
+ return slab_reserved_space(s, page, sizeof(u64));
+ else
+ return (u64 *)&page->mapping;
+}
+
+static void slub_pick_id(struct kmem_cache *s, struct page *page)
+{
+ u64 *s_id;
+
+ s_id = slub_id_location(s, page);
+ if (s_id != NULL)
+ __slab_pick_id(s_id);
+}
+
+static void slub_put_id(struct kmem_cache *s, struct page *p)
+{
+ /* Make buddy allocator freeing checks happy */
+ if ((!need_reserve_slab_id) && (s->flags & SLAB_WANT_OBJIDS))
+ p->mapping = NULL;
+}
+
+void k_object_id(const void *x, u64 *id)
+{
+ struct page *page;
+ u64 *s_id;
+
+ id[0] = id[1] = 0;
+
+ if (x == NULL)
+ return;
+
+ page = virt_to_head_page(x);
+ if (unlikely(!PageSlab(page)))
+ return;
+
+ s_id = slub_id_location(page->slab, page);
+ if (s_id == NULL)
+ return;
+
+ __slab_get_id(id, *s_id,
+ slab_index((void *)x, page->slab, page_address(page)));
+}
+#else
+#define need_reserve_slab_id 0
+static inline void slub_pick_id(struct page *page)
+{
+}
+
+static inline void slub_put_id(struct kmem_cache *s, struct page *p)
+{
+}
+#endif
+
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
@@ -1461,6 +1524,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
page->freelist = start;
page->inuse = 0;
page->frozen = 1;
+ slub_pick_id(s, page);
out:
return page;
}
@@ -1470,6 +1534,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
+ slub_put_id(s, page);
if (kmem_cache_debug(s)) {
void *p;
@@ -2889,6 +2954,12 @@ static int kmem_cache_open(struct kmem_cache *s,
if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
s->reserved = sizeof(struct rcu_head);
+ if (need_reserve_slab_id && (s->flags & SLAB_WANT_OBJIDS))
+ /*
+ * The id is required for alive objects only, thus it's
+ * safe to put this in the same place with the rcu head
+ */
+ s->reserved = max_t(int, s->reserved, sizeof(u64));
if (!calculate_sizes(s, -1))
goto error;
--
1.5.5.6
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 5/5] slab_id: Show the task's mm ID in proc
2011-10-06 16:22 [PATCH 0/5] Slab objects identifiers Pavel Emelyanov
` (3 preceding siblings ...)
2011-10-06 16:24 ` [PATCH 4/5] slab_id: Slub " Pavel Emelyanov
@ 2011-10-06 16:24 ` Pavel Emelyanov
2011-10-07 17:03 ` [PATCH 0/5] Slab objects identifiers Christoph Lameter
2011-10-10 18:59 ` Matt Helsley
6 siblings, 0 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-06 16:24 UTC (permalink / raw)
To: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm
Cc: Glauber Costa, Cyrill Gorcunov, Andrew Morton
This is just an example of how to use the slab IDs infrastructure.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
---
fs/proc/array.c | 17 +++++++++++++++++
fs/proc/base.c | 6 ++++++
fs/proc/internal.h | 2 ++
kernel/fork.c | 2 +-
4 files changed, 26 insertions(+), 1 deletions(-)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd..77eb2ba 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -357,6 +357,23 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
+#ifdef CONFIG_SLAB_OBJECT_IDS
+int proc_pid_objects(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ u64 id[2];
+
+ task_lock(task);
+
+ k_object_id(task->mm, id);
+ seq_printf(m, "mm: %016Lx%016Lx\n", id[0], id[1]);
+
+ task_unlock(task);
+
+ return 0;
+}
+#endif
+
static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5eb0206..4ffc31c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2792,6 +2792,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#endif
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
+#ifdef CONFIG_SLAB_OBJECT_IDS
+ ONE("objects", S_IRUGO, proc_pid_objects),
+#endif
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUGO, proc_pid_personality),
INF("limits", S_IRUGO, proc_pid_limits),
@@ -3141,6 +3144,9 @@ static const struct pid_entry tid_base_stuff[] = {
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
+#ifdef CONFIG_SLAB_OBJECT_IDS
+ ONE("objects", S_IRUGO, proc_pid_objects),
+#endif
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUGO, proc_pid_personality),
INF("limits", S_IRUGO, proc_pid_limits),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7838e5c..ac19d98 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -49,6 +49,8 @@ extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
+extern int proc_pid_objects(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task);
extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e6b6f4..853e96e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1597,7 +1597,7 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_WANT_OBJIDS, NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
nsproxy_cache_init();
--
1.5.5.6
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH 0/5] Slab objects identifiers
2011-10-06 16:22 [PATCH 0/5] Slab objects identifiers Pavel Emelyanov
` (4 preceding siblings ...)
2011-10-06 16:24 ` [PATCH 5/5] slab_id: Show the task's mm ID in proc Pavel Emelyanov
@ 2011-10-07 17:03 ` Christoph Lameter
2011-10-10 10:20 ` Pavel Emelyanov
2011-10-10 18:59 ` Matt Helsley
6 siblings, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2011-10-07 17:03 UTC (permalink / raw)
To: Pavel Emelyanov
Cc: Pekka Enberg, Matt Mackall, linux-mm, Glauber Costa,
Cyrill Gorcunov, Andrew Morton
On Thu, 6 Oct 2011, Pavel Emelyanov wrote:
> While doing the checkpoint-restore in the userspace we need to determine
> whether various kernel objects (like mm_struct-s of file_struct-s) are shared
> between tasks and restore this state.
>
> The 2nd step can for now be solved by using respective CLONE_XXX flags and
> the unshare syscall, while there's currently no ways for solving the 1st one.
>
> One of the ways for checking whether two tasks share e.g. an mm_struct is to
> provide some mm_struct ID of a task to its proc file. The best from the
> performance point of view ID is the object address in the kernel, but showing
> them to the userspace is not good for performance reasons. Thus the ID should
> not be calculated based on the object address.
If two tasks share an mm_struct then the mm_struct pointer (task->mm) will
point to the same address. Objects are already uniquely identified by
their address. If you store the physical address with the object content
when transferring then you can verify that they share the mm_struct.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 0/5] Slab objects identifiers
2011-10-07 17:03 ` [PATCH 0/5] Slab objects identifiers Christoph Lameter
@ 2011-10-10 10:20 ` Pavel Emelyanov
2011-10-10 15:24 ` Christoph Lameter
2011-10-13 7:12 ` Pekka Enberg
0 siblings, 2 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-10 10:20 UTC (permalink / raw)
To: Christoph Lameter
Cc: Pekka Enberg, Matt Mackall, linux-mm@kvack.org, Glauber Costa,
Cyrill Gorcunov, Andrew Morton
On 10/07/2011 09:03 PM, Christoph Lameter wrote:
> On Thu, 6 Oct 2011, Pavel Emelyanov wrote:
>
>> While doing the checkpoint-restore in the userspace we need to determine
>> whether various kernel objects (like mm_struct-s of file_struct-s) are shared
>> between tasks and restore this state.
>>
>> The 2nd step can for now be solved by using respective CLONE_XXX flags and
>> the unshare syscall, while there's currently no ways for solving the 1st one.
>>
>> One of the ways for checking whether two tasks share e.g. an mm_struct is to
>> provide some mm_struct ID of a task to its proc file. The best from the
>> performance point of view ID is the object address in the kernel, but showing
>> them to the userspace is not good for performance reasons. Thus the ID should
>> not be calculated based on the object address.
>
> If two tasks share an mm_struct then the mm_struct pointer (task->mm) will
> point to the same address. Objects are already uniquely identified by
> their address.
Yes of course, but ...
> If you store the physical address with the object content
> when transferring then you can verify that they share the mm_struct.
... are we all OK with showing kernel addresses to the userspace? I thought the %pK
format was invented specially to handle such leaks.
If we are, then (as I said in the first letter) we should just show them and forget
this set. If we're not - we should invent smth more straightforward and this set is
an attempt for doing this.
Thanks,
Pavel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/5] Slab objects identifiers
2011-10-10 10:20 ` Pavel Emelyanov
@ 2011-10-10 15:24 ` Christoph Lameter
2011-10-10 15:46 ` Pavel Emelyanov
2011-10-13 7:12 ` Pekka Enberg
1 sibling, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2011-10-10 15:24 UTC (permalink / raw)
To: Pavel Emelyanov
Cc: Pekka Enberg, Matt Mackall, linux-mm@kvack.org, Glauber Costa,
Cyrill Gorcunov, Andrew Morton
On Mon, 10 Oct 2011, Pavel Emelyanov wrote:
> > If you store the physical address with the object content
> > when transferring then you can verify that they share the mm_struct.
>
> ... are we all OK with showing kernel addresses to the userspace? I thought the %pK
> format was invented specially to handle such leaks.
Not in general but I think for process migration we are fine with the
handling of the addresses. Doesnt process migration require rooot anyways?
Adding additional metadata to each slab or object is certainly not
acceptable because it slows down operations for everyone.
> If we are, then (as I said in the first letter) we should just show them and forget
> this set. If we're not - we should invent smth more straightforward and this set is
> an attempt for doing this.
Well one solution is to show the addresses only to members of a specific
group or only to root.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/5] Slab objects identifiers
2011-10-10 15:24 ` Christoph Lameter
@ 2011-10-10 15:46 ` Pavel Emelyanov
0 siblings, 0 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-10 15:46 UTC (permalink / raw)
To: Christoph Lameter
Cc: Pekka Enberg, Matt Mackall, linux-mm@kvack.org, Glauber Costa,
Cyrill Gorcunov, Andrew Morton
On 10/10/2011 07:24 PM, Christoph Lameter wrote:
> On Mon, 10 Oct 2011, Pavel Emelyanov wrote:
>
>>> If you store the physical address with the object content
>>> when transferring then you can verify that they share the mm_struct.
>>
>> ... are we all OK with showing kernel addresses to the userspace? I thought the %pK
>> format was invented specially to handle such leaks.
>
> Not in general but I think for process migration we are fine with the
> handling of the addresses. Doesnt process migration require rooot anyways?
Well, currently the most strict requirement for checkpoint part is the
ptrace_may_access() one, which is not necessarily root. And I'd prefer not
restricting it further without need.
> Adding additional metadata to each slab or object is certainly not
> acceptable because it slows down operations for everyone.
Please note, that this ID generation thing is not required for EVERY kmem cache
in the system, only for a couple of them. For 64 bits kernel this ID is stored on
the struct page itself doesn't making object density worse, on 32 bits the caches
I require to mark with this flag already have gaps between objects and thus do not
make density worse either.
Besides, the slab ID generation is a single percpu counter and is called in the
places where percpu areas are already hot in caches, thus no noticeable penalty
should be seen.
Besides, in OpenVZ we modify slub code to store on the page with objects an array
of pointers (size == number of objects) and no performance tests show any degradation
due to this.
If I misunderstood your concern, please elaborate.
>> If we are, then (as I said in the first letter) we should just show them and forget
>> this set. If we're not - we should invent smth more straightforward and this set is
>> an attempt for doing this.
>
> Well one solution is to show the addresses only to members of a specific
> group or only to root.
I agree with that. Need to sort out the issues above.
Thanks,
Pavel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/5] Slab objects identifiers
2011-10-10 10:20 ` Pavel Emelyanov
2011-10-10 15:24 ` Christoph Lameter
@ 2011-10-13 7:12 ` Pekka Enberg
2011-10-13 11:25 ` Pavel Emelyanov
1 sibling, 1 reply; 18+ messages in thread
From: Pekka Enberg @ 2011-10-13 7:12 UTC (permalink / raw)
To: Pavel Emelyanov
Cc: Christoph Lameter, Matt Mackall, linux-mm@kvack.org,
Glauber Costa, Cyrill Gorcunov, Andrew Morton
On Mon, Oct 10, 2011 at 1:20 PM, Pavel Emelyanov <xemul@parallels.com> wrote:
>> If two tasks share an mm_struct then the mm_struct pointer (task->mm) will
>> point to the same address. Objects are already uniquely identified by
>> their address.
>
> Yes of course, but ...
>
>> If you store the physical address with the object content
>> when transferring then you can verify that they share the mm_struct.
>
> ... are we all OK with showing kernel addresses to the userspace? I thought the %pK
> format was invented specially to handle such leaks.
I don't think it's worth it to try to hide kernel addresses for
checkpoint/restart.
> If we are, then (as I said in the first letter) we should just show them and forget
> this set. If we're not - we should invent smth more straightforward and this set is
> an attempt for doing this.
Does this ID thing need to happen in the slab layer?
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 0/5] Slab objects identifiers
2011-10-13 7:12 ` Pekka Enberg
@ 2011-10-13 11:25 ` Pavel Emelyanov
2011-10-13 12:12 ` Pekka Enberg
0 siblings, 1 reply; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-13 11:25 UTC (permalink / raw)
To: Pekka Enberg
Cc: Christoph Lameter, Matt Mackall, linux-mm@kvack.org,
Glauber Costa, Cyrill Gorcunov, Andrew Morton
On 10/13/2011 11:12 AM, Pekka Enberg wrote:
> On Mon, Oct 10, 2011 at 1:20 PM, Pavel Emelyanov <xemul@parallels.com> wrote:
>>> If two tasks share an mm_struct then the mm_struct pointer (task->mm) will
>>> point to the same address. Objects are already uniquely identified by
>>> their address.
>>
>> Yes of course, but ...
>>
>>> If you store the physical address with the object content
>>> when transferring then you can verify that they share the mm_struct.
>>
>> ... are we all OK with showing kernel addresses to the userspace? I thought the %pK
>> format was invented specially to handle such leaks.
>
> I don't think it's worth it to try to hide kernel addresses for
> checkpoint/restart.
We don't need to know anything about kernel internals for checkpoint-restart, that's
why I said, that abstract identifiers are just fine.
>> If we are, then (as I said in the first letter) we should just show them and forget
>> this set. If we're not - we should invent smth more straightforward and this set is
>> an attempt for doing this.
>
> Does this ID thing need to happen in the slab layer?
Not necessarily, of course, but if we're going to show some identifier of an object
we have 2 choices - either we generate this ID independently (with e.g. IDA), but
this is slow, or we use some knowledge of an object as a bunch of bytes in memory.
These slab IDs thing is an attempt to implement the 2nd approach.
The question I'm trying to answer with this is - do task A and task B have their mm
shared or not? Showing an ID answers one. Maybe there exists another way, but I haven't
invented it yet and decided to send this set out for discussion (the "release early"
idiom). If slab maintainers say "no, we don't accept this at all ever", then of course
I'll have to think further, but if the concept is suitable, but needs some refinement -
let's do it.
> Pekka
> .
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/5] Slab objects identifiers
2011-10-13 11:25 ` Pavel Emelyanov
@ 2011-10-13 12:12 ` Pekka Enberg
2011-10-13 12:23 ` Pavel Emelyanov
0 siblings, 1 reply; 18+ messages in thread
From: Pekka Enberg @ 2011-10-13 12:12 UTC (permalink / raw)
To: Pavel Emelyanov
Cc: Christoph Lameter, Matt Mackall, linux-mm@kvack.org,
Glauber Costa, Cyrill Gorcunov, Andrew Morton
On Thu, Oct 13, 2011 at 2:25 PM, Pavel Emelyanov <xemul@parallels.com> wrote:
>>> ... are we all OK with showing kernel addresses to the userspace? I thought the %pK
>>> format was invented specially to handle such leaks.
>>
>> I don't think it's worth it to try to hide kernel addresses for
>> checkpoint/restart.
>
> We don't need to know anything about kernel internals for checkpoint-restart, that's
> why I said, that abstract identifiers are just fine.
OK.
>>> If we are, then (as I said in the first letter) we should just show them and forget
>>> this set. If we're not - we should invent smth more straightforward and this set is
>>> an attempt for doing this.
>>
>> Does this ID thing need to happen in the slab layer?
>
> Not necessarily, of course, but if we're going to show some identifier of an object
> we have 2 choices - either we generate this ID independently (with e.g. IDA), but
> this is slow, or we use some knowledge of an object as a bunch of bytes in memory.
> These slab IDs thing is an attempt to implement the 2nd approach.
Why is the first approach slow? I fully agree that unique IDs are probably the
way to go here but why don't you just add a new member to struct mm_struct and
initialize it in mm_alloc() and mm_dup()?
> The question I'm trying to answer with this is - do task A and task B have their mm
> shared or not? Showing an ID answers one. Maybe there exists another way, but I haven't
> invented it yet and decided to send this set out for discussion (the "release early"
> idiom). If slab maintainers say "no, we don't accept this at all ever", then of course
> I'll have to think further, but if the concept is suitable, but needs some refinement -
> let's do it.
Oh, I much appreciate that you sent this early. I'm not completely against doing
this in the slab layer but I need much more convincing. I expect most distros to
enable checkpoint/restart so this ID mechanism is going to be default
on for slab.
Pekka
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 0/5] Slab objects identifiers
2011-10-13 12:12 ` Pekka Enberg
@ 2011-10-13 12:23 ` Pavel Emelyanov
0 siblings, 0 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-13 12:23 UTC (permalink / raw)
To: Pekka Enberg
Cc: Christoph Lameter, Matt Mackall, linux-mm@kvack.org,
Glauber Costa, Cyrill Gorcunov, Andrew Morton
>>> Does this ID thing need to happen in the slab layer?
>>
>> Not necessarily, of course, but if we're going to show some identifier of an object
>> we have 2 choices - either we generate this ID independently (with e.g. IDA), but
>> this is slow, or we use some knowledge of an object as a bunch of bytes in memory.
>> These slab IDs thing is an attempt to implement the 2nd approach.
>
> Why is the first approach slow? I fully agree that unique IDs are probably the
> way to go here but why don't you just add a new member to struct mm_struct and
> initialize it in mm_alloc() and mm_dup()?
For several reasons:
1. I will need the same for fdtable, fs, files (the struct file can be shared by two
different fdtables), namespaces, etc and try to make it more generic.
2. IDA allocation for mm will slowdown the fork() (for other objects - other operations).
3. My trick with increasing percpu will require 64-bit field on each object which is
probably acceptable for mm_struct, but is critical for struct file and fs_struct, as
they are already quite small.
>> The question I'm trying to answer with this is - do task A and task B have their mm
>> shared or not? Showing an ID answers one. Maybe there exists another way, but I haven't
>> invented it yet and decided to send this set out for discussion (the "release early"
>> idiom). If slab maintainers say "no, we don't accept this at all ever", then of course
>> I'll have to think further, but if the concept is suitable, but needs some refinement -
>> let's do it.
>
> Oh, I much appreciate that you sent this early. I'm not completely against doing
> this in the slab layer but I need much more convincing. I expect most distros to
> enable checkpoint/restart so this ID mechanism is going to be default
> on for slab.
>
> Pekka
> .
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/5] Slab objects identifiers
2011-10-06 16:22 [PATCH 0/5] Slab objects identifiers Pavel Emelyanov
` (5 preceding siblings ...)
2011-10-07 17:03 ` [PATCH 0/5] Slab objects identifiers Christoph Lameter
@ 2011-10-10 18:59 ` Matt Helsley
2011-10-11 7:50 ` Pavel Emelyanov
6 siblings, 1 reply; 18+ messages in thread
From: Matt Helsley @ 2011-10-10 18:59 UTC (permalink / raw)
To: Pavel Emelyanov
Cc: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm,
Glauber Costa, Cyrill Gorcunov, Andrew Morton
On Thu, Oct 06, 2011 at 08:22:17PM +0400, Pavel Emelyanov wrote:
> Hi.
>
>
> While doing the checkpoint-restore in the userspace we need to determine
> whether various kernel objects (like mm_struct-s of file_struct-s) are shared
> between tasks and restore this state.
> The 2nd step can for now be solved by using respective CLONE_XXX flags and
> the unshare syscall, while there's currently no ways for solving the 1st one.
>
> One of the ways for checking whether two tasks share e.g. an mm_struct is to
> provide some mm_struct ID of a task to its proc file. The best from the
> performance point of view ID is the object address in the kernel, but showing
> them to the userspace is not good for performance reasons. Thus the ID should
> not be calculated based on the object address.
>
> The proposal is to have the ID for slab objects as the mixture of two things -
> the number of an object on the slub and the ID of a slab, which is calculated
> simply by getting a monotonic 64 bit number at the slab allocation time which
> gives us 200+ years of stable work (see comment in the patch #2) :)
This just strikes me as the wrong approach. Userspace should not need to know
the structures the kernel is using to implement the sharing that's possible
with the clone flags. The userspace interface should be framed such that the
kernel is not exporting the relationship between these structures so much as
the relationship between the tasks which those structures reflect.
Perhaps you could write the interface so that it shows the clone flags
one would use to re-create the child task from the parent instead of
trying to output these ids.
Also, putting this in slab seems like a poor choice -- isn't instrumenting
the allocator rather invasive? Especialy when we're talking about a
handful of structs in comparison to everything else the allocators
handle?
Cheers,
-Matt Helsley
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 0/5] Slab objects identifiers
2011-10-10 18:59 ` Matt Helsley
@ 2011-10-11 7:50 ` Pavel Emelyanov
0 siblings, 0 replies; 18+ messages in thread
From: Pavel Emelyanov @ 2011-10-11 7:50 UTC (permalink / raw)
To: Matt Helsley
Cc: Christoph Lameter, Pekka Enberg, Matt Mackall, linux-mm@kvack.org,
Glauber Costa, Cyrill Gorcunov, Andrew Morton
On 10/10/2011 10:59 PM, Matt Helsley wrote:
> On Thu, Oct 06, 2011 at 08:22:17PM +0400, Pavel Emelyanov wrote:
Matt, thanks for the reply! Please, see my comments below.
>> Hi.
>>
>>
>> While doing the checkpoint-restore in the userspace we need to determine
>> whether various kernel objects (like mm_struct-s of file_struct-s) are shared
>> between tasks and restore this state.
>> The 2nd step can for now be solved by using respective CLONE_XXX flags and
>> the unshare syscall, while there's currently no ways for solving the 1st one.
>>
>> One of the ways for checking whether two tasks share e.g. an mm_struct is to
>> provide some mm_struct ID of a task to its proc file. The best from the
>> performance point of view ID is the object address in the kernel, but showing
>> them to the userspace is not good for performance reasons. Thus the ID should
>> not be calculated based on the object address.
>>
>> The proposal is to have the ID for slab objects as the mixture of two things -
>> the number of an object on the slub and the ID of a slab, which is calculated
>> simply by getting a monotonic 64 bit number at the slab allocation time which
>> gives us 200+ years of stable work (see comment in the patch #2) :)
>
> This just strikes me as the wrong approach. Userspace should not need to know
> the structures the kernel is using to implement the sharing that's possible
> with the clone flags. The userspace interface should be framed such that the
> kernel is not exporting the relationship between these structures so much as
> the relationship between the tasks which those structures reflect.
> Perhaps you could write the interface so that it shows the clone flags
> one would use to re-create the child task from the parent instead of
> trying to output these ids.
Well, another API would also work for us, I just propose this one as one of the
approaches.
Your proposal with showing CLONE flags sounds very promising, but how can it handle
the case when a task shares it's e.g. mm_struct with some other task which is not his
parent? Like if we create the chain of 3 tasks all with the shared mm_struct and then
the middle one calls execve unsharing one (I saw MySQL doing this). Besides "reparenting
to init" and the unshare syscall may create more interesting objects sharing mosaic and
thus we need the API which is as generic as "do these two tasks share an mm?".
Looking a little bit forward, if the same API can answer a question "does this *group*
of tasks sharing one mm_struct share it with someone else?" this would also be very
helpful.
> Also, putting this in slab seems like a poor choice -- isn't instrumenting
> the allocator rather invasive?
Well, the payload in my patches is not intrusive - it just adds the code, not tosses
the existing one.
> Especialy when we're talking about a handful of structs in comparison to everything
> else the allocators handle?
I did this functionality so that it doesn't affect those kmem caches that we don't need
to provide us the IDs at all.
> Cheers,
> -Matt Helsley
> .
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 18+ messages in thread