* [RFC 1/2]swap: add a simple buddy allocator
@ 2012-10-22 2:30 Shaohua Li
2012-10-22 21:10 ` Andrew Morton
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Shaohua Li @ 2012-10-22 2:30 UTC (permalink / raw)
To: linux-mm; +Cc: akpm, hughd, riel
I'm using a fast SSD to do swap. scan_swap_map() sometimes uses up to 20~30%
CPU time (when cluster is hard to find), which becomes a bottleneck.
scan_swap_map() scans a byte array to search a 256 page cluster, which is very
slow.
Here I introduced a simple buddy allocator. Since we only care about 256 pages
cluster, we can just use a counter to implement the buddy allocator. Every 256
pages use one int to store the counter, so searching cluster is very efficient.
With this, scap_swap_map() overhead disappears.
This might help low end SD card swap too. Because if the cluster is aligned, SD
firmware can do flash erase more efficiently.
The downside is the cluster must be aligned to 256 pages, which will reduce the
chance to find a cluster.
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
include/linux/swap.h | 1
mm/swapfile.c | 74 +++++++++++++++++++++++++++++++++++++++------------
2 files changed, 58 insertions(+), 17 deletions(-)
Index: linux/include/linux/swap.h
===================================================================
--- linux.orig/include/linux/swap.h 2012-10-22 09:20:40.802165198 +0800
+++ linux/include/linux/swap.h 2012-10-22 09:20:50.462043746 +0800
@@ -185,6 +185,7 @@ struct swap_info_struct {
signed char next; /* next type on the swap list */
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
+ unsigned int *swap_cluster_count; /* cluster counter */
unsigned int lowest_bit; /* index of first free in swap_map */
unsigned int highest_bit; /* index of last free in swap_map */
unsigned int pages; /* total of usable pages of swap */
Index: linux/mm/swapfile.c
===================================================================
--- linux.orig/mm/swapfile.c 2012-10-22 09:20:40.794165269 +0800
+++ linux/mm/swapfile.c 2012-10-22 09:21:34.317493506 +0800
@@ -182,6 +182,22 @@ static int wait_for_discard(void *word)
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256
+static inline void inc_swap_cluster_count(unsigned int *swap_cluster_count,
+ unsigned long page_nr)
+{
+ swap_cluster_count += page_nr/SWAPFILE_CLUSTER;
+ VM_BUG_ON(*swap_cluster_count >= SWAPFILE_CLUSTER);
+ (*swap_cluster_count)++;
+}
+
+static inline void dec_swap_cluster_count(unsigned int *swap_cluster_count,
+ unsigned long page_nr)
+{
+ swap_cluster_count += page_nr/SWAPFILE_CLUSTER;
+ VM_BUG_ON(*swap_cluster_count == 0);
+ (*swap_cluster_count)--;
+}
+
static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned char usage)
{
@@ -206,6 +222,8 @@ static unsigned long scan_swap_map(struc
scan_base = offset = si->cluster_next;
if (unlikely(!si->cluster_nr--)) {
+ unsigned long base;
+
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks;
@@ -235,15 +253,14 @@ static unsigned long scan_swap_map(struc
*/
if (!(si->flags & SWP_SOLIDSTATE))
scan_base = offset = si->lowest_bit;
- last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
- /* Locate the first empty (unaligned) cluster */
- for (; last_in_cluster <= si->highest_bit; offset++) {
- if (si->swap_map[offset])
- last_in_cluster = offset + SWAPFILE_CLUSTER;
- else if (offset == last_in_cluster) {
+ /* Locate the first empty (aligned) cluster */
+ for (base = round_up(offset, SWAPFILE_CLUSTER);
+ base <= si->highest_bit; base += SWAPFILE_CLUSTER) {
+ if (!si->swap_cluster_count[base/SWAPFILE_CLUSTER]) {
+ offset = base;
+ last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
spin_lock(&swap_lock);
- offset -= SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
found_free_cluster = 1;
@@ -256,15 +273,14 @@ static unsigned long scan_swap_map(struc
}
offset = si->lowest_bit;
- last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
- /* Locate the first empty (unaligned) cluster */
- for (; last_in_cluster < scan_base; offset++) {
- if (si->swap_map[offset])
- last_in_cluster = offset + SWAPFILE_CLUSTER;
- else if (offset == last_in_cluster) {
+ /* Locate the first empty (aligned) cluster */
+ for (base = round_up(offset, SWAPFILE_CLUSTER);
+ base < scan_base; base += SWAPFILE_CLUSTER) {
+ if (!si->swap_cluster_count[base/SWAPFILE_CLUSTER]) {
+ offset = base;
+ last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
spin_lock(&swap_lock);
- offset -= SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
found_free_cluster = 1;
@@ -315,6 +331,7 @@ checks:
si->highest_bit = 0;
}
si->swap_map[offset] = usage;
+ inc_swap_cluster_count(si->swap_cluster_count, offset);
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;
@@ -549,6 +566,7 @@ static unsigned char swap_entry_free(str
/* free if no reference */
if (!usage) {
+ dec_swap_cluster_count(p->swap_cluster_count, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -1445,6 +1463,7 @@ static int setup_swap_extents(struct swa
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
+ unsigned int *swap_cluster_count,
unsigned long *frontswap_map)
{
int i, prev;
@@ -1455,6 +1474,7 @@ static void enable_swap_info(struct swap
else
p->prio = --least_priority;
p->swap_map = swap_map;
+ p->swap_cluster_count = swap_cluster_count;
frontswap_map_set(p, frontswap_map);
p->flags |= SWP_WRITEOK;
nr_swap_pages += p->pages;
@@ -1480,6 +1500,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
{
struct swap_info_struct *p = NULL;
unsigned char *swap_map;
+ unsigned int *swap_cluster_count;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
@@ -1556,7 +1577,8 @@ SYSCALL_DEFINE1(swapoff, const char __us
* sys_swapoff for this swap_info_struct at this point.
*/
/* re-insert swap space back into swap_list */
- enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+ enable_swap_info(p, p->prio, p->swap_map, p->swap_cluster_count,
+ frontswap_map_get(p));
goto out_dput;
}
@@ -1581,11 +1603,14 @@ SYSCALL_DEFINE1(swapoff, const char __us
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
+ swap_cluster_count = p->swap_cluster_count;
+ p->swap_cluster_count = NULL;
p->flags = 0;
frontswap_invalidate_area(type);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ vfree(swap_cluster_count);
vfree(frontswap_map_get(p));
/* Destroy swap account informatin */
swap_cgroup_swapoff(type);
@@ -1896,6 +1921,7 @@ static unsigned long read_swap_header(st
static int setup_swap_map_and_extents(struct swap_info_struct *p,
union swap_header *swap_header,
unsigned char *swap_map,
+ unsigned int *swap_cluster_count,
unsigned long maxpages,
sector_t *span)
{
@@ -1912,11 +1938,16 @@ static int setup_swap_map_and_extents(st
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
nr_good_pages--;
+ inc_swap_cluster_count(swap_cluster_count, page_nr);
}
}
+ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
+ inc_swap_cluster_count(swap_cluster_count, i);
+
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
+ inc_swap_cluster_count(swap_cluster_count, 0);
p->max = maxpages;
p->pages = nr_good_pages;
nr_extents = setup_swap_extents(p, span);
@@ -1946,6 +1977,7 @@ SYSCALL_DEFINE2(swapon, const char __use
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
+ unsigned int *swap_cluster_count = NULL;
unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
@@ -2020,12 +2052,19 @@ SYSCALL_DEFINE2(swapon, const char __use
goto bad_swap;
}
+ swap_cluster_count = vzalloc(DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER) *
+ sizeof(int));
+ if (!swap_cluster_count) {
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
goto bad_swap;
nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
- maxpages, &span);
+ swap_cluster_count, maxpages, &span);
if (unlikely(nr_extents < 0)) {
error = nr_extents;
goto bad_swap;
@@ -2048,7 +2087,7 @@ SYSCALL_DEFINE2(swapon, const char __use
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map, frontswap_map);
+ enable_swap_info(p, prio, swap_map, swap_cluster_count, frontswap_map);
printk(KERN_INFO "Adding %uk swap on %s. "
"Priority:%d extents:%d across:%lluk %s%s%s\n",
@@ -2078,6 +2117,7 @@ bad_swap:
p->flags = 0;
spin_unlock(&swap_lock);
vfree(swap_map);
+ vfree(swap_cluster_count);
if (swap_file) {
if (inode && S_ISREG(inode->i_mode)) {
mutex_unlock(&inode->i_mutex);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC 1/2]swap: add a simple buddy allocator
2012-10-22 2:30 [RFC 1/2]swap: add a simple buddy allocator Shaohua Li
@ 2012-10-22 21:10 ` Andrew Morton
2012-10-25 0:20 ` Minchan Kim
2012-10-25 11:35 ` Ni zhan Chen
2 siblings, 0 replies; 5+ messages in thread
From: Andrew Morton @ 2012-10-22 21:10 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-mm, hughd, riel
On Mon, 22 Oct 2012 10:30:51 +0800
Shaohua Li <shli@kernel.org> wrote:
> I'm using a fast SSD to do swap. scan_swap_map() sometimes uses up to 20~30%
> CPU time (when cluster is hard to find), which becomes a bottleneck.
> scan_swap_map() scans a byte array to search a 256 page cluster, which is very
> slow.
>
> Here I introduced a simple buddy allocator. Since we only care about 256 pages
> cluster, we can just use a counter to implement the buddy allocator. Every 256
> pages use one int to store the counter, so searching cluster is very efficient.
> With this, scap_swap_map() overhead disappears.
>
> This might help low end SD card swap too. Because if the cluster is aligned, SD
> firmware can do flash erase more efficiently.
>
> The downside is the cluster must be aligned to 256 pages, which will reduce the
> chance to find a cluster.
>
hm. How serious is this downside?
>
> ...
>
> @@ -2020,12 +2052,19 @@ SYSCALL_DEFINE2(swapon, const char __use
> goto bad_swap;
> }
>
> + swap_cluster_count = vzalloc(DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER) *
> + sizeof(int));
[ Actually sizeof(unsigned int). Or, probably safer, sizeof(*swap_cluster_count)]
How large is this allocation? swap-size-in-bytes/256k, methinks. So
64kbytes for a 16G swap partition? That sounds acceptable. Something
like lib/flex_array.c could be used here perhaps, although that would
involve memory allocations at awkward times.
> + if (!swap_cluster_count) {
> + error = -ENOMEM;
> + goto bad_swap;
> + }
I shall await Hugh review on this patchset ;)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC 1/2]swap: add a simple buddy allocator
2012-10-22 2:30 [RFC 1/2]swap: add a simple buddy allocator Shaohua Li
2012-10-22 21:10 ` Andrew Morton
@ 2012-10-25 0:20 ` Minchan Kim
2012-10-26 2:29 ` Shaohua Li
2012-10-25 11:35 ` Ni zhan Chen
2 siblings, 1 reply; 5+ messages in thread
From: Minchan Kim @ 2012-10-25 0:20 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-mm, akpm, hughd, riel
Hi Shaohua,
Your idea does make sense to me.
Below just a few nitpick.
On Mon, Oct 22, 2012 at 10:30:51AM +0800, Shaohua Li wrote:
> I'm using a fast SSD to do swap. scan_swap_map() sometimes uses up to 20~30%
> CPU time (when cluster is hard to find), which becomes a bottleneck.
> scan_swap_map() scans a byte array to search a 256 page cluster, which is very
> slow.
>
> Here I introduced a simple buddy allocator. Since we only care about 256 pages
> cluster, we can just use a counter to implement the buddy allocator. Every 256
> pages use one int to store the counter, so searching cluster is very efficient.
> With this, scap_swap_map() overhead disappears.
>
> This might help low end SD card swap too. Because if the cluster is aligned, SD
> firmware can do flash erase more efficiently.
Indeed.
>
> The downside is the cluster must be aligned to 256 pages, which will reduce the
> chance to find a cluster.
It would be not good for roration device. Can't we make it for only discardable
device?
>
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
> include/linux/swap.h | 1
> mm/swapfile.c | 74 +++++++++++++++++++++++++++++++++++++++------------
> 2 files changed, 58 insertions(+), 17 deletions(-)
>
> Index: linux/include/linux/swap.h
> ===================================================================
> --- linux.orig/include/linux/swap.h 2012-10-22 09:20:40.802165198 +0800
> +++ linux/include/linux/swap.h 2012-10-22 09:20:50.462043746 +0800
> @@ -185,6 +185,7 @@ struct swap_info_struct {
> signed char next; /* next type on the swap list */
> unsigned int max; /* extent of the swap_map */
> unsigned char *swap_map; /* vmalloc'ed array of usage counts */
> + unsigned int *swap_cluster_count; /* cluster counter */
swap_cluster_count is rather awkward.
How about this?
unsigned int *inuse_pages_in_cluster; /* used pages in a cluster */
> unsigned int lowest_bit; /* index of first free in swap_map */
> unsigned int highest_bit; /* index of last free in swap_map */
> unsigned int pages; /* total of usable pages of swap */
> Index: linux/mm/swapfile.c
> ===================================================================
> --- linux.orig/mm/swapfile.c 2012-10-22 09:20:40.794165269 +0800
> +++ linux/mm/swapfile.c 2012-10-22 09:21:34.317493506 +0800
> @@ -182,6 +182,22 @@ static int wait_for_discard(void *word)
> #define SWAPFILE_CLUSTER 256
> #define LATENCY_LIMIT 256
>
> +static inline void inc_swap_cluster_count(unsigned int *swap_cluster_count,
> + unsigned long page_nr)
> +{
> + swap_cluster_count += page_nr/SWAPFILE_CLUSTER;
> + VM_BUG_ON(*swap_cluster_count >= SWAPFILE_CLUSTER);
> + (*swap_cluster_count)++;
> +}
How about this?
static inline void inc_swap_map(struct swap_info_struct *si, unsigned long offset)
{
int cluster_idx = offset / SWAPFILE_CLUSTER;
si->inuse_pages_in_cluster[cluster_idx]++;
}
> +
> +static inline void dec_swap_cluster_count(unsigned int *swap_cluster_count,
> + unsigned long page_nr)
> +{
> + swap_cluster_count += page_nr/SWAPFILE_CLUSTER;
> + VM_BUG_ON(*swap_cluster_count == 0);
> + (*swap_cluster_count)--;
> +}
static inline void dec_swap_map(struct swap_info_struct *si, unsigned long offset)
{
int cluster_idx = offset / SWAPFILE_CLUSTER;
si->inuse_pages_in_cluster[cluster_idx]--;
}
> +
> static unsigned long scan_swap_map(struct swap_info_struct *si,
> unsigned char usage)
> {
> @@ -206,6 +222,8 @@ static unsigned long scan_swap_map(struc
> scan_base = offset = si->cluster_next;
>
> if (unlikely(!si->cluster_nr--)) {
> + unsigned long base;
> +
> if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
> si->cluster_nr = SWAPFILE_CLUSTER - 1;
> goto checks;
> @@ -235,15 +253,14 @@ static unsigned long scan_swap_map(struc
> */
> if (!(si->flags & SWP_SOLIDSTATE))
> scan_base = offset = si->lowest_bit;
> - last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
>
> - /* Locate the first empty (unaligned) cluster */
> - for (; last_in_cluster <= si->highest_bit; offset++) {
> - if (si->swap_map[offset])
> - last_in_cluster = offset + SWAPFILE_CLUSTER;
> - else if (offset == last_in_cluster) {
> + /* Locate the first empty (aligned) cluster */
> + for (base = round_up(offset, SWAPFILE_CLUSTER);
> + base <= si->highest_bit; base += SWAPFILE_CLUSTER) {
> + if (!si->swap_cluster_count[base/SWAPFILE_CLUSTER]) {
> + offset = base;
> + last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
> spin_lock(&swap_lock);
> - offset -= SWAPFILE_CLUSTER - 1;
> si->cluster_next = offset;
> si->cluster_nr = SWAPFILE_CLUSTER - 1;
> found_free_cluster = 1;
> @@ -256,15 +273,14 @@ static unsigned long scan_swap_map(struc
> }
>
> offset = si->lowest_bit;
> - last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
>
> - /* Locate the first empty (unaligned) cluster */
> - for (; last_in_cluster < scan_base; offset++) {
> - if (si->swap_map[offset])
> - last_in_cluster = offset + SWAPFILE_CLUSTER;
> - else if (offset == last_in_cluster) {
> + /* Locate the first empty (aligned) cluster */
> + for (base = round_up(offset, SWAPFILE_CLUSTER);
> + base < scan_base; base += SWAPFILE_CLUSTER) {
> + if (!si->swap_cluster_count[base/SWAPFILE_CLUSTER]) {
> + offset = base;
> + last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
> spin_lock(&swap_lock);
> - offset -= SWAPFILE_CLUSTER - 1;
> si->cluster_next = offset;
> si->cluster_nr = SWAPFILE_CLUSTER - 1;
> found_free_cluster = 1;
> @@ -315,6 +331,7 @@ checks:
> si->highest_bit = 0;
> }
> si->swap_map[offset] = usage;
> + inc_swap_cluster_count(si->swap_cluster_count, offset);
> si->cluster_next = offset + 1;
> si->flags -= SWP_SCANNING;
>
> @@ -549,6 +566,7 @@ static unsigned char swap_entry_free(str
>
> /* free if no reference */
> if (!usage) {
> + dec_swap_cluster_count(p->swap_cluster_count, offset);
> if (offset < p->lowest_bit)
> p->lowest_bit = offset;
> if (offset > p->highest_bit)
> @@ -1445,6 +1463,7 @@ static int setup_swap_extents(struct swa
>
> static void enable_swap_info(struct swap_info_struct *p, int prio,
> unsigned char *swap_map,
> + unsigned int *swap_cluster_count,
> unsigned long *frontswap_map)
> {
> int i, prev;
> @@ -1455,6 +1474,7 @@ static void enable_swap_info(struct swap
> else
> p->prio = --least_priority;
> p->swap_map = swap_map;
> + p->swap_cluster_count = swap_cluster_count;
> frontswap_map_set(p, frontswap_map);
> p->flags |= SWP_WRITEOK;
> nr_swap_pages += p->pages;
> @@ -1480,6 +1500,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
> {
> struct swap_info_struct *p = NULL;
> unsigned char *swap_map;
> + unsigned int *swap_cluster_count;
> struct file *swap_file, *victim;
> struct address_space *mapping;
> struct inode *inode;
> @@ -1556,7 +1577,8 @@ SYSCALL_DEFINE1(swapoff, const char __us
> * sys_swapoff for this swap_info_struct at this point.
> */
> /* re-insert swap space back into swap_list */
> - enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
> + enable_swap_info(p, p->prio, p->swap_map, p->swap_cluster_count,
> + frontswap_map_get(p));
> goto out_dput;
> }
>
> @@ -1581,11 +1603,14 @@ SYSCALL_DEFINE1(swapoff, const char __us
> p->max = 0;
> swap_map = p->swap_map;
> p->swap_map = NULL;
> + swap_cluster_count = p->swap_cluster_count;
> + p->swap_cluster_count = NULL;
> p->flags = 0;
> frontswap_invalidate_area(type);
> spin_unlock(&swap_lock);
> mutex_unlock(&swapon_mutex);
> vfree(swap_map);
> + vfree(swap_cluster_count);
> vfree(frontswap_map_get(p));
> /* Destroy swap account informatin */
> swap_cgroup_swapoff(type);
> @@ -1896,6 +1921,7 @@ static unsigned long read_swap_header(st
> static int setup_swap_map_and_extents(struct swap_info_struct *p,
> union swap_header *swap_header,
> unsigned char *swap_map,
> + unsigned int *swap_cluster_count,
> unsigned long maxpages,
> sector_t *span)
> {
> @@ -1912,11 +1938,16 @@ static int setup_swap_map_and_extents(st
> if (page_nr < maxpages) {
> swap_map[page_nr] = SWAP_MAP_BAD;
> nr_good_pages--;
> + inc_swap_cluster_count(swap_cluster_count, page_nr);
> }
> }
>
> + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
> + inc_swap_cluster_count(swap_cluster_count, i);
> +
> if (nr_good_pages) {
> swap_map[0] = SWAP_MAP_BAD;
> + inc_swap_cluster_count(swap_cluster_count, 0);
> p->max = maxpages;
> p->pages = nr_good_pages;
> nr_extents = setup_swap_extents(p, span);
> @@ -1946,6 +1977,7 @@ SYSCALL_DEFINE2(swapon, const char __use
> sector_t span;
> unsigned long maxpages;
> unsigned char *swap_map = NULL;
> + unsigned int *swap_cluster_count = NULL;
> unsigned long *frontswap_map = NULL;
> struct page *page = NULL;
> struct inode *inode = NULL;
> @@ -2020,12 +2052,19 @@ SYSCALL_DEFINE2(swapon, const char __use
> goto bad_swap;
> }
>
> + swap_cluster_count = vzalloc(DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER) *
> + sizeof(int));
> + if (!swap_cluster_count) {
> + error = -ENOMEM;
> + goto bad_swap;
> + }
> +
> error = swap_cgroup_swapon(p->type, maxpages);
> if (error)
> goto bad_swap;
>
> nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
> - maxpages, &span);
> + swap_cluster_count, maxpages, &span);
> if (unlikely(nr_extents < 0)) {
> error = nr_extents;
> goto bad_swap;
> @@ -2048,7 +2087,7 @@ SYSCALL_DEFINE2(swapon, const char __use
> if (swap_flags & SWAP_FLAG_PREFER)
> prio =
> (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
> - enable_swap_info(p, prio, swap_map, frontswap_map);
> + enable_swap_info(p, prio, swap_map, swap_cluster_count, frontswap_map);
>
> printk(KERN_INFO "Adding %uk swap on %s. "
> "Priority:%d extents:%d across:%lluk %s%s%s\n",
> @@ -2078,6 +2117,7 @@ bad_swap:
> p->flags = 0;
> spin_unlock(&swap_lock);
> vfree(swap_map);
> + vfree(swap_cluster_count);
> if (swap_file) {
> if (inode && S_ISREG(inode->i_mode)) {
> mutex_unlock(&inode->i_mutex);
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
Kind regards,
Minchan Kim
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC 1/2]swap: add a simple buddy allocator
2012-10-22 2:30 [RFC 1/2]swap: add a simple buddy allocator Shaohua Li
2012-10-22 21:10 ` Andrew Morton
2012-10-25 0:20 ` Minchan Kim
@ 2012-10-25 11:35 ` Ni zhan Chen
2 siblings, 0 replies; 5+ messages in thread
From: Ni zhan Chen @ 2012-10-25 11:35 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-mm, akpm, hughd, riel
On 10/22/2012 10:30 AM, Shaohua Li wrote:
> I'm using a fast SSD to do swap. scan_swap_map() sometimes uses up to 20~30%
> CPU time (when cluster is hard to find), which becomes a bottleneck.
> scan_swap_map() scans a byte array to search a 256 page cluster, which is very
> slow.
>
> Here I introduced a simple buddy allocator. Since we only care about 256 pages
> cluster, we can just use a counter to implement the buddy allocator. Every 256
> pages use one int to store the counter, so searching cluster is very efficient.
> With this, scap_swap_map() overhead disappears.
>
> This might help low end SD card swap too. Because if the cluster is aligned, SD
> firmware can do flash erase more efficiently.
Good idea!
>
> The downside is the cluster must be aligned to 256 pages, which will reduce the
> chance to find a cluster.
>
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
> include/linux/swap.h | 1
> mm/swapfile.c | 74 +++++++++++++++++++++++++++++++++++++++------------
> 2 files changed, 58 insertions(+), 17 deletions(-)
>
> Index: linux/include/linux/swap.h
> ===================================================================
> --- linux.orig/include/linux/swap.h 2012-10-22 09:20:40.802165198 +0800
> +++ linux/include/linux/swap.h 2012-10-22 09:20:50.462043746 +0800
> @@ -185,6 +185,7 @@ struct swap_info_struct {
> signed char next; /* next type on the swap list */
> unsigned int max; /* extent of the swap_map */
> unsigned char *swap_map; /* vmalloc'ed array of usage counts */
> + unsigned int *swap_cluster_count; /* cluster counter */
> unsigned int lowest_bit; /* index of first free in swap_map */
> unsigned int highest_bit; /* index of last free in swap_map */
> unsigned int pages; /* total of usable pages of swap */
> Index: linux/mm/swapfile.c
> ===================================================================
> --- linux.orig/mm/swapfile.c 2012-10-22 09:20:40.794165269 +0800
> +++ linux/mm/swapfile.c 2012-10-22 09:21:34.317493506 +0800
> @@ -182,6 +182,22 @@ static int wait_for_discard(void *word)
> #define SWAPFILE_CLUSTER 256
> #define LATENCY_LIMIT 256
>
> +static inline void inc_swap_cluster_count(unsigned int *swap_cluster_count,
> + unsigned long page_nr)
> +{
> + swap_cluster_count += page_nr/SWAPFILE_CLUSTER;
> + VM_BUG_ON(*swap_cluster_count >= SWAPFILE_CLUSTER);
> + (*swap_cluster_count)++;
> +}
> +
> +static inline void dec_swap_cluster_count(unsigned int *swap_cluster_count,
> + unsigned long page_nr)
> +{
> + swap_cluster_count += page_nr/SWAPFILE_CLUSTER;
> + VM_BUG_ON(*swap_cluster_count == 0);
> + (*swap_cluster_count)--;
> +}
> +
> static unsigned long scan_swap_map(struct swap_info_struct *si,
> unsigned char usage)
> {
> @@ -206,6 +222,8 @@ static unsigned long scan_swap_map(struc
> scan_base = offset = si->cluster_next;
>
> if (unlikely(!si->cluster_nr--)) {
> + unsigned long base;
> +
> if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
> si->cluster_nr = SWAPFILE_CLUSTER - 1;
> goto checks;
> @@ -235,15 +253,14 @@ static unsigned long scan_swap_map(struc
> */
> if (!(si->flags & SWP_SOLIDSTATE))
> scan_base = offset = si->lowest_bit;
> - last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
>
> - /* Locate the first empty (unaligned) cluster */
> - for (; last_in_cluster <= si->highest_bit; offset++) {
> - if (si->swap_map[offset])
> - last_in_cluster = offset + SWAPFILE_CLUSTER;
> - else if (offset == last_in_cluster) {
> + /* Locate the first empty (aligned) cluster */
> + for (base = round_up(offset, SWAPFILE_CLUSTER);
> + base <= si->highest_bit; base += SWAPFILE_CLUSTER) {
> + if (!si->swap_cluster_count[base/SWAPFILE_CLUSTER]) {
> + offset = base;
> + last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
> spin_lock(&swap_lock);
> - offset -= SWAPFILE_CLUSTER - 1;
> si->cluster_next = offset;
> si->cluster_nr = SWAPFILE_CLUSTER - 1;
> found_free_cluster = 1;
> @@ -256,15 +273,14 @@ static unsigned long scan_swap_map(struc
> }
>
> offset = si->lowest_bit;
> - last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
>
> - /* Locate the first empty (unaligned) cluster */
> - for (; last_in_cluster < scan_base; offset++) {
> - if (si->swap_map[offset])
> - last_in_cluster = offset + SWAPFILE_CLUSTER;
> - else if (offset == last_in_cluster) {
> + /* Locate the first empty (aligned) cluster */
> + for (base = round_up(offset, SWAPFILE_CLUSTER);
> + base < scan_base; base += SWAPFILE_CLUSTER) {
> + if (!si->swap_cluster_count[base/SWAPFILE_CLUSTER]) {
> + offset = base;
> + last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
> spin_lock(&swap_lock);
> - offset -= SWAPFILE_CLUSTER - 1;
> si->cluster_next = offset;
> si->cluster_nr = SWAPFILE_CLUSTER - 1;
> found_free_cluster = 1;
> @@ -315,6 +331,7 @@ checks:
> si->highest_bit = 0;
> }
> si->swap_map[offset] = usage;
> + inc_swap_cluster_count(si->swap_cluster_count, offset);
> si->cluster_next = offset + 1;
> si->flags -= SWP_SCANNING;
>
> @@ -549,6 +566,7 @@ static unsigned char swap_entry_free(str
>
> /* free if no reference */
> if (!usage) {
> + dec_swap_cluster_count(p->swap_cluster_count, offset);
> if (offset < p->lowest_bit)
> p->lowest_bit = offset;
> if (offset > p->highest_bit)
> @@ -1445,6 +1463,7 @@ static int setup_swap_extents(struct swa
>
> static void enable_swap_info(struct swap_info_struct *p, int prio,
> unsigned char *swap_map,
> + unsigned int *swap_cluster_count,
> unsigned long *frontswap_map)
> {
> int i, prev;
> @@ -1455,6 +1474,7 @@ static void enable_swap_info(struct swap
> else
> p->prio = --least_priority;
> p->swap_map = swap_map;
> + p->swap_cluster_count = swap_cluster_count;
> frontswap_map_set(p, frontswap_map);
> p->flags |= SWP_WRITEOK;
> nr_swap_pages += p->pages;
> @@ -1480,6 +1500,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
> {
> struct swap_info_struct *p = NULL;
> unsigned char *swap_map;
> + unsigned int *swap_cluster_count;
> struct file *swap_file, *victim;
> struct address_space *mapping;
> struct inode *inode;
> @@ -1556,7 +1577,8 @@ SYSCALL_DEFINE1(swapoff, const char __us
> * sys_swapoff for this swap_info_struct at this point.
> */
> /* re-insert swap space back into swap_list */
> - enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
> + enable_swap_info(p, p->prio, p->swap_map, p->swap_cluster_count,
> + frontswap_map_get(p));
> goto out_dput;
> }
>
> @@ -1581,11 +1603,14 @@ SYSCALL_DEFINE1(swapoff, const char __us
> p->max = 0;
> swap_map = p->swap_map;
> p->swap_map = NULL;
> + swap_cluster_count = p->swap_cluster_count;
> + p->swap_cluster_count = NULL;
> p->flags = 0;
> frontswap_invalidate_area(type);
> spin_unlock(&swap_lock);
> mutex_unlock(&swapon_mutex);
> vfree(swap_map);
> + vfree(swap_cluster_count);
> vfree(frontswap_map_get(p));
> /* Destroy swap account informatin */
> swap_cgroup_swapoff(type);
> @@ -1896,6 +1921,7 @@ static unsigned long read_swap_header(st
> static int setup_swap_map_and_extents(struct swap_info_struct *p,
> union swap_header *swap_header,
> unsigned char *swap_map,
> + unsigned int *swap_cluster_count,
> unsigned long maxpages,
> sector_t *span)
> {
> @@ -1912,11 +1938,16 @@ static int setup_swap_map_and_extents(st
> if (page_nr < maxpages) {
> swap_map[page_nr] = SWAP_MAP_BAD;
> nr_good_pages--;
> + inc_swap_cluster_count(swap_cluster_count, page_nr);
> }
> }
>
> + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
> + inc_swap_cluster_count(swap_cluster_count, i);
> +
> if (nr_good_pages) {
> swap_map[0] = SWAP_MAP_BAD;
> + inc_swap_cluster_count(swap_cluster_count, 0);
> p->max = maxpages;
> p->pages = nr_good_pages;
> nr_extents = setup_swap_extents(p, span);
> @@ -1946,6 +1977,7 @@ SYSCALL_DEFINE2(swapon, const char __use
> sector_t span;
> unsigned long maxpages;
> unsigned char *swap_map = NULL;
> + unsigned int *swap_cluster_count = NULL;
> unsigned long *frontswap_map = NULL;
> struct page *page = NULL;
> struct inode *inode = NULL;
> @@ -2020,12 +2052,19 @@ SYSCALL_DEFINE2(swapon, const char __use
> goto bad_swap;
> }
>
> + swap_cluster_count = vzalloc(DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER) *
> + sizeof(int));
> + if (!swap_cluster_count) {
> + error = -ENOMEM;
> + goto bad_swap;
> + }
> +
> error = swap_cgroup_swapon(p->type, maxpages);
> if (error)
> goto bad_swap;
>
> nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
> - maxpages, &span);
> + swap_cluster_count, maxpages, &span);
> if (unlikely(nr_extents < 0)) {
> error = nr_extents;
> goto bad_swap;
> @@ -2048,7 +2087,7 @@ SYSCALL_DEFINE2(swapon, const char __use
> if (swap_flags & SWAP_FLAG_PREFER)
> prio =
> (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
> - enable_swap_info(p, prio, swap_map, frontswap_map);
> + enable_swap_info(p, prio, swap_map, swap_cluster_count, frontswap_map);
>
> printk(KERN_INFO "Adding %uk swap on %s. "
> "Priority:%d extents:%d across:%lluk %s%s%s\n",
> @@ -2078,6 +2117,7 @@ bad_swap:
> p->flags = 0;
> spin_unlock(&swap_lock);
> vfree(swap_map);
> + vfree(swap_cluster_count);
> if (swap_file) {
> if (inode && S_ISREG(inode->i_mode)) {
> mutex_unlock(&inode->i_mutex);
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC 1/2]swap: add a simple buddy allocator
2012-10-25 0:20 ` Minchan Kim
@ 2012-10-26 2:29 ` Shaohua Li
0 siblings, 0 replies; 5+ messages in thread
From: Shaohua Li @ 2012-10-26 2:29 UTC (permalink / raw)
To: Minchan Kim; +Cc: linux-mm, akpm, hughd, riel
On Thu, Oct 25, 2012 at 09:20:37AM +0900, Minchan Kim wrote:
> Hi Shaohua,
>
> Your idea does make sense to me.
> Below just a few nitpick.
Thanks for your time.
> On Mon, Oct 22, 2012 at 10:30:51AM +0800, Shaohua Li wrote:
> > I'm using a fast SSD to do swap. scan_swap_map() sometimes uses up to 20~30%
> > CPU time (when cluster is hard to find), which becomes a bottleneck.
> > scan_swap_map() scans a byte array to search a 256 page cluster, which is very
> > slow.
> >
> > Here I introduced a simple buddy allocator. Since we only care about 256 pages
> > cluster, we can just use a counter to implement the buddy allocator. Every 256
> > pages use one int to store the counter, so searching cluster is very efficient.
> > With this, scap_swap_map() overhead disappears.
> >
> > This might help low end SD card swap too. Because if the cluster is aligned, SD
> > firmware can do flash erase more efficiently.
>
> Indeed.
>
> >
> > The downside is the cluster must be aligned to 256 pages, which will reduce the
> > chance to find a cluster.
>
> It would be not good for roration device. Can't we make it for only discardable
> device?
This is the biggest concern. Andrew raised it too. If most swap space isn't
full, this isn't a problem. Otherwise, might be. But I didn't find a way to
evaluate how big the chance is reduced. I can try if anybody has ideal
workload. If can't evaluate the chance, I'll choose to only enable it for
discardable device.
Other comments make sense, I'll update in next post.
Thanks,
Shaohua
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2012-10-26 2:29 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-10-22 2:30 [RFC 1/2]swap: add a simple buddy allocator Shaohua Li
2012-10-22 21:10 ` Andrew Morton
2012-10-25 0:20 ` Minchan Kim
2012-10-26 2:29 ` Shaohua Li
2012-10-25 11:35 ` Ni zhan Chen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).