* [PATCH] makedumpfile: keep dumpfile pages in a cache
@ 2012-08-28 17:49 Petr Tesarik
2012-09-03 3:42 ` Atsushi Kumagai
0 siblings, 1 reply; 7+ messages in thread
From: Petr Tesarik @ 2012-08-28 17:49 UTC (permalink / raw)
To: kexec; +Cc: Petr Tesarik
Add a simple cache for pages read from the dumpfile.
This is a big win if we read consecutive data from one page, e.g.
page descriptors, or even page table entries.
Note that makedumpfile now always reads a complete page. This was already
the case with kdump-compressed and sadump formats, but makedumpfile was
throwing most of the data away. For the kdump-compressed case, we may
actually save a lot of decompression, too.
I tried to keep the cache small to minimize memory footprint, but it should
be big enough to hold all pages to do 4-level paging plus some data. This
is needed e.g. for vmalloc areas or Xen page frame table data, which are not
contiguous in physical memory.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
Makefile | 4 -
cache.c | 119
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
cache.h | 26 ++++++++++++
makedumpfile.c | 113 +++++++++++++++++++++++++++++++-----------------------
sadump_info.c | 24 +++--------
sadump_info.h | 2
6 files changed, 222 insertions(+), 66 deletions(-)
--- a/Makefile
+++ b/Makefile
@@ -40,8 +40,8 @@ CFLAGS_ARCH += -m32
endif
SRC = makedumpfile.c makedumpfile.h diskdump_mod.h sadump_mod.h
sadump_info.h
-SRC_PART = print_info.c dwarf_info.c elf_info.c erase_info.c sadump_info.c
-OBJ_PART = print_info.o dwarf_info.o elf_info.o erase_info.o sadump_info.o
+SRC_PART = print_info.c dwarf_info.c elf_info.c erase_info.c sadump_info.c
cache.c
+OBJ_PART = print_info.o dwarf_info.o elf_info.o erase_info.o sadump_info.o
cache.o
SRC_ARCH = arch/arm.c arch/x86.c arch/x86_64.c arch/ia64.c arch/ppc64.c
arch/s390x.c arch/ppc.c
OBJ_ARCH = arch/arm.o arch/x86.o arch/x86_64.o arch/ia64.o arch/ppc64.o
arch/s390x.o arch/ppc.o
--- /dev/null
+++ b/cache.c
@@ -0,0 +1,119 @@
+/*
+ * cache.h
+ *
+ * Created by: Petr Tesarik <ptesarik@suse.cz>
+ *
+ * Copyright (c) 2012 SUSE LINUX Products GmbH, Nuernberg, Germany.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "makedumpfile.h"
+#include "cache.h"
+
+struct cache_entry {
+ unsigned long long paddr;
+ void *bufptr;
+ struct cache_entry *next, *prev;
+};
+
+struct cache {
+ struct cache_entry *head, *tail;
+};
+
+/* 8 pages covers 4-level paging plus 4 data pages */
+#define CACHE_SIZE 8
+static struct cache_entry pool[CACHE_SIZE];
+static int avail = CACHE_SIZE;
+
+static struct cache used, pending;
+
+static void
+add_entry(struct cache *cache, struct cache_entry *entry)
+{
+ entry->next = cache->head;
+ entry->prev = NULL;
+ if (cache->head)
+ cache->head->prev = entry;
+ cache->head = entry;
+ if (!cache->tail)
+ cache->tail = entry;
+}
+
+static void
+remove_entry(struct cache *cache, struct cache_entry *entry)
+{
+ if (entry->next)
+ entry->next->prev = entry->prev;
+ else
+ cache->tail = entry->prev;
+
+ if (entry->prev)
+ entry->prev->next = entry->next;
+ else
+ cache->head = entry->next;
+}
+
+void *
+cache_search(unsigned long long paddr)
+{
+ struct cache_entry *entry;
+ for (entry = used.head; entry; entry = entry->next)
+ if (entry->paddr == paddr) {
+ if (entry != used.head) {
+ remove_entry(&used, entry);
+ add_entry(&used, entry);
+ }
+ return entry->bufptr;
+ }
+
+ return NULL; /* cache miss */
+}
+
+void *
+cache_alloc(unsigned long long paddr)
+{
+ struct cache_entry *entry = NULL;
+
+ if (avail) {
+ void *bufptr = malloc(info->page_size);
+ if (bufptr) {
+ entry = &pool[--avail];
+ entry->bufptr = bufptr;
+ }
+ }
+
+ if (!entry) {
+ if (used.tail) {
+ entry = used.tail;
+ remove_entry(&used, entry);
+ } else
+ return NULL;
+ }
+
+ entry->paddr = paddr;
+ add_entry(&pending, entry);
+
+ return entry->bufptr;
+}
+
+void
+cache_add(unsigned long long paddr)
+{
+ struct cache_entry *entry;
+ for (entry = pending.head; entry; entry = entry->next) {
+ if (entry->paddr == paddr) {
+ remove_entry(&pending, entry);
+ add_entry(&used, entry);
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/cache.h
@@ -0,0 +1,26 @@
+/*
+ * cache.h
+ *
+ * Written by: Petr Tesarik <ptesarik@suse.cz>
+ *
+ * Copyright (c) 2012 SUSE LINUX Products GmbH, Nuernberg, Germany.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _CACHE_H
+#define _CACHE_H
+
+void *cache_search(unsigned long long paddr);
+void *cache_alloc(unsigned long long paddr);
+void cache_add(unsigned long long paddr);
+
+#endif /* _CACHE_H */
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -19,6 +19,7 @@
#include "elf_info.h"
#include "erase_info.h"
#include "sadump_info.h"
+#include "cache.h"
#include <stddef.h>
#include <sys/time.h>
@@ -222,83 +223,103 @@ read_page_desc(unsigned long long paddr,
return TRUE;
}
-int
-readpmem_kdump_compressed(unsigned long long paddr, void *bufptr, size_t
size)
+static int
+readpage_elf(unsigned long long paddr, void *bufptr)
+{
+ const off_t failed = (off_t)-1;
+ off_t offset = 0;
+
+ if (!(offset = paddr_to_offset(paddr))) {
+ ERRMSG("Can't convert a physical address(%llx) to offset.\n",
+ paddr);
+ return FALSE;
+ }
+
+ if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
+ ERRMSG("Can't seek the dump memory(%s). (offset: %llx) %s\n",
+ info->name_memory, (unsigned long long)offset, strerror(errno));
+ return FALSE;
+ }
+
+ if (read(info->fd_memory, bufptr, info->page_size) != info->page_size) {
+ ERRMSG("Can't read the dump memory(%s). %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static int
+readpage_kdump_compressed(unsigned long long paddr, void *bufptr)
{
page_desc_t pd;
- char buf[info->page_size];
- char buf2[info->page_size];
+ char buf[info->page_size], *rdbuf;
int ret;
- unsigned long retlen, page_offset;
-
- page_offset = paddr % info->page_size;
+ unsigned long retlen;
if (!is_dumpable(info->bitmap_memory, paddr_to_pfn(paddr))) {
ERRMSG("pfn(%llx) is excluded from %s.\n",
paddr_to_pfn(paddr), info->name_memory);
- goto error;
+ return FALSE;
}
if (!read_page_desc(paddr, &pd)) {
ERRMSG("Can't read page_desc: %llx\n", paddr);
- goto error;
+ return FALSE;
}
if (lseek(info->fd_memory, pd.offset, SEEK_SET) < 0) {
ERRMSG("Can't seek %s. %s\n",
info->name_memory, strerror(errno));
- goto error;
+ return FALSE;
}
/*
* Read page data
*/
- if (read(info->fd_memory, buf, pd.size) != pd.size) {
+ rdbuf = pd.flags & (DUMP_DH_COMPRESSED_ZLIB | DUMP_DH_COMPRESSED_LZO)
+ ? buf : bufptr;
+ if (read(info->fd_memory, rdbuf, pd.size) != pd.size) {
ERRMSG("Can't read %s. %s\n",
info->name_memory, strerror(errno));
- goto error;
+ return FALSE;
}
if (pd.flags & DUMP_DH_COMPRESSED_ZLIB) {
retlen = info->page_size;
- ret = uncompress((unsigned char *)buf2, &retlen,
+ ret = uncompress((unsigned char *)bufptr, &retlen,
(unsigned char *)buf, pd.size);
if ((ret != Z_OK) || (retlen != info->page_size)) {
ERRMSG("Uncompress failed: %d\n", ret);
- goto error;
+ return FALSE;
}
- memcpy(bufptr, buf2 + page_offset, size);
#ifdef USELZO
} else if (info->flag_lzo_support
&& (pd.flags & DUMP_DH_COMPRESSED_LZO)) {
retlen = info->page_size;
ret = lzo1x_decompress_safe((unsigned char *)buf, pd.size,
- (unsigned char *)buf2, &retlen,
+ (unsigned char *)bufptr, &retlen,
LZO1X_MEM_DECOMPRESS);
if ((ret != LZO_E_OK) || (retlen != info->page_size)) {
ERRMSG("Uncompress failed: %d\n", ret);
- goto error;
+ return FALSE;
}
- memcpy(bufptr, buf2 + page_offset, size);
#endif
- } else
- memcpy(bufptr, buf + page_offset, size);
+ }
- return size;
-error:
- ERRMSG("type_addr: %d, addr:%llx, size:%zd\n", PADDR, paddr, size);
- return FALSE;
+ return TRUE;
}
int
readmem(int type_addr, unsigned long long addr, void *bufptr, size_t size)
{
size_t read_size, next_size;
- off_t offset = 0;
unsigned long long next_addr;
unsigned long long paddr, maddr = NOT_PADDR;
+ unsigned long long pgaddr;
+ void *pgbuf;
char *next_ptr;
- const off_t failed = (off_t)-1;
switch (type_addr) {
case VADDR:
@@ -358,31 +379,29 @@ readmem(int type_addr, unsigned long lon
goto error;
}
- if (info->flag_refiltering)
- return readpmem_kdump_compressed(paddr, bufptr, read_size);
-
- if (info->flag_sadump)
- return readpmem_sadump(paddr, bufptr, read_size);
-
- if (!(offset = paddr_to_offset(paddr))) {
- ERRMSG("Can't convert a physical address(%llx) to offset.\n",
- paddr);
- goto error;
- }
-
- if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
- ERRMSG("Can't seek the dump memory(%s). (offset: %llx) %s\n",
- info->name_memory, (unsigned long long)offset, strerror(errno));
- goto error;
- }
+ pgaddr = PAGEBASE(paddr);
+ pgbuf = cache_search(pgaddr);
+ if (!pgbuf) {
+ pgbuf = cache_alloc(pgaddr);
+ if (!pgbuf)
+ goto error;
- if (read(info->fd_memory, bufptr, read_size) != read_size) {
- ERRMSG("Can't read the dump memory(%s). %s\n",
- info->name_memory, strerror(errno));
- goto error;
+ if (info->flag_refiltering) {
+ if (!readpage_kdump_compressed(pgaddr, pgbuf))
+ goto error;
+ } else if (info->flag_sadump) {
+ if (!readpage_sadump(pgaddr, pgbuf))
+ goto error;
+ } else {
+ if (!readpage_elf(pgaddr, pgbuf))
+ goto error;
+ }
+ cache_add(pgaddr);
}
+ memcpy(bufptr, pgbuf + PAGEOFFSET(paddr), read_size);
return size;
+
error:
ERRMSG("type_addr: %d, addr:%llx, size:%zd\n", type_addr, addr, size);
return FALSE;
--- a/sadump_info.c
+++ b/sadump_info.c
@@ -949,11 +949,10 @@ failed:
#endif /* __x86_64__ */
int
-readpmem_sadump(unsigned long long paddr, void *bufptr, size_t size)
+readpage_sadump(unsigned long long paddr, void *bufptr)
{
unsigned long long pfn, block, whole_offset, perdisk_offset;
ulong page_offset;
- char buf[info->page_size];
int fd_memory;
if (si->kdump_backed_up &&
@@ -965,12 +964,12 @@ readpmem_sadump(unsigned long long paddr
page_offset = paddr % info->page_size;
if (pfn >= si->sh_memory->max_mapnr)
- goto error;
+ return FALSE;
if (!is_dumpable(info->bitmap_memory, pfn)) {
ERRMSG("pfn(%llx) is excluded from %s.\n", pfn,
info->name_memory);
- goto error;
+ return FALSE;
}
block = pfn_to_block(pfn);
@@ -980,7 +979,7 @@ readpmem_sadump(unsigned long long paddr
int diskid;
if (!lookup_diskset(whole_offset, &diskid, &perdisk_offset))
- goto error;
+ return FALSE;
fd_memory = si->diskset_info[diskid].fd_memory;
perdisk_offset += si->diskset_info[diskid].data_offset;
@@ -992,19 +991,12 @@ readpmem_sadump(unsigned long long paddr
}
if (lseek(fd_memory, perdisk_offset, SEEK_SET) < 0)
- goto error;
+ return FALSE;
- if (read(fd_memory, buf, sizeof(buf)) != sizeof(buf))
- goto error;
+ if (read(fd_memory, bufptr, info->page_size) != info->page_size)
+ return FALSE;
- memcpy(bufptr, buf + page_offset, size);
-
- return size;
-
-error:
- DEBUG_MSG("type_addr: %d, addr:%llx, size:%zd\n", PADDR, paddr, size);
-
- return FALSE;
+ return TRUE;
}
int
--- a/sadump_info.h
+++ b/sadump_info.h
@@ -43,7 +43,7 @@ int sadump_initialize_bitmap_memory(void
int sadump_num_online_cpus(void);
int sadump_set_timestamp(struct timeval *ts);
unsigned long long sadump_get_max_mapnr(void);
-int readpmem_sadump(unsigned long long paddr, void *bufptr, size_t size);
+int readpage_sadump(unsigned long long paddr, void *bufptr);
int sadump_check_debug_info(void);
int sadump_generate_vmcoreinfo_from_vmlinux(size_t *vmcoreinfo_size);
int sadump_generate_elf_note_from_dumpfile(void);
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] makedumpfile: keep dumpfile pages in a cache
2012-08-28 17:49 [PATCH] makedumpfile: keep dumpfile pages in a cache Petr Tesarik
@ 2012-09-03 3:42 ` Atsushi Kumagai
2012-09-03 7:04 ` Petr Tesarik
0 siblings, 1 reply; 7+ messages in thread
From: Atsushi Kumagai @ 2012-09-03 3:42 UTC (permalink / raw)
To: ptesarik; +Cc: kexec
Hello Petr,
On Tue, 28 Aug 2012 19:49:49 +0200
Petr Tesarik <ptesarik@suse.cz> wrote:
> Add a simple cache for pages read from the dumpfile.
>
> This is a big win if we read consecutive data from one page, e.g.
> page descriptors, or even page table entries.
>
> Note that makedumpfile now always reads a complete page. This was already
> the case with kdump-compressed and sadump formats, but makedumpfile was
> throwing most of the data away. For the kdump-compressed case, we may
> actually save a lot of decompression, too.
>
> I tried to keep the cache small to minimize memory footprint, but it should
> be big enough to hold all pages to do 4-level paging plus some data. This
> is needed e.g. for vmalloc areas or Xen page frame table data, which are not
> contiguous in physical memory.
>
> Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
It's interesting to me. I want to know how performance will be improved
with this patch, so do you have speed measurements ?
Thanks
Atsushi Kumagai
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] makedumpfile: keep dumpfile pages in a cache
2012-09-03 3:42 ` Atsushi Kumagai
@ 2012-09-03 7:04 ` Petr Tesarik
2012-09-06 15:50 ` Petr Tesarik
0 siblings, 1 reply; 7+ messages in thread
From: Petr Tesarik @ 2012-09-03 7:04 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: kexec
Dne Po 3. září 2012 05:42:33 Atsushi Kumagai napsal(a):
> Hello Petr,
>
> On Tue, 28 Aug 2012 19:49:49 +0200
>
> Petr Tesarik <ptesarik@suse.cz> wrote:
> > Add a simple cache for pages read from the dumpfile.
> >
> > This is a big win if we read consecutive data from one page, e.g.
> > page descriptors, or even page table entries.
> >
> > Note that makedumpfile now always reads a complete page. This was already
> > the case with kdump-compressed and sadump formats, but makedumpfile was
> > throwing most of the data away. For the kdump-compressed case, we may
> > actually save a lot of decompression, too.
> >
> > I tried to keep the cache small to minimize memory footprint, but it
> > should be big enough to hold all pages to do 4-level paging plus some
> > data. This is needed e.g. for vmalloc areas or Xen page frame table
> > data, which are not contiguous in physical memory.
> >
> > Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
>
> It's interesting to me. I want to know how performance will be improved
> with this patch, so do you have speed measurements ?
Not really. I only measured the hit/miss ratio, and with filtering Xen domU
and dump level 0, I got the following on a small system (2G RAM):
cache hit: 1818880 cache miss: 1873
The improvement isn't much for non-Xen case, because the hits are mostly due
to virtual-to-physical translations, and most of Linux data is stored at
virtual addresses that can be resolved by adding/subtracting a fixed offset.
Of course, you will also win only the syscall overhead, because Linux keeps
the data in the kernel pagecache anyway. I'll measure the times for you on a
reasonably large system (~256G) and send the results here.
Regards,
Petr Tesarik
SUSE Linux
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] makedumpfile: keep dumpfile pages in a cache
2012-09-03 7:04 ` Petr Tesarik
@ 2012-09-06 15:50 ` Petr Tesarik
2012-11-14 3:47 ` Atsushi Kumagai
0 siblings, 1 reply; 7+ messages in thread
From: Petr Tesarik @ 2012-09-06 15:50 UTC (permalink / raw)
To: kexec; +Cc: Atsushi Kumagai
Dne Po 3. září 2012 09:04:03 Petr Tesarik napsal(a):
> Dne Po 3. září 2012 05:42:33 Atsushi Kumagai napsal(a):
> > Hello Petr,
> >
> > On Tue, 28 Aug 2012 19:49:49 +0200
> >
> > Petr Tesarik <ptesarik@suse.cz> wrote:
> > > Add a simple cache for pages read from the dumpfile.
> > >
> > > This is a big win if we read consecutive data from one page, e.g.
> > > page descriptors, or even page table entries.
> > >
> > > Note that makedumpfile now always reads a complete page. This was
> > > already the case with kdump-compressed and sadump formats, but
> > > makedumpfile was throwing most of the data away. For the
> > > kdump-compressed case, we may actually save a lot of decompression,
> > > too.
> > >
> > > I tried to keep the cache small to minimize memory footprint, but it
> > > should be big enough to hold all pages to do 4-level paging plus some
> > > data. This is needed e.g. for vmalloc areas or Xen page frame table
> > > data, which are not contiguous in physical memory.
> > >
> > > Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
> >
> > It's interesting to me. I want to know how performance will be improved
> > with this patch, so do you have speed measurements ?
>
> Not really. I only measured the hit/miss ratio, and with filtering Xen domU
> and dump level 0, I got the following on a small system (2G RAM):
>
> cache hit: 1818880 cache miss: 1873
>
> The improvement isn't much for non-Xen case, because the hits are mostly
> due to virtual-to-physical translations, and most of Linux data is stored
> at virtual addresses that can be resolved by adding/subtracting a fixed
> offset.
>
> Of course, you will also win only the syscall overhead, because Linux keeps
> the data in the kernel pagecache anyway. I'll measure the times for you on
> a reasonably large system (~256G) and send the results here.
I couldn't get a medium-sized system for testing, so I performed some
measurements on a 64G system. I ran makedumpfile repeatedly from the kdump
environment. First run was used to cache target filesystem metadata, and the
cache was not dropped between runs to minimize effects of the target
filesystem. I ran it against /proc/vmcore, i.e. the input file was always
resident, nothing to skew the results.
I tried with a kdump file with no compression (to get gzip/LZO out of the
picture) and an ELF file. For the Xen case I only did the ELF file, because
kdump is not available.
First I ran it on bare metal. There was a slight improvement for -d31:
kdump no cache:
6.32user 55.20system 1:15.60elapsed 81%CPU (0avgtext+0avgdata
4800maxresident)k
2080inputs+5714296outputs (2major+342minor)pagefaults 0swaps
kdump with cache:
6.02user 24.58system 0:46.51elapsed 65%CPU (0avgtext+0avgdata
4912maxresident)k
1864inputs+5714288outputs (2major+350minor)pagefaults 0swaps
ELF no cache:
7.58user 74.25system 1:59.52elapsed 68%CPU (0avgtext+0avgdata
4800maxresident)k
728inputs+9288824outputs (1major+342minor)pagefaults 0swaps
ELF with cache:
7.43user 44.21system 1:17.41elapsed 66%CPU (0avgtext+0avgdata
4896maxresident)k
728inputs+9288792outputs (1major+349minor)pagefaults 0swaps
To sum it up, I can see an improvement of approx. 50% in system time. The
increase in memory consumption is a bit more than I would expect (why do I see
~100k for a cache of 12k?), but acceptable nevertheless. I can see a slight
increase in user time (approx. 25%) for the kdump case, which could be
attributed to the cache overhead. I don't have any explanation for the
decreased user time for the ELF case, but it's consistent.
I also tried running makedumpfile with -d1. This results in long sequential
reads, so it's the worst case for a simple LRU-policy cache. The results are
too unstable to make a reliable measurement, but there seems to be a slight
performance hit. It is certainly less than 5% total time.
I think there are two reasons for that:
1. We're copying file data twice for each page (once from the kernel page
cache to the process space, and once from the internal cache to the
destination).
2. Instead of reusing the same data location, we're rotating 8 different pages
(or even up to twice as much if the allocated space is neither continuous nor
page-aligned). This stresses both for the CPU's L1 d-cache and the TLB a tiny
bit more. Note that in the /proc/vmcore case, the kernel sequentially maps all
physical memory of the crashed system, so every cache page may be evicted
before we get to using it again. This could explain why I observe an increase
in system time despite making less system calls.
There's a lot of things I could do to regain the old performance, if anybody
is concerned about the slight performance regression for this worst case. Just
let me know.
Second, I ran with the Xen hypervisor. Since dump levels greater than 1 don't
work, I ran with '-E -X -d1'. Even though this includes the inefficient page
walk described above, the improvement was immense.
no cache:
95.33user 657.18system 13:08.40elapsed 95%CPU (0avgtext+0avgdata
5440maxresident)k
704inputs+6563856outputs (1major+388minor)pagefaults 0swaps
with cache:
61.14user 110.15system 3:24.24elapsed 83%CPU (0avgtext+0avgdata
5584maxresident)k
2360inputs+6563872outputs (2major+396minor)pagefaults 0swaps
In short, almost 80% shorter total time.
Petr Tesarik
SUSE Linux
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] makedumpfile: keep dumpfile pages in a cache
2012-09-06 15:50 ` Petr Tesarik
@ 2012-11-14 3:47 ` Atsushi Kumagai
[not found] ` <2267600.lHCcsG40Ue@azariah.suse.cz>
0 siblings, 1 reply; 7+ messages in thread
From: Atsushi Kumagai @ 2012-11-14 3:47 UTC (permalink / raw)
To: ptesarik; +Cc: kexec
Hello Petr,
On Thu, 6 Sep 2012 17:50:52 +0200
Petr Tesarik <ptesarik@suse.cz> wrote:
> Dne Po 3. září 2012 09:04:03 Petr Tesarik napsal(a):
> > Dne Po 3. září 2012 05:42:33 Atsushi Kumagai napsal(a):
> > > Hello Petr,
> > >
> > > On Tue, 28 Aug 2012 19:49:49 +0200
> > >
> > > Petr Tesarik <ptesarik@suse.cz> wrote:
> > > > Add a simple cache for pages read from the dumpfile.
> > > >
> > > > This is a big win if we read consecutive data from one page, e.g.
> > > > page descriptors, or even page table entries.
> > > >
> > > > Note that makedumpfile now always reads a complete page. This was
> > > > already the case with kdump-compressed and sadump formats, but
> > > > makedumpfile was throwing most of the data away. For the
> > > > kdump-compressed case, we may actually save a lot of decompression,
> > > > too.
> > > >
> > > > I tried to keep the cache small to minimize memory footprint, but it
> > > > should be big enough to hold all pages to do 4-level paging plus some
> > > > data. This is needed e.g. for vmalloc areas or Xen page frame table
> > > > data, which are not contiguous in physical memory.
> > > >
> > > > Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Sorry for the late reply.
According to your measurement, it looks good on performance.
However, I found the issue below in v1.5.1-beta and made sure that this patch
causes it by git bisect (but I don't find the true cause yet).
result on kernel 3.4:
$ makedumpfile --non-cyclic vmcore dumpfile
Copying data : [ 62 %]
readpage_elf: Can't convert a physical address(a0000) to offset.
readmem: type_addr: 1, addr:1000a0000, size:4096
read_pfn: Can't get the page data.
makedumpfile Failed.
$
It seems critical issue for all users, so I will postpone merging this patch
until this issue is solved.
Thanks
Atsushi Kumagai
> > >
> > > It's interesting to me. I want to know how performance will be improved
> > > with this patch, so do you have speed measurements ?
> >
> > Not really. I only measured the hit/miss ratio, and with filtering Xen domU
> > and dump level 0, I got the following on a small system (2G RAM):
> >
> > cache hit: 1818880 cache miss: 1873
> >
> > The improvement isn't much for non-Xen case, because the hits are mostly
> > due to virtual-to-physical translations, and most of Linux data is stored
> > at virtual addresses that can be resolved by adding/subtracting a fixed
> > offset.
> >
> > Of course, you will also win only the syscall overhead, because Linux keeps
> > the data in the kernel pagecache anyway. I'll measure the times for you on
> > a reasonably large system (~256G) and send the results here.
>
> I couldn't get a medium-sized system for testing, so I performed some
> measurements on a 64G system. I ran makedumpfile repeatedly from the kdump
> environment. First run was used to cache target filesystem metadata, and the
> cache was not dropped between runs to minimize effects of the target
> filesystem. I ran it against /proc/vmcore, i.e. the input file was always
> resident, nothing to skew the results.
>
> I tried with a kdump file with no compression (to get gzip/LZO out of the
> picture) and an ELF file. For the Xen case I only did the ELF file, because
> kdump is not available.
>
> First I ran it on bare metal. There was a slight improvement for -d31:
>
> kdump no cache:
> 6.32user 55.20system 1:15.60elapsed 81%CPU (0avgtext+0avgdata
> 4800maxresident)k
> 2080inputs+5714296outputs (2major+342minor)pagefaults 0swaps
>
> kdump with cache:
> 6.02user 24.58system 0:46.51elapsed 65%CPU (0avgtext+0avgdata
> 4912maxresident)k
> 1864inputs+5714288outputs (2major+350minor)pagefaults 0swaps
>
> ELF no cache:
> 7.58user 74.25system 1:59.52elapsed 68%CPU (0avgtext+0avgdata
> 4800maxresident)k
> 728inputs+9288824outputs (1major+342minor)pagefaults 0swaps
>
> ELF with cache:
> 7.43user 44.21system 1:17.41elapsed 66%CPU (0avgtext+0avgdata
> 4896maxresident)k
> 728inputs+9288792outputs (1major+349minor)pagefaults 0swaps
>
> To sum it up, I can see an improvement of approx. 50% in system time. The
> increase in memory consumption is a bit more than I would expect (why do I see
> ~100k for a cache of 12k?), but acceptable nevertheless. I can see a slight
> increase in user time (approx. 25%) for the kdump case, which could be
> attributed to the cache overhead. I don't have any explanation for the
> decreased user time for the ELF case, but it's consistent.
>
> I also tried running makedumpfile with -d1. This results in long sequential
> reads, so it's the worst case for a simple LRU-policy cache. The results are
> too unstable to make a reliable measurement, but there seems to be a slight
> performance hit. It is certainly less than 5% total time.
>
> I think there are two reasons for that:
>
> 1. We're copying file data twice for each page (once from the kernel page
> cache to the process space, and once from the internal cache to the
> destination).
> 2. Instead of reusing the same data location, we're rotating 8 different pages
> (or even up to twice as much if the allocated space is neither continuous nor
> page-aligned). This stresses both for the CPU's L1 d-cache and the TLB a tiny
> bit more. Note that in the /proc/vmcore case, the kernel sequentially maps all
> physical memory of the crashed system, so every cache page may be evicted
> before we get to using it again. This could explain why I observe an increase
> in system time despite making less system calls.
>
> There's a lot of things I could do to regain the old performance, if anybody
> is concerned about the slight performance regression for this worst case. Just
> let me know.
>
> Second, I ran with the Xen hypervisor. Since dump levels greater than 1 don't
> work, I ran with '-E -X -d1'. Even though this includes the inefficient page
> walk described above, the improvement was immense.
>
> no cache:
> 95.33user 657.18system 13:08.40elapsed 95%CPU (0avgtext+0avgdata
> 5440maxresident)k
> 704inputs+6563856outputs (1major+388minor)pagefaults 0swaps
>
> with cache:
> 61.14user 110.15system 3:24.24elapsed 83%CPU (0avgtext+0avgdata
> 5584maxresident)k
> 2360inputs+6563872outputs (2major+396minor)pagefaults 0swaps
>
> In short, almost 80% shorter total time.
>
> Petr Tesarik
> SUSE Linux
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] makedumpfile: keep dumpfile pages in a cache
[not found] ` <20130110094851.61168ff4486308c27aa567f6@mxc.nes.nec.co.jp>
@ 2013-02-06 7:01 ` Atsushi Kumagai
2013-02-13 12:18 ` Petr Tesarik
0 siblings, 1 reply; 7+ messages in thread
From: Atsushi Kumagai @ 2013-02-06 7:01 UTC (permalink / raw)
To: ptesarik; +Cc: kexec
Hello Petr,
On Thu, 10 Jan 2013 09:48:51 +0900
Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> wrote:
> Hello Petr,
>
> On Wed, 19 Dec 2012 16:01:25 +0100
> Petr Tesarik <ptesarik@suse.cz> wrote:
>
> > V Mon, 19 Nov 2012 17:40:44 +0900
> > Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> napsáno:
> >
> > > Hello Petr,
> > >
> > > On Wed, 14 Nov 2012 15:42:12 +0100
> > > Petr Tesarik <ptesarik@suse.cz> wrote:
> > >
> > > > > Sorry for the late reply.
> > > > > According to your measurement, it looks good on performance.
> > > > >
> > > > > However, I found the issue below in v1.5.1-beta and made sure
> > > > > that this patch causes it by git bisect (but I don't find the
> > > > > true cause yet).
> > > > >
> > > > > result on kernel 3.4:
> > > > > $ makedumpfile --non-cyclic vmcore dumpfile
> > > > > Copying data : [ 62 %]
> > > > > readpage_elf: Can't convert a physical address(a0000) to \
> > > > > offset. readmem: type_addr: 1, addr:1000a0000, size:4096
> > > > > read_pfn: Can't get the page data.
> > > > >
> > > > > makedumpfile Failed.
> > > > > $
> > > > >
> > > > > It seems critical issue for all users, so I will postpone merging
> > > > > this patch until this issue is solved.
I found the cause of this issue.
In the log above, readmem() try to read 0x1000a0000 (and it's correct),
but readpage_elf() try to read 0xa0000.
This is because your code uses PAGEBASE macro before readpage_elf().
#define PAGEBASE(X) (((unsigned long)(X)) & ~(PAGESIZE() - 1))
In 32bit systems, sizeof(unsigned long) is 32, 0x1000a0000 is truncated
to 0xa0000 and readpage_elf() gets it.
It's PAGEBASE macro's issue, there is no problem in your code.
So, I'll merge your patch just as it is, and merge the patch below.
Thanks
Atsushi Kumagai
------------------------------------------------------------------
[PATCH] Fix PAGEOFFSET and PAGEBASE macros for i386 PAE.
i386 PAE system has physical address of 36bit, but PAGEOFFSET and
PAGEBASE cast the argument to "unsigned long (32bit)".
As a result, they return invalid address in i386 PAE system.
Signed-off-by: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
---
makedumpfile.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/makedumpfile.h b/makedumpfile.h
index 98cd528..6026fa2 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -151,8 +151,8 @@ isAnon(unsigned long mapping)
#define PAGESIZE() (info->page_size)
#define PAGESHIFT() (info->page_shift)
-#define PAGEOFFSET(X) (((unsigned long)(X)) & (PAGESIZE() - 1))
-#define PAGEBASE(X) (((unsigned long)(X)) & ~(PAGESIZE() - 1))
+#define PAGEOFFSET(X) (((unsigned long long)(X)) & (PAGESIZE() - 1))
+#define PAGEBASE(X) (((unsigned long long)(X)) & ~(PAGESIZE() - 1))
/*
* for SPARSEMEM
--
1.8.0.2
> > > >
> > > > Understood. However, I haven't run into this situation, but I'd
> > > > like to help.
> > >
> > > Thanks in advance.
> > >
> > > > Which architecture is this?
> > > > Could you possibly share the vmcore file with me?
> > >
> > > I tested on i386 with kernel-3.4.8.
> > >
> > > I have no way to send the vmcore to you, so I attach the .config file
> > > instead of it.
> >
> > I have finally compiled and installed the kernel. I was able to save an
> > ELF dump file. However, makedumpfile fails like this:
> >
> > ptesarik@nathan:~/makedumpfile> ./makedumpfile --non-cyclic vmcore
> > dumpfile
> > __read_disk_dump_header: Can't seek a file(vmcore). Invalid argument
> > read_device: Can't seek a file(vmcore). Invalid argument
> > check_elf_format: Can't seek vmcore. Invalid argument
> >
> > makedumpfile Failed.
> >
> > Any ideas?
>
> Hmmm...the errors are shown with "lseek(fd, 0x0, SEEK_SET)" and vmcore
> might be broken for some reason, but I'm not sure...
> I assume that the vmcore can't be opened with crash either, right ?
>
> Anyway, it seems difficult to reproduce this issue in your environment.
> So, I take on the investigation for this issue, please give me more time.
>
>
> Thanks
> Atsushi Kumagai
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] makedumpfile: keep dumpfile pages in a cache
2013-02-06 7:01 ` Atsushi Kumagai
@ 2013-02-13 12:18 ` Petr Tesarik
0 siblings, 0 replies; 7+ messages in thread
From: Petr Tesarik @ 2013-02-13 12:18 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: kexec
On Wed, 6 Feb 2013 16:01:08 +0900
Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> wrote:
> Hello Petr,
>
> On Thu, 10 Jan 2013 09:48:51 +0900
> Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> wrote:
>
> > Hello Petr,
> >
> > On Wed, 19 Dec 2012 16:01:25 +0100
> > Petr Tesarik <ptesarik@suse.cz> wrote:
> >
> > > V Mon, 19 Nov 2012 17:40:44 +0900
> > > Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> napsáno:
> > >
> > > > Hello Petr,
> > > >
> > > > On Wed, 14 Nov 2012 15:42:12 +0100
> > > > Petr Tesarik <ptesarik@suse.cz> wrote:
> > > >
> > > > > > Sorry for the late reply.
> > > > > > According to your measurement, it looks good on performance.
> > > > > >
> > > > > > However, I found the issue below in v1.5.1-beta and made sure
> > > > > > that this patch causes it by git bisect (but I don't find the
> > > > > > true cause yet).
> > > > > >
> > > > > > result on kernel 3.4:
> > > > > > $ makedumpfile --non-cyclic vmcore dumpfile
> > > > > > Copying data : [ 62 %]
> > > > > > readpage_elf: Can't convert a physical address(a0000) to \
> > > > > > offset. readmem: type_addr: 1, addr:1000a0000, size:4096
> > > > > > read_pfn: Can't get the page data.
> > > > > >
> > > > > > makedumpfile Failed.
> > > > > > $
> > > > > >
> > > > > > It seems critical issue for all users, so I will postpone merging
> > > > > > this patch until this issue is solved.
>
> I found the cause of this issue.
>
> In the log above, readmem() try to read 0x1000a0000 (and it's correct),
> but readpage_elf() try to read 0xa0000.
> This is because your code uses PAGEBASE macro before readpage_elf().
>
> #define PAGEBASE(X) (((unsigned long)(X)) & ~(PAGESIZE() - 1))
>
> In 32bit systems, sizeof(unsigned long) is 32, 0x1000a0000 is truncated
> to 0xa0000 and readpage_elf() gets it.
>
> It's PAGEBASE macro's issue, there is no problem in your code.
> So, I'll merge your patch just as it is, and merge the patch below.
Oh, this is great news! Thank you for the work, and sorry for my late
reply.
Petr Tesarik
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2013-02-13 12:19 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-08-28 17:49 [PATCH] makedumpfile: keep dumpfile pages in a cache Petr Tesarik
2012-09-03 3:42 ` Atsushi Kumagai
2012-09-03 7:04 ` Petr Tesarik
2012-09-06 15:50 ` Petr Tesarik
2012-11-14 3:47 ` Atsushi Kumagai
[not found] ` <2267600.lHCcsG40Ue@azariah.suse.cz>
[not found] ` <20121119174044.3144d3df02b62128d1d2bfe2@mxc.nes.nec.co.jp>
[not found] ` <20121219160125.036d5de8@azariah>
[not found] ` <20130110094851.61168ff4486308c27aa567f6@mxc.nes.nec.co.jp>
2013-02-06 7:01 ` Atsushi Kumagai
2013-02-13 12:18 ` Petr Tesarik
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.