* [PATCH RFC 00/11] makedumpfile: parallel processing
@ 2015-06-05 7:56 Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel Zhou Wenjian
` (12 more replies)
0 siblings, 13 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:56 UTC (permalink / raw)
To: kexec
This patch set implements parallel processing by means of multiple threads.
With this patch set, it is available to use multiple threads to read
and compress pages. This parallel process will save time.
This feature only supports creating dumpfile in kdump-compressed format from
vmcore in kdump-compressed format or elf format. Currently, sadump and
xen kdump are not supported.
Qiao Nuohan (11):
Add readpage_kdump_compressed_parallel
Add mappage_elf_parallel
Add readpage_elf_parallel
Add read_pfn_parallel
Add function to initial bitmap for parallel use
Add filter_data_buffer_parallel
Add write_kdump_pages_parallel to allow parallel process
Add write_kdump_pages_parallel_cyclic to allow parallel process in
cyclic_mode
Initial and free data used for parallel process
Make makedumpfile available to read and compress pages parallelly
Add usage and manual about multiple threads process
Makefile | 2 +
erase_info.c | 29 +-
erase_info.h | 2 +
makedumpfile.8 | 24 +
makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
makedumpfile.h | 79 +++
print_info.c | 16 +
7 files changed, 1652 insertions(+), 5 deletions(-)
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
@ 2015-06-05 7:56 ` Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 02/11] Add mappage_elf_parallel Zhou Wenjian
` (11 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:56 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
readpage_kdump_compressed_parallel is used to enable reading pages from
vmcore in kdump-compressed format parallel. fd_memory and bitmap_memory
should be initialized and offered to each thread individually to avoid
conflict.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 137 insertions(+), 0 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index 32f5459..10b6738 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -252,6 +252,20 @@ pfn_to_pos(mdf_pfn_t pfn)
return desc_pos;
}
+unsigned long
+pfn_to_pos_parallel(mdf_pfn_t pfn, struct dump_bitmap* bitmap_memory_parallel)
+{
+ unsigned long desc_pos;
+ mdf_pfn_t i;
+
+ desc_pos = info->valid_pages[pfn / BITMAP_SECT_LEN];
+ for (i = round(pfn, BITMAP_SECT_LEN); i < pfn; i++)
+ if (is_dumpable(bitmap_memory_parallel, i))
+ desc_pos++;
+
+ return desc_pos;
+}
+
int
read_page_desc(unsigned long long paddr, page_desc_t *pd)
{
@@ -294,6 +308,50 @@ read_page_desc(unsigned long long paddr, page_desc_t *pd)
return TRUE;
}
+int
+read_page_desc_parallel(int fd_memory, unsigned long long paddr,
+ page_desc_t *pd,
+ struct dump_bitmap* bitmap_memory_parallel)
+{
+ struct disk_dump_header *dh;
+ unsigned long desc_pos;
+ mdf_pfn_t pfn;
+ off_t offset;
+
+ /*
+ * Find page descriptor
+ */
+ dh = info->dh_memory;
+ offset
+ = (DISKDUMP_HEADER_BLOCKS + dh->sub_hdr_size + dh->bitmap_blocks)
+ * dh->block_size;
+ pfn = paddr_to_pfn(paddr);
+ desc_pos = pfn_to_pos_parallel(pfn, bitmap_memory_parallel);
+ offset += (off_t)desc_pos * sizeof(page_desc_t);
+ if (lseek(fd_memory, offset, SEEK_SET) < 0) {
+ ERRMSG("Can't seek %s. %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
+ }
+
+ /*
+ * Read page descriptor
+ */
+ if (read(fd_memory, pd, sizeof(*pd)) != sizeof(*pd)) {
+ ERRMSG("Can't read %s. %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
+ }
+
+ /*
+ * Sanity check
+ */
+ if (pd->size > dh->block_size)
+ return FALSE;
+
+ return TRUE;
+}
+
static void
unmap_cache(struct cache_entry *entry)
{
@@ -590,6 +648,85 @@ readpage_kdump_compressed(unsigned long long paddr, void *bufptr)
return TRUE;
}
+static int
+readpage_kdump_compressed_parallel(int fd_memory, unsigned long long paddr,
+ void *bufptr,
+ struct dump_bitmap* bitmap_memory_parallel)
+{
+ page_desc_t pd;
+ char buf[info->page_size], *rdbuf;
+ int ret;
+ unsigned long retlen;
+
+ if (!is_dumpable(bitmap_memory_parallel, paddr_to_pfn(paddr))) {
+ ERRMSG("pfn(%llx) is excluded from %s.\n",
+ paddr_to_pfn(paddr), info->name_memory);
+ return FALSE;
+ }
+
+ if (!read_page_desc_parallel(fd_memory, paddr, &pd,
+ bitmap_memory_parallel)) {
+ ERRMSG("Can't read page_desc: %llx\n", paddr);
+ return FALSE;
+ }
+
+ if (lseek(fd_memory, pd.offset, SEEK_SET) < 0) {
+ ERRMSG("Can't seek %s. %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
+ }
+
+ /*
+ * Read page data
+ */
+ rdbuf = pd.flags & (DUMP_DH_COMPRESSED_ZLIB | DUMP_DH_COMPRESSED_LZO |
+ DUMP_DH_COMPRESSED_SNAPPY) ? buf : bufptr;
+ if (read(fd_memory, rdbuf, pd.size) != pd.size) {
+ ERRMSG("Can't read %s. %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
+ }
+
+ if (pd.flags & DUMP_DH_COMPRESSED_ZLIB) {
+ retlen = info->page_size;
+ ret = uncompress((unsigned char *)bufptr, &retlen,
+ (unsigned char *)buf, pd.size);
+ if ((ret != Z_OK) || (retlen != info->page_size)) {
+ ERRMSG("Uncompress failed: %d\n", ret);
+ return FALSE;
+ }
+#ifdef USELZO
+ } else if (info->flag_lzo_support
+ && (pd.flags & DUMP_DH_COMPRESSED_LZO)) {
+ retlen = info->page_size;
+ ret = lzo1x_decompress_safe((unsigned char *)buf, pd.size,
+ (unsigned char *)bufptr, &retlen,
+ LZO1X_MEM_DECOMPRESS);
+ if ((ret != LZO_E_OK) || (retlen != info->page_size)) {
+ ERRMSG("Uncompress failed: %d\n", ret);
+ return FALSE;
+ }
+#endif
+#ifdef USESNAPPY
+ } else if ((pd.flags & DUMP_DH_COMPRESSED_SNAPPY)) {
+
+ ret = snappy_uncompressed_length(buf, pd.size, (size_t *)&retlen);
+ if (ret != SNAPPY_OK) {
+ ERRMSG("Uncompress failed: %d\n", ret);
+ return FALSE;
+ }
+
+ ret = snappy_uncompress(buf, pd.size, bufptr, (size_t *)&retlen);
+ if ((ret != SNAPPY_OK) || (retlen != info->page_size)) {
+ ERRMSG("Uncompress failed: %d\n", ret);
+ return FALSE;
+ }
+#endif
+ }
+
+ return TRUE;
+}
+
int
readmem(int type_addr, unsigned long long addr, void *bufptr, size_t size)
{
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 02/11] Add mappage_elf_parallel
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel Zhou Wenjian
@ 2015-06-05 7:56 ` Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 03/11] Add readpage_elf_parallel Zhou Wenjian
` (10 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:56 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
mappage_elf_parallel is used to enable mmaping elf format to memory
parallelly. later patch will will use the mmapped memory to get data
of each page. fd_memory and mmap_cache should be initialized and offered
to each threads individually to avoid conflict.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
makedumpfile.h | 14 ++++++++
2 files changed, 111 insertions(+), 0 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index 10b6738..0f71ce7 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -395,6 +395,46 @@ update_mmap_range(off_t offset, int initial) {
}
static int
+update_mmap_range_parallel(int fd_memory, off_t offset,
+ struct mmap_cache *mmap_cache)
+{
+ off_t start_offset, end_offset;
+ off_t map_size;
+ off_t max_offset = get_max_file_offset();
+ off_t pt_load_end = offset_to_pt_load_end(offset);
+
+ /*
+ * mmap_buf must be cleaned
+ */
+ if (mmap_cache->mmap_buf != MAP_FAILED)
+ munmap(mmap_cache->mmap_buf, mmap_cache->mmap_end_offset
+ - mmap_cache->mmap_start_offset);
+
+ /*
+ * offset for mmap() must be page aligned.
+ */
+ start_offset = roundup(offset, info->page_size);
+ end_offset = MIN(max_offset, round(pt_load_end, info->page_size));
+
+ if (!pt_load_end || (end_offset - start_offset) <= 0)
+ return FALSE;
+
+ map_size = MIN(end_offset - start_offset, info->mmap_region_size);
+
+ mmap_cache->mmap_buf = mmap(NULL, map_size, PROT_READ, MAP_PRIVATE,
+ fd_memory, start_offset);
+
+ if (mmap_cache->mmap_buf == MAP_FAILED) {
+ return FALSE;
+ }
+
+ mmap_cache->mmap_start_offset = start_offset;
+ mmap_cache->mmap_end_offset = start_offset + map_size;
+
+ return TRUE;
+}
+
+static int
is_mapped_with_mmap(off_t offset) {
if (info->flag_usemmap == MMAP_ENABLE
@@ -405,6 +445,15 @@ is_mapped_with_mmap(off_t offset) {
return FALSE;
}
+static int
+is_mapped_with_mmap_parallel(off_t offset, struct mmap_cache *mmap_cache) {
+ if (offset >= mmap_cache->mmap_start_offset
+ && offset < mmap_cache->mmap_end_offset)
+ return TRUE;
+ else
+ return FALSE;
+}
+
int
initialize_mmap(void) {
unsigned long long phys_start;
@@ -459,6 +508,54 @@ mappage_elf(unsigned long long paddr)
return info->mmap_buf + (offset - info->mmap_start_offset);
}
+static char *
+mappage_elf_parallel(int fd_memory, unsigned long long paddr,
+ struct mmap_cache *mmap_cache)
+{
+ off_t offset, offset2;
+ int flag_usemmap;
+
+ pthread_rwlock_rdlock(&info->usemmap_rwlock);
+ flag_usemmap = info->flag_usemmap;
+ pthread_rwlock_unlock(&info->usemmap_rwlock);
+ if (flag_usemmap != MMAP_ENABLE)
+ return NULL;
+
+ offset = paddr_to_offset(paddr);
+ if (!offset || page_is_fractional(offset))
+ return NULL;
+
+ offset2 = paddr_to_offset(paddr + info->page_size - 1);
+ if (!offset2)
+ return NULL;
+
+ if (offset2 - offset != info->page_size - 1)
+ return NULL;
+
+ if (!is_mapped_with_mmap_parallel(offset, mmap_cache) &&
+ !update_mmap_range_parallel(fd_memory, offset, mmap_cache)) {
+ ERRMSG("Can't read the dump memory(%s) with mmap().\n",
+ info->name_memory);
+
+ ERRMSG("This kernel might have some problems about mmap().\n");
+ ERRMSG("read() will be used instead of mmap() from now.\n");
+
+ /*
+ * Fall back to read().
+ */
+ pthread_rwlock_wrlock(&info->usemmap_rwlock);
+ info->flag_usemmap = MMAP_DISABLE;
+ pthread_rwlock_unlock(&info->usemmap_rwlock);
+ return NULL;
+ }
+
+ if (offset < mmap_cache->mmap_start_offset ||
+ offset + info->page_size > mmap_cache->mmap_end_offset)
+ return NULL;
+
+ return mmap_cache->mmap_buf + (offset - mmap_cache->mmap_start_offset);
+}
+
static int
read_from_vmcore(off_t offset, void *bufptr, unsigned long size)
{
diff --git a/makedumpfile.h b/makedumpfile.h
index d2fadbd..939850f 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -42,6 +42,7 @@
#include "dwarf_info.h"
#include "diskdump_mod.h"
#include "sadump_mod.h"
+#include <pthread.h>
/*
* Result of command
@@ -913,6 +914,15 @@ typedef unsigned long int ulong;
typedef unsigned long long int ulonglong;
/*
+ * for parallel process
+ */
+struct mmap_cache {
+ char *mmap_buf;
+ off_t mmap_start_offset;
+ off_t mmap_end_offset;
+};
+
+/*
* makedumpfile header
* For re-arranging the dump data on different architecture, all the
* variables are defined by 64bits. The size of signature is aligned
@@ -1177,6 +1187,10 @@ struct DumpInfo {
* for cyclic_splitting mode, setup splitblock_size
*/
long long splitblock_size;
+ /*
+ * for parallel process
+ */
+ pthread_rwlock_t usemmap_rwlock;
};
extern struct DumpInfo *info;
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 03/11] Add readpage_elf_parallel
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 02/11] Add mappage_elf_parallel Zhou Wenjian
@ 2015-06-05 7:56 ` Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 04/11] Add read_pfn_parallel Zhou Wenjian
` (9 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:56 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
readpage_elf_parallel is used to enable reading pages from elf format
parallelly. fd_memory should be initialize and offered to each threads
individually to avoid conflict.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 98 insertions(+), 0 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index 0f71ce7..9f12865 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -576,6 +576,27 @@ read_from_vmcore(off_t offset, void *bufptr, unsigned long size)
return TRUE;
}
+static int
+read_from_vmcore_parallel(int fd_memory, off_t offset, void *bufptr,
+ unsigned long size)
+{
+ const off_t failed = (off_t)-1;
+
+ if (lseek(fd_memory, offset, SEEK_SET) == failed) {
+ ERRMSG("Can't seek the dump memory(%s). (offset: %llx) %s\n",
+ info->name_memory, (unsigned long long)offset, strerror(errno));
+ return FALSE;
+ }
+
+ if (read(fd_memory, bufptr, size) != size) {
+ ERRMSG("Can't read the dump memory(%s). %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
/*
* This function is specific for reading page from ELF.
*
@@ -670,6 +691,83 @@ readpage_elf(unsigned long long paddr, void *bufptr)
}
static int
+readpage_elf_parallel(int fd_memory, unsigned long long paddr, void *bufptr)
+{
+ off_t offset1, offset2;
+ size_t size1, size2;
+ unsigned long long phys_start, phys_end, frac_head = 0;
+
+ offset1 = paddr_to_offset(paddr);
+ offset2 = paddr_to_offset(paddr + info->page_size);
+ phys_start = paddr;
+ phys_end = paddr + info->page_size;
+
+ /*
+ * Check the case phys_start isn't aligned by page size like below:
+ *
+ * phys_start
+ * = 0x40ffda7000
+ * |<-- frac_head -->|------------- PT_LOAD -------------
+ * ----+-----------------------+---------------------+----
+ * | pfn:N | pfn:N+1 | ...
+ * ----+-----------------------+---------------------+----
+ * |
+ * pfn_to_paddr(pfn:N) # page size = 16k
+ * = 0x40ffda4000
+ */
+ if (!offset1) {
+ phys_start = page_head_to_phys_start(paddr);
+ offset1 = paddr_to_offset(phys_start);
+ frac_head = phys_start - paddr;
+ memset(bufptr, 0, frac_head);
+ }
+
+ /*
+ * Check the case phys_end isn't aligned by page size like the
+ * phys_start's case.
+ */
+ if (!offset2) {
+ phys_end = page_head_to_phys_end(paddr);
+ offset2 = paddr_to_offset(phys_end);
+ memset(bufptr + (phys_end - paddr), 0, info->page_size
+ - (phys_end - paddr));
+ }
+
+ /*
+ * Check the separated page on different PT_LOAD segments.
+ */
+ if (offset1 + (phys_end - phys_start) == offset2) {
+ size1 = phys_end - phys_start;
+ } else {
+ for (size1 = 1; size1 < info->page_size - frac_head; size1++) {
+ offset2 = paddr_to_offset(phys_start + size1);
+ if (offset1 + size1 != offset2)
+ break;
+ }
+ }
+
+ if(!read_from_vmcore_parallel(fd_memory, offset1, bufptr + frac_head,
+ size1)) {
+ ERRMSG("Can't read the dump memory(%s).\n",
+ info->name_memory);
+ return FALSE;
+ }
+
+ if (size1 + frac_head != info->page_size) {
+ size2 = phys_end - (phys_start + size1);
+
+ if(!read_from_vmcore_parallel(fd_memory, offset2,
+ bufptr + frac_head + size1, size2)) {
+ ERRMSG("Can't read the dump memory(%s).\n",
+ info->name_memory);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+static int
readpage_kdump_compressed(unsigned long long paddr, void *bufptr)
{
page_desc_t pd;
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 04/11] Add read_pfn_parallel
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (2 preceding siblings ...)
2015-06-05 7:56 ` [PATCH RFC 03/11] Add readpage_elf_parallel Zhou Wenjian
@ 2015-06-05 7:56 ` Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 05/11] Add function to initial bitmap for parallel use Zhou Wenjian
` (8 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:56 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
read_pfn_parallel is used to enable reading pages from vmcore parallely.
Current supported format is kdump-compressed and elf, mmap elf format
is also supported.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
Makefile | 2 ++
makedumpfile.c | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 36 insertions(+), 0 deletions(-)
diff --git a/Makefile b/Makefile
index 2d2b1b7..0b10312 100644
--- a/Makefile
+++ b/Makefile
@@ -66,6 +66,8 @@ LIBS := -lsnappy $(LIBS)
CFLAGS += -DUSESNAPPY
endif
+LIBS := -lpthread $(LIBS)
+
all: makedumpfile
$(OBJ_PART): $(SRC_PART)
diff --git a/makedumpfile.c b/makedumpfile.c
index 9f12865..8a69321 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -6667,6 +6667,40 @@ read_pfn(mdf_pfn_t pfn, unsigned char *buf)
}
int
+read_pfn_parallel(int fd_memory, mdf_pfn_t pfn, unsigned char *buf,
+ struct dump_bitmap* bitmap_memory_parallel,
+ struct mmap_cache *mmap_cache)
+{
+ unsigned long long paddr;
+ unsigned long long pgaddr;
+
+ paddr = pfn_to_paddr(pfn);
+
+ pgaddr = PAGEBASE(paddr);
+
+ if (info->flag_refiltering) {
+ if (!readpage_kdump_compressed_parallel(fd_memory, pgaddr, buf,
+ bitmap_memory_parallel)) {
+ ERRMSG("Can't get the page data.\n");
+ return FALSE;
+ }
+ } else {
+ char *mapbuf = mappage_elf_parallel(fd_memory, pgaddr,
+ mmap_cache);
+ if (mapbuf) {
+ memcpy(buf, mapbuf, info->page_size);
+ } else {
+ if (!readpage_elf_parallel(fd_memory, pgaddr, buf)) {
+ ERRMSG("Can't get the page data.\n");
+ return FALSE;
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+int
get_loads_dumpfile_cyclic(void)
{
int i, phnum, num_new_load = 0;
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 05/11] Add function to initial bitmap for parallel use
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (3 preceding siblings ...)
2015-06-05 7:56 ` [PATCH RFC 04/11] Add read_pfn_parallel Zhou Wenjian
@ 2015-06-05 7:56 ` Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 06/11] Add filter_data_buffer_parallel Zhou Wenjian
` (7 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:56 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
initialize_bitmap_memory_parallel and initialize_2nd_bitmap_parallel
is used for parallel process to avoid conflict on bitmap.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.c | 20 ++++++++++++++++++++
makedumpfile.h | 18 ++++++++++++++++++
2 files changed, 38 insertions(+), 0 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index 8a69321..05859a3 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -3398,6 +3398,16 @@ initialize_bitmap_memory(void)
return TRUE;
}
+void
+initialize_bitmap_memory_parallel(struct dump_bitmap *bitmap, int thread_num)
+{
+ bitmap->fd = FD_BITMAP_MEMORY_PARALLEL(thread_num);
+ bitmap->file_name = info->name_memory;
+ bitmap->no_block = -1;
+ memset(bitmap->buf, 0, BUFSIZE_BITMAP);
+ bitmap->offset = info->bitmap_memory->offset;
+}
+
int
calibrate_machdep_info(void)
{
@@ -3713,6 +3723,16 @@ initialize_2nd_bitmap(struct dump_bitmap *bitmap)
bitmap->offset = info->len_bitmap / 2;
}
+void
+initialize_2nd_bitmap_parallel(struct dump_bitmap *bitmap, int thread_num)
+{
+ bitmap->fd = FD_BITMAP_PARALLEL(thread_num);
+ bitmap->file_name = info->name_bitmap;
+ bitmap->no_block = -1;
+ memset(bitmap->buf, 0, BUFSIZE_BITMAP);
+ bitmap->offset = info->len_bitmap / 2;
+}
+
int
set_bitmap(struct dump_bitmap *bitmap, mdf_pfn_t pfn, int val)
{
diff --git a/makedumpfile.h b/makedumpfile.h
index 939850f..b1ff561 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -429,6 +429,11 @@ do { \
#define SPLITTING_SIZE_EI(i) info->splitting_info[i].size_eraseinfo
/*
+ * Macro for getting parallel info.
+ */
+#define FD_BITMAP_MEMORY_PARALLEL(i) info->parallel_info[i].fd_bitmap_memory
+#define FD_BITMAP_PARALLEL(i) info->parallel_info[i].fd_bitmap
+/*
* kernel version
*
* NOTE: the format of kernel_version is as follows
@@ -957,6 +962,18 @@ struct splitting_info {
unsigned long size_eraseinfo;
} splitting_info_t;
+struct parallel_info {
+ int fd_memory;
+ int fd_bitmap_memory;
+ int fd_bitmap;
+ unsigned char *buf;
+ unsigned char *buf_out;
+ struct mmap_cache *mmap_cache;
+#ifdef USELZO
+ lzo_bytep wrkmem;
+#endif
+} parallel_info_t;
+
struct ppc64_vmemmap {
unsigned long phys;
unsigned long virt;
@@ -1093,6 +1110,7 @@ struct DumpInfo {
char *name_dumpfile;
int num_dumpfile;
struct splitting_info *splitting_info;
+ struct parallel_info *parallel_info;
/*
* bitmap info:
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 06/11] Add filter_data_buffer_parallel
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (4 preceding siblings ...)
2015-06-05 7:56 ` [PATCH RFC 05/11] Add function to initial bitmap for parallel use Zhou Wenjian
@ 2015-06-05 7:57 ` Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 07/11] Add write_kdump_pages_parallel to allow parallel process Zhou Wenjian
` (6 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:57 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
filter_data_buffer_parallel is used to enable filtering buffer
parallely.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
erase_info.c | 29 ++++++++++++++++++++++++++++-
erase_info.h | 2 ++
2 files changed, 30 insertions(+), 1 deletions(-)
diff --git a/erase_info.c b/erase_info.c
index e0e0f71..0b253d7 100644
--- a/erase_info.c
+++ b/erase_info.c
@@ -2328,7 +2328,6 @@ extract_filter_info(unsigned long long start_paddr,
return TRUE;
}
-
/*
* External functions.
*/
@@ -2413,6 +2412,34 @@ filter_data_buffer(unsigned char *buf, unsigned long long paddr,
}
}
+/*
+ * Filter buffer if the physical address is in filter_info.
+ */
+void
+filter_data_buffer_parallel(unsigned char *buf, unsigned long long paddr,
+ size_t size, pthread_mutex_t *mutex)
+{
+ struct filter_info fl_info;
+ unsigned char *buf_ptr;
+ int found = FALSE;
+
+ while (TRUE) {
+ pthread_mutex_lock(mutex);
+ found = extract_filter_info(paddr, paddr + size, &fl_info);
+ pthread_mutex_unlock(mutex);
+
+ if (found) {
+ buf_ptr = buf + (fl_info.paddr - paddr);
+ if (fl_info.nullify)
+ memset(buf_ptr, 0, fl_info.size);
+ else
+ memset(buf_ptr, fl_info.erase_ch, fl_info.size);
+ } else {
+ break;
+ }
+ }
+}
+
unsigned long
get_size_eraseinfo(void)
{
diff --git a/erase_info.h b/erase_info.h
index 4d4957e..b363a40 100644
--- a/erase_info.h
+++ b/erase_info.h
@@ -60,6 +60,8 @@ extern unsigned long num_erase_info;
int gather_filter_info(void);
void clear_filter_info(void);
void filter_data_buffer(unsigned char *buf, unsigned long long paddr, size_t size);
+void filter_data_buffer_parallel(unsigned char *buf, unsigned long long paddr,
+ size_t size, pthread_mutex_t *mutex);
unsigned long get_size_eraseinfo(void);
int update_filter_info_raw(unsigned long long, int, int);
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 07/11] Add write_kdump_pages_parallel to allow parallel process
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (5 preceding siblings ...)
2015-06-05 7:57 ` [PATCH RFC 06/11] Add filter_data_buffer_parallel Zhou Wenjian
@ 2015-06-05 7:57 ` Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 08/11] Add write_kdump_pages_parallel_cyclic to allow parallel process in cyclic_mode Zhou Wenjian
` (5 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:57 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
Use several threads to read and compress pages and one thread to write
the produced pages into dumpfile. The produced pages will be stored in
a buffer, then the consumer thread will get pages from this buffer.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.c | 450 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
makedumpfile.h | 44 ++++++
2 files changed, 494 insertions(+), 0 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index 05859a3..bce6dc3 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -235,6 +235,31 @@ is_in_same_page(unsigned long vaddr1, unsigned long vaddr2)
return FALSE;
}
+static inline unsigned long
+calculate_len_buf_out(long page_size)
+{
+ unsigned long len_buf_out_zlib, len_buf_out_lzo, len_buf_out_snappy;
+ unsigned long len_buf_out;
+
+ len_buf_out_zlib = len_buf_out_lzo = len_buf_out_snappy = 0;
+
+#ifdef USELZO
+ len_buf_out_lzo = page_size + page_size / 16 + 64 + 3;
+#endif
+
+#ifdef USESNAPPY
+ len_buf_out_snappy = snappy_max_compressed_length(page_size);
+#endif
+
+ len_buf_out_zlib = compressBound(page_size);
+
+ len_buf_out = MAX(len_buf_out_zlib,
+ MAX(len_buf_out_lzo,
+ len_buf_out_snappy));
+
+ return len_buf_out;
+}
+
#define BITMAP_SECT_LEN 4096
static inline int is_dumpable(struct dump_bitmap *, mdf_pfn_t);
static inline int is_dumpable_cyclic(char *bitmap, mdf_pfn_t, struct cycle *cycle);
@@ -7016,6 +7041,431 @@ write_elf_pages_cyclic(struct cache_data *cd_header, struct cache_data *cd_page)
return TRUE;
}
+void *
+kdump_thread_function(void *arg) {
+ void *retval = PTHREAD_FAIL;
+ struct thread_args *kdump_thread_args = (struct thread_args *)arg;
+ struct page_data *page_data_buf = kdump_thread_args->page_data_buf;
+ int page_data_num = kdump_thread_args->page_data_num;
+ mdf_pfn_t pfn;
+ mdf_pfn_t consumed_pfn;
+ int index;
+ int found;
+ int fd_memory = 0;
+ struct dump_bitmap bitmap_parallel, bitmap_memory_parallel;
+ unsigned char *buf = NULL, *buf_out = NULL;
+ struct mmap_cache *mmap_cache =
+ MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
+ unsigned long size_out;
+#ifdef USELZO
+ lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
+#endif
+#ifdef USESNAPPY
+ unsigned long len_buf_out_snappy =
+ snappy_max_compressed_length(info->page_size);
+#endif
+
+ buf = BUF_PARALLEL(kdump_thread_args->thread_num);
+ buf_out = BUF_OUT_PARALLEL(kdump_thread_args->thread_num);
+
+ fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->thread_num);
+
+ initialize_2nd_bitmap_parallel(&bitmap_parallel, kdump_thread_args->thread_num);
+
+ if (info->flag_refiltering) {
+ initialize_bitmap_memory_parallel(&bitmap_memory_parallel,
+ kdump_thread_args->thread_num);
+ }
+
+ while (1) {
+ /* get next pfn */
+ pthread_mutex_lock(&info->current_pfn_mutex);
+ pfn = info->current_pfn;
+ info->current_pfn++;
+ pthread_mutex_unlock(&info->current_pfn_mutex);
+
+ if (pfn >= kdump_thread_args->end_pfn)
+ break;
+
+ index = -1;
+ found = FALSE;
+
+ while (found == FALSE) {
+ /*
+ * need a cancellation point here
+ */
+ sleep(0);
+
+ index = (index + 1) % page_data_num;
+
+ if (pthread_mutex_trylock(&page_data_buf[index].mutex) != 0)
+ continue;
+
+ if (page_data_buf[index].ready != 0)
+ goto unlock;
+
+ pthread_mutex_lock(&info->consumed_pfn_mutex);
+ if ((long)page_data_buf[index].pfn >
+ (long)info->consumed_pfn)
+ info->consumed_pfn = page_data_buf[index].pfn;
+ consumed_pfn = info->consumed_pfn;
+ pthread_mutex_unlock(&info->consumed_pfn_mutex);
+
+ /*
+ * leave space for slow producer
+ */
+ if ((long)pfn - (long)consumed_pfn > page_data_num)
+ goto unlock;
+
+ found = TRUE;
+
+ page_data_buf[index].pfn = pfn;
+ page_data_buf[index].ready = 1;
+
+ if (!is_dumpable(&bitmap_parallel, pfn)) {
+ page_data_buf[index].dumpable = FALSE;
+ goto unlock;
+ }
+
+ page_data_buf[index].dumpable = TRUE;
+
+ if (!read_pfn_parallel(fd_memory, pfn, buf,
+ &bitmap_memory_parallel,
+ mmap_cache))
+ goto fail;
+
+ filter_data_buffer_parallel(buf, pfn_to_paddr(pfn),
+ info->page_size,
+ &info->filter_mutex);
+
+ if ((info->dump_level & DL_EXCLUDE_ZERO)
+ && is_zero_page(buf, info->page_size)) {
+ page_data_buf[index].zero = TRUE;
+ goto unlock;
+ }
+
+ page_data_buf[index].zero = FALSE;
+
+ /*
+ * Compress the page data.
+ */
+ size_out = kdump_thread_args->len_buf_out;
+ if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
+ && ((size_out = kdump_thread_args->len_buf_out),
+ compress2(buf_out, &size_out, buf,
+ info->page_size,
+ Z_BEST_SPEED) == Z_OK)
+ && (size_out < info->page_size)) {
+ page_data_buf[index].flags =
+ DUMP_DH_COMPRESSED_ZLIB;
+ page_data_buf[index].size = size_out;
+ memcpy(page_data_buf[index].buf, buf_out,
+ size_out);
+#ifdef USELZO
+ } else if (info->flag_lzo_support
+ && (info->flag_compress
+ & DUMP_DH_COMPRESSED_LZO)
+ && ((size_out = info->page_size),
+ lzo1x_1_compress(buf, info->page_size,
+ buf_out, &size_out,
+ wrkmem) == LZO_E_OK)
+ && (size_out < info->page_size)) {
+ page_data_buf[index].flags =
+ DUMP_DH_COMPRESSED_LZO;
+ page_data_buf[index].size = size_out;
+ memcpy(page_data_buf[index].buf, buf_out,
+ size_out);
+#endif
+#ifdef USESNAPPY
+ } else if ((info->flag_compress
+ & DUMP_DH_COMPRESSED_SNAPPY)
+ && ((size_out = len_buf_out_snappy),
+ snappy_compress((char *)buf,
+ info->page_size,
+ (char *)buf_out,
+ (size_t *)&size_out)
+ == SNAPPY_OK)
+ && (size_out < info->page_size)) {
+ page_data_buf[index].flags =
+ DUMP_DH_COMPRESSED_SNAPPY;
+ page_data_buf[index].size = size_out;
+ memcpy(page_data_buf[index].buf, buf_out,
+ size_out);
+#endif
+ } else {
+ page_data_buf[index].flags = 0;
+ page_data_buf[index].size = info->page_size;
+ memcpy(page_data_buf[index].buf, buf,
+ info->page_size);
+ }
+unlock:
+ pthread_mutex_unlock(&page_data_buf[index].mutex);
+ }
+ }
+
+ retval = NULL;
+
+fail:
+ if (bitmap_memory_parallel.fd > 0)
+ close(bitmap_memory_parallel.fd);
+
+ pthread_exit(retval);
+}
+
+int
+write_kdump_pages_parallel(struct cache_data *cd_header,
+ struct cache_data *cd_page)
+{
+ int ret = FALSE;
+ int res;
+ unsigned long len_buf_out;
+ mdf_pfn_t per, num_dumpable;
+ mdf_pfn_t start_pfn, end_pfn;
+ struct disk_dump_header *dh = info->dump_header;
+ struct page_desc pd, pd_zero;
+ off_t offset_data = 0;
+ struct timeval tv_start;
+ struct timeval last, new;
+ unsigned char buf[info->page_size];
+ unsigned long long consuming_pfn;
+ pthread_t **threads = NULL;
+ struct thread_args *kdump_thread_args = NULL;
+ void *thread_result;
+ int page_data_num;
+ struct page_data *page_data_buf = NULL;
+ int index;
+ int i;
+
+ if (info->flag_elf_dumpfile)
+ return ret;
+
+ res = pthread_mutex_init(&info->current_pfn_mutex, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize current_pfn_mutex. %s\n",
+ strerror(res));
+ goto out;
+ }
+
+ res = pthread_mutex_init(&info->consumed_pfn_mutex, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize consumed_pfn_mutex. %s\n",
+ strerror(res));
+ goto out;
+ }
+
+ res = pthread_mutex_init(&info->filter_mutex, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize filter_mutex. %s\n", strerror(res));
+ goto out;
+ }
+
+ res = pthread_rwlock_init(&info->usemmap_rwlock, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize usemmap_rwlock. %s\n", strerror(res));
+ goto out;
+ }
+
+ len_buf_out = calculate_len_buf_out(info->page_size);
+
+ num_dumpable = get_num_dumpable();
+ per = num_dumpable / 10000;
+ per = per ? per : 1;
+
+ /*
+ * Calculate the offset of the page data.
+ */
+ cd_header->offset
+ = (DISKDUMP_HEADER_BLOCKS + dh->sub_hdr_size + dh->bitmap_blocks)
+ * dh->block_size;
+ cd_page->offset = cd_header->offset + sizeof(page_desc_t)*num_dumpable;
+ offset_data = cd_page->offset;
+
+ /*
+ * Write the data of zero-filled page.
+ */
+ gettimeofday(&tv_start, NULL);
+ if (info->dump_level & DL_EXCLUDE_ZERO) {
+ pd_zero.size = info->page_size;
+ pd_zero.flags = 0;
+ pd_zero.offset = offset_data;
+ pd_zero.page_flags = 0;
+ memset(buf, 0, pd_zero.size);
+ if (!write_cache(cd_page, buf, pd_zero.size))
+ goto out;
+ offset_data += pd_zero.size;
+ }
+
+ start_pfn = 0;
+ end_pfn = info->max_mapnr;
+
+ info->current_pfn = start_pfn;
+ info->consumed_pfn = start_pfn - 1;
+
+ threads = info->threads;
+ kdump_thread_args = info->kdump_thread_args;
+
+ page_data_num = info->num_buffers;
+ page_data_buf = info->page_data_buf;
+
+ for (i = 0; i < page_data_num; i++) {
+ /*
+ * producer will use pfn in page_data_buf to decide the
+ * consumed pfn
+ */
+ page_data_buf[i].pfn = start_pfn - 1;
+ page_data_buf[i].ready = 0;
+ res = pthread_mutex_init(&page_data_buf[i].mutex, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize mutex of page_data_buf. %s\n",
+ strerror(res));
+ goto out;
+ }
+ }
+
+ for (i = 0; i < info->num_threads; i++) {
+ kdump_thread_args[i].thread_num = i;
+ kdump_thread_args[i].len_buf_out = len_buf_out;
+ kdump_thread_args[i].start_pfn = start_pfn;
+ kdump_thread_args[i].end_pfn = end_pfn;
+ kdump_thread_args[i].page_data_num = page_data_num;
+ kdump_thread_args[i].page_data_buf = page_data_buf;
+
+ res = pthread_create(threads[i], NULL,
+ kdump_thread_function,
+ (void *)&kdump_thread_args[i]);
+ if (res != 0) {
+ ERRMSG("Can't create thread %d. %s\n",
+ i, strerror(res));
+ goto out;
+ }
+ }
+
+ consuming_pfn = start_pfn;
+ index = -1;
+
+ gettimeofday(&last, NULL);
+
+ while (consuming_pfn < end_pfn) {
+ index = (index + 1) % page_data_num;
+
+ gettimeofday(&new, NULL);
+ if (new.tv_sec - last.tv_sec > WAIT_TIME) {
+ ERRMSG("Can't get data of pfn %llx.\n", consuming_pfn);
+ goto out;
+ }
+
+ /*
+ * check pfn first without mutex locked to reduce the time
+ * trying to lock the mutex
+ */
+ if (page_data_buf[index].pfn != consuming_pfn)
+ continue;
+
+ pthread_mutex_lock(&page_data_buf[index].mutex);
+
+ /* check whether the found one is ready to be consumed */
+ if (page_data_buf[index].pfn != consuming_pfn ||
+ page_data_buf[index].ready != 1) {
+ goto unlock;
+ }
+
+ if ((num_dumped % per) == 0)
+ print_progress(PROGRESS_COPY, num_dumped, num_dumpable);
+
+ /* next pfn is found, refresh last here */
+ last = new;
+ consuming_pfn++;
+ page_data_buf[index].ready = 0;
+
+ if (page_data_buf[index].dumpable == FALSE)
+ goto unlock;
+
+ num_dumped++;
+
+ if (page_data_buf[index].zero == TRUE) {
+ if (!write_cache(cd_header, &pd_zero, sizeof(page_desc_t)))
+ goto out;
+ pfn_zero++;
+ } else {
+ pd.flags = page_data_buf[index].flags;
+ pd.size = page_data_buf[index].size;
+ pd.page_flags = 0;
+ pd.offset = offset_data;
+ offset_data += pd.size;
+ /*
+ * Write the page header.
+ */
+ if (!write_cache(cd_header, &pd, sizeof(page_desc_t)))
+ goto out;
+ /*
+ * Write the page data.
+ */
+ if (!write_cache(cd_page, page_data_buf[index].buf, pd.size))
+ goto out;
+ }
+unlock:
+ pthread_mutex_unlock(&page_data_buf[index].mutex);
+ }
+
+ /*
+ * Write the remainder.
+ */
+ if (!write_cache_bufsz(cd_page))
+ goto out;
+ if (!write_cache_bufsz(cd_header))
+ goto out;
+
+ ret = TRUE;
+ /*
+ * print [100 %]
+ */
+ print_progress(PROGRESS_COPY, num_dumped, num_dumpable);
+ print_execution_time(PROGRESS_COPY, &tv_start);
+ PROGRESS_MSG("\n");
+
+out:
+ if (threads != NULL) {
+ for (i = 0; i < info->num_threads; i++) {
+ if (threads[i] != NULL) {
+ res = pthread_cancel(*threads[i]);
+ if (res != 0 && res != ESRCH)
+ ERRMSG("Can't cancel thread %d. %s\n",
+ i, strerror(res));
+ }
+ }
+
+ for (i = 0; i < info->num_threads; i++) {
+ if (threads[i] != NULL) {
+ res = pthread_join(*threads[i], &thread_result);
+ if (res != 0)
+ ERRMSG("Can't join with thread %d. %s\n",
+ i, strerror(res));
+
+ if (thread_result == PTHREAD_CANCELED)
+ DEBUG_MSG("Thread %d is cancelled.\n", i);
+ else if (thread_result == PTHREAD_FAIL)
+ DEBUG_MSG("Thread %d fails.\n", i);
+ else
+ DEBUG_MSG("Thread %d finishes.\n", i);
+
+ }
+ }
+ }
+
+ if (page_data_buf != NULL) {
+ for (i = 0; i < page_data_num; i++) {
+ pthread_mutex_destroy(&page_data_buf[i].mutex);
+ }
+ }
+
+ pthread_rwlock_destroy(&info->usemmap_rwlock);
+ pthread_mutex_destroy(&info->filter_mutex);
+ pthread_mutex_destroy(&info->consumed_pfn_mutex);
+ pthread_mutex_destroy(&info->current_pfn_mutex);
+
+ return ret;
+}
+
int
write_kdump_pages(struct cache_data *cd_header, struct cache_data *cd_page)
{
diff --git a/makedumpfile.h b/makedumpfile.h
index b1ff561..bca3d56 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -431,8 +431,15 @@ do { \
/*
* Macro for getting parallel info.
*/
+#define FD_MEMORY_PARALLEL(i) info->parallel_info[i].fd_memory
#define FD_BITMAP_MEMORY_PARALLEL(i) info->parallel_info[i].fd_bitmap_memory
#define FD_BITMAP_PARALLEL(i) info->parallel_info[i].fd_bitmap
+#define BUF_PARALLEL(i) info->parallel_info[i].buf
+#define BUF_OUT_PARALLEL(i) info->parallel_info[i].buf_out
+#define MMAP_CACHE_PARALLEL(i) info->parallel_info[i].mmap_cache
+#ifdef USELZO
+#define WRKMEM_PARALLEL(i) info->parallel_info[i].wrkmem
+#endif
/*
* kernel version
*
@@ -921,12 +928,39 @@ typedef unsigned long long int ulonglong;
/*
* for parallel process
*/
+
+#define WAIT_TIME (60 * 10)
+#define PTHREAD_FAIL ((void *)-2)
+
struct mmap_cache {
char *mmap_buf;
off_t mmap_start_offset;
off_t mmap_end_offset;
};
+struct page_data
+{
+ mdf_pfn_t pfn;
+ int dumpable;
+ int zero;
+ unsigned int flags;
+ long size;
+ unsigned char *buf;
+ pthread_mutex_t mutex;
+ /*
+ * whether the page_data is ready to be consumed
+ */
+ int ready;
+};
+
+struct thread_args {
+ int thread_num;
+ unsigned long len_buf_out;
+ mdf_pfn_t start_pfn, end_pfn;
+ int page_data_num;
+ struct page_data *page_data_buf;
+};
+
/*
* makedumpfile header
* For re-arranging the dump data on different architecture, all the
@@ -1208,7 +1242,17 @@ struct DumpInfo {
/*
* for parallel process
*/
+ int num_threads;
+ int num_buffers;
+ pthread_t **threads;
+ struct thread_args *kdump_thread_args;
+ struct page_data *page_data_buf;
pthread_rwlock_t usemmap_rwlock;
+ mdf_pfn_t current_pfn;
+ pthread_mutex_t current_pfn_mutex;
+ mdf_pfn_t consumed_pfn;
+ pthread_mutex_t consumed_pfn_mutex;
+ pthread_mutex_t filter_mutex;
};
extern struct DumpInfo *info;
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 08/11] Add write_kdump_pages_parallel_cyclic to allow parallel process in cyclic_mode
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (6 preceding siblings ...)
2015-06-05 7:57 ` [PATCH RFC 07/11] Add write_kdump_pages_parallel to allow parallel process Zhou Wenjian
@ 2015-06-05 7:57 ` Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 09/11] Initial and free data used for parallel process Zhou Wenjian
` (4 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:57 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
Use several threads to read and compress pages and one thread to write
the produced pages into dumpfile. This patch is used for cyclic mode.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.c | 390 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 390 insertions(+), 0 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index bce6dc3..86426d8 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -7672,6 +7672,396 @@ out:
return ret;
}
+void *
+kdump_thread_function_cyclic(void *arg) {
+ void *retval = PTHREAD_FAIL;
+ struct thread_args *kdump_thread_args = (struct thread_args *)arg;
+ struct page_data *page_data_buf = kdump_thread_args->page_data_buf;
+ int page_data_num = kdump_thread_args->page_data_num;
+ mdf_pfn_t pfn;
+ mdf_pfn_t consumed_pfn;
+ int index;
+ int found;
+ int fd_memory = 0;
+ struct dump_bitmap bitmap_memory_parallel;
+ unsigned char *buf = NULL, *buf_out = NULL;
+ struct mmap_cache *mmap_cache =
+ MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
+ unsigned long size_out;
+#ifdef USELZO
+ lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
+#endif
+#ifdef USESNAPPY
+ unsigned long len_buf_out_snappy =
+ snappy_max_compressed_length(info->page_size);
+#endif
+
+ buf = BUF_PARALLEL(kdump_thread_args->thread_num);
+ buf_out = BUF_OUT_PARALLEL(kdump_thread_args->thread_num);
+
+ fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->thread_num);
+
+ if (info->flag_refiltering) {
+ initialize_bitmap_memory_parallel(&bitmap_memory_parallel,
+ kdump_thread_args->thread_num);
+ }
+
+ while (1) {
+ /* get next pfn */
+ pthread_mutex_lock(&info->current_pfn_mutex);
+ pfn = info->current_pfn;
+ info->current_pfn++;
+ pthread_mutex_unlock(&info->current_pfn_mutex);
+
+ if (pfn >= kdump_thread_args->end_pfn)
+ break;
+
+ index = -1;
+ found = FALSE;
+
+ while (found == FALSE) {
+ /*
+ * need a cancellation point here
+ */
+ sleep(0);
+
+ index = (index + 1) % page_data_num;
+
+ if (pthread_mutex_trylock(&page_data_buf[index].mutex) != 0)
+ continue;
+
+ if (page_data_buf[index].ready != 0)
+ goto unlock;
+
+ pthread_mutex_lock(&info->consumed_pfn_mutex);
+ if ((long)page_data_buf[index].pfn >
+ (long)info->consumed_pfn)
+ info->consumed_pfn = page_data_buf[index].pfn;
+ consumed_pfn = info->consumed_pfn;
+ pthread_mutex_unlock(&info->consumed_pfn_mutex);
+
+ /*
+ * leave space for slow producer
+ */
+ if ((long)pfn - (long)consumed_pfn > page_data_num)
+ goto unlock;
+
+ found = TRUE;
+
+ page_data_buf[index].pfn = pfn;
+ page_data_buf[index].ready = 1;
+
+ if (!is_on(info->partial_bitmap2,
+ pfn - kdump_thread_args->start_pfn)) {
+ page_data_buf[index].dumpable = FALSE;
+ goto unlock;
+ }
+
+ page_data_buf[index].dumpable = TRUE;
+
+ if (!read_pfn_parallel(fd_memory, pfn, buf,
+ &bitmap_memory_parallel,
+ mmap_cache))
+ goto fail;
+
+ filter_data_buffer_parallel(buf, pfn_to_paddr(pfn),
+ info->page_size,
+ &info->filter_mutex);
+
+ if ((info->dump_level & DL_EXCLUDE_ZERO)
+ && is_zero_page(buf, info->page_size)) {
+ page_data_buf[index].zero = TRUE;
+ goto unlock;
+ }
+
+ page_data_buf[index].zero = FALSE;
+
+ /*
+ * Compress the page data.
+ */
+ size_out = kdump_thread_args->len_buf_out;
+ if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
+ && ((size_out = kdump_thread_args->len_buf_out),
+ compress2(buf_out, &size_out, buf,
+ info->page_size,
+ Z_BEST_SPEED) == Z_OK)
+ && (size_out < info->page_size)) {
+ page_data_buf[index].flags =
+ DUMP_DH_COMPRESSED_ZLIB;
+ page_data_buf[index].size = size_out;
+ memcpy(page_data_buf[index].buf, buf_out, size_out);
+#ifdef USELZO
+ } else if (info->flag_lzo_support
+ && (info->flag_compress
+ & DUMP_DH_COMPRESSED_LZO)
+ && ((size_out = info->page_size),
+ lzo1x_1_compress(buf, info->page_size,
+ buf_out, &size_out,
+ wrkmem) == LZO_E_OK)
+ && (size_out < info->page_size)) {
+ page_data_buf[index].flags =
+ DUMP_DH_COMPRESSED_LZO;
+ page_data_buf[index].size = size_out;
+ memcpy(page_data_buf[index].buf, buf_out, size_out);
+#endif
+#ifdef USESNAPPY
+ } else if ((info->flag_compress
+ & DUMP_DH_COMPRESSED_SNAPPY)
+ && ((size_out = len_buf_out_snappy),
+ snappy_compress((char *)buf,
+ info->page_size,
+ (char *)buf_out,
+ (size_t *)&size_out)
+ == SNAPPY_OK)
+ && (size_out < info->page_size)) {
+ page_data_buf[index].flags =
+ DUMP_DH_COMPRESSED_SNAPPY;
+ page_data_buf[index].size = size_out;
+ memcpy(page_data_buf[index].buf, buf_out, size_out);
+#endif
+ } else {
+ page_data_buf[index].flags = 0;
+ page_data_buf[index].size = info->page_size;
+ memcpy(page_data_buf[index].buf, buf, info->page_size);
+ }
+unlock:
+ pthread_mutex_unlock(&page_data_buf[index].mutex);
+
+ }
+ }
+
+ retval = NULL;
+
+fail:
+ if (bitmap_memory_parallel.fd > 0)
+ close(bitmap_memory_parallel.fd);
+
+ pthread_exit(retval);
+}
+
+int
+write_kdump_pages_parallel_cyclic(struct cache_data *cd_header,
+ struct cache_data *cd_page,
+ struct page_desc *pd_zero,
+ off_t *offset_data, struct cycle *cycle)
+{
+ int ret = FALSE;
+ int res;
+ unsigned long len_buf_out;
+ mdf_pfn_t per;
+ mdf_pfn_t start_pfn, end_pfn;
+ struct page_desc pd;
+ struct timeval tv_start;
+ struct timeval last, new;
+ unsigned long long consuming_pfn;
+ pthread_t **threads = NULL;
+ struct thread_args *kdump_thread_args = NULL;
+ void *thread_result;
+ int page_data_num;
+ struct page_data *page_data_buf = NULL;
+ int i;
+ int index;
+
+ if (info->flag_elf_dumpfile)
+ return FALSE;
+
+ res = pthread_mutex_init(&info->current_pfn_mutex, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize current_pfn_mutex. %s\n",
+ strerror(res));
+ goto out;
+ }
+
+ res = pthread_mutex_init(&info->consumed_pfn_mutex, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize consumed_pfn_mutex. %s\n",
+ strerror(res));
+ goto out;
+ }
+
+ res = pthread_mutex_init(&info->filter_mutex, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize filter_mutex. %s\n", strerror(res));
+ goto out;
+ }
+
+ res = pthread_rwlock_init(&info->usemmap_rwlock, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize usemmap_rwlock. %s\n", strerror(res));
+ goto out;
+ }
+
+ len_buf_out = calculate_len_buf_out(info->page_size);
+
+ per = info->num_dumpable / 10000;
+ per = per ? per : 1;
+
+ gettimeofday(&tv_start, NULL);
+
+ start_pfn = cycle->start_pfn;
+ end_pfn = cycle->end_pfn;
+
+ info->current_pfn = start_pfn;
+ info->consumed_pfn = start_pfn - 1;
+
+ threads = info->threads;
+ kdump_thread_args = info->kdump_thread_args;
+
+ page_data_num = info->num_buffers;
+ page_data_buf = info->page_data_buf;
+
+ for (i = 0; i < page_data_num; i++) {
+ /*
+ * producer will use pfn in page_data_buf to decide the
+ * consumed pfn
+ */
+ page_data_buf[i].pfn = start_pfn - 1;
+ page_data_buf[i].ready = 0;
+ res = pthread_mutex_init(&page_data_buf[i].mutex, NULL);
+ if (res != 0) {
+ ERRMSG("Can't initialize mutex of page_data_buf. %s\n",
+ strerror(res));
+ goto out;
+ }
+ }
+
+ for (i = 0; i < info->num_threads; i++) {
+ kdump_thread_args[i].thread_num = i;
+ kdump_thread_args[i].len_buf_out = len_buf_out;
+ kdump_thread_args[i].start_pfn = start_pfn;
+ kdump_thread_args[i].end_pfn = end_pfn;
+ kdump_thread_args[i].page_data_num = page_data_num;
+ kdump_thread_args[i].page_data_buf = page_data_buf;
+
+ res = pthread_create(threads[i], NULL,
+ kdump_thread_function_cyclic,
+ (void *)&kdump_thread_args[i]);
+ if (res != 0) {
+ ERRMSG("Can't create thread %d. %s\n",
+ i, strerror(res));
+ goto out;
+ }
+ }
+
+ consuming_pfn = start_pfn;
+ index = -1;
+
+ gettimeofday(&last, NULL);
+
+ while (consuming_pfn < end_pfn) {
+ index = (index + 1) % page_data_num;
+
+ gettimeofday(&new, NULL);
+ if (new.tv_sec - last.tv_sec > WAIT_TIME) {
+ ERRMSG("Can't get data of pfn %llx.\n", consuming_pfn);
+ goto out;
+ }
+
+ /*
+ * check pfn first without mutex locked to reduce the time
+ * trying to lock the mutex
+ */
+ if (page_data_buf[index].pfn != consuming_pfn)
+ continue;
+
+ pthread_mutex_lock(&page_data_buf[index].mutex);
+
+ /* check whether the found one is ready to be consumed */
+ if (page_data_buf[index].pfn != consuming_pfn ||
+ page_data_buf[index].ready != 1) {
+ goto unlock;
+ }
+
+ if ((num_dumped % per) == 0)
+ print_progress(PROGRESS_COPY, num_dumped, info->num_dumpable);
+
+ /* next pfn is found, refresh last here */
+ last = new;
+ consuming_pfn++;
+ page_data_buf[index].ready = 0;
+
+ if (page_data_buf[index].dumpable == FALSE)
+ goto unlock;
+
+ num_dumped++;
+
+ if (page_data_buf[index].zero == TRUE) {
+ if (!write_cache(cd_header, pd_zero, sizeof(page_desc_t)))
+ goto out;
+ pfn_zero++;
+ } else {
+ pd.flags = page_data_buf[index].flags;
+ pd.size = page_data_buf[index].size;
+ pd.page_flags = 0;
+ pd.offset = *offset_data;
+ *offset_data += pd.size;
+ /*
+ * Write the page header.
+ */
+ if (!write_cache(cd_header, &pd, sizeof(page_desc_t)))
+ goto out;
+ /*
+ * Write the page data.
+ */
+ if (!write_cache(cd_page, page_data_buf[index].buf, pd.size))
+ goto out;
+
+ }
+unlock:
+ pthread_mutex_unlock(&page_data_buf[index].mutex);
+ }
+
+ ret = TRUE;
+ /*
+ * print [100 %]
+ */
+ print_progress(PROGRESS_COPY, num_dumped, info->num_dumpable);
+ print_execution_time(PROGRESS_COPY, &tv_start);
+ PROGRESS_MSG("\n");
+
+out:
+ if (threads != NULL) {
+ for (i = 0; i < info->num_threads; i++) {
+ if (threads[i] != NULL) {
+ res = pthread_cancel(*threads[i]);
+ if (res != 0 && res != ESRCH)
+ ERRMSG("Can't cancel thread %d. %s\n",
+ i, strerror(res));
+ }
+ }
+
+ for (i = 0; i < info->num_threads; i++) {
+ if (threads[i] != NULL) {
+ res = pthread_join(*threads[i], &thread_result);
+ if (res != 0)
+ ERRMSG("Can't join with thread %d. %s\n",
+ i, strerror(res));
+
+ if (thread_result == PTHREAD_CANCELED)
+ DEBUG_MSG("Thread %d is cancelled.\n", i);
+ else if (thread_result == PTHREAD_FAIL)
+ DEBUG_MSG("Thread %d fails.\n", i);
+ else
+ DEBUG_MSG("Thread %d finishes.\n", i);
+
+ }
+ }
+ }
+
+ if (page_data_buf != NULL) {
+ for (i = 0; i < page_data_num; i++) {
+ pthread_mutex_destroy(&page_data_buf[i].mutex);
+ }
+ }
+
+ pthread_rwlock_destroy(&info->usemmap_rwlock);
+ pthread_mutex_destroy(&info->filter_mutex);
+ pthread_mutex_destroy(&info->consumed_pfn_mutex);
+ pthread_mutex_destroy(&info->current_pfn_mutex);
+
+ return ret;
+}
+
int
write_kdump_pages_cyclic(struct cache_data *cd_header, struct cache_data *cd_page,
struct page_desc *pd_zero, off_t *offset_data, struct cycle *cycle)
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 09/11] Initial and free data used for parallel process
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (7 preceding siblings ...)
2015-06-05 7:57 ` [PATCH RFC 08/11] Add write_kdump_pages_parallel_cyclic to allow parallel process in cyclic_mode Zhou Wenjian
@ 2015-06-05 7:57 ` Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 10/11] Make makedumpfile available to read and compress pages parallelly Zhou Wenjian
` (3 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:57 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
This patch is used to initial/free data for parallel process and
the memory limit is concerned in this function.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
makedumpfile.h | 1 +
2 files changed, 203 insertions(+), 0 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index 86426d8..04a6c45 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -1426,6 +1426,23 @@ open_dump_bitmap(void)
SPLITTING_FD_BITMAP(i) = fd;
}
}
+
+ if (info->num_threads) {
+ /*
+ * Reserve file descriptors of bitmap for creating dumpfiles
+ * parallelly, because a bitmap file will be unlinked just after
+ * this and it is not possible to open a bitmap file later.
+ */
+ for (i = 0; i < info->num_threads; i++) {
+ if ((fd = open(info->name_bitmap, O_RDONLY)) < 0) {
+ ERRMSG("Can't open the bitmap file(%s). %s\n",
+ info->name_bitmap, strerror(errno));
+ return FALSE;
+ }
+ FD_BITMAP_PARALLEL(i) = fd;
+ }
+ }
+
unlink(info->name_bitmap);
return TRUE;
@@ -3446,6 +3463,191 @@ calibrate_machdep_info(void)
}
int
+initial_for_parallel()
+{
+ unsigned long len_buf_out;
+ unsigned long page_data_buf_size;
+ unsigned long limit_size;
+ int page_data_num;
+ int i;
+
+ len_buf_out = calculate_len_buf_out(info->page_size);
+
+ /*
+ * allocate memory for threads
+ */
+ if ((info->threads = malloc(sizeof(pthread_t *) * info->num_threads))
+ == NULL) {
+ MSG("Can't allocate memory for threads. %s\n",
+ strerror(errno));
+ return FALSE;
+ }
+ memset(info->threads, 0, sizeof(pthread_t *) * info->num_threads);
+
+ if ((info->kdump_thread_args =
+ malloc(sizeof(struct thread_args) * info->num_threads))
+ == NULL) {
+ MSG("Can't allocate memory for arguments of threads. %s\n",
+ strerror(errno));
+ return FALSE;
+ }
+ memset(info->kdump_thread_args, 0, sizeof(struct thread_args) * info->num_threads);
+
+ for (i = 0; i < info->num_threads; i++) {
+ if ((info->threads[i] = malloc(sizeof(pthread_t))) == NULL) {
+ MSG("Can't allocate memory for thread %d. %s",
+ i, strerror(errno));
+ return FALSE;
+ }
+
+ if ((BUF_PARALLEL(i) = malloc(info->page_size)) == NULL) {
+ MSG("Can't allocate memory for the memory buffer. %s\n",
+ strerror(errno));
+ return FALSE;
+ }
+
+ if ((BUF_OUT_PARALLEL(i) = malloc(len_buf_out)) == NULL) {
+ MSG("Can't allocate memory for the compression buffer. %s\n",
+ strerror(errno));
+ return FALSE;
+ }
+
+ if ((MMAP_CACHE_PARALLEL(i) = malloc(sizeof(struct mmap_cache))) == NULL) {
+ MSG("Can't allocate memory for mmap_cache. %s\n",
+ strerror(errno));
+ return FALSE;
+ }
+
+ /*
+ * initial for mmap_cache
+ */
+ MMAP_CACHE_PARALLEL(i)->mmap_buf = MAP_FAILED;
+ MMAP_CACHE_PARALLEL(i)->mmap_start_offset = 0;
+ MMAP_CACHE_PARALLEL(i)->mmap_end_offset = 0;
+
+#ifdef USELZO
+ if ((WRKMEM_PARALLEL(i) = malloc(LZO1X_1_MEM_COMPRESS)) == NULL) {
+ MSG("Can't allocate memory for the working memory. %s\n",
+ strerror(errno));
+ return FALSE;
+ }
+#endif
+ }
+
+ /*
+ * get a safe number of page_data
+ */
+ page_data_buf_size = MAX(len_buf_out, info->page_size);
+
+ limit_size = (get_free_memory_size()
+ - MAP_REGION * info->num_threads) * 0.6;
+
+ page_data_num = limit_size / page_data_buf_size;
+
+ if (info->num_buffers != 0)
+ info->num_buffers = MIN(info->num_buffers, page_data_num);
+ else
+ info->num_buffers = MIN(PAGE_DATA_NUM, page_data_num);
+
+ DEBUG_MSG("Number of struct page_data for produce/consume: %d\n",
+ info->num_buffers);
+
+ /*
+ * allocate memory for page_data
+ */
+ if ((info->page_data_buf = malloc(sizeof(struct page_data) * info->num_buffers))
+ == NULL) {
+ MSG("Can't allocate memory for page_data_buf. %s\n",
+ strerror(errno));
+ return FALSE;
+ }
+ memset(info->page_data_buf, 0, sizeof(struct page_data) * info->num_buffers);
+
+ for (i = 0; i < info->num_buffers; i++) {
+ if ((info->page_data_buf[i].buf = malloc(page_data_buf_size)) == NULL) {
+ MSG("Can't allocate memory for buf of page_data_buf. %s\n",
+ strerror(errno));
+ return FALSE;
+ }
+ }
+
+ /*
+ * initial fd_memory for threads
+ */
+ for (i = 0; i < info->num_threads; i++) {
+ if ((FD_MEMORY_PARALLEL(i) = open(info->name_memory, O_RDONLY))
+ < 0) {
+ ERRMSG("Can't open the dump memory(%s). %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
+ }
+
+ if ((FD_BITMAP_MEMORY_PARALLEL(i) =
+ open(info->name_memory, O_RDONLY)) < 0) {
+ ERRMSG("Can't open the dump memory(%s). %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+void
+free_for_parallel()
+{
+ int i;
+
+ if (info->threads != NULL) {
+ for (i = 0; i < info->num_threads; i++) {
+ if (info->threads[i] != NULL)
+ free(info->threads[i]);
+
+ if (BUF_PARALLEL(i) != NULL)
+ free(BUF_PARALLEL(i));
+
+ if (BUF_OUT_PARALLEL(i) != NULL)
+ free(BUF_OUT_PARALLEL(i));
+
+ if (MMAP_CACHE_PARALLEL(i) != NULL) {
+ if (MMAP_CACHE_PARALLEL(i)->mmap_buf !=
+ MAP_FAILED)
+ munmap(MMAP_CACHE_PARALLEL(i)->mmap_buf,
+ MMAP_CACHE_PARALLEL(i)->mmap_end_offset
+ - MMAP_CACHE_PARALLEL(i)->mmap_start_offset);
+
+ free(MMAP_CACHE_PARALLEL(i));
+ }
+#ifdef USELZO
+ if (WRKMEM_PARALLEL(i) != NULL)
+ free(WRKMEM_PARALLEL(i));
+#endif
+
+ }
+ free(info->threads);
+ }
+
+ if (info->kdump_thread_args != NULL)
+ free(info->kdump_thread_args);
+
+ if (info->page_data_buf != NULL) {
+ for (i = 0; i < info->num_buffers; i++) {
+ if (info->page_data_buf[i].buf != NULL)
+ free(info->page_data_buf[i].buf);
+ }
+ free(info->page_data_buf);
+ }
+
+ for (i = 0; i < info->num_threads; i++) {
+ if (FD_MEMORY_PARALLEL(i) > 0)
+ close(FD_MEMORY_PARALLEL(i));
+
+ if (FD_BITMAP_MEMORY_PARALLEL(i) > 0)
+ close(FD_BITMAP_MEMORY_PARALLEL(i));
+ }
+}
+
+int
initial(void)
{
off_t offset;
diff --git a/makedumpfile.h b/makedumpfile.h
index bca3d56..67c2a38 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -929,6 +929,7 @@ typedef unsigned long long int ulonglong;
* for parallel process
*/
+#define PAGE_DATA_NUM (50)
#define WAIT_TIME (60 * 10)
#define PTHREAD_FAIL ((void *)-2)
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 10/11] Make makedumpfile available to read and compress pages parallelly
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (8 preceding siblings ...)
2015-06-05 7:57 ` [PATCH RFC 09/11] Initial and free data used for parallel process Zhou Wenjian
@ 2015-06-05 7:57 ` Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 11/11] Add usage and manual about multiple threads process Zhou Wenjian
` (2 subsequent siblings)
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:57 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
Using this patch, it is available to use multiple threads to read
and compress pages. This parallel process will save time.
Currently, sadump and xen kdump is not supported.
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
makedumpfile.h | 2 +
2 files changed, 75 insertions(+), 4 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index 04a6c45..bb931c3 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -3846,6 +3846,27 @@ out:
DEBUG_MSG("Buffer size for the cyclic mode: %ld\n", info->bufsize_cyclic);
}
+ if (info->num_threads) {
+ if (is_xen_memory()) {
+ MSG("'--num-threads' option is disable,\n");
+ MSG("because %s is Xen's memory core image.\n",
+ info->name_memory);
+ return FALSE;
+ }
+
+ if (info->flag_sadump) {
+ MSG("'--num-threads' option is disable,\n");
+ MSG("because %s is sadump %s format.\n",
+ info->name_memory, sadump_format_type_name());
+ return FALSE;
+ }
+
+ if (!initial_for_parallel()) {
+ MSG("Fail to initial for parallel process.\n");
+ return FALSE;
+ }
+ }
+
if (!is_xen_memory() && !cache_init())
return FALSE;
@@ -8823,9 +8844,16 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data *cd_header, struct cache_d
if (!write_kdump_bitmap2_cyclic(&cycle))
return FALSE;
- if (!write_kdump_pages_cyclic(cd_header, cd_page, &pd_zero,
+ if (info->num_threads) {
+ if (!write_kdump_pages_parallel_cyclic(cd_header,
+ cd_page, &pd_zero,
+ &offset_data, &cycle))
+ return FALSE;
+ } else {
+ if (!write_kdump_pages_cyclic(cd_header, cd_page, &pd_zero,
&offset_data, &cycle))
- return FALSE;
+ return FALSE;
+ }
}
free_bitmap2_buffer_cyclic();
@@ -9832,8 +9860,13 @@ writeout_dumpfile(void)
goto out;
if (!write_kdump_bitmap())
goto out;
- if (!write_kdump_pages(&cd_header, &cd_page))
- goto out;
+ if (info->num_threads) {
+ if (!write_kdump_pages_parallel(&cd_header, &cd_page))
+ goto out;
+ } else {
+ if (!write_kdump_pages(&cd_header, &cd_page))
+ goto out;
+ }
if (!write_kdump_eraseinfo(&cd_page))
goto out;
}
@@ -10847,6 +10880,18 @@ check_param_for_creating_dumpfile(int argc, char *argv[])
if (info->flag_sadump_diskset && !sadump_is_supported_arch())
return FALSE;
+ if (info->num_threads) {
+ if (info->flag_split) {
+ MSG("--num-threads cannot used with --split.\n");
+ return FALSE;
+ }
+
+ if (info->flag_elf_dumpfile) {
+ MSG("--num-threads cannot used with ELF format.\n");
+ return FALSE;
+ }
+ }
+
if ((argc == optind + 2) && !info->flag_flatten
&& !info->flag_split
&& !info->flag_sadump_diskset) {
@@ -10911,6 +10956,18 @@ check_param_for_creating_dumpfile(int argc, char *argv[])
} else
return FALSE;
+ if (info->num_threads) {
+ if ((info->parallel_info =
+ malloc(sizeof(parallel_info_t) * info->num_threads))
+ == NULL) {
+ MSG("Can't allocate memory for parallel_info.\n");
+ return FALSE;
+ }
+
+ memset(info->parallel_info, 0, sizeof(parallel_info_t)
+ * info->num_threads);
+ }
+
return TRUE;
}
@@ -11223,6 +11280,8 @@ static struct option longopts[] = {
{"non-mmap", no_argument, NULL, OPT_NON_MMAP},
{"mem-usage", no_argument, NULL, OPT_MEM_USAGE},
{"splitblock-size", required_argument, NULL, OPT_SPLITBLOCK_SIZE},
+ {"num-threads", required_argument, NULL, OPT_NUM_THREADS},
+ {"num-buffers", required_argument, NULL, OPT_NUM_BUFFERS},
{0, 0, 0, 0}
};
@@ -11366,6 +11425,12 @@ main(int argc, char *argv[])
case OPT_SPLITBLOCK_SIZE:
info->splitblock_size = atoi(optarg);
break;
+ case OPT_NUM_THREADS:
+ info->num_threads = atoi(optarg);
+ break;
+ case OPT_NUM_BUFFERS:
+ info->num_buffers = atoi(optarg);
+ break;
case '?':
MSG("Commandline parameter is invalid.\n");
MSG("Try `makedumpfile --help' for more information.\n");
@@ -11509,6 +11574,8 @@ out:
else if (!info->flag_mem_usage)
MSG("makedumpfile Completed.\n");
+ free_for_parallel();
+
if (info) {
if (info->dh_memory)
free(info->dh_memory);
@@ -11536,6 +11603,8 @@ out:
free(info->p2m_mfn_frame_list);
if (info->page_buf != NULL)
free(info->page_buf);
+ if (info->parallel_info != NULL)
+ free(info->parallel_info);
free(info);
if (splitblock) {
diff --git a/makedumpfile.h b/makedumpfile.h
index 67c2a38..42a8ee3 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -1974,6 +1974,8 @@ struct elf_prstatus {
#define OPT_NON_MMAP OPT_START+13
#define OPT_MEM_USAGE OPT_START+14
#define OPT_SPLITBLOCK_SIZE OPT_START+15
+#define OPT_NUM_THREADS OPT_START+16
+#define OPT_NUM_BUFFERS OPT_START+17
/*
* Function Prototype.
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [PATCH RFC 11/11] Add usage and manual about multiple threads process
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (9 preceding siblings ...)
2015-06-05 7:57 ` [PATCH RFC 10/11] Make makedumpfile available to read and compress pages parallelly Zhou Wenjian
@ 2015-06-05 7:57 ` Zhou Wenjian
2015-06-08 3:55 ` [PATCH RFC 00/11] makedumpfile: parallel processing "Zhou, Wenjian/周文剑"
2015-06-10 6:06 ` Atsushi Kumagai
12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05 7:57 UTC (permalink / raw)
To: kexec; +Cc: Qiao Nuohan
From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
makedumpfile.8 | 24 ++++++++++++++++++++++++
print_info.c | 16 ++++++++++++++++
2 files changed, 40 insertions(+), 0 deletions(-)
diff --git a/makedumpfile.8 b/makedumpfile.8
index 9752671..5a760c8 100644
--- a/makedumpfile.8
+++ b/makedumpfile.8
@@ -12,6 +12,8 @@ makedumpfile \- make a small dumpfile of kdump
.br
\fBmakedumpfile\fR \-\-split [\fIOPTION\fR] [\-x \fIVMLINUX\fR|\-i \fIVMCOREINFO\fR] \fIVMCORE\fR \fIDUMPFILE1\fR \fIDUMPFILE2\fR [\fIDUMPFILE3\fR ..]
.br
+\fBmakedumpfile\fR [\fIOPTION\fR] [\-x \fIVMLINUX\fR|\-i \fIVMCOREINFO\fR] \-\-num\-threads \fITHREADNUM\fR [\-\-num\-buffers \fIBUFNUM\fR] \fIVMCORE\fR \fIDUMPFILE\fR
+.br
\fBmakedumpfile\fR \-\-reassemble \fIDUMPFILE1\fR \fIDUMPFILE2\fR [\fIDUMPFILE3\fR ..] \fIDUMPFILE\fR
.br
\fBmakedumpfile\fR \-g \fIVMCOREINFO\fR \-x \fIVMLINUX\fR
@@ -371,6 +373,28 @@ the kdump\-compressed format.
# makedumpfile \-\-split \-d 31 \-x vmlinux /proc/vmcore dumpfile1 dumpfile2
.TP
+\fB\-\-num\-threads\fR \fITHREADNUM\fR
+Using multiple threads to read and compress data of each page in parallel.
+And it will reduces time for saving \fIDUMPFILE\fR.
+This feature only supports creating \fIDUMPFILE\fR in kdump\-comressed
+format from \fIVMCORE\fR in kdump\-compressed format or elf format.
+.br
+.B Example:
+.br
+# makedumpfile \-d 31 \-\-num\-threads 4 /proc/vmcore dumpfile
+
+.TP
+\fB\-\-num\-buffers\fR \fIBUFNUM\fR
+This option is used for multiple threads process, please check \-\-num\-threads
+option. Multiple threads process will need buffers to store generated page
+data by threads temporarily, and this option is used to specify the number
+of pages can be stored.
+.br
+.B Example:
+.br
+# makedumpfile \-d 31 \-\-num\-threads 4 \-\-num\-buffers 30 /proc/vmcore dumpfile
+
+.TP
\fB\-\-reassemble\fR
Reassemble multiple \fIDUMPFILE\fRs, which are created by \-\-split option,
into one \fIDUMPFILE\fR. dumpfile1 and dumpfile2 are reassembled into dumpfile
diff --git a/print_info.c b/print_info.c
index 9215e0f..a830ee2 100644
--- a/print_info.c
+++ b/print_info.c
@@ -76,6 +76,10 @@ print_usage(void)
MSG(" # makedumpfile --split [OPTION] [-x VMLINUX|-i VMCOREINFO] VMCORE DUMPFILE1\n");
MSG(" DUMPFILE2 [DUMPFILE3 ..]\n");
MSG("\n");
+ MSG(" Using multiple threads to create DUMPFILE in parallel:\n");
+ MSG(" # makedumpfile [OPTION] [-x VMLINUX|-i VMCOREINFO] --num-threads THREADNUM\n");
+ MSG(" [--num-buffers BUFNUM] VMCORE DUMPFILE1\n");
+ MSG("\n");
MSG(" Reassemble multiple DUMPFILEs:\n");
MSG(" # makedumpfile --reassemble DUMPFILE1 DUMPFILE2 [DUMPFILE3 ..] DUMPFILE\n");
MSG("\n");
@@ -184,6 +188,18 @@ print_usage(void)
MSG(" by the number of DUMPFILEs.\n");
MSG(" This feature supports only the kdump-compressed format.\n");
MSG("\n");
+ MSG(" [--num-threads THREADNUM]:\n");
+ MSG(" Using multiple threads to read and compress data of each page in parallel.\n");
+ MSG(" And it will reduces time for saving DUMPFILE.\n");
+ MSG(" This feature only supports creating DUMPFILE in kdump-comressed format from\n");
+ MSG(" VMCORE in kdump-compressed format or elf format.\n");
+ MSG("\n");
+ MSG(" [--num-buffers BUFNUM]:\n");
+ MSG(" This option is used for multiple threads process, please check --num-threads\n");
+ MSG(" option. Multiple threads process will need buffers to store generated page\n");
+ MSG(" data by threads temporarily, and this option is used to specify the number\n");
+ MSG(" of pages can be stored.\n");
+ MSG("\n");
MSG(" [--reassemble]:\n");
MSG(" Reassemble multiple DUMPFILEs, which are created by --split option,\n");
MSG(" into one DUMPFILE. dumpfile1 and dumpfile2 are reassembled into dumpfile.\n");
--
1.7.1
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (10 preceding siblings ...)
2015-06-05 7:57 ` [PATCH RFC 11/11] Add usage and manual about multiple threads process Zhou Wenjian
@ 2015-06-08 3:55 ` "Zhou, Wenjian/周文剑"
2015-12-01 8:39 ` Chao Fan
2015-06-10 6:06 ` Atsushi Kumagai
12 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-06-08 3:55 UTC (permalink / raw)
To: kexec
hello all,
I test this patch set in two machines and the following is the benchmark.
These tables show the time that makedumpfile spends. And the unit is second.
"core-data" in the table means the context in the vmcore.
For example:
core-data's value is 256. It means that in the vmcore, 256 * 8 bits of each page
are set to 1.
"-l" in the table means producing lzo format vmcore
"-c" in the table means producing kdump-compressed format vmcore
###################################machine with 128G memory
************ makedumpfile -d 0 ******************
core-data 256 1280
threads_num
-l
0 758 881
8 932 1014
16 973 1085
-c
0 3994 4071
8 966 1007
16 1053 1192
************ makedumpfile -d 3 ******************
core-data 256 1280
threads_num
-l
0 764 847
8 948 1058
16 943 1069
-c
0 4021 4050
8 949 1029
16 1051 1190
************ makedumpfile -d 31 ******************
core-data 256 1280
threads_num
-l
0 4 4
8 639 610
16 680 680
-c
0 14 13
8 607 610
16 631 662
###################################machine with 24G memory
************ makedumpfile -d 0 ******************
core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3584 3840 4096
threads_num
-l
0 15 140 186 196 196 196 196 197 197 197 195 195 195 195 186 131 15
4 9 136 189 204 204 202 201 200 201 200 200 202 204 203 189 136 9
8 11 131 193 198 198 202 206 205 206 205 205 202 198 197 193 132 11
12 18 137 194 202 203 197 201 203 204 202 201 196 202 202 194 136 17
-c
0 80 786 967 1031 874 849 700 608 652 603 764 768 873 1031 1016 776 80
4 82 262 315 321 296 256 255 220 218 221 241 268 303 320 319 259 84
8 58 148 174 189 179 189 196 198 199 198 196 190 178 174 170 145 57
12 56 112 131 157 170 189 200 204 204 203 199 191 170 157 132 111 59
************ makedumpfile -d 1 ******************
core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3584 3840 4096
threads_num
-l
0 16 134 194 204 204 205 205 206 205 207 204 203 204 204 193 134 15
4 9 132 193 197 196 198 199 200 200 200 199 197 196 197 192 132 9
8 12 135 189 202 204 200 197 196 197 195 196 199 203 202 189 136 12
12 16 130 190 200 200 205 202 201 200 201 202 205 199 200 189 131 17
-c
0 77 775 1009 1032 872 853 699 606 643 602 758 765 870 1026 1014 774 78
4 80 262 316 322 332 257 247 217 223 218 288 256 322 322 315 258 81
8 56 146 173 176 170 184 198 205 207 203 198 185 169 180 169 149 56
12 56 110 133 152 175 185 194 202 202 202 193 184 176 152 135 114 56
************ makedumpfile -d 7 ******************
core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3584 3840 4096
threads_num
-l
0 16 138 188 197 197 197 197 197 197 198 196 197 197 197 189 137 16
4 10 131 187 202 205 203 202 202 203 203 201 203 204 201 187 131 8
8 11 135 191 199 197 201 203 205 206 204 203 200 197 199 192 134 11
12 18 134 195 201 203 197 199 202 202 201 199 196 203 201 197 134 19
-c
0 77 770 1011 1032 871 841 698 621 645 601 763 765 870 1025 1014 773 78
4 81 263 311 320 319 255 240 216 242 214 240 257 300 319 314 255 80
8 57 157 176 172 174 191 196 199 199 199 195 191 173 171 167 146 57
12 55 111 136 156 170 188 203 204 204 203 201 186 168 156 136 112 56
************ makedumpfile -d 31 ******************
core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3584 3840 4096
threads_num
-l
0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
4 7 8 8 8 8 8 8 8 8 8 8 8 8 8 7 8 8
8 11 11 11 10 11 11 11 11 11 11 10 11 11 11 11 11 11
12 14 13 14 13 13 15 15 13 15 13 14 14 13 15 15 15 16
-c
0 4 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
8 12 12 12 13 12 12 12 12 12 12 13 12 14 13 12 12 13
12 14 16 14 14 13 15 15 15 14 14 14 14 16 14 15 15 14
On 06/05/2015 03:56 PM, Zhou Wenjian wrote:
> This patch set implements parallel processing by means of multiple threads.
> With this patch set, it is available to use multiple threads to read
> and compress pages. This parallel process will save time.
> This feature only supports creating dumpfile in kdump-compressed format from
> vmcore in kdump-compressed format or elf format. Currently, sadump and
> xen kdump are not supported.
>
> Qiao Nuohan (11):
> Add readpage_kdump_compressed_parallel
> Add mappage_elf_parallel
> Add readpage_elf_parallel
> Add read_pfn_parallel
> Add function to initial bitmap for parallel use
> Add filter_data_buffer_parallel
> Add write_kdump_pages_parallel to allow parallel process
> Add write_kdump_pages_parallel_cyclic to allow parallel process in
> cyclic_mode
> Initial and free data used for parallel process
> Make makedumpfile available to read and compress pages parallelly
> Add usage and manual about multiple threads process
>
> Makefile | 2 +
> erase_info.c | 29 +-
> erase_info.h | 2 +
> makedumpfile.8 | 24 +
> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> makedumpfile.h | 79 +++
> print_info.c | 16 +
> 7 files changed, 1652 insertions(+), 5 deletions(-)
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
--
Thanks
Zhou Wenjian
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
` (11 preceding siblings ...)
2015-06-08 3:55 ` [PATCH RFC 00/11] makedumpfile: parallel processing "Zhou, Wenjian/周文剑"
@ 2015-06-10 6:06 ` Atsushi Kumagai
2015-06-11 3:47 ` "Zhou, Wenjian/周文剑"
12 siblings, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-06-10 6:06 UTC (permalink / raw)
To: zhouwj-fnst@cn.fujitsu.com; +Cc: kexec@lists.infradead.org
Hello Zhou,
>This patch set implements parallel processing by means of multiple threads.
>With this patch set, it is available to use multiple threads to read
>and compress pages. This parallel process will save time.
>This feature only supports creating dumpfile in kdump-compressed format from
>vmcore in kdump-compressed format or elf format. Currently, sadump and
> xen kdump are not supported.
makedumpfile already has a parallel processing feature (--split),
it parallelizes not only page compression but also disk i/o, so
I think --split includes what you want to do by this patch.
In what case do you think this patch will be effective, what is
the advantage of this patch ?
Thanks
Atsushi Kumagai
>
>Qiao Nuohan (11):
> Add readpage_kdump_compressed_parallel
> Add mappage_elf_parallel
> Add readpage_elf_parallel
> Add read_pfn_parallel
> Add function to initial bitmap for parallel use
> Add filter_data_buffer_parallel
> Add write_kdump_pages_parallel to allow parallel process
> Add write_kdump_pages_parallel_cyclic to allow parallel process in
> cyclic_mode
> Initial and free data used for parallel process
> Make makedumpfile available to read and compress pages parallelly
> Add usage and manual about multiple threads process
>
> Makefile | 2 +
> erase_info.c | 29 +-
> erase_info.h | 2 +
> makedumpfile.8 | 24 +
> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> makedumpfile.h | 79 +++
> print_info.c | 16 +
> 7 files changed, 1652 insertions(+), 5 deletions(-)
>
>
>_______________________________________________
>kexec mailing list
>kexec@lists.infradead.org
>http://lists.infradead.org/mailman/listinfo/kexec
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-06-10 6:06 ` Atsushi Kumagai
@ 2015-06-11 3:47 ` "Zhou, Wenjian/周文剑"
2015-06-15 1:59 ` qiaonuohan
0 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-06-11 3:47 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: kexec@lists.infradead.org
hello,
though --split can parallel process, it can't just produce one core.
more processes, better performance. but it also means more split cores.
people may want to just produce one core, however they still prefer parallel
processing for its better performance.
so, parallel processing by multiple threads is needed.
in the future, multiple threads can also be used in each split process to
accelerate process.
On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
> Hello Zhou,
>
>> This patch set implements parallel processing by means of multiple threads.
>> With this patch set, it is available to use multiple threads to read
>> and compress pages. This parallel process will save time.
>> This feature only supports creating dumpfile in kdump-compressed format from
>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>> xen kdump are not supported.
>
> makedumpfile already has a parallel processing feature (--split),
> it parallelizes not only page compression but also disk i/o, so
> I think --split includes what you want to do by this patch.
>
> In what case do you think this patch will be effective, what is
> the advantage of this patch ?
>
>
> Thanks
> Atsushi Kumagai
>
>>
>> Qiao Nuohan (11):
>> Add readpage_kdump_compressed_parallel
>> Add mappage_elf_parallel
>> Add readpage_elf_parallel
>> Add read_pfn_parallel
>> Add function to initial bitmap for parallel use
>> Add filter_data_buffer_parallel
>> Add write_kdump_pages_parallel to allow parallel process
>> Add write_kdump_pages_parallel_cyclic to allow parallel process in
>> cyclic_mode
>> Initial and free data used for parallel process
>> Make makedumpfile available to read and compress pages parallelly
>> Add usage and manual about multiple threads process
>>
>> Makefile | 2 +
>> erase_info.c | 29 +-
>> erase_info.h | 2 +
>> makedumpfile.8 | 24 +
>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> makedumpfile.h | 79 +++
>> print_info.c | 16 +
>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
--
Thanks
Zhou Wenjian
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-06-11 3:47 ` "Zhou, Wenjian/周文剑"
@ 2015-06-15 1:59 ` qiaonuohan
2015-06-15 5:57 ` Atsushi Kumagai
0 siblings, 1 reply; 43+ messages in thread
From: qiaonuohan @ 2015-06-15 1:59 UTC (permalink / raw)
To: "Zhou, Wenjian/周文剑", Atsushi Kumagai
Cc: kexec@lists.infradead.org
On 06/11/2015 11:47 AM, "Zhou, Wenjian/周文剑" wrote:
> hello,
>
> though --split can parallel process, it can't just produce one core.
> more processes, better performance. but it also means more split cores.
> people may want to just produce one core, however they still prefer parallel
> processing for its better performance.
>
> so, parallel processing by multiple threads is needed.
> in the future, multiple threads can also be used in each split process to
> accelerate process.
>
>
> On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
>> Hello Zhou,
>>
Hello Atsushi,
>>> This patch set implements parallel processing by means of multiple threads.
>>> With this patch set, it is available to use multiple threads to read
>>> and compress pages. This parallel process will save time.
>>> This feature only supports creating dumpfile in kdump-compressed format from
>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>> xen kdump are not supported.
>>
>> makedumpfile already has a parallel processing feature (--split),
>> it parallelizes not only page compression but also disk i/o, so
>> I think --split includes what you want to do by this patch.
>>
>> In what case do you think this patch will be effective, what is
>> the advantage of this patch ?
Since commit 428a5e99eea929639ab9c761f33743f78a961b1a(kdumpctl: Pass
disable_cpu_apicid to kexec of capture kernel) has been merged. It is possible for
us to use multiple cpus in 2nd kernel.
Using multiple threads is trying to take advantage of multiple cpus in 2nd kernel.
Since memory becomes bigger and bigger, dumping spends more time. Why not take
advantage of multiple cpus?
OTOH, --split does a lot help to improve performance. But more processes
means more files, saving multiple files and managing those files is not that
convenient.
Multiple threads do have some merit in improving performance. And later, as zhou
said, we can also try to combine --split with multiple threads to save more time.
--
Regards
Qiao Nuohan
>>
>>
>> Thanks
>> Atsushi Kumagai
>>
>>>
>>> Qiao Nuohan (11):
>>> Add readpage_kdump_compressed_parallel
>>> Add mappage_elf_parallel
>>> Add readpage_elf_parallel
>>> Add read_pfn_parallel
>>> Add function to initial bitmap for parallel use
>>> Add filter_data_buffer_parallel
>>> Add write_kdump_pages_parallel to allow parallel process
>>> Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>> cyclic_mode
>>> Initial and free data used for parallel process
>>> Make makedumpfile available to read and compress pages parallelly
>>> Add usage and manual about multiple threads process
>>>
>>> Makefile | 2 +
>>> erase_info.c | 29 +-
>>> erase_info.h | 2 +
>>> makedumpfile.8 | 24 +
>>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>> makedumpfile.h | 79 +++
>>> print_info.c | 16 +
>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>
>>>
>>> _______________________________________________
>>> kexec mailing list
>>> kexec@lists.infradead.org
>>> http://lists.infradead.org/mailman/listinfo/kexec
>
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-06-15 1:59 ` qiaonuohan
@ 2015-06-15 5:57 ` Atsushi Kumagai
2015-06-15 6:06 ` qiaonuohan
2015-06-15 6:07 ` qiaonuohan
0 siblings, 2 replies; 43+ messages in thread
From: Atsushi Kumagai @ 2015-06-15 5:57 UTC (permalink / raw)
To: qiaonuohan@cn.fujitsu.com, zhouwj-fnst@cn.fujitsu.com
Cc: kexec@lists.infradead.org
Hello Qiao,
>> On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
>>> Hello Zhou,
>>>
>
>Hello Atsushi,
>
>>>> This patch set implements parallel processing by means of multiple threads.
>>>> With this patch set, it is available to use multiple threads to read
>>>> and compress pages. This parallel process will save time.
>>>> This feature only supports creating dumpfile in kdump-compressed format from
>>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>>> xen kdump are not supported.
>>>
>>> makedumpfile already has a parallel processing feature (--split),
>>> it parallelizes not only page compression but also disk i/o, so
>>> I think --split includes what you want to do by this patch.
>>>
>>> In what case do you think this patch will be effective, what is
>>> the advantage of this patch ?
>
>Since commit 428a5e99eea929639ab9c761f33743f78a961b1a(kdumpctl: Pass
>disable_cpu_apicid to kexec of capture kernel) has been merged. It is possible for
>us to use multiple cpus in 2nd kernel.
>
>Using multiple threads is trying to take advantage of multiple cpus in 2nd kernel.
>Since memory becomes bigger and bigger, dumping spends more time. Why not take
>advantage of multiple cpus?
>
>OTOH, --split does a lot help to improve performance. But more processes
>means more files, saving multiple files and managing those files is not that
>convenient.
I see, actually I guess some users may feel lazy to use --split since
it requires concatenation for analyzing, and it seems that some improvements
by using multiple threads can be expected at least in the zlib case.
So I agree with the concept.
>Multiple threads do have some merit in improving performance. And later, as zhou
>said, we can also try to combine --split with multiple threads to save more time.
At first I thought it's enough to modify --split path to generate single vmcore.
However, if the compression process is the bottleneck, we should allot multiple
cpus to each i/o process when doing parallel i/o. For that reason, it's good to
introduce the new feature to create multiple threads in addition to --split.
Just one thing, when you make the complete version, please make it on the devel
branch because cyclic/non-cyclic codes have been changed from v1.5.8.
Thanks
Atsushi Kumagai
>--
>Regards
>Qiao Nuohan
>
>>>
>>>
>>> Thanks
>>> Atsushi Kumagai
>>>
>>>>
>>>> Qiao Nuohan (11):
>>>> Add readpage_kdump_compressed_parallel
>>>> Add mappage_elf_parallel
>>>> Add readpage_elf_parallel
>>>> Add read_pfn_parallel
>>>> Add function to initial bitmap for parallel use
>>>> Add filter_data_buffer_parallel
>>>> Add write_kdump_pages_parallel to allow parallel process
>>>> Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>>> cyclic_mode
>>>> Initial and free data used for parallel process
>>>> Make makedumpfile available to read and compress pages parallelly
>>>> Add usage and manual about multiple threads process
>>>>
>>>> Makefile | 2 +
>>>> erase_info.c | 29 +-
>>>> erase_info.h | 2 +
>>>> makedumpfile.8 | 24 +
>>>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>> makedumpfile.h | 79 +++
>>>> print_info.c | 16 +
>>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>>
>>>>
>>>> _______________________________________________
>>>> kexec mailing list
>>>> kexec@lists.infradead.org
>>>> http://lists.infradead.org/mailman/listinfo/kexec
>>
>>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-06-15 5:57 ` Atsushi Kumagai
@ 2015-06-15 6:06 ` qiaonuohan
2015-06-15 6:07 ` qiaonuohan
1 sibling, 0 replies; 43+ messages in thread
From: qiaonuohan @ 2015-06-15 6:06 UTC (permalink / raw)
To: Atsushi Kumagai, zhouwj-fnst@cn.fujitsu.com; +Cc: kexec@lists.infradead.org
On 06/15/2015 01:57 PM, Atsushi Kumagai wrote:
> Hello Qiao,
>
>>> On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
>>>> Hello Zhou,
>>>>
>>
>> Hello Atsushi,
>>
>>>>> This patch set implements parallel processing by means of multiple threads.
>>>>> With this patch set, it is available to use multiple threads to read
>>>>> and compress pages. This parallel process will save time.
>>>>> This feature only supports creating dumpfile in kdump-compressed format from
>>>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>>>> xen kdump are not supported.
>>>>
>>>> makedumpfile already has a parallel processing feature (--split),
>>>> it parallelizes not only page compression but also disk i/o, so
>>>> I think --split includes what you want to do by this patch.
>>>>
>>>> In what case do you think this patch will be effective, what is
>>>> the advantage of this patch ?
>>
>> Since commit 428a5e99eea929639ab9c761f33743f78a961b1a(kdumpctl: Pass
>> disable_cpu_apicid to kexec of capture kernel) has been merged. It is possible for
>> us to use multiple cpus in 2nd kernel.
>>
>> Using multiple threads is trying to take advantage of multiple cpus in 2nd kernel.
>> Since memory becomes bigger and bigger, dumping spends more time. Why not take
>> advantage of multiple cpus?
>>
>> OTOH, --split does a lot help to improve performance. But more processes
>> means more files, saving multiple files and managing those files is not that
>> convenient.
>
> I see, actually I guess some users may feel lazy to use --split since
> it requires concatenation for analyzing, and it seems that some improvements
> by using multiple threads can be expected at least in the zlib case.
> So I agree with the concept.
>
>> Multiple threads do have some merit in improving performance. And later, as zhou
>> said, we can also try to combine --split with multiple threads to save more time.
>
> At first I thought it's enough to modify --split path to generate single vmcore.
> However, if the compression process is the bottleneck, we should allot multiple
> cpus to each i/o process when doing parallel i/o. For that reason, it's good to
> introduce the new feature to create multiple threads in addition to --split.
I see.
>
> Just one thing, when you make the complete version, please make it on the devel
> branch because cyclic/non-cyclic codes have been changed from v1.5.8.
Yes, we will start rebasing the code.
>
>
> Thanks
> Atsushi Kumagai
>
>
>> --
>> Regards
>> Qiao Nuohan
>>
>>>>
>>>>
>>>> Thanks
>>>> Atsushi Kumagai
>>>>
>>>>>
>>>>> Qiao Nuohan (11):
>>>>> Add readpage_kdump_compressed_parallel
>>>>> Add mappage_elf_parallel
>>>>> Add readpage_elf_parallel
>>>>> Add read_pfn_parallel
>>>>> Add function to initial bitmap for parallel use
>>>>> Add filter_data_buffer_parallel
>>>>> Add write_kdump_pages_parallel to allow parallel process
>>>>> Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>>>> cyclic_mode
>>>>> Initial and free data used for parallel process
>>>>> Make makedumpfile available to read and compress pages parallelly
>>>>> Add usage and manual about multiple threads process
>>>>>
>>>>> Makefile | 2 +
>>>>> erase_info.c | 29 +-
>>>>> erase_info.h | 2 +
>>>>> makedumpfile.8 | 24 +
>>>>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>> makedumpfile.h | 79 +++
>>>>> print_info.c | 16 +
>>>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> kexec mailing list
>>>>> kexec@lists.infradead.org
>>>>> http://lists.infradead.org/mailman/listinfo/kexec
>>>
>>>
>
--
Regards
Qiao Nuohan
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-06-15 5:57 ` Atsushi Kumagai
2015-06-15 6:06 ` qiaonuohan
@ 2015-06-15 6:07 ` qiaonuohan
1 sibling, 0 replies; 43+ messages in thread
From: qiaonuohan @ 2015-06-15 6:07 UTC (permalink / raw)
To: Atsushi Kumagai, zhouwj-fnst@cn.fujitsu.com; +Cc: kexec@lists.infradead.org
On 06/15/2015 01:57 PM, Atsushi Kumagai wrote:
> Hello Qiao,
>
>>> On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
>>>> Hello Zhou,
>>>>
>>
>> Hello Atsushi,
>>
>>>>> This patch set implements parallel processing by means of multiple threads.
>>>>> With this patch set, it is available to use multiple threads to read
>>>>> and compress pages. This parallel process will save time.
>>>>> This feature only supports creating dumpfile in kdump-compressed format from
>>>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>>>> xen kdump are not supported.
>>>>
>>>> makedumpfile already has a parallel processing feature (--split),
>>>> it parallelizes not only page compression but also disk i/o, so
>>>> I think --split includes what you want to do by this patch.
>>>>
>>>> In what case do you think this patch will be effective, what is
>>>> the advantage of this patch ?
>>
>> Since commit 428a5e99eea929639ab9c761f33743f78a961b1a(kdumpctl: Pass
>> disable_cpu_apicid to kexec of capture kernel) has been merged. It is possible for
>> us to use multiple cpus in 2nd kernel.
>>
>> Using multiple threads is trying to take advantage of multiple cpus in 2nd kernel.
>> Since memory becomes bigger and bigger, dumping spends more time. Why not take
>> advantage of multiple cpus?
>>
>> OTOH, --split does a lot help to improve performance. But more processes
>> means more files, saving multiple files and managing those files is not that
>> convenient.
>
> I see, actually I guess some users may feel lazy to use --split since
> it requires concatenation for analyzing, and it seems that some improvements
> by using multiple threads can be expected at least in the zlib case.
> So I agree with the concept.
>
>> Multiple threads do have some merit in improving performance. And later, as zhou
>> said, we can also try to combine --split with multiple threads to save more time.
>
> At first I thought it's enough to modify --split path to generate single vmcore.
> However, if the compression process is the bottleneck, we should allot multiple
> cpus to each i/o process when doing parallel i/o. For that reason, it's good to
> introduce the new feature to create multiple threads in addition to --split.
I see.
>
> Just one thing, when you make the complete version, please make it on the devel
> branch because cyclic/non-cyclic codes have been changed from v1.5.8.
Yes, we will start rebasing the code.
>
>
> Thanks
> Atsushi Kumagai
>
>
>> --
>> Regards
>> Qiao Nuohan
>>
>>>>
>>>>
>>>> Thanks
>>>> Atsushi Kumagai
>>>>
>>>>>
>>>>> Qiao Nuohan (11):
>>>>> Add readpage_kdump_compressed_parallel
>>>>> Add mappage_elf_parallel
>>>>> Add readpage_elf_parallel
>>>>> Add read_pfn_parallel
>>>>> Add function to initial bitmap for parallel use
>>>>> Add filter_data_buffer_parallel
>>>>> Add write_kdump_pages_parallel to allow parallel process
>>>>> Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>>>> cyclic_mode
>>>>> Initial and free data used for parallel process
>>>>> Make makedumpfile available to read and compress pages parallelly
>>>>> Add usage and manual about multiple threads process
>>>>>
>>>>> Makefile | 2 +
>>>>> erase_info.c | 29 +-
>>>>> erase_info.h | 2 +
>>>>> makedumpfile.8 | 24 +
>>>>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>> makedumpfile.h | 79 +++
>>>>> print_info.c | 16 +
>>>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> kexec mailing list
>>>>> kexec@lists.infradead.org
>>>>> http://lists.infradead.org/mailman/listinfo/kexec
>>>
>>>
>
--
Regards
Qiao Nuohan
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-06-08 3:55 ` [PATCH RFC 00/11] makedumpfile: parallel processing "Zhou, Wenjian/周文剑"
@ 2015-12-01 8:39 ` Chao Fan
2015-12-02 5:29 ` "Zhou, Wenjian/周文剑"
0 siblings, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-01 8:39 UTC (permalink / raw)
To: Wenjian Zhou/周文剑; +Cc: Shaohui Deng, kexec
Hi Zhou Wenjian,
I did some tests according to your tables. I have a problem when I set
dump_level to 31. The machine has 1T memory, and when dump_level was set
to 31, the size of vmcore is 17G. The kernel is 3.10.0-327.el7.x86_64.
The kexec-tools is kexec-tools-2.0.7-38.el7.x86_64.
If I use
core_collector time makedumpfile -l --message-level 1 -d 31
in kdump based on makedumpfile 1.5.7, the time is
63 seconds(the average of many tests).
And then I use the kdump based on makedumpfile 1.5.9.
core_collector time makedumpfile -l --message-level 1 -d 31
the time is 58 seconds.
core_collector time makedumpfile --num-threads 1 -l --message-level 1 -d 31
the time is 240 seconds.
core_collector time makedumpfile --num-threads 2 -l --message-level 1 -d 31
the time is 189 seconds.
core_collector time makedumpfile --num-threads 4 -l --message-level 1 -d 31
the time is 220 seconds.
core_collector time makedumpfile --num-threads 8 -l --message-level 1 -d 31
the time is 417 seconds.
core_collector time makedumpfile --num-threads 12 -l --message-level 1 -d 31
the time is 579 seconds.
core_collector time makedumpfile --num-threads 16 -l --message-level 1 -d 31
the time is 756 seconds.
So I do not know why if I add "--num-threads", the makedumpfile will use more
time than without "--num-threads". Since your table also shows that
makedumpfile -d 31, the threads_num is 0, the makdumpfile is fatest.
If there are any problems in my tests, please tell me.
Thanks,
Chao Fan
----- Original Message -----
> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> To: kexec@lists.infradead.org
> Sent: Monday, June 8, 2015 11:55:41 AM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>
> hello all,
>
> I test this patch set in two machines and the following is the benchmark.
>
> These tables show the time that makedumpfile spends. And the unit is second.
>
> "core-data" in the table means the context in the vmcore.
> For example:
> core-data's value is 256. It means that in the vmcore, 256 * 8 bits of each
> page
> are set to 1.
>
> "-l" in the table means producing lzo format vmcore
>
> "-c" in the table means producing kdump-compressed format vmcore
>
> ###################################machine with 128G memory
>
> ************ makedumpfile -d 0 ******************
> core-data 256 1280
> threads_num
> -l
> 0 758 881
> 8 932 1014
> 16 973 1085
> -c
> 0 3994 4071
> 8 966 1007
> 16 1053 1192
>
> ************ makedumpfile -d 3 ******************
> core-data 256 1280
> threads_num
> -l
> 0 764 847
> 8 948 1058
> 16 943 1069
> -c
> 0 4021 4050
> 8 949 1029
> 16 1051 1190
>
> ************ makedumpfile -d 31 ******************
> core-data 256 1280
> threads_num
> -l
> 0 4 4
> 8 639 610
> 16 680 680
> -c
> 0 14 13
> 8 607 610
> 16 631 662
>
> ###################################machine with 24G memory
>
> ************ makedumpfile -d 0 ******************
> core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328
> 3584 3840 4096
> threads_num
> -l
> 0 15 140 186 196 196 196 196 197 197 197 195 195 195 195 186 131 15
> 4 9 136 189 204 204 202 201 200 201 200 200 202 204 203 189 136 9
> 8 11 131 193 198 198 202 206 205 206 205 205 202 198 197 193 132 11
> 12 18 137 194 202 203 197 201 203 204 202 201 196 202 202 194 136 17
> -c
> 0 80 786 967 1031 874 849 700 608 652 603 764 768 873 1031 1016 776 80
> 4 82 262 315 321 296 256 255 220 218 221 241 268 303 320 319 259 84
> 8 58 148 174 189 179 189 196 198 199 198 196 190 178 174 170 145 57
> 12 56 112 131 157 170 189 200 204 204 203 199 191 170 157 132 111 59
>
> ************ makedumpfile -d 1 ******************
> core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328
> 3584 3840 4096
> threads_num
> -l
> 0 16 134 194 204 204 205 205 206 205 207 204 203 204 204 193 134 15
> 4 9 132 193 197 196 198 199 200 200 200 199 197 196 197 192 132 9
> 8 12 135 189 202 204 200 197 196 197 195 196 199 203 202 189 136 12
> 12 16 130 190 200 200 205 202 201 200 201 202 205 199 200 189 131 17
> -c
> 0 77 775 1009 1032 872 853 699 606 643 602 758 765 870 1026 1014 774 78
> 4 80 262 316 322 332 257 247 217 223 218 288 256 322 322 315 258 81
> 8 56 146 173 176 170 184 198 205 207 203 198 185 169 180 169 149 56
> 12 56 110 133 152 175 185 194 202 202 202 193 184 176 152 135 114 56
>
> ************ makedumpfile -d 7 ******************
> core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328
> 3584 3840 4096
> threads_num
> -l
> 0 16 138 188 197 197 197 197 197 197 198 196 197 197 197 189 137 16
> 4 10 131 187 202 205 203 202 202 203 203 201 203 204 201 187 131 8
> 8 11 135 191 199 197 201 203 205 206 204 203 200 197 199 192 134 11
> 12 18 134 195 201 203 197 199 202 202 201 199 196 203 201 197 134 19
> -c
> 0 77 770 1011 1032 871 841 698 621 645 601 763 765 870 1025 1014 773 78
> 4 81 263 311 320 319 255 240 216 242 214 240 257 300 319 314 255 80
> 8 57 157 176 172 174 191 196 199 199 199 195 191 173 171 167 146 57
> 12 55 111 136 156 170 188 203 204 204 203 201 186 168 156 136 112 56
>
> ************ makedumpfile -d 31 ******************
> core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328
> 3584 3840 4096
> threads_num
> -l
> 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
> 4 7 8 8 8 8 8 8 8 8 8 8 8 8 8 7 8 8
> 8 11 11 11 10 11 11 11 11 11 11 10 11 11 11 11 11 11
> 12 14 13 14 13 13 15 15 13 15 13 14 14 13 15 15 15 16
> -c
> 0 4 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4
> 4 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
> 8 12 12 12 13 12 12 12 12 12 12 13 12 14 13 12 12 13
> 12 14 16 14 14 13 15 15 15 14 14 14 14 16 14 15 15 14
>
>
> On 06/05/2015 03:56 PM, Zhou Wenjian wrote:
> > This patch set implements parallel processing by means of multiple threads.
> > With this patch set, it is available to use multiple threads to read
> > and compress pages. This parallel process will save time.
> > This feature only supports creating dumpfile in kdump-compressed format
> > from
> > vmcore in kdump-compressed format or elf format. Currently, sadump and
> > xen kdump are not supported.
> >
> > Qiao Nuohan (11):
> > Add readpage_kdump_compressed_parallel
> > Add mappage_elf_parallel
> > Add readpage_elf_parallel
> > Add read_pfn_parallel
> > Add function to initial bitmap for parallel use
> > Add filter_data_buffer_parallel
> > Add write_kdump_pages_parallel to allow parallel process
> > Add write_kdump_pages_parallel_cyclic to allow parallel process in
> > cyclic_mode
> > Initial and free data used for parallel process
> > Make makedumpfile available to read and compress pages parallelly
> > Add usage and manual about multiple threads process
> >
> > Makefile | 2 +
> > erase_info.c | 29 +-
> > erase_info.h | 2 +
> > makedumpfile.8 | 24 +
> > makedumpfile.c | 1505
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > makedumpfile.h | 79 +++
> > print_info.c | 16 +
> > 7 files changed, 1652 insertions(+), 5 deletions(-)
> >
> >
> > _______________________________________________
> > kexec mailing list
> > kexec@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec
>
>
> --
> Thanks
> Zhou Wenjian
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-01 8:39 ` Chao Fan
@ 2015-12-02 5:29 ` "Zhou, Wenjian/周文剑"
2015-12-02 7:24 ` Dave Young
0 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-02 5:29 UTC (permalink / raw)
To: Chao Fan; +Cc: Shaohui Deng, kexec
On 12/01/2015 04:39 PM, Chao Fan wrote:
> Hi Zhou Wenjian,
>
> I did some tests according to your tables. I have a problem when I set
> dump_level to 31. The machine has 1T memory, and when dump_level was set
> to 31, the size of vmcore is 17G. The kernel is 3.10.0-327.el7.x86_64.
> The kexec-tools is kexec-tools-2.0.7-38.el7.x86_64.
>
> If I use
> core_collector time makedumpfile -l --message-level 1 -d 31
> in kdump based on makedumpfile 1.5.7, the time is
> 63 seconds(the average of many tests).
>
> And then I use the kdump based on makedumpfile 1.5.9.
> core_collector time makedumpfile -l --message-level 1 -d 31
> the time is 58 seconds.
>
> core_collector time makedumpfile --num-threads 1 -l --message-level 1 -d 31
> the time is 240 seconds.
>
> core_collector time makedumpfile --num-threads 2 -l --message-level 1 -d 31
> the time is 189 seconds.
>
> core_collector time makedumpfile --num-threads 4 -l --message-level 1 -d 31
> the time is 220 seconds.
>
> core_collector time makedumpfile --num-threads 8 -l --message-level 1 -d 31
> the time is 417 seconds.
>
> core_collector time makedumpfile --num-threads 12 -l --message-level 1 -d 31
> the time is 579 seconds.
>
> core_collector time makedumpfile --num-threads 16 -l --message-level 1 -d 31
> the time is 756 seconds.
>
> So I do not know why if I add "--num-threads", the makedumpfile will use more
> time than without "--num-threads". Since your table also shows that
> makedumpfile -d 31, the threads_num is 0, the makdumpfile is fatest.
>
> If there are any problems in my tests, please tell me.
>
Hello,
I think there is no problem if other test results are as expected.
--num-threads mainly reduces the time of compressing.
So for lzo, it can't do much help at most of time.
However, when "-d 31" is specified, it will be worse.
Less than 50 buffers are used to cache the compressed page.
And even the page has been filtered, it will also take a buffer.
So if "-d 31" is specified, the filtered page will use a lot
of buffers. Then the page which needs to be compressed can't
be compressed parallel.
So, it's not strange that "--num-threads" will take more time
in "-l -d 31"
--
Thanks
Zhou
> Thanks,
> Chao Fan
>
> ----- Original Message -----
>> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> To: kexec@lists.infradead.org
>> Sent: Monday, June 8, 2015 11:55:41 AM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> hello all,
>>
>> I test this patch set in two machines and the following is the benchmark.
>>
>> These tables show the time that makedumpfile spends. And the unit is second.
>>
>> "core-data" in the table means the context in the vmcore.
>> For example:
>> core-data's value is 256. It means that in the vmcore, 256 * 8 bits of each
>> page
>> are set to 1.
>>
>> "-l" in the table means producing lzo format vmcore
>>
>> "-c" in the table means producing kdump-compressed format vmcore
>>
>> ###################################machine with 128G memory
>>
>> ************ makedumpfile -d 0 ******************
>> core-data 256 1280
>> threads_num
>> -l
>> 0 758 881
>> 8 932 1014
>> 16 973 1085
>> -c
>> 0 3994 4071
>> 8 966 1007
>> 16 1053 1192
>>
>> ************ makedumpfile -d 3 ******************
>> core-data 256 1280
>> threads_num
>> -l
>> 0 764 847
>> 8 948 1058
>> 16 943 1069
>> -c
>> 0 4021 4050
>> 8 949 1029
>> 16 1051 1190
>>
>> ************ makedumpfile -d 31 ******************
>> core-data 256 1280
>> threads_num
>> -l
>> 0 4 4
>> 8 639 610
>> 16 680 680
>> -c
>> 0 14 13
>> 8 607 610
>> 16 631 662
>>
>> ###################################machine with 24G memory
>>
>> ************ makedumpfile -d 0 ******************
>> core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328
>> 3584 3840 4096
>> threads_num
>> -l
>> 0 15 140 186 196 196 196 196 197 197 197 195 195 195 195 186 131 15
>> 4 9 136 189 204 204 202 201 200 201 200 200 202 204 203 189 136 9
>> 8 11 131 193 198 198 202 206 205 206 205 205 202 198 197 193 132 11
>> 12 18 137 194 202 203 197 201 203 204 202 201 196 202 202 194 136 17
>> -c
>> 0 80 786 967 1031 874 849 700 608 652 603 764 768 873 1031 1016 776 80
>> 4 82 262 315 321 296 256 255 220 218 221 241 268 303 320 319 259 84
>> 8 58 148 174 189 179 189 196 198 199 198 196 190 178 174 170 145 57
>> 12 56 112 131 157 170 189 200 204 204 203 199 191 170 157 132 111 59
>>
>> ************ makedumpfile -d 1 ******************
>> core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328
>> 3584 3840 4096
>> threads_num
>> -l
>> 0 16 134 194 204 204 205 205 206 205 207 204 203 204 204 193 134 15
>> 4 9 132 193 197 196 198 199 200 200 200 199 197 196 197 192 132 9
>> 8 12 135 189 202 204 200 197 196 197 195 196 199 203 202 189 136 12
>> 12 16 130 190 200 200 205 202 201 200 201 202 205 199 200 189 131 17
>> -c
>> 0 77 775 1009 1032 872 853 699 606 643 602 758 765 870 1026 1014 774 78
>> 4 80 262 316 322 332 257 247 217 223 218 288 256 322 322 315 258 81
>> 8 56 146 173 176 170 184 198 205 207 203 198 185 169 180 169 149 56
>> 12 56 110 133 152 175 185 194 202 202 202 193 184 176 152 135 114 56
>>
>> ************ makedumpfile -d 7 ******************
>> core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328
>> 3584 3840 4096
>> threads_num
>> -l
>> 0 16 138 188 197 197 197 197 197 197 198 196 197 197 197 189 137 16
>> 4 10 131 187 202 205 203 202 202 203 203 201 203 204 201 187 131 8
>> 8 11 135 191 199 197 201 203 205 206 204 203 200 197 199 192 134 11
>> 12 18 134 195 201 203 197 199 202 202 201 199 196 203 201 197 134 19
>> -c
>> 0 77 770 1011 1032 871 841 698 621 645 601 763 765 870 1025 1014 773 78
>> 4 81 263 311 320 319 255 240 216 242 214 240 257 300 319 314 255 80
>> 8 57 157 176 172 174 191 196 199 199 199 195 191 173 171 167 146 57
>> 12 55 111 136 156 170 188 203 204 204 203 201 186 168 156 136 112 56
>>
>> ************ makedumpfile -d 31 ******************
>> core-data 0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328
>> 3584 3840 4096
>> threads_num
>> -l
>> 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
>> 4 7 8 8 8 8 8 8 8 8 8 8 8 8 8 7 8 8
>> 8 11 11 11 10 11 11 11 11 11 11 10 11 11 11 11 11 11
>> 12 14 13 14 13 13 15 15 13 15 13 14 14 13 15 15 15 16
>> -c
>> 0 4 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4
>> 4 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
>> 8 12 12 12 13 12 12 12 12 12 12 13 12 14 13 12 12 13
>> 12 14 16 14 14 13 15 15 15 14 14 14 14 16 14 15 15 14
>>
>>
>> On 06/05/2015 03:56 PM, Zhou Wenjian wrote:
>>> This patch set implements parallel processing by means of multiple threads.
>>> With this patch set, it is available to use multiple threads to read
>>> and compress pages. This parallel process will save time.
>>> This feature only supports creating dumpfile in kdump-compressed format
>>> from
>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>> xen kdump are not supported.
>>>
>>> Qiao Nuohan (11):
>>> Add readpage_kdump_compressed_parallel
>>> Add mappage_elf_parallel
>>> Add readpage_elf_parallel
>>> Add read_pfn_parallel
>>> Add function to initial bitmap for parallel use
>>> Add filter_data_buffer_parallel
>>> Add write_kdump_pages_parallel to allow parallel process
>>> Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>> cyclic_mode
>>> Initial and free data used for parallel process
>>> Make makedumpfile available to read and compress pages parallelly
>>> Add usage and manual about multiple threads process
>>>
>>> Makefile | 2 +
>>> erase_info.c | 29 +-
>>> erase_info.h | 2 +
>>> makedumpfile.8 | 24 +
>>> makedumpfile.c | 1505
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>> makedumpfile.h | 79 +++
>>> print_info.c | 16 +
>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>
>>>
>>> _______________________________________________
>>> kexec mailing list
>>> kexec@lists.infradead.org
>>> http://lists.infradead.org/mailman/listinfo/kexec
>>
>>
>> --
>> Thanks
>> Zhou Wenjian
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-02 5:29 ` "Zhou, Wenjian/周文剑"
@ 2015-12-02 7:24 ` Dave Young
2015-12-02 7:38 ` "Zhou, Wenjian/周文剑"
0 siblings, 1 reply; 43+ messages in thread
From: Dave Young @ 2015-12-02 7:24 UTC (permalink / raw)
To: "Zhou, Wenjian/周文剑"
Cc: Chao Fan, Shaohui Deng, kexec
Hi,
On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> I think there is no problem if other test results are as expected.
>
> --num-threads mainly reduces the time of compressing.
> So for lzo, it can't do much help at most of time.
Seems the help of --num-threads does not say it exactly:
[--num-threads THREADNUM]:
Using multiple threads to read and compress data of each page in parallel.
And it will reduces time for saving DUMPFILE.
This feature only supports creating DUMPFILE in kdump-comressed format from
VMCORE in kdump-compressed format or elf format.
Lzo is also a compress method, it should be mentioned that --num-threads only
supports zlib compressed vmcore.
Also worth to mention about the recommended -d value for this feature.
Thanks
Dave
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-02 7:24 ` Dave Young
@ 2015-12-02 7:38 ` "Zhou, Wenjian/周文剑"
2015-12-04 2:30 ` Atsushi Kumagai
0 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-02 7:38 UTC (permalink / raw)
To: Dave Young; +Cc: Chao Fan, Shaohui Deng, kexec
On 12/02/2015 03:24 PM, Dave Young wrote:
> Hi,
>
> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> I think there is no problem if other test results are as expected.
>>
>> --num-threads mainly reduces the time of compressing.
>> So for lzo, it can't do much help at most of time.
>
> Seems the help of --num-threads does not say it exactly:
>
> [--num-threads THREADNUM]:
> Using multiple threads to read and compress data of each page in parallel.
> And it will reduces time for saving DUMPFILE.
> This feature only supports creating DUMPFILE in kdump-comressed format from
> VMCORE in kdump-compressed format or elf format.
>
> Lzo is also a compress method, it should be mentioned that --num-threads only
> supports zlib compressed vmcore.
>
Sorry, it seems that something I said is not so clear.
lzo is also supported. Since lzo compresses data at a high speed, the
improving of the performance is not so obvious at most of time.
> Also worth to mention about the recommended -d value for this feature.
>
Yes, I think it's worth. I forgot it.
--
Thanks
Zhou
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-02 7:38 ` "Zhou, Wenjian/周文剑"
@ 2015-12-04 2:30 ` Atsushi Kumagai
2015-12-04 3:33 ` "Zhou, Wenjian/周文剑"
0 siblings, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-12-04 2:30 UTC (permalink / raw)
To: zhouwj-fnst@cn.fujitsu.com; +Cc: kexec@lists.infradead.org
Hello, Zhou
>On 12/02/2015 03:24 PM, Dave Young wrote:
>> Hi,
>>
>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>> I think there is no problem if other test results are as expected.
>>>
>>> --num-threads mainly reduces the time of compressing.
>>> So for lzo, it can't do much help at most of time.
>>
>> Seems the help of --num-threads does not say it exactly:
>>
>> [--num-threads THREADNUM]:
>> Using multiple threads to read and compress data of each page in parallel.
>> And it will reduces time for saving DUMPFILE.
>> This feature only supports creating DUMPFILE in kdump-comressed format from
>> VMCORE in kdump-compressed format or elf format.
>>
>> Lzo is also a compress method, it should be mentioned that --num-threads only
>> supports zlib compressed vmcore.
>>
>
>Sorry, it seems that something I said is not so clear.
>lzo is also supported. Since lzo compresses data at a high speed, the
>improving of the performance is not so obvious at most of time.
>
>> Also worth to mention about the recommended -d value for this feature.
>>
>
>Yes, I think it's worth. I forgot it.
I saw your patch, but I think I should confirm what is the problem first.
>However, when "-d 31" is specified, it will be worse.
>Less than 50 buffers are used to cache the compressed page.
>And even the page has been filtered, it will also take a buffer.
>So if "-d 31" is specified, the filtered page will use a lot
>of buffers. Then the page which needs to be compressed can't
>be compressed parallel.
Could you explain why compression will not be parallel in more detail ?
Actually the buffers are used also for filtered pages, it sounds inefficient.
However, I don't understand why it prevents parallel compression.
Further, according to Chao's benchmark, there is a big performance
degradation even if the number of thread is 1. (58s vs 240s)
The current implementation seems to have some problems, we should
solve them.
Thanks,
Atsushi Kumagai
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-04 2:30 ` Atsushi Kumagai
@ 2015-12-04 3:33 ` "Zhou, Wenjian/周文剑"
2015-12-04 8:56 ` Chao Fan
2015-12-10 8:14 ` Atsushi Kumagai
0 siblings, 2 replies; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-04 3:33 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: kexec@lists.infradead.org
Hello Kumagai,
On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> Hello, Zhou
>
>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>> Hi,
>>>
>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>> I think there is no problem if other test results are as expected.
>>>>
>>>> --num-threads mainly reduces the time of compressing.
>>>> So for lzo, it can't do much help at most of time.
>>>
>>> Seems the help of --num-threads does not say it exactly:
>>>
>>> [--num-threads THREADNUM]:
>>> Using multiple threads to read and compress data of each page in parallel.
>>> And it will reduces time for saving DUMPFILE.
>>> This feature only supports creating DUMPFILE in kdump-comressed format from
>>> VMCORE in kdump-compressed format or elf format.
>>>
>>> Lzo is also a compress method, it should be mentioned that --num-threads only
>>> supports zlib compressed vmcore.
>>>
>>
>> Sorry, it seems that something I said is not so clear.
>> lzo is also supported. Since lzo compresses data at a high speed, the
>> improving of the performance is not so obvious at most of time.
>>
>>> Also worth to mention about the recommended -d value for this feature.
>>>
>>
>> Yes, I think it's worth. I forgot it.
>
> I saw your patch, but I think I should confirm what is the problem first.
>
>> However, when "-d 31" is specified, it will be worse.
>> Less than 50 buffers are used to cache the compressed page.
>> And even the page has been filtered, it will also take a buffer.
>> So if "-d 31" is specified, the filtered page will use a lot
>> of buffers. Then the page which needs to be compressed can't
>> be compressed parallel.
>
> Could you explain why compression will not be parallel in more detail ?
> Actually the buffers are used also for filtered pages, it sounds inefficient.
> However, I don't understand why it prevents parallel compression.
>
Think about this, in a huge memory, most of the page will be filtered, and
we have 5 buffers.
page1 page2 page3 page4 page5 page6 page7 .....
[buffer1] [2] [3] [4] [5]
unfiltered filtered filtered filtered filtered unfiltered filtered
Since filtered page will take a buffer, when compressing page1,
page6 can't be compressed at the same time.
That why it will prevent parallel compression.
> Further, according to Chao's benchmark, there is a big performance
> degradation even if the number of thread is 1. (58s vs 240s)
> The current implementation seems to have some problems, we should
> solve them.
>
If "-d 31" is specified, on the one hand we can't save time by compressing
parallel, on the other hand we will introduce some extra work by adding
"--num-threads". So it is obvious that it will have a performance degradation.
I'm not so sure if it is a problem that the performance degradation is so big.
But I think if in other cases, it works as expected, this won't be a problem(
or a problem needs to be fixed), for the performance degradation existing
in theory.
Or the current implementation should be replaced by a new arithmetic.
For example:
We can add an array to record whether the page is filtered or not.
And only the unfiltered page will take the buffer.
But I'm not sure if it is worth.
For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
--
Thanks
Zhou
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-04 3:33 ` "Zhou, Wenjian/周文剑"
@ 2015-12-04 8:56 ` Chao Fan
2015-12-07 1:09 ` "Zhou, Wenjian/周文剑"
2015-12-10 8:14 ` Atsushi Kumagai
1 sibling, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-04 8:56 UTC (permalink / raw)
To: Wenjian Zhou/周文剑; +Cc: Atsushi Kumagai, kexec
Hi Zhou Wenjian and Kumagai,
I have follow Zhou Wenjian's words to do some tests, in the condition of
"-c", makdumpfile 1.5.9 does perform better than "-l".
I have done more tests in a machine with 128G memory, in the condition
of "-d 0" and "-d 3", the makedumpfile 1.5.9 performs well. But if with
"--num-threads 1", it does need more time than without "--num-threads".
Here is my results(makedumpfile -c):
"-d 0" (the size of vmcore is 2.6G):
--num-threads time(seconds)
0 556
1 1186
4 307
8 186
12 131
16 123
"-d 3" (the size of vmcore is 1.3G):
--num-threads time(seconds)
0 141
1 262
2 137
4 91
8 121
16 137
So, I think makedumpfile 1.5.9 can save time in the condition of "-c"
and not "-d 31" and not "--num-threads 1".
----- Original Message -----
> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> Cc: kexec@lists.infradead.org
> Sent: Friday, December 4, 2015 11:33:36 AM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>
> Hello Kumagai,
>
> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> > Hello, Zhou
> >
> >> On 12/02/2015 03:24 PM, Dave Young wrote:
> >>> Hi,
> >>>
> >>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >>>> I think there is no problem if other test results are as expected.
> >>>>
> >>>> --num-threads mainly reduces the time of compressing.
> >>>> So for lzo, it can't do much help at most of time.
> >>>
> >>> Seems the help of --num-threads does not say it exactly:
> >>>
> >>> [--num-threads THREADNUM]:
> >>> Using multiple threads to read and compress data of each page in
> >>> parallel.
> >>> And it will reduces time for saving DUMPFILE.
> >>> This feature only supports creating DUMPFILE in kdump-comressed
> >>> format from
> >>> VMCORE in kdump-compressed format or elf format.
> >>>
> >>> Lzo is also a compress method, it should be mentioned that --num-threads
> >>> only
> >>> supports zlib compressed vmcore.
> >>>
> >>
> >> Sorry, it seems that something I said is not so clear.
> >> lzo is also supported. Since lzo compresses data at a high speed, the
> >> improving of the performance is not so obvious at most of time.
> >>
> >>> Also worth to mention about the recommended -d value for this feature.
> >>>
> >>
> >> Yes, I think it's worth. I forgot it.
> >
> > I saw your patch, but I think I should confirm what is the problem first.
> >
> >> However, when "-d 31" is specified, it will be worse.
> >> Less than 50 buffers are used to cache the compressed page.
> >> And even the page has been filtered, it will also take a buffer.
> >> So if "-d 31" is specified, the filtered page will use a lot
> >> of buffers. Then the page which needs to be compressed can't
> >> be compressed parallel.
> >
> > Could you explain why compression will not be parallel in more detail ?
> > Actually the buffers are used also for filtered pages, it sounds
> > inefficient.
> > However, I don't understand why it prevents parallel compression.
> >
>
> Think about this, in a huge memory, most of the page will be filtered, and
> we have 5 buffers.
>
> page1 page2 page3 page4 page5 page6 page7 .....
> [buffer1] [2] [3] [4] [5]
> unfiltered filtered filtered filtered filtered unfiltered filtered
>
> Since filtered page will take a buffer, when compressing page1,
> page6 can't be compressed at the same time.
> That why it will prevent parallel compression.
>
> > Further, according to Chao's benchmark, there is a big performance
> > degradation even if the number of thread is 1. (58s vs 240s)
> > The current implementation seems to have some problems, we should
> > solve them.
> >
>
> If "-d 31" is specified, on the one hand we can't save time by compressing
> parallel, on the other hand we will introduce some extra work by adding
> "--num-threads". So it is obvious that it will have a performance
> degradation.
>
> I'm not so sure if it is a problem that the performance degradation is so
> big.
> But I think if in other cases, it works as expected, this won't be a problem(
> or a problem needs to be fixed), for the performance degradation existing
> in theory.
>
> Or the current implementation should be replaced by a new arithmetic.
> For example:
> We can add an array to record whether the page is filtered or not.
> And only the unfiltered page will take the buffer.
>
> But I'm not sure if it is worth.
> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>
> --
> Thanks
> Zhou
>
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-04 8:56 ` Chao Fan
@ 2015-12-07 1:09 ` "Zhou, Wenjian/周文剑"
0 siblings, 0 replies; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-07 1:09 UTC (permalink / raw)
To: Chao Fan; +Cc: Atsushi Kumagai, kexec
On 12/04/2015 04:56 PM, Chao Fan wrote:
> Hi Zhou Wenjian and Kumagai,
>
> I have follow Zhou Wenjian's words to do some tests, in the condition of
> "-c", makdumpfile 1.5.9 does perform better than "-l".
>
> I have done more tests in a machine with 128G memory, in the condition
> of "-d 0" and "-d 3", the makedumpfile 1.5.9 performs well. But if with
> "--num-threads 1", it does need more time than without "--num-threads".
>
> Here is my results(makedumpfile -c):
>
> "-d 0" (the size of vmcore is 2.6G):
> --num-threads time(seconds)
> 0 556
> 1 1186
> 4 307
> 8 186
> 12 131
> 16 123
>
>
> "-d 3" (the size of vmcore is 1.3G):
> --num-threads time(seconds)
> 0 141
> 1 262
> 2 137
> 4 91
> 8 121
> 16 137
>
Hello Chao,
This result also seems not so good.
We had test it, and you can refer to:
http://lists.infradead.org/pipermail/kexec/2015-October/014576.html
Could you collect the information by *perf stat -e page-faults* on both
--num-threads 0 and --num-threads 1 ?
Your result looks like the performance without the patch which dividing
compress2().
--
Thanks
Zhou
> So, I think makedumpfile 1.5.9 can save time in the condition of "-c"
> and not "-d 31" and not "--num-threads 1".
>
> ----- Original Message -----
>> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> Cc: kexec@lists.infradead.org
>> Sent: Friday, December 4, 2015 11:33:36 AM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> Hello Kumagai,
>>
>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>>> Hello, Zhou
>>>
>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>>>> Hi,
>>>>>
>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>>>> I think there is no problem if other test results are as expected.
>>>>>>
>>>>>> --num-threads mainly reduces the time of compressing.
>>>>>> So for lzo, it can't do much help at most of time.
>>>>>
>>>>> Seems the help of --num-threads does not say it exactly:
>>>>>
>>>>> [--num-threads THREADNUM]:
>>>>> Using multiple threads to read and compress data of each page in
>>>>> parallel.
>>>>> And it will reduces time for saving DUMPFILE.
>>>>> This feature only supports creating DUMPFILE in kdump-comressed
>>>>> format from
>>>>> VMCORE in kdump-compressed format or elf format.
>>>>>
>>>>> Lzo is also a compress method, it should be mentioned that --num-threads
>>>>> only
>>>>> supports zlib compressed vmcore.
>>>>>
>>>>
>>>> Sorry, it seems that something I said is not so clear.
>>>> lzo is also supported. Since lzo compresses data at a high speed, the
>>>> improving of the performance is not so obvious at most of time.
>>>>
>>>>> Also worth to mention about the recommended -d value for this feature.
>>>>>
>>>>
>>>> Yes, I think it's worth. I forgot it.
>>>
>>> I saw your patch, but I think I should confirm what is the problem first.
>>>
>>>> However, when "-d 31" is specified, it will be worse.
>>>> Less than 50 buffers are used to cache the compressed page.
>>>> And even the page has been filtered, it will also take a buffer.
>>>> So if "-d 31" is specified, the filtered page will use a lot
>>>> of buffers. Then the page which needs to be compressed can't
>>>> be compressed parallel.
>>>
>>> Could you explain why compression will not be parallel in more detail ?
>>> Actually the buffers are used also for filtered pages, it sounds
>>> inefficient.
>>> However, I don't understand why it prevents parallel compression.
>>>
>>
>> Think about this, in a huge memory, most of the page will be filtered, and
>> we have 5 buffers.
>>
>> page1 page2 page3 page4 page5 page6 page7 .....
>> [buffer1] [2] [3] [4] [5]
>> unfiltered filtered filtered filtered filtered unfiltered filtered
>>
>> Since filtered page will take a buffer, when compressing page1,
>> page6 can't be compressed at the same time.
>> That why it will prevent parallel compression.
>>
>>> Further, according to Chao's benchmark, there is a big performance
>>> degradation even if the number of thread is 1. (58s vs 240s)
>>> The current implementation seems to have some problems, we should
>>> solve them.
>>>
>>
>> If "-d 31" is specified, on the one hand we can't save time by compressing
>> parallel, on the other hand we will introduce some extra work by adding
>> "--num-threads". So it is obvious that it will have a performance
>> degradation.
>>
>> I'm not so sure if it is a problem that the performance degradation is so
>> big.
>> But I think if in other cases, it works as expected, this won't be a problem(
>> or a problem needs to be fixed), for the performance degradation existing
>> in theory.
>>
>> Or the current implementation should be replaced by a new arithmetic.
>> For example:
>> We can add an array to record whether the page is filtered or not.
>> And only the unfiltered page will take the buffer.
>>
>> But I'm not sure if it is worth.
>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>>
>> --
>> Thanks
>> Zhou
>>
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-04 3:33 ` "Zhou, Wenjian/周文剑"
2015-12-04 8:56 ` Chao Fan
@ 2015-12-10 8:14 ` Atsushi Kumagai
2015-12-10 9:36 ` "Zhou, Wenjian/周文剑"
1 sibling, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-12-10 8:14 UTC (permalink / raw)
To: "Zhou, Wenjian/周文剑"
Cc: kexec@lists.infradead.org
>Hello Kumagai,
>
>On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> Hello, Zhou
>>
>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>>> Hi,
>>>>
>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>>> I think there is no problem if other test results are as expected.
>>>>>
>>>>> --num-threads mainly reduces the time of compressing.
>>>>> So for lzo, it can't do much help at most of time.
>>>>
>>>> Seems the help of --num-threads does not say it exactly:
>>>>
>>>> [--num-threads THREADNUM]:
>>>> Using multiple threads to read and compress data of each page in parallel.
>>>> And it will reduces time for saving DUMPFILE.
>>>> This feature only supports creating DUMPFILE in kdump-comressed format from
>>>> VMCORE in kdump-compressed format or elf format.
>>>>
>>>> Lzo is also a compress method, it should be mentioned that --num-threads only
>>>> supports zlib compressed vmcore.
>>>>
>>>
>>> Sorry, it seems that something I said is not so clear.
>>> lzo is also supported. Since lzo compresses data at a high speed, the
>>> improving of the performance is not so obvious at most of time.
>>>
>>>> Also worth to mention about the recommended -d value for this feature.
>>>>
>>>
>>> Yes, I think it's worth. I forgot it.
>>
>> I saw your patch, but I think I should confirm what is the problem first.
>>
>>> However, when "-d 31" is specified, it will be worse.
>>> Less than 50 buffers are used to cache the compressed page.
>>> And even the page has been filtered, it will also take a buffer.
>>> So if "-d 31" is specified, the filtered page will use a lot
>>> of buffers. Then the page which needs to be compressed can't
>>> be compressed parallel.
>>
>> Could you explain why compression will not be parallel in more detail ?
>> Actually the buffers are used also for filtered pages, it sounds inefficient.
>> However, I don't understand why it prevents parallel compression.
>>
>
>Think about this, in a huge memory, most of the page will be filtered, and
>we have 5 buffers.
>
>page1 page2 page3 page4 page5 page6 page7 .....
>[buffer1] [2] [3] [4] [5]
>unfiltered filtered filtered filtered filtered unfiltered filtered
>
>Since filtered page will take a buffer, when compressing page1,
>page6 can't be compressed at the same time.
>That why it will prevent parallel compression.
Thanks for your explanation, I understand.
This is just an issue of the current implementation, there is no
reason to stand this restriction.
>> Further, according to Chao's benchmark, there is a big performance
>> degradation even if the number of thread is 1. (58s vs 240s)
>> The current implementation seems to have some problems, we should
>> solve them.
>>
>
>If "-d 31" is specified, on the one hand we can't save time by compressing
>parallel, on the other hand we will introduce some extra work by adding
>"--num-threads". So it is obvious that it will have a performance degradation.
Sure, there must be some overhead due to "some extra work"(e.g. exclusive lock),
but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
too slow, the degradation is too big to be called "some extra work".
Both --num-threads=0 and --num-threads=1 are serial processing,
the above "buffer fairness issue" will not be related to this degradation.
What do you think what make this degradation ?
>I'm not so sure if it is a problem that the performance degradation is so big.
>But I think if in other cases, it works as expected, this won't be a problem(
>or a problem needs to be fixed), for the performance degradation existing
>in theory.
>
>Or the current implementation should be replaced by a new arithmetic.
>For example:
>We can add an array to record whether the page is filtered or not.
>And only the unfiltered page will take the buffer.
We should discuss how to implement new mechanism, I'll mention this later.
>But I'm not sure if it is worth.
>For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
Basically the faster, the better. There is no obvious target time.
If there is room for improvement, we should do it.
Thanks,
Atsushi Kumagai
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-10 8:14 ` Atsushi Kumagai
@ 2015-12-10 9:36 ` "Zhou, Wenjian/周文剑"
2015-12-10 9:58 ` Chao Fan
2015-12-14 8:26 ` Atsushi Kumagai
0 siblings, 2 replies; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-10 9:36 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: kexec@lists.infradead.org
On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> Hello Kumagai,
>>
>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>>> Hello, Zhou
>>>
>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>>>> Hi,
>>>>>
>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>>>> I think there is no problem if other test results are as expected.
>>>>>>
>>>>>> --num-threads mainly reduces the time of compressing.
>>>>>> So for lzo, it can't do much help at most of time.
>>>>>
>>>>> Seems the help of --num-threads does not say it exactly:
>>>>>
>>>>> [--num-threads THREADNUM]:
>>>>> Using multiple threads to read and compress data of each page in parallel.
>>>>> And it will reduces time for saving DUMPFILE.
>>>>> This feature only supports creating DUMPFILE in kdump-comressed format from
>>>>> VMCORE in kdump-compressed format or elf format.
>>>>>
>>>>> Lzo is also a compress method, it should be mentioned that --num-threads only
>>>>> supports zlib compressed vmcore.
>>>>>
>>>>
>>>> Sorry, it seems that something I said is not so clear.
>>>> lzo is also supported. Since lzo compresses data at a high speed, the
>>>> improving of the performance is not so obvious at most of time.
>>>>
>>>>> Also worth to mention about the recommended -d value for this feature.
>>>>>
>>>>
>>>> Yes, I think it's worth. I forgot it.
>>>
>>> I saw your patch, but I think I should confirm what is the problem first.
>>>
>>>> However, when "-d 31" is specified, it will be worse.
>>>> Less than 50 buffers are used to cache the compressed page.
>>>> And even the page has been filtered, it will also take a buffer.
>>>> So if "-d 31" is specified, the filtered page will use a lot
>>>> of buffers. Then the page which needs to be compressed can't
>>>> be compressed parallel.
>>>
>>> Could you explain why compression will not be parallel in more detail ?
>>> Actually the buffers are used also for filtered pages, it sounds inefficient.
>>> However, I don't understand why it prevents parallel compression.
>>>
>>
>> Think about this, in a huge memory, most of the page will be filtered, and
>> we have 5 buffers.
>>
>> page1 page2 page3 page4 page5 page6 page7 .....
>> [buffer1] [2] [3] [4] [5]
>> unfiltered filtered filtered filtered filtered unfiltered filtered
>>
>> Since filtered page will take a buffer, when compressing page1,
>> page6 can't be compressed at the same time.
>> That why it will prevent parallel compression.
>
> Thanks for your explanation, I understand.
> This is just an issue of the current implementation, there is no
> reason to stand this restriction.
>
>>> Further, according to Chao's benchmark, there is a big performance
>>> degradation even if the number of thread is 1. (58s vs 240s)
>>> The current implementation seems to have some problems, we should
>>> solve them.
>>>
>>
>> If "-d 31" is specified, on the one hand we can't save time by compressing
>> parallel, on the other hand we will introduce some extra work by adding
>> "--num-threads". So it is obvious that it will have a performance degradation.
>
> Sure, there must be some overhead due to "some extra work"(e.g. exclusive lock),
> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
> too slow, the degradation is too big to be called "some extra work".
>
> Both --num-threads=0 and --num-threads=1 are serial processing,
> the above "buffer fairness issue" will not be related to this degradation.
> What do you think what make this degradation ?
>
I can't get such result at this moment, so I can't do some further investigation
right now. I guess it may be caused by the underlying implementation of pthread.
I reviewed the test result of the patch v2 and found in different machines,
the results are quite different.
It seems that I can get almost the same result of Chao from "PRIMEQUEST 1800E".
###################################
- System: PRIMERGY RX300 S6
- CPU: Intel(R) Xeon(R) CPU x5660
- memory: 16GB
###################################
************ makedumpfile -d 7 ******************
core-data 0 256
threads-num
-l
0 10 144
4 5 110
8 5 111
12 6 111
************ makedumpfile -d 31 ******************
core-data 0 256
threads-num
-l
0 0 0
4 2 2
8 2 3
12 2 3
###################################
- System: PRIMEQUEST 1800E
- CPU: Intel(R) Xeon(R) CPU E7540
- memory: 32GB
###################################
************ makedumpfile -d 7 ******************
core-data 0 256
threads-num
-l
0 34 270
4 63 154
8 64 131
12 65 159
************ makedumpfile -d 31 ******************
core-data 0 256
threads-num
-l
0 2 1
4 48 48
8 48 49
12 49 50
>> I'm not so sure if it is a problem that the performance degradation is so big.
>> But I think if in other cases, it works as expected, this won't be a problem(
>> or a problem needs to be fixed), for the performance degradation existing
>> in theory.
>>
>> Or the current implementation should be replaced by a new arithmetic.
>> For example:
>> We can add an array to record whether the page is filtered or not.
>> And only the unfiltered page will take the buffer.
>
> We should discuss how to implement new mechanism, I'll mention this later.
>
>> But I'm not sure if it is worth.
>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>
> Basically the faster, the better. There is no obvious target time.
> If there is room for improvement, we should do it.
>
Maybe we can improve the performance of "-c -d 31" in some case.
BTW, we can easily get the theoretical performance by using the "--split".
--
Thanks
Zhou
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-10 9:36 ` "Zhou, Wenjian/周文剑"
@ 2015-12-10 9:58 ` Chao Fan
2015-12-10 10:32 ` "Zhou, Wenjian/周文剑"
2015-12-14 8:26 ` Atsushi Kumagai
1 sibling, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-10 9:58 UTC (permalink / raw)
To: Wenjian Zhou/周文剑; +Cc: Atsushi Kumagai, kexec
----- Original Message -----
> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> Cc: kexec@lists.infradead.org
> Sent: Thursday, December 10, 2015 5:36:47 PM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>
> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >> Hello Kumagai,
> >>
> >> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >>> Hello, Zhou
> >>>
> >>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >>>>> Hi,
> >>>>>
> >>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >>>>>> I think there is no problem if other test results are as expected.
> >>>>>>
> >>>>>> --num-threads mainly reduces the time of compressing.
> >>>>>> So for lzo, it can't do much help at most of time.
> >>>>>
> >>>>> Seems the help of --num-threads does not say it exactly:
> >>>>>
> >>>>> [--num-threads THREADNUM]:
> >>>>> Using multiple threads to read and compress data of each page
> >>>>> in parallel.
> >>>>> And it will reduces time for saving DUMPFILE.
> >>>>> This feature only supports creating DUMPFILE in
> >>>>> kdump-comressed format from
> >>>>> VMCORE in kdump-compressed format or elf format.
> >>>>>
> >>>>> Lzo is also a compress method, it should be mentioned that
> >>>>> --num-threads only
> >>>>> supports zlib compressed vmcore.
> >>>>>
> >>>>
> >>>> Sorry, it seems that something I said is not so clear.
> >>>> lzo is also supported. Since lzo compresses data at a high speed, the
> >>>> improving of the performance is not so obvious at most of time.
> >>>>
> >>>>> Also worth to mention about the recommended -d value for this feature.
> >>>>>
> >>>>
> >>>> Yes, I think it's worth. I forgot it.
> >>>
> >>> I saw your patch, but I think I should confirm what is the problem first.
> >>>
> >>>> However, when "-d 31" is specified, it will be worse.
> >>>> Less than 50 buffers are used to cache the compressed page.
> >>>> And even the page has been filtered, it will also take a buffer.
> >>>> So if "-d 31" is specified, the filtered page will use a lot
> >>>> of buffers. Then the page which needs to be compressed can't
> >>>> be compressed parallel.
> >>>
> >>> Could you explain why compression will not be parallel in more detail ?
> >>> Actually the buffers are used also for filtered pages, it sounds
> >>> inefficient.
> >>> However, I don't understand why it prevents parallel compression.
> >>>
> >>
> >> Think about this, in a huge memory, most of the page will be filtered, and
> >> we have 5 buffers.
> >>
> >> page1 page2 page3 page4 page5 page6 page7
> >> .....
> >> [buffer1] [2] [3] [4] [5]
> >> unfiltered filtered filtered filtered filtered unfiltered filtered
> >>
> >> Since filtered page will take a buffer, when compressing page1,
> >> page6 can't be compressed at the same time.
> >> That why it will prevent parallel compression.
> >
> > Thanks for your explanation, I understand.
> > This is just an issue of the current implementation, there is no
> > reason to stand this restriction.
> >
> >>> Further, according to Chao's benchmark, there is a big performance
> >>> degradation even if the number of thread is 1. (58s vs 240s)
> >>> The current implementation seems to have some problems, we should
> >>> solve them.
> >>>
> >>
> >> If "-d 31" is specified, on the one hand we can't save time by compressing
> >> parallel, on the other hand we will introduce some extra work by adding
> >> "--num-threads". So it is obvious that it will have a performance
> >> degradation.
> >
> > Sure, there must be some overhead due to "some extra work"(e.g. exclusive
> > lock),
> > but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
> > too slow, the degradation is too big to be called "some extra work".
> >
> > Both --num-threads=0 and --num-threads=1 are serial processing,
> > the above "buffer fairness issue" will not be related to this degradation.
> > What do you think what make this degradation ?
> >
>
> I can't get such result at this moment, so I can't do some further
> investigation
> right now. I guess it may be caused by the underlying implementation of
> pthread.
> I reviewed the test result of the patch v2 and found in different machines,
> the results are quite different.
Hi Zhou Wenjian,
I have done more tests in another machine with 128G memory, and get the result:
the size of vmcore is 300M in "-d 31"
makedumpfile -l --message-level 1 -d 31:
time: 8.6s page-faults: 2272
makedumpfile -l --num-threads 1 --message-level 1 -d 31:
time: 28.1s page-faults: 2359
and the size of vmcore is 2.6G in "-d 0".
In this machine, I get the same result as yours:
makedumpfile -c --message-level 1 -d 0:
time: 597s page-faults: 2287
makedumpfile -c --num-threads 1 --message-level 1 -d 0:
time: 602s page-faults: 2361
makedumpfile -c --num-threads 2 --message-level 1 -d 0:
time: 337s page-faults: 2397
makedumpfile -c --num-threads 4 --message-level 1 -d 0:
time: 175s page-faults: 2461
makedumpfile -c --num-threads 8 --message-level 1 -d 0:
time: 103s page-faults: 2611
But the machine of my first test is not under my control, should I wait for
the first machine to do more tests?
If there are still some problems in my tests, please tell me.
Thanks,
Chao Fan
>
> It seems that I can get almost the same result of Chao from "PRIMEQUEST
> 1800E".
>
> ###################################
> - System: PRIMERGY RX300 S6
> - CPU: Intel(R) Xeon(R) CPU x5660
> - memory: 16GB
> ###################################
> ************ makedumpfile -d 7 ******************
> core-data 0 256
> threads-num
> -l
> 0 10 144
> 4 5 110
> 8 5 111
> 12 6 111
>
> ************ makedumpfile -d 31 ******************
> core-data 0 256
> threads-num
> -l
> 0 0 0
> 4 2 2
> 8 2 3
> 12 2 3
>
> ###################################
> - System: PRIMEQUEST 1800E
> - CPU: Intel(R) Xeon(R) CPU E7540
> - memory: 32GB
> ###################################
> ************ makedumpfile -d 7 ******************
> core-data 0 256
> threads-num
> -l
> 0 34 270
> 4 63 154
> 8 64 131
> 12 65 159
>
> ************ makedumpfile -d 31 ******************
> core-data 0 256
> threads-num
> -l
> 0 2 1
> 4 48 48
> 8 48 49
> 12 49 50
>
> >> I'm not so sure if it is a problem that the performance degradation is so
> >> big.
> >> But I think if in other cases, it works as expected, this won't be a
> >> problem(
> >> or a problem needs to be fixed), for the performance degradation existing
> >> in theory.
> >>
> >> Or the current implementation should be replaced by a new arithmetic.
> >> For example:
> >> We can add an array to record whether the page is filtered or not.
> >> And only the unfiltered page will take the buffer.
> >
> > We should discuss how to implement new mechanism, I'll mention this later.
> >
> >> But I'm not sure if it is worth.
> >> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
> >
> > Basically the faster, the better. There is no obvious target time.
> > If there is room for improvement, we should do it.
> >
>
> Maybe we can improve the performance of "-c -d 31" in some case.
>
> BTW, we can easily get the theoretical performance by using the "--split".
>
> --
> Thanks
> Zhou
>
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-10 9:58 ` Chao Fan
@ 2015-12-10 10:32 ` "Zhou, Wenjian/周文剑"
2015-12-10 10:54 ` Chao Fan
0 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-10 10:32 UTC (permalink / raw)
To: Chao Fan; +Cc: Atsushi Kumagai, kexec
On 12/10/2015 05:58 PM, Chao Fan wrote:
>
>
> ----- Original Message -----
>> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> Cc: kexec@lists.infradead.org
>> Sent: Thursday, December 10, 2015 5:36:47 PM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>>>> Hello Kumagai,
>>>>
>>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>>>>> Hello, Zhou
>>>>>
>>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>>>>>> I think there is no problem if other test results are as expected.
>>>>>>>>
>>>>>>>> --num-threads mainly reduces the time of compressing.
>>>>>>>> So for lzo, it can't do much help at most of time.
>>>>>>>
>>>>>>> Seems the help of --num-threads does not say it exactly:
>>>>>>>
>>>>>>> [--num-threads THREADNUM]:
>>>>>>> Using multiple threads to read and compress data of each page
>>>>>>> in parallel.
>>>>>>> And it will reduces time for saving DUMPFILE.
>>>>>>> This feature only supports creating DUMPFILE in
>>>>>>> kdump-comressed format from
>>>>>>> VMCORE in kdump-compressed format or elf format.
>>>>>>>
>>>>>>> Lzo is also a compress method, it should be mentioned that
>>>>>>> --num-threads only
>>>>>>> supports zlib compressed vmcore.
>>>>>>>
>>>>>>
>>>>>> Sorry, it seems that something I said is not so clear.
>>>>>> lzo is also supported. Since lzo compresses data at a high speed, the
>>>>>> improving of the performance is not so obvious at most of time.
>>>>>>
>>>>>>> Also worth to mention about the recommended -d value for this feature.
>>>>>>>
>>>>>>
>>>>>> Yes, I think it's worth. I forgot it.
>>>>>
>>>>> I saw your patch, but I think I should confirm what is the problem first.
>>>>>
>>>>>> However, when "-d 31" is specified, it will be worse.
>>>>>> Less than 50 buffers are used to cache the compressed page.
>>>>>> And even the page has been filtered, it will also take a buffer.
>>>>>> So if "-d 31" is specified, the filtered page will use a lot
>>>>>> of buffers. Then the page which needs to be compressed can't
>>>>>> be compressed parallel.
>>>>>
>>>>> Could you explain why compression will not be parallel in more detail ?
>>>>> Actually the buffers are used also for filtered pages, it sounds
>>>>> inefficient.
>>>>> However, I don't understand why it prevents parallel compression.
>>>>>
>>>>
>>>> Think about this, in a huge memory, most of the page will be filtered, and
>>>> we have 5 buffers.
>>>>
>>>> page1 page2 page3 page4 page5 page6 page7
>>>> .....
>>>> [buffer1] [2] [3] [4] [5]
>>>> unfiltered filtered filtered filtered filtered unfiltered filtered
>>>>
>>>> Since filtered page will take a buffer, when compressing page1,
>>>> page6 can't be compressed at the same time.
>>>> That why it will prevent parallel compression.
>>>
>>> Thanks for your explanation, I understand.
>>> This is just an issue of the current implementation, there is no
>>> reason to stand this restriction.
>>>
>>>>> Further, according to Chao's benchmark, there is a big performance
>>>>> degradation even if the number of thread is 1. (58s vs 240s)
>>>>> The current implementation seems to have some problems, we should
>>>>> solve them.
>>>>>
>>>>
>>>> If "-d 31" is specified, on the one hand we can't save time by compressing
>>>> parallel, on the other hand we will introduce some extra work by adding
>>>> "--num-threads". So it is obvious that it will have a performance
>>>> degradation.
>>>
>>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive
>>> lock),
>>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
>>> too slow, the degradation is too big to be called "some extra work".
>>>
>>> Both --num-threads=0 and --num-threads=1 are serial processing,
>>> the above "buffer fairness issue" will not be related to this degradation.
>>> What do you think what make this degradation ?
>>>
>>
>> I can't get such result at this moment, so I can't do some further
>> investigation
>> right now. I guess it may be caused by the underlying implementation of
>> pthread.
>> I reviewed the test result of the patch v2 and found in different machines,
>> the results are quite different.
>
> Hi Zhou Wenjian,
>
> I have done more tests in another machine with 128G memory, and get the result:
>
> the size of vmcore is 300M in "-d 31"
> makedumpfile -l --message-level 1 -d 31:
> time: 8.6s page-faults: 2272
>
> makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> time: 28.1s page-faults: 2359
>
>
> and the size of vmcore is 2.6G in "-d 0".
> In this machine, I get the same result as yours:
>
>
> makedumpfile -c --message-level 1 -d 0:
> time: 597s page-faults: 2287
>
> makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> time: 602s page-faults: 2361
>
> makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> time: 337s page-faults: 2397
>
> makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> time: 175s page-faults: 2461
>
> makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> time: 103s page-faults: 2611
>
>
> But the machine of my first test is not under my control, should I wait for
> the first machine to do more tests?
> If there are still some problems in my tests, please tell me.
>
Thanks a lot for your test, it seems that there is nothing wrong.
And I haven't got any idea about more tests...
Could you provide the information of your cpu ?
I will do some further investigation later.
But I still believe it's better not to use "-l -d 31" and "--num-threads"
at the same time, though it's very strange that the performance
degradation is so big.
--
Thanks
Zhou
> Thanks,
> Chao Fan
>
>
>>
>> It seems that I can get almost the same result of Chao from "PRIMEQUEST
>> 1800E".
>>
>> ###################################
>> - System: PRIMERGY RX300 S6
>> - CPU: Intel(R) Xeon(R) CPU x5660
>> - memory: 16GB
>> ###################################
>> ************ makedumpfile -d 7 ******************
>> core-data 0 256
>> threads-num
>> -l
>> 0 10 144
>> 4 5 110
>> 8 5 111
>> 12 6 111
>>
>> ************ makedumpfile -d 31 ******************
>> core-data 0 256
>> threads-num
>> -l
>> 0 0 0
>> 4 2 2
>> 8 2 3
>> 12 2 3
>>
>> ###################################
>> - System: PRIMEQUEST 1800E
>> - CPU: Intel(R) Xeon(R) CPU E7540
>> - memory: 32GB
>> ###################################
>> ************ makedumpfile -d 7 ******************
>> core-data 0 256
>> threads-num
>> -l
>> 0 34 270
>> 4 63 154
>> 8 64 131
>> 12 65 159
>>
>> ************ makedumpfile -d 31 ******************
>> core-data 0 256
>> threads-num
>> -l
>> 0 2 1
>> 4 48 48
>> 8 48 49
>> 12 49 50
>>
>>>> I'm not so sure if it is a problem that the performance degradation is so
>>>> big.
>>>> But I think if in other cases, it works as expected, this won't be a
>>>> problem(
>>>> or a problem needs to be fixed), for the performance degradation existing
>>>> in theory.
>>>>
>>>> Or the current implementation should be replaced by a new arithmetic.
>>>> For example:
>>>> We can add an array to record whether the page is filtered or not.
>>>> And only the unfiltered page will take the buffer.
>>>
>>> We should discuss how to implement new mechanism, I'll mention this later.
>>>
>>>> But I'm not sure if it is worth.
>>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>>>
>>> Basically the faster, the better. There is no obvious target time.
>>> If there is room for improvement, we should do it.
>>>
>>
>> Maybe we can improve the performance of "-c -d 31" in some case.
>>
>> BTW, we can easily get the theoretical performance by using the "--split".
>>
>> --
>> Thanks
>> Zhou
>>
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-10 10:32 ` "Zhou, Wenjian/周文剑"
@ 2015-12-10 10:54 ` Chao Fan
2015-12-22 8:32 ` HATAYAMA Daisuke
0 siblings, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-10 10:54 UTC (permalink / raw)
To: Wenjian Zhou/周文剑; +Cc: Atsushi Kumagai, kexec
----- Original Message -----
> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> To: "Chao Fan" <cfan@redhat.com>
> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>, kexec@lists.infradead.org
> Sent: Thursday, December 10, 2015 6:32:32 PM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>
> On 12/10/2015 05:58 PM, Chao Fan wrote:
> >
> >
> > ----- Original Message -----
> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> >> Cc: kexec@lists.infradead.org
> >> Sent: Thursday, December 10, 2015 5:36:47 PM
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >>
> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >>>> Hello Kumagai,
> >>>>
> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >>>>> Hello, Zhou
> >>>>>
> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >>>>>>> Hi,
> >>>>>>>
> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >>>>>>>> I think there is no problem if other test results are as expected.
> >>>>>>>>
> >>>>>>>> --num-threads mainly reduces the time of compressing.
> >>>>>>>> So for lzo, it can't do much help at most of time.
> >>>>>>>
> >>>>>>> Seems the help of --num-threads does not say it exactly:
> >>>>>>>
> >>>>>>> [--num-threads THREADNUM]:
> >>>>>>> Using multiple threads to read and compress data of each
> >>>>>>> page
> >>>>>>> in parallel.
> >>>>>>> And it will reduces time for saving DUMPFILE.
> >>>>>>> This feature only supports creating DUMPFILE in
> >>>>>>> kdump-comressed format from
> >>>>>>> VMCORE in kdump-compressed format or elf format.
> >>>>>>>
> >>>>>>> Lzo is also a compress method, it should be mentioned that
> >>>>>>> --num-threads only
> >>>>>>> supports zlib compressed vmcore.
> >>>>>>>
> >>>>>>
> >>>>>> Sorry, it seems that something I said is not so clear.
> >>>>>> lzo is also supported. Since lzo compresses data at a high speed, the
> >>>>>> improving of the performance is not so obvious at most of time.
> >>>>>>
> >>>>>>> Also worth to mention about the recommended -d value for this
> >>>>>>> feature.
> >>>>>>>
> >>>>>>
> >>>>>> Yes, I think it's worth. I forgot it.
> >>>>>
> >>>>> I saw your patch, but I think I should confirm what is the problem
> >>>>> first.
> >>>>>
> >>>>>> However, when "-d 31" is specified, it will be worse.
> >>>>>> Less than 50 buffers are used to cache the compressed page.
> >>>>>> And even the page has been filtered, it will also take a buffer.
> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
> >>>>>> of buffers. Then the page which needs to be compressed can't
> >>>>>> be compressed parallel.
> >>>>>
> >>>>> Could you explain why compression will not be parallel in more detail ?
> >>>>> Actually the buffers are used also for filtered pages, it sounds
> >>>>> inefficient.
> >>>>> However, I don't understand why it prevents parallel compression.
> >>>>>
> >>>>
> >>>> Think about this, in a huge memory, most of the page will be filtered,
> >>>> and
> >>>> we have 5 buffers.
> >>>>
> >>>> page1 page2 page3 page4 page5 page6 page7
> >>>> .....
> >>>> [buffer1] [2] [3] [4] [5]
> >>>> unfiltered filtered filtered filtered filtered unfiltered
> >>>> filtered
> >>>>
> >>>> Since filtered page will take a buffer, when compressing page1,
> >>>> page6 can't be compressed at the same time.
> >>>> That why it will prevent parallel compression.
> >>>
> >>> Thanks for your explanation, I understand.
> >>> This is just an issue of the current implementation, there is no
> >>> reason to stand this restriction.
> >>>
> >>>>> Further, according to Chao's benchmark, there is a big performance
> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
> >>>>> The current implementation seems to have some problems, we should
> >>>>> solve them.
> >>>>>
> >>>>
> >>>> If "-d 31" is specified, on the one hand we can't save time by
> >>>> compressing
> >>>> parallel, on the other hand we will introduce some extra work by adding
> >>>> "--num-threads". So it is obvious that it will have a performance
> >>>> degradation.
> >>>
> >>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive
> >>> lock),
> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
> >>> too slow, the degradation is too big to be called "some extra work".
> >>>
> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
> >>> the above "buffer fairness issue" will not be related to this
> >>> degradation.
> >>> What do you think what make this degradation ?
> >>>
> >>
> >> I can't get such result at this moment, so I can't do some further
> >> investigation
> >> right now. I guess it may be caused by the underlying implementation of
> >> pthread.
> >> I reviewed the test result of the patch v2 and found in different
> >> machines,
> >> the results are quite different.
> >
> > Hi Zhou Wenjian,
> >
> > I have done more tests in another machine with 128G memory, and get the
> > result:
> >
> > the size of vmcore is 300M in "-d 31"
> > makedumpfile -l --message-level 1 -d 31:
> > time: 8.6s page-faults: 2272
> >
> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> > time: 28.1s page-faults: 2359
> >
> >
> > and the size of vmcore is 2.6G in "-d 0".
> > In this machine, I get the same result as yours:
> >
> >
> > makedumpfile -c --message-level 1 -d 0:
> > time: 597s page-faults: 2287
> >
> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> > time: 602s page-faults: 2361
> >
> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> > time: 337s page-faults: 2397
> >
> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> > time: 175s page-faults: 2461
> >
> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> > time: 103s page-faults: 2611
> >
> >
> > But the machine of my first test is not under my control, should I wait for
> > the first machine to do more tests?
> > If there are still some problems in my tests, please tell me.
> >
>
> Thanks a lot for your test, it seems that there is nothing wrong.
> And I haven't got any idea about more tests...
>
> Could you provide the information of your cpu ?
> I will do some further investigation later.
>
OK, of course, here is the information of cpu:
# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 48
On-line CPU(s) list: 0-47
Thread(s) per core: 1
Core(s) per socket: 6
Socket(s): 8
NUMA node(s): 8
Vendor ID: AuthenticAMD
CPU family: 16
Model: 8
Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
Stepping: 0
CPU MHz: 2793.040
BogoMIPS: 5586.22
Virtualization: AMD-V
L1d cache: 64K
L1i cache: 64K
L2 cache: 512K
L3 cache: 5118K
NUMA node0 CPU(s): 0,8,16,24,32,40
NUMA node1 CPU(s): 1,9,17,25,33,41
NUMA node2 CPU(s): 2,10,18,26,34,42
NUMA node3 CPU(s): 3,11,19,27,35,43
NUMA node4 CPU(s): 4,12,20,28,36,44
NUMA node5 CPU(s): 5,13,21,29,37,45
NUMA node6 CPU(s): 6,14,22,30,38,46
NUMA node7 CPU(s): 7,15,23,31,39,47
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt hw_pstate npt lbrv svm_lock nrip_save pausefilter vmmcall
> But I still believe it's better not to use "-l -d 31" and "--num-threads"
> at the same time, though it's very strange that the performance
> degradation is so big.
>
> --
> Thanks
> Zhou
>
> > Thanks,
> > Chao Fan
> >
> >
> >>
> >> It seems that I can get almost the same result of Chao from "PRIMEQUEST
> >> 1800E".
> >>
> >> ###################################
> >> - System: PRIMERGY RX300 S6
> >> - CPU: Intel(R) Xeon(R) CPU x5660
> >> - memory: 16GB
> >> ###################################
> >> ************ makedumpfile -d 7 ******************
> >> core-data 0 256
> >> threads-num
> >> -l
> >> 0 10 144
> >> 4 5 110
> >> 8 5 111
> >> 12 6 111
> >>
> >> ************ makedumpfile -d 31 ******************
> >> core-data 0 256
> >> threads-num
> >> -l
> >> 0 0 0
> >> 4 2 2
> >> 8 2 3
> >> 12 2 3
> >>
> >> ###################################
> >> - System: PRIMEQUEST 1800E
> >> - CPU: Intel(R) Xeon(R) CPU E7540
> >> - memory: 32GB
> >> ###################################
> >> ************ makedumpfile -d 7 ******************
> >> core-data 0 256
> >> threads-num
> >> -l
> >> 0 34 270
> >> 4 63 154
> >> 8 64 131
> >> 12 65 159
> >>
> >> ************ makedumpfile -d 31 ******************
> >> core-data 0 256
> >> threads-num
> >> -l
> >> 0 2 1
> >> 4 48 48
> >> 8 48 49
> >> 12 49 50
> >>
> >>>> I'm not so sure if it is a problem that the performance degradation is
> >>>> so
> >>>> big.
> >>>> But I think if in other cases, it works as expected, this won't be a
> >>>> problem(
> >>>> or a problem needs to be fixed), for the performance degradation
> >>>> existing
> >>>> in theory.
> >>>>
> >>>> Or the current implementation should be replaced by a new arithmetic.
> >>>> For example:
> >>>> We can add an array to record whether the page is filtered or not.
> >>>> And only the unfiltered page will take the buffer.
> >>>
> >>> We should discuss how to implement new mechanism, I'll mention this
> >>> later.
> >>>
> >>>> But I'm not sure if it is worth.
> >>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much
> >>>> help.
> >>>
> >>> Basically the faster, the better. There is no obvious target time.
> >>> If there is room for improvement, we should do it.
> >>>
> >>
> >> Maybe we can improve the performance of "-c -d 31" in some case.
> >>
> >> BTW, we can easily get the theoretical performance by using the "--split".
> >>
> >> --
> >> Thanks
> >> Zhou
> >>
> >>
> >>
> >> _______________________________________________
> >> kexec mailing list
> >> kexec@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/kexec
> >>
>
>
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-10 9:36 ` "Zhou, Wenjian/周文剑"
2015-12-10 9:58 ` Chao Fan
@ 2015-12-14 8:26 ` Atsushi Kumagai
2015-12-14 8:59 ` "Zhou, Wenjian/周文剑"
1 sibling, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-12-14 8:26 UTC (permalink / raw)
To: "Zhou, Wenjian/周文剑"
Cc: kexec@lists.infradead.org
>>> Think about this, in a huge memory, most of the page will be filtered, and
>>> we have 5 buffers.
>>>
>>> page1 page2 page3 page4 page5 page6 page7 .....
>>> [buffer1] [2] [3] [4] [5]
>>> unfiltered filtered filtered filtered filtered unfiltered filtered
>>>
>>> Since filtered page will take a buffer, when compressing page1,
>>> page6 can't be compressed at the same time.
>>> That why it will prevent parallel compression.
>>
>> Thanks for your explanation, I understand.
>> This is just an issue of the current implementation, there is no
>> reason to stand this restriction.
>>
>>>> Further, according to Chao's benchmark, there is a big performance
>>>> degradation even if the number of thread is 1. (58s vs 240s)
>>>> The current implementation seems to have some problems, we should
>>>> solve them.
>>>>
>>>
>>> If "-d 31" is specified, on the one hand we can't save time by compressing
>>> parallel, on the other hand we will introduce some extra work by adding
>>> "--num-threads". So it is obvious that it will have a performance degradation.
>>
>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive lock),
>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
>> too slow, the degradation is too big to be called "some extra work".
>>
>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> the above "buffer fairness issue" will not be related to this degradation.
>> What do you think what make this degradation ?
>>
>
>I can't get such result at this moment, so I can't do some further investigation
>right now. I guess it may be caused by the underlying implementation of pthread.
>I reviewed the test result of the patch v2 and found in different machines,
>the results are quite different.
Unluckily, I also can't reproduce such big degradation.
According to the Chao's verification, this issue seems different form
the "too many page fault issue" that we solved.
I have no ideas, but at least I want to confirm whether this issue
is avoidable or not.
>It seems that I can get almost the same result of Chao from "PRIMEQUEST 1800E".
>
>###################################
>- System: PRIMERGY RX300 S6
>- CPU: Intel(R) Xeon(R) CPU x5660
>- memory: 16GB
>###################################
>************ makedumpfile -d 7 ******************
> core-data 0 256
> threads-num
>-l
> 0 10 144
> 4 5 110
> 8 5 111
> 12 6 111
>
>************ makedumpfile -d 31 ******************
> core-data 0 256
> threads-num
>-l
> 0 0 0
> 4 2 2
> 8 2 3
> 12 2 3
>
>###################################
>- System: PRIMEQUEST 1800E
>- CPU: Intel(R) Xeon(R) CPU E7540
>- memory: 32GB
>###################################
>************ makedumpfile -d 7 ******************
> core-data 0 256
> threads-num
>-l
> 0 34 270
> 4 63 154
> 8 64 131
> 12 65 159
>
>************ makedumpfile -d 31 ******************
> core-data 0 256
> threads-num
>-l
> 0 2 1
> 4 48 48
> 8 48 49
> 12 49 50
>
>>> I'm not so sure if it is a problem that the performance degradation is so big.
>>> But I think if in other cases, it works as expected, this won't be a problem(
>>> or a problem needs to be fixed), for the performance degradation existing
>>> in theory.
>>>
>>> Or the current implementation should be replaced by a new arithmetic.
>>> For example:
>>> We can add an array to record whether the page is filtered or not.
>>> And only the unfiltered page will take the buffer.
>>
>> We should discuss how to implement new mechanism, I'll mention this later.
>>
>>> But I'm not sure if it is worth.
>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>>
>> Basically the faster, the better. There is no obvious target time.
>> If there is room for improvement, we should do it.
>>
>
>Maybe we can improve the performance of "-c -d 31" in some case.
Yes, the buffer is used for -c, -l and -p, not only for -l.
It would be useful to improve that.
>BTW, we can easily get the theoretical performance by using the "--split".
Are you sure ? You persuaded me in the thread below:
http://lists.infradead.org/pipermail/kexec/2015-June/013881.html
--num-threads is orthogonal to --split, it's better to use the both
option since they try to solve different bottlenecks.
That's why I decided to merge your multi thread feature.
However, what you said sounds --split is a superset of --num-threads.
You don't need the multi thread feature ?
Thanks,
Atsushi Kumagai
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-14 8:26 ` Atsushi Kumagai
@ 2015-12-14 8:59 ` "Zhou, Wenjian/周文剑"
0 siblings, 0 replies; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-14 8:59 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: kexec@lists.infradead.org
On 12/14/2015 04:26 PM, Atsushi Kumagai wrote:
>>>> Think about this, in a huge memory, most of the page will be filtered, and
>>>> we have 5 buffers.
>>>>
>>>> page1 page2 page3 page4 page5 page6 page7 .....
>>>> [buffer1] [2] [3] [4] [5]
>>>> unfiltered filtered filtered filtered filtered unfiltered filtered
>>>>
>>>> Since filtered page will take a buffer, when compressing page1,
>>>> page6 can't be compressed at the same time.
>>>> That why it will prevent parallel compression.
>>>
>>> Thanks for your explanation, I understand.
>>> This is just an issue of the current implementation, there is no
>>> reason to stand this restriction.
>>>
>>>>> Further, according to Chao's benchmark, there is a big performance
>>>>> degradation even if the number of thread is 1. (58s vs 240s)
>>>>> The current implementation seems to have some problems, we should
>>>>> solve them.
>>>>>
>>>>
>>>> If "-d 31" is specified, on the one hand we can't save time by compressing
>>>> parallel, on the other hand we will introduce some extra work by adding
>>>> "--num-threads". So it is obvious that it will have a performance degradation.
>>>
>>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive lock),
>>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
>>> too slow, the degradation is too big to be called "some extra work".
>>>
>>> Both --num-threads=0 and --num-threads=1 are serial processing,
>>> the above "buffer fairness issue" will not be related to this degradation.
>>> What do you think what make this degradation ?
>>>
>>
>> I can't get such result at this moment, so I can't do some further investigation
>> right now. I guess it may be caused by the underlying implementation of pthread.
>> I reviewed the test result of the patch v2 and found in different machines,
>> the results are quite different.
>
> Unluckily, I also can't reproduce such big degradation.
> According to the Chao's verification, this issue seems different form
> the "too many page fault issue" that we solved.
> I have no ideas, but at least I want to confirm whether this issue
> is avoidable or not.
>
>> It seems that I can get almost the same result of Chao from "PRIMEQUEST 1800E".
>>
>> ###################################
>> - System: PRIMERGY RX300 S6
>> - CPU: Intel(R) Xeon(R) CPU x5660
>> - memory: 16GB
>> ###################################
>> ************ makedumpfile -d 7 ******************
>> core-data 0 256
>> threads-num
>> -l
>> 0 10 144
>> 4 5 110
>> 8 5 111
>> 12 6 111
>>
>> ************ makedumpfile -d 31 ******************
>> core-data 0 256
>> threads-num
>> -l
>> 0 0 0
>> 4 2 2
>> 8 2 3
>> 12 2 3
>>
>> ###################################
>> - System: PRIMEQUEST 1800E
>> - CPU: Intel(R) Xeon(R) CPU E7540
>> - memory: 32GB
>> ###################################
>> ************ makedumpfile -d 7 ******************
>> core-data 0 256
>> threads-num
>> -l
>> 0 34 270
>> 4 63 154
>> 8 64 131
>> 12 65 159
>>
>> ************ makedumpfile -d 31 ******************
>> core-data 0 256
>> threads-num
>> -l
>> 0 2 1
>> 4 48 48
>> 8 48 49
>> 12 49 50
>>
>>>> I'm not so sure if it is a problem that the performance degradation is so big.
>>>> But I think if in other cases, it works as expected, this won't be a problem(
>>>> or a problem needs to be fixed), for the performance degradation existing
>>>> in theory.
>>>>
>>>> Or the current implementation should be replaced by a new arithmetic.
>>>> For example:
>>>> We can add an array to record whether the page is filtered or not.
>>>> And only the unfiltered page will take the buffer.
>>>
>>> We should discuss how to implement new mechanism, I'll mention this later.
>>>
>>>> But I'm not sure if it is worth.
>>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>>>
>>> Basically the faster, the better. There is no obvious target time.
>>> If there is room for improvement, we should do it.
>>>
>>
>> Maybe we can improve the performance of "-c -d 31" in some case.
>
> Yes, the buffer is used for -c, -l and -p, not only for -l.
> It would be useful to improve that.
>
>> BTW, we can easily get the theoretical performance by using the "--split".
>
> Are you sure ? You persuaded me in the thread below:
>
> http://lists.infradead.org/pipermail/kexec/2015-June/013881.html
>
> --num-threads is orthogonal to --split, it's better to use the both
> option since they try to solve different bottlenecks.
> That's why I decided to merge your multi thread feature.
>
> However, what you said sounds --split is a superset of --num-threads.
> You don't need the multi thread feature ?
>
I just mean the performance.
There is no doubt that we will use multi-threads in --split in the future.
But as we all known, threads and processes have some common characters.
And in makedumpfile, if we use "--split core1 core2 core3 core4" and
"--num-threads 4" separately, the spent time should not be quite different.
Since the logic of "--split" is more simple, if we can't improve the performance
of "-l -d 31" by "--split", we also don't have much chance to do it by "--num-threads".
I just mean that.
It is of course that --split is not a super set of --num-threads.
--
Thanks
Zhou
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-10 10:54 ` Chao Fan
@ 2015-12-22 8:32 ` HATAYAMA Daisuke
2015-12-24 2:20 ` Chao Fan
0 siblings, 1 reply; 43+ messages in thread
From: HATAYAMA Daisuke @ 2015-12-22 8:32 UTC (permalink / raw)
To: cfan; +Cc: ats-kumagai, zhouwj-fnst, kexec
Chao,
From: Chao Fan <cfan@redhat.com>
Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
Date: Thu, 10 Dec 2015 05:54:28 -0500
>
>
> ----- Original Message -----
>> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> To: "Chao Fan" <cfan@redhat.com>
>> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>, kexec@lists.infradead.org
>> Sent: Thursday, December 10, 2015 6:32:32 PM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> On 12/10/2015 05:58 PM, Chao Fan wrote:
>> >
>> >
>> > ----- Original Message -----
>> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> >> Cc: kexec@lists.infradead.org
>> >> Sent: Thursday, December 10, 2015 5:36:47 PM
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >>
>> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> >>>> Hello Kumagai,
>> >>>>
>> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> >>>>> Hello, Zhou
>> >>>>>
>> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>> >>>>>>> Hi,
>> >>>>>>>
>> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> >>>>>>>> I think there is no problem if other test results are as expected.
>> >>>>>>>>
>> >>>>>>>> --num-threads mainly reduces the time of compressing.
>> >>>>>>>> So for lzo, it can't do much help at most of time.
>> >>>>>>>
>> >>>>>>> Seems the help of --num-threads does not say it exactly:
>> >>>>>>>
>> >>>>>>> [--num-threads THREADNUM]:
>> >>>>>>> Using multiple threads to read and compress data of each
>> >>>>>>> page
>> >>>>>>> in parallel.
>> >>>>>>> And it will reduces time for saving DUMPFILE.
>> >>>>>>> This feature only supports creating DUMPFILE in
>> >>>>>>> kdump-comressed format from
>> >>>>>>> VMCORE in kdump-compressed format or elf format.
>> >>>>>>>
>> >>>>>>> Lzo is also a compress method, it should be mentioned that
>> >>>>>>> --num-threads only
>> >>>>>>> supports zlib compressed vmcore.
>> >>>>>>>
>> >>>>>>
>> >>>>>> Sorry, it seems that something I said is not so clear.
>> >>>>>> lzo is also supported. Since lzo compresses data at a high speed, the
>> >>>>>> improving of the performance is not so obvious at most of time.
>> >>>>>>
>> >>>>>>> Also worth to mention about the recommended -d value for this
>> >>>>>>> feature.
>> >>>>>>>
>> >>>>>>
>> >>>>>> Yes, I think it's worth. I forgot it.
>> >>>>>
>> >>>>> I saw your patch, but I think I should confirm what is the problem
>> >>>>> first.
>> >>>>>
>> >>>>>> However, when "-d 31" is specified, it will be worse.
>> >>>>>> Less than 50 buffers are used to cache the compressed page.
>> >>>>>> And even the page has been filtered, it will also take a buffer.
>> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
>> >>>>>> of buffers. Then the page which needs to be compressed can't
>> >>>>>> be compressed parallel.
>> >>>>>
>> >>>>> Could you explain why compression will not be parallel in more detail ?
>> >>>>> Actually the buffers are used also for filtered pages, it sounds
>> >>>>> inefficient.
>> >>>>> However, I don't understand why it prevents parallel compression.
>> >>>>>
>> >>>>
>> >>>> Think about this, in a huge memory, most of the page will be filtered,
>> >>>> and
>> >>>> we have 5 buffers.
>> >>>>
>> >>>> page1 page2 page3 page4 page5 page6 page7
>> >>>> .....
>> >>>> [buffer1] [2] [3] [4] [5]
>> >>>> unfiltered filtered filtered filtered filtered unfiltered
>> >>>> filtered
>> >>>>
>> >>>> Since filtered page will take a buffer, when compressing page1,
>> >>>> page6 can't be compressed at the same time.
>> >>>> That why it will prevent parallel compression.
>> >>>
>> >>> Thanks for your explanation, I understand.
>> >>> This is just an issue of the current implementation, there is no
>> >>> reason to stand this restriction.
>> >>>
>> >>>>> Further, according to Chao's benchmark, there is a big performance
>> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
>> >>>>> The current implementation seems to have some problems, we should
>> >>>>> solve them.
>> >>>>>
>> >>>>
>> >>>> If "-d 31" is specified, on the one hand we can't save time by
>> >>>> compressing
>> >>>> parallel, on the other hand we will introduce some extra work by adding
>> >>>> "--num-threads". So it is obvious that it will have a performance
>> >>>> degradation.
>> >>>
>> >>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive
>> >>> lock),
>> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
>> >>> too slow, the degradation is too big to be called "some extra work".
>> >>>
>> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> >>> the above "buffer fairness issue" will not be related to this
>> >>> degradation.
>> >>> What do you think what make this degradation ?
>> >>>
>> >>
>> >> I can't get such result at this moment, so I can't do some further
>> >> investigation
>> >> right now. I guess it may be caused by the underlying implementation of
>> >> pthread.
>> >> I reviewed the test result of the patch v2 and found in different
>> >> machines,
>> >> the results are quite different.
>> >
>> > Hi Zhou Wenjian,
>> >
>> > I have done more tests in another machine with 128G memory, and get the
>> > result:
>> >
>> > the size of vmcore is 300M in "-d 31"
>> > makedumpfile -l --message-level 1 -d 31:
>> > time: 8.6s page-faults: 2272
>> >
>> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
>> > time: 28.1s page-faults: 2359
>> >
>> >
>> > and the size of vmcore is 2.6G in "-d 0".
>> > In this machine, I get the same result as yours:
>> >
>> >
>> > makedumpfile -c --message-level 1 -d 0:
>> > time: 597s page-faults: 2287
>> >
>> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
>> > time: 602s page-faults: 2361
>> >
>> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
>> > time: 337s page-faults: 2397
>> >
>> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
>> > time: 175s page-faults: 2461
>> >
>> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
>> > time: 103s page-faults: 2611
>> >
>> >
>> > But the machine of my first test is not under my control, should I wait for
>> > the first machine to do more tests?
>> > If there are still some problems in my tests, please tell me.
>> >
>>
>> Thanks a lot for your test, it seems that there is nothing wrong.
>> And I haven't got any idea about more tests...
>>
>> Could you provide the information of your cpu ?
>> I will do some further investigation later.
>>
>
> OK, of course, here is the information of cpu:
>
> # lscpu
> Architecture: x86_64
> CPU op-mode(s): 32-bit, 64-bit
> Byte Order: Little Endian
> CPU(s): 48
> On-line CPU(s) list: 0-47
> Thread(s) per core: 1
> Core(s) per socket: 6
> Socket(s): 8
> NUMA node(s): 8
> Vendor ID: AuthenticAMD
> CPU family: 16
> Model: 8
> Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
> Stepping: 0
> CPU MHz: 2793.040
> BogoMIPS: 5586.22
> Virtualization: AMD-V
> L1d cache: 64K
> L1i cache: 64K
> L2 cache: 512K
> L3 cache: 5118K
> NUMA node0 CPU(s): 0,8,16,24,32,40
> NUMA node1 CPU(s): 1,9,17,25,33,41
> NUMA node2 CPU(s): 2,10,18,26,34,42
> NUMA node3 CPU(s): 3,11,19,27,35,43
> NUMA node4 CPU(s): 4,12,20,28,36,44
> NUMA node5 CPU(s): 5,13,21,29,37,45
> NUMA node6 CPU(s): 6,14,22,30,38,46
> NUMA node7 CPU(s): 7,15,23,31,39,47
This CPU assignment on NUMA nodes looks interesting. Is it possible
that this affects performance of makedumpfile? This is just a guess.
Could you check whether the performance gets imporoved if you run each
thread on the same NUMA node? For example:
# taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
Also, if this were cause of this performance degradation, we might
need to extend nr_cpus= kernel option to choose NUMA nodes we use;
though, we might already be able to do that in combination with other
kernel features.
> Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt hw_pstate npt lbrv svm_lock nrip_save pausefilter vmmcall
>
>> But I still believe it's better not to use "-l -d 31" and "--num-threads"
>> at the same time, though it's very strange that the performance
>> degradation is so big.
>>
>> --
>> Thanks
>> Zhou
>>
>> > Thanks,
>> > Chao Fan
>> >
>> >
>> >>
>> >> It seems that I can get almost the same result of Chao from "PRIMEQUEST
>> >> 1800E".
>> >>
>> >> ###################################
>> >> - System: PRIMERGY RX300 S6
>> >> - CPU: Intel(R) Xeon(R) CPU x5660
>> >> - memory: 16GB
>> >> ###################################
>> >> ************ makedumpfile -d 7 ******************
>> >> core-data 0 256
>> >> threads-num
>> >> -l
>> >> 0 10 144
>> >> 4 5 110
>> >> 8 5 111
>> >> 12 6 111
>> >>
>> >> ************ makedumpfile -d 31 ******************
>> >> core-data 0 256
>> >> threads-num
>> >> -l
>> >> 0 0 0
>> >> 4 2 2
>> >> 8 2 3
>> >> 12 2 3
>> >>
>> >> ###################################
>> >> - System: PRIMEQUEST 1800E
>> >> - CPU: Intel(R) Xeon(R) CPU E7540
>> >> - memory: 32GB
>> >> ###################################
>> >> ************ makedumpfile -d 7 ******************
>> >> core-data 0 256
>> >> threads-num
>> >> -l
>> >> 0 34 270
>> >> 4 63 154
>> >> 8 64 131
>> >> 12 65 159
>> >>
>> >> ************ makedumpfile -d 31 ******************
>> >> core-data 0 256
>> >> threads-num
>> >> -l
>> >> 0 2 1
>> >> 4 48 48
>> >> 8 48 49
>> >> 12 49 50
>> >>
>> >>>> I'm not so sure if it is a problem that the performance degradation is
>> >>>> so
>> >>>> big.
>> >>>> But I think if in other cases, it works as expected, this won't be a
>> >>>> problem(
>> >>>> or a problem needs to be fixed), for the performance degradation
>> >>>> existing
>> >>>> in theory.
>> >>>>
>> >>>> Or the current implementation should be replaced by a new arithmetic.
>> >>>> For example:
>> >>>> We can add an array to record whether the page is filtered or not.
>> >>>> And only the unfiltered page will take the buffer.
>> >>>
>> >>> We should discuss how to implement new mechanism, I'll mention this
>> >>> later.
>> >>>
>> >>>> But I'm not sure if it is worth.
>> >>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much
>> >>>> help.
>> >>>
>> >>> Basically the faster, the better. There is no obvious target time.
>> >>> If there is room for improvement, we should do it.
>> >>>
>> >>
>> >> Maybe we can improve the performance of "-c -d 31" in some case.
>> >>
>> >> BTW, we can easily get the theoretical performance by using the "--split".
>> >>
>> >> --
>> >> Thanks
>> >> Zhou
>> >>
>> >>
>> >>
>> >> _______________________________________________
>> >> kexec mailing list
>> >> kexec@lists.infradead.org
>> >> http://lists.infradead.org/mailman/listinfo/kexec
>> >>
>>
>>
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
--
Thanks.
HATAYAMA, Daisuke
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-22 8:32 ` HATAYAMA Daisuke
@ 2015-12-24 2:20 ` Chao Fan
2015-12-24 3:22 ` HATAYAMA Daisuke
0 siblings, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-24 2:20 UTC (permalink / raw)
To: HATAYAMA Daisuke; +Cc: ats-kumagai, zhouwj-fnst, kexec
----- Original Message -----
> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> To: cfan@redhat.com
> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
> Sent: Tuesday, December 22, 2015 4:32:25 PM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>
> Chao,
>
> From: Chao Fan <cfan@redhat.com>
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> Date: Thu, 10 Dec 2015 05:54:28 -0500
>
> >
> >
> > ----- Original Message -----
> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> To: "Chao Fan" <cfan@redhat.com>
> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
> >> kexec@lists.infradead.org
> >> Sent: Thursday, December 10, 2015 6:32:32 PM
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >>
> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
> >> >
> >> >
> >> > ----- Original Message -----
> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> >> >> Cc: kexec@lists.infradead.org
> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >>
> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >> >>>> Hello Kumagai,
> >> >>>>
> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >> >>>>> Hello, Zhou
> >> >>>>>
> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >> >>>>>>> Hi,
> >> >>>>>>>
> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >> >>>>>>>> I think there is no problem if other test results are as
> >> >>>>>>>> expected.
> >> >>>>>>>>
> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
> >> >>>>>>>> So for lzo, it can't do much help at most of time.
> >> >>>>>>>
> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
> >> >>>>>>>
> >> >>>>>>> [--num-threads THREADNUM]:
> >> >>>>>>> Using multiple threads to read and compress data of each
> >> >>>>>>> page
> >> >>>>>>> in parallel.
> >> >>>>>>> And it will reduces time for saving DUMPFILE.
> >> >>>>>>> This feature only supports creating DUMPFILE in
> >> >>>>>>> kdump-comressed format from
> >> >>>>>>> VMCORE in kdump-compressed format or elf format.
> >> >>>>>>>
> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
> >> >>>>>>> --num-threads only
> >> >>>>>>> supports zlib compressed vmcore.
> >> >>>>>>>
> >> >>>>>>
> >> >>>>>> Sorry, it seems that something I said is not so clear.
> >> >>>>>> lzo is also supported. Since lzo compresses data at a high speed,
> >> >>>>>> the
> >> >>>>>> improving of the performance is not so obvious at most of time.
> >> >>>>>>
> >> >>>>>>> Also worth to mention about the recommended -d value for this
> >> >>>>>>> feature.
> >> >>>>>>>
> >> >>>>>>
> >> >>>>>> Yes, I think it's worth. I forgot it.
> >> >>>>>
> >> >>>>> I saw your patch, but I think I should confirm what is the problem
> >> >>>>> first.
> >> >>>>>
> >> >>>>>> However, when "-d 31" is specified, it will be worse.
> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
> >> >>>>>> And even the page has been filtered, it will also take a buffer.
> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
> >> >>>>>> of buffers. Then the page which needs to be compressed can't
> >> >>>>>> be compressed parallel.
> >> >>>>>
> >> >>>>> Could you explain why compression will not be parallel in more
> >> >>>>> detail ?
> >> >>>>> Actually the buffers are used also for filtered pages, it sounds
> >> >>>>> inefficient.
> >> >>>>> However, I don't understand why it prevents parallel compression.
> >> >>>>>
> >> >>>>
> >> >>>> Think about this, in a huge memory, most of the page will be
> >> >>>> filtered,
> >> >>>> and
> >> >>>> we have 5 buffers.
> >> >>>>
> >> >>>> page1 page2 page3 page4 page5 page6
> >> >>>> page7
> >> >>>> .....
> >> >>>> [buffer1] [2] [3] [4] [5]
> >> >>>> unfiltered filtered filtered filtered filtered unfiltered
> >> >>>> filtered
> >> >>>>
> >> >>>> Since filtered page will take a buffer, when compressing page1,
> >> >>>> page6 can't be compressed at the same time.
> >> >>>> That why it will prevent parallel compression.
> >> >>>
> >> >>> Thanks for your explanation, I understand.
> >> >>> This is just an issue of the current implementation, there is no
> >> >>> reason to stand this restriction.
> >> >>>
> >> >>>>> Further, according to Chao's benchmark, there is a big performance
> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
> >> >>>>> The current implementation seems to have some problems, we should
> >> >>>>> solve them.
> >> >>>>>
> >> >>>>
> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
> >> >>>> compressing
> >> >>>> parallel, on the other hand we will introduce some extra work by
> >> >>>> adding
> >> >>>> "--num-threads". So it is obvious that it will have a performance
> >> >>>> degradation.
> >> >>>
> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
> >> >>> exclusive
> >> >>> lock),
> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still
> >> >>> sounds
> >> >>> too slow, the degradation is too big to be called "some extra work".
> >> >>>
> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
> >> >>> the above "buffer fairness issue" will not be related to this
> >> >>> degradation.
> >> >>> What do you think what make this degradation ?
> >> >>>
> >> >>
> >> >> I can't get such result at this moment, so I can't do some further
> >> >> investigation
> >> >> right now. I guess it may be caused by the underlying implementation of
> >> >> pthread.
> >> >> I reviewed the test result of the patch v2 and found in different
> >> >> machines,
> >> >> the results are quite different.
> >> >
> >> > Hi Zhou Wenjian,
> >> >
> >> > I have done more tests in another machine with 128G memory, and get the
> >> > result:
> >> >
> >> > the size of vmcore is 300M in "-d 31"
> >> > makedumpfile -l --message-level 1 -d 31:
> >> > time: 8.6s page-faults: 2272
> >> >
> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> >> > time: 28.1s page-faults: 2359
> >> >
> >> >
> >> > and the size of vmcore is 2.6G in "-d 0".
> >> > In this machine, I get the same result as yours:
> >> >
> >> >
> >> > makedumpfile -c --message-level 1 -d 0:
> >> > time: 597s page-faults: 2287
> >> >
> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> >> > time: 602s page-faults: 2361
> >> >
> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> >> > time: 337s page-faults: 2397
> >> >
> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> >> > time: 175s page-faults: 2461
> >> >
> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> >> > time: 103s page-faults: 2611
> >> >
> >> >
> >> > But the machine of my first test is not under my control, should I wait
> >> > for
> >> > the first machine to do more tests?
> >> > If there are still some problems in my tests, please tell me.
> >> >
> >>
> >> Thanks a lot for your test, it seems that there is nothing wrong.
> >> And I haven't got any idea about more tests...
> >>
> >> Could you provide the information of your cpu ?
> >> I will do some further investigation later.
> >>
> >
> > OK, of course, here is the information of cpu:
> >
> > # lscpu
> > Architecture: x86_64
> > CPU op-mode(s): 32-bit, 64-bit
> > Byte Order: Little Endian
> > CPU(s): 48
> > On-line CPU(s) list: 0-47
> > Thread(s) per core: 1
> > Core(s) per socket: 6
> > Socket(s): 8
> > NUMA node(s): 8
> > Vendor ID: AuthenticAMD
> > CPU family: 16
> > Model: 8
> > Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
> > Stepping: 0
> > CPU MHz: 2793.040
> > BogoMIPS: 5586.22
> > Virtualization: AMD-V
> > L1d cache: 64K
> > L1i cache: 64K
> > L2 cache: 512K
> > L3 cache: 5118K
> > NUMA node0 CPU(s): 0,8,16,24,32,40
> > NUMA node1 CPU(s): 1,9,17,25,33,41
> > NUMA node2 CPU(s): 2,10,18,26,34,42
> > NUMA node3 CPU(s): 3,11,19,27,35,43
> > NUMA node4 CPU(s): 4,12,20,28,36,44
> > NUMA node5 CPU(s): 5,13,21,29,37,45
> > NUMA node6 CPU(s): 6,14,22,30,38,46
> > NUMA node7 CPU(s): 7,15,23,31,39,47
>
> This CPU assignment on NUMA nodes looks interesting. Is it possible
> that this affects performance of makedumpfile? This is just a guess.
>
> Could you check whether the performance gets imporoved if you run each
> thread on the same NUMA node? For example:
>
> # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
> vmcore-cd0
>
Hi HATAYAMA,
I think your guess is right, but maybe your command has a little problem.
From my test, the NUMA did affect the performance, but not too much.
The average time of cpus in the same NUMA node:
# taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
is 314s
The average time of cpus in different NUMA node:
# taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
is 354s
But I think if you want to use "--num-threads 4", the --cpu-list numbers
following "taskset -c" should be 5 cpus at least, otherwise the time will be too
long.
Thanks,
Chao Fan
> Also, if this were cause of this performance degradation, we might
> need to extend nr_cpus= kernel option to choose NUMA nodes we use;
> though, we might already be able to do that in combination with other
> kernel features.
>
> > Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
> > mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall mmxext fxsr_opt
> > pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc
> > extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic
> > cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt
> > hw_pstate npt lbrv svm_lock nrip_save pausefilter vmmcall
> >
> >> But I still believe it's better not to use "-l -d 31" and "--num-threads"
> >> at the same time, though it's very strange that the performance
> >> degradation is so big.
> >>
> >> --
> >> Thanks
> >> Zhou
> >>
> >> > Thanks,
> >> > Chao Fan
> >> >
> >> >
> >> >>
> >> >> It seems that I can get almost the same result of Chao from "PRIMEQUEST
> >> >> 1800E".
> >> >>
> >> >> ###################################
> >> >> - System: PRIMERGY RX300 S6
> >> >> - CPU: Intel(R) Xeon(R) CPU x5660
> >> >> - memory: 16GB
> >> >> ###################################
> >> >> ************ makedumpfile -d 7 ******************
> >> >> core-data 0 256
> >> >> threads-num
> >> >> -l
> >> >> 0 10 144
> >> >> 4 5 110
> >> >> 8 5 111
> >> >> 12 6 111
> >> >>
> >> >> ************ makedumpfile -d 31 ******************
> >> >> core-data 0 256
> >> >> threads-num
> >> >> -l
> >> >> 0 0 0
> >> >> 4 2 2
> >> >> 8 2 3
> >> >> 12 2 3
> >> >>
> >> >> ###################################
> >> >> - System: PRIMEQUEST 1800E
> >> >> - CPU: Intel(R) Xeon(R) CPU E7540
> >> >> - memory: 32GB
> >> >> ###################################
> >> >> ************ makedumpfile -d 7 ******************
> >> >> core-data 0 256
> >> >> threads-num
> >> >> -l
> >> >> 0 34 270
> >> >> 4 63 154
> >> >> 8 64 131
> >> >> 12 65 159
> >> >>
> >> >> ************ makedumpfile -d 31 ******************
> >> >> core-data 0 256
> >> >> threads-num
> >> >> -l
> >> >> 0 2 1
> >> >> 4 48 48
> >> >> 8 48 49
> >> >> 12 49 50
> >> >>
> >> >>>> I'm not so sure if it is a problem that the performance degradation
> >> >>>> is
> >> >>>> so
> >> >>>> big.
> >> >>>> But I think if in other cases, it works as expected, this won't be a
> >> >>>> problem(
> >> >>>> or a problem needs to be fixed), for the performance degradation
> >> >>>> existing
> >> >>>> in theory.
> >> >>>>
> >> >>>> Or the current implementation should be replaced by a new arithmetic.
> >> >>>> For example:
> >> >>>> We can add an array to record whether the page is filtered or not.
> >> >>>> And only the unfiltered page will take the buffer.
> >> >>>
> >> >>> We should discuss how to implement new mechanism, I'll mention this
> >> >>> later.
> >> >>>
> >> >>>> But I'm not sure if it is worth.
> >> >>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much
> >> >>>> help.
> >> >>>
> >> >>> Basically the faster, the better. There is no obvious target time.
> >> >>> If there is room for improvement, we should do it.
> >> >>>
> >> >>
> >> >> Maybe we can improve the performance of "-c -d 31" in some case.
> >> >>
> >> >> BTW, we can easily get the theoretical performance by using the
> >> >> "--split".
> >> >>
> >> >> --
> >> >> Thanks
> >> >> Zhou
> >> >>
> >> >>
> >> >>
> >> >> _______________________________________________
> >> >> kexec mailing list
> >> >> kexec@lists.infradead.org
> >> >> http://lists.infradead.org/mailman/listinfo/kexec
> >> >>
> >>
> >>
> >>
> >>
> >> _______________________________________________
> >> kexec mailing list
> >> kexec@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/kexec
> >>
> >
> > _______________________________________________
> > kexec mailing list
> > kexec@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec
> --
> Thanks.
> HATAYAMA, Daisuke
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-24 2:20 ` Chao Fan
@ 2015-12-24 3:22 ` HATAYAMA Daisuke
2015-12-24 3:31 ` Chao Fan
0 siblings, 1 reply; 43+ messages in thread
From: HATAYAMA Daisuke @ 2015-12-24 3:22 UTC (permalink / raw)
To: cfan; +Cc: ats-kumagai, zhouwj-fnst, kexec
From: Chao Fan <cfan@redhat.com>
Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
Date: Wed, 23 Dec 2015 21:20:48 -0500
>
>
> ----- Original Message -----
>> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> To: cfan@redhat.com
>> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
>> Sent: Tuesday, December 22, 2015 4:32:25 PM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> Chao,
>>
>> From: Chao Fan <cfan@redhat.com>
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> Date: Thu, 10 Dec 2015 05:54:28 -0500
>>
>> >
>> >
>> > ----- Original Message -----
>> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> To: "Chao Fan" <cfan@redhat.com>
>> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
>> >> kexec@lists.infradead.org
>> >> Sent: Thursday, December 10, 2015 6:32:32 PM
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >>
>> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
>> >> >
>> >> >
>> >> > ----- Original Message -----
>> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> >> >> Cc: kexec@lists.infradead.org
>> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
>> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >>
>> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> >> >>>> Hello Kumagai,
>> >> >>>>
>> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> >> >>>>> Hello, Zhou
>> >> >>>>>
>> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>> >> >>>>>>> Hi,
>> >> >>>>>>>
>> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> >> >>>>>>>> I think there is no problem if other test results are as
>> >> >>>>>>>> expected.
>> >> >>>>>>>>
>> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
>> >> >>>>>>>> So for lzo, it can't do much help at most of time.
>> >> >>>>>>>
>> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
>> >> >>>>>>>
>> >> >>>>>>> [--num-threads THREADNUM]:
>> >> >>>>>>> Using multiple threads to read and compress data of each
>> >> >>>>>>> page
>> >> >>>>>>> in parallel.
>> >> >>>>>>> And it will reduces time for saving DUMPFILE.
>> >> >>>>>>> This feature only supports creating DUMPFILE in
>> >> >>>>>>> kdump-comressed format from
>> >> >>>>>>> VMCORE in kdump-compressed format or elf format.
>> >> >>>>>>>
>> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
>> >> >>>>>>> --num-threads only
>> >> >>>>>>> supports zlib compressed vmcore.
>> >> >>>>>>>
>> >> >>>>>>
>> >> >>>>>> Sorry, it seems that something I said is not so clear.
>> >> >>>>>> lzo is also supported. Since lzo compresses data at a high speed,
>> >> >>>>>> the
>> >> >>>>>> improving of the performance is not so obvious at most of time.
>> >> >>>>>>
>> >> >>>>>>> Also worth to mention about the recommended -d value for this
>> >> >>>>>>> feature.
>> >> >>>>>>>
>> >> >>>>>>
>> >> >>>>>> Yes, I think it's worth. I forgot it.
>> >> >>>>>
>> >> >>>>> I saw your patch, but I think I should confirm what is the problem
>> >> >>>>> first.
>> >> >>>>>
>> >> >>>>>> However, when "-d 31" is specified, it will be worse.
>> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
>> >> >>>>>> And even the page has been filtered, it will also take a buffer.
>> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
>> >> >>>>>> of buffers. Then the page which needs to be compressed can't
>> >> >>>>>> be compressed parallel.
>> >> >>>>>
>> >> >>>>> Could you explain why compression will not be parallel in more
>> >> >>>>> detail ?
>> >> >>>>> Actually the buffers are used also for filtered pages, it sounds
>> >> >>>>> inefficient.
>> >> >>>>> However, I don't understand why it prevents parallel compression.
>> >> >>>>>
>> >> >>>>
>> >> >>>> Think about this, in a huge memory, most of the page will be
>> >> >>>> filtered,
>> >> >>>> and
>> >> >>>> we have 5 buffers.
>> >> >>>>
>> >> >>>> page1 page2 page3 page4 page5 page6
>> >> >>>> page7
>> >> >>>> .....
>> >> >>>> [buffer1] [2] [3] [4] [5]
>> >> >>>> unfiltered filtered filtered filtered filtered unfiltered
>> >> >>>> filtered
>> >> >>>>
>> >> >>>> Since filtered page will take a buffer, when compressing page1,
>> >> >>>> page6 can't be compressed at the same time.
>> >> >>>> That why it will prevent parallel compression.
>> >> >>>
>> >> >>> Thanks for your explanation, I understand.
>> >> >>> This is just an issue of the current implementation, there is no
>> >> >>> reason to stand this restriction.
>> >> >>>
>> >> >>>>> Further, according to Chao's benchmark, there is a big performance
>> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
>> >> >>>>> The current implementation seems to have some problems, we should
>> >> >>>>> solve them.
>> >> >>>>>
>> >> >>>>
>> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
>> >> >>>> compressing
>> >> >>>> parallel, on the other hand we will introduce some extra work by
>> >> >>>> adding
>> >> >>>> "--num-threads". So it is obvious that it will have a performance
>> >> >>>> degradation.
>> >> >>>
>> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
>> >> >>> exclusive
>> >> >>> lock),
>> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still
>> >> >>> sounds
>> >> >>> too slow, the degradation is too big to be called "some extra work".
>> >> >>>
>> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> >> >>> the above "buffer fairness issue" will not be related to this
>> >> >>> degradation.
>> >> >>> What do you think what make this degradation ?
>> >> >>>
>> >> >>
>> >> >> I can't get such result at this moment, so I can't do some further
>> >> >> investigation
>> >> >> right now. I guess it may be caused by the underlying implementation of
>> >> >> pthread.
>> >> >> I reviewed the test result of the patch v2 and found in different
>> >> >> machines,
>> >> >> the results are quite different.
>> >> >
>> >> > Hi Zhou Wenjian,
>> >> >
>> >> > I have done more tests in another machine with 128G memory, and get the
>> >> > result:
>> >> >
>> >> > the size of vmcore is 300M in "-d 31"
>> >> > makedumpfile -l --message-level 1 -d 31:
>> >> > time: 8.6s page-faults: 2272
>> >> >
>> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
>> >> > time: 28.1s page-faults: 2359
>> >> >
>> >> >
>> >> > and the size of vmcore is 2.6G in "-d 0".
>> >> > In this machine, I get the same result as yours:
>> >> >
>> >> >
>> >> > makedumpfile -c --message-level 1 -d 0:
>> >> > time: 597s page-faults: 2287
>> >> >
>> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
>> >> > time: 602s page-faults: 2361
>> >> >
>> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
>> >> > time: 337s page-faults: 2397
>> >> >
>> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
>> >> > time: 175s page-faults: 2461
>> >> >
>> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
>> >> > time: 103s page-faults: 2611
>> >> >
>> >> >
>> >> > But the machine of my first test is not under my control, should I wait
>> >> > for
>> >> > the first machine to do more tests?
>> >> > If there are still some problems in my tests, please tell me.
>> >> >
>> >>
>> >> Thanks a lot for your test, it seems that there is nothing wrong.
>> >> And I haven't got any idea about more tests...
>> >>
>> >> Could you provide the information of your cpu ?
>> >> I will do some further investigation later.
>> >>
>> >
>> > OK, of course, here is the information of cpu:
>> >
>> > # lscpu
>> > Architecture: x86_64
>> > CPU op-mode(s): 32-bit, 64-bit
>> > Byte Order: Little Endian
>> > CPU(s): 48
>> > On-line CPU(s) list: 0-47
>> > Thread(s) per core: 1
>> > Core(s) per socket: 6
>> > Socket(s): 8
>> > NUMA node(s): 8
>> > Vendor ID: AuthenticAMD
>> > CPU family: 16
>> > Model: 8
>> > Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
>> > Stepping: 0
>> > CPU MHz: 2793.040
>> > BogoMIPS: 5586.22
>> > Virtualization: AMD-V
>> > L1d cache: 64K
>> > L1i cache: 64K
>> > L2 cache: 512K
>> > L3 cache: 5118K
>> > NUMA node0 CPU(s): 0,8,16,24,32,40
>> > NUMA node1 CPU(s): 1,9,17,25,33,41
>> > NUMA node2 CPU(s): 2,10,18,26,34,42
>> > NUMA node3 CPU(s): 3,11,19,27,35,43
>> > NUMA node4 CPU(s): 4,12,20,28,36,44
>> > NUMA node5 CPU(s): 5,13,21,29,37,45
>> > NUMA node6 CPU(s): 6,14,22,30,38,46
>> > NUMA node7 CPU(s): 7,15,23,31,39,47
>>
>> This CPU assignment on NUMA nodes looks interesting. Is it possible
>> that this affects performance of makedumpfile? This is just a guess.
>>
>> Could you check whether the performance gets imporoved if you run each
>> thread on the same NUMA node? For example:
>>
>> # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>> vmcore-cd0
>>
> Hi HATAYAMA,
>
> I think your guess is right, but maybe your command has a little problem.
>
> From my test, the NUMA did affect the performance, but not too much.
> The average time of cpus in the same NUMA node:
> # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
> is 314s
> The average time of cpus in different NUMA node:
> # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
> is 354s
>
Hmm, according to some previous discussion, what we should see here is
whether it affects performance of makedumpfile with --num-threads 1
and -d 31. So you should need to compare:
# taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
with:
# taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
Also, I'm assuming that you've done these benchmark on kdump 1st
kernel, not kdump 2nd kernel. Is this correct?
> But I think if you want to use "--num-threads 4", the --cpu-list numbers
> following "taskset -c" should be 5 cpus at least, otherwise the time will be too
> long.
>
I see.
--
Thanks.
HATAYAMA, Daisuke
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-24 3:22 ` HATAYAMA Daisuke
@ 2015-12-24 3:31 ` Chao Fan
2015-12-24 3:50 ` HATAYAMA Daisuke
0 siblings, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-24 3:31 UTC (permalink / raw)
To: HATAYAMA Daisuke; +Cc: ats-kumagai, zhouwj-fnst, kexec
----- Original Message -----
> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> To: cfan@redhat.com
> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
> Sent: Thursday, December 24, 2015 11:22:28 AM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>
> From: Chao Fan <cfan@redhat.com>
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> Date: Wed, 23 Dec 2015 21:20:48 -0500
>
> >
> >
> > ----- Original Message -----
> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> >> To: cfan@redhat.com
> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
> >> kexec@lists.infradead.org
> >> Sent: Tuesday, December 22, 2015 4:32:25 PM
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >>
> >> Chao,
> >>
> >> From: Chao Fan <cfan@redhat.com>
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> Date: Thu, 10 Dec 2015 05:54:28 -0500
> >>
> >> >
> >> >
> >> > ----- Original Message -----
> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> To: "Chao Fan" <cfan@redhat.com>
> >> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
> >> >> kexec@lists.infradead.org
> >> >> Sent: Thursday, December 10, 2015 6:32:32 PM
> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >>
> >> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
> >> >> >
> >> >> >
> >> >> > ----- Original Message -----
> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> >> >> >> Cc: kexec@lists.infradead.org
> >> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> >>
> >> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >> >> >>>> Hello Kumagai,
> >> >> >>>>
> >> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >> >> >>>>> Hello, Zhou
> >> >> >>>>>
> >> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >> >> >>>>>>> Hi,
> >> >> >>>>>>>
> >> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >> >> >>>>>>>> I think there is no problem if other test results are as
> >> >> >>>>>>>> expected.
> >> >> >>>>>>>>
> >> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
> >> >> >>>>>>>> So for lzo, it can't do much help at most of time.
> >> >> >>>>>>>
> >> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
> >> >> >>>>>>>
> >> >> >>>>>>> [--num-threads THREADNUM]:
> >> >> >>>>>>> Using multiple threads to read and compress data of
> >> >> >>>>>>> each
> >> >> >>>>>>> page
> >> >> >>>>>>> in parallel.
> >> >> >>>>>>> And it will reduces time for saving DUMPFILE.
> >> >> >>>>>>> This feature only supports creating DUMPFILE in
> >> >> >>>>>>> kdump-comressed format from
> >> >> >>>>>>> VMCORE in kdump-compressed format or elf format.
> >> >> >>>>>>>
> >> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
> >> >> >>>>>>> --num-threads only
> >> >> >>>>>>> supports zlib compressed vmcore.
> >> >> >>>>>>>
> >> >> >>>>>>
> >> >> >>>>>> Sorry, it seems that something I said is not so clear.
> >> >> >>>>>> lzo is also supported. Since lzo compresses data at a high
> >> >> >>>>>> speed,
> >> >> >>>>>> the
> >> >> >>>>>> improving of the performance is not so obvious at most of time.
> >> >> >>>>>>
> >> >> >>>>>>> Also worth to mention about the recommended -d value for this
> >> >> >>>>>>> feature.
> >> >> >>>>>>>
> >> >> >>>>>>
> >> >> >>>>>> Yes, I think it's worth. I forgot it.
> >> >> >>>>>
> >> >> >>>>> I saw your patch, but I think I should confirm what is the
> >> >> >>>>> problem
> >> >> >>>>> first.
> >> >> >>>>>
> >> >> >>>>>> However, when "-d 31" is specified, it will be worse.
> >> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
> >> >> >>>>>> And even the page has been filtered, it will also take a buffer.
> >> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
> >> >> >>>>>> of buffers. Then the page which needs to be compressed can't
> >> >> >>>>>> be compressed parallel.
> >> >> >>>>>
> >> >> >>>>> Could you explain why compression will not be parallel in more
> >> >> >>>>> detail ?
> >> >> >>>>> Actually the buffers are used also for filtered pages, it sounds
> >> >> >>>>> inefficient.
> >> >> >>>>> However, I don't understand why it prevents parallel compression.
> >> >> >>>>>
> >> >> >>>>
> >> >> >>>> Think about this, in a huge memory, most of the page will be
> >> >> >>>> filtered,
> >> >> >>>> and
> >> >> >>>> we have 5 buffers.
> >> >> >>>>
> >> >> >>>> page1 page2 page3 page4 page5 page6
> >> >> >>>> page7
> >> >> >>>> .....
> >> >> >>>> [buffer1] [2] [3] [4] [5]
> >> >> >>>> unfiltered filtered filtered filtered filtered unfiltered
> >> >> >>>> filtered
> >> >> >>>>
> >> >> >>>> Since filtered page will take a buffer, when compressing page1,
> >> >> >>>> page6 can't be compressed at the same time.
> >> >> >>>> That why it will prevent parallel compression.
> >> >> >>>
> >> >> >>> Thanks for your explanation, I understand.
> >> >> >>> This is just an issue of the current implementation, there is no
> >> >> >>> reason to stand this restriction.
> >> >> >>>
> >> >> >>>>> Further, according to Chao's benchmark, there is a big
> >> >> >>>>> performance
> >> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
> >> >> >>>>> The current implementation seems to have some problems, we should
> >> >> >>>>> solve them.
> >> >> >>>>>
> >> >> >>>>
> >> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
> >> >> >>>> compressing
> >> >> >>>> parallel, on the other hand we will introduce some extra work by
> >> >> >>>> adding
> >> >> >>>> "--num-threads". So it is obvious that it will have a performance
> >> >> >>>> degradation.
> >> >> >>>
> >> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
> >> >> >>> exclusive
> >> >> >>> lock),
> >> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still
> >> >> >>> sounds
> >> >> >>> too slow, the degradation is too big to be called "some extra
> >> >> >>> work".
> >> >> >>>
> >> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
> >> >> >>> the above "buffer fairness issue" will not be related to this
> >> >> >>> degradation.
> >> >> >>> What do you think what make this degradation ?
> >> >> >>>
> >> >> >>
> >> >> >> I can't get such result at this moment, so I can't do some further
> >> >> >> investigation
> >> >> >> right now. I guess it may be caused by the underlying implementation
> >> >> >> of
> >> >> >> pthread.
> >> >> >> I reviewed the test result of the patch v2 and found in different
> >> >> >> machines,
> >> >> >> the results are quite different.
> >> >> >
> >> >> > Hi Zhou Wenjian,
> >> >> >
> >> >> > I have done more tests in another machine with 128G memory, and get
> >> >> > the
> >> >> > result:
> >> >> >
> >> >> > the size of vmcore is 300M in "-d 31"
> >> >> > makedumpfile -l --message-level 1 -d 31:
> >> >> > time: 8.6s page-faults: 2272
> >> >> >
> >> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> >> >> > time: 28.1s page-faults: 2359
> >> >> >
> >> >> >
> >> >> > and the size of vmcore is 2.6G in "-d 0".
> >> >> > In this machine, I get the same result as yours:
> >> >> >
> >> >> >
> >> >> > makedumpfile -c --message-level 1 -d 0:
> >> >> > time: 597s page-faults: 2287
> >> >> >
> >> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> >> >> > time: 602s page-faults: 2361
> >> >> >
> >> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> >> >> > time: 337s page-faults: 2397
> >> >> >
> >> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> >> >> > time: 175s page-faults: 2461
> >> >> >
> >> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> >> >> > time: 103s page-faults: 2611
> >> >> >
> >> >> >
> >> >> > But the machine of my first test is not under my control, should I
> >> >> > wait
> >> >> > for
> >> >> > the first machine to do more tests?
> >> >> > If there are still some problems in my tests, please tell me.
> >> >> >
> >> >>
> >> >> Thanks a lot for your test, it seems that there is nothing wrong.
> >> >> And I haven't got any idea about more tests...
> >> >>
> >> >> Could you provide the information of your cpu ?
> >> >> I will do some further investigation later.
> >> >>
> >> >
> >> > OK, of course, here is the information of cpu:
> >> >
> >> > # lscpu
> >> > Architecture: x86_64
> >> > CPU op-mode(s): 32-bit, 64-bit
> >> > Byte Order: Little Endian
> >> > CPU(s): 48
> >> > On-line CPU(s) list: 0-47
> >> > Thread(s) per core: 1
> >> > Core(s) per socket: 6
> >> > Socket(s): 8
> >> > NUMA node(s): 8
> >> > Vendor ID: AuthenticAMD
> >> > CPU family: 16
> >> > Model: 8
> >> > Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
> >> > Stepping: 0
> >> > CPU MHz: 2793.040
> >> > BogoMIPS: 5586.22
> >> > Virtualization: AMD-V
> >> > L1d cache: 64K
> >> > L1i cache: 64K
> >> > L2 cache: 512K
> >> > L3 cache: 5118K
> >> > NUMA node0 CPU(s): 0,8,16,24,32,40
> >> > NUMA node1 CPU(s): 1,9,17,25,33,41
> >> > NUMA node2 CPU(s): 2,10,18,26,34,42
> >> > NUMA node3 CPU(s): 3,11,19,27,35,43
> >> > NUMA node4 CPU(s): 4,12,20,28,36,44
> >> > NUMA node5 CPU(s): 5,13,21,29,37,45
> >> > NUMA node6 CPU(s): 6,14,22,30,38,46
> >> > NUMA node7 CPU(s): 7,15,23,31,39,47
> >>
> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
> >> that this affects performance of makedumpfile? This is just a guess.
> >>
> >> Could you check whether the performance gets imporoved if you run each
> >> thread on the same NUMA node? For example:
> >>
> >> # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> vmcore-cd0
> >>
> > Hi HATAYAMA,
> >
> > I think your guess is right, but maybe your command has a little problem.
> >
> > From my test, the NUMA did affect the performance, but not too much.
> > The average time of cpus in the same NUMA node:
> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
> > vmcore-cd0
> > is 314s
> > The average time of cpus in different NUMA node:
> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
> > vmcore-cd0
> > is 354s
> >
>
> Hmm, according to some previous discussion, what we should see here is
> whether it affects performance of makedumpfile with --num-threads 1
> and -d 31. So you should need to compare:
>
> # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
>
> with:
>
> # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
>
> Also, I'm assuming that you've done these benchmark on kdump 1st
> kernel, not kdump 2nd kernel. Is this correct?
>
Hi HATAYAMA,
I test in the first kernel, not in the kdump second kernel.
Thanks,
Chao Fan
> > But I think if you want to use "--num-threads 4", the --cpu-list numbers
> > following "taskset -c" should be 5 cpus at least, otherwise the time will
> > be too
> > long.
> >
>
> I see.
>
> --
> Thanks.
> HATAYAMA, Daisuke
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-24 3:31 ` Chao Fan
@ 2015-12-24 3:50 ` HATAYAMA Daisuke
2015-12-24 6:02 ` Chao Fan
0 siblings, 1 reply; 43+ messages in thread
From: HATAYAMA Daisuke @ 2015-12-24 3:50 UTC (permalink / raw)
To: cfan; +Cc: ats-kumagai, zhouwj-fnst, kexec
From: Chao Fan <cfan@redhat.com>
Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
Date: Wed, 23 Dec 2015 22:31:37 -0500
>
>
> ----- Original Message -----
>> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> To: cfan@redhat.com
>> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
>> Sent: Thursday, December 24, 2015 11:22:28 AM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> From: Chao Fan <cfan@redhat.com>
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> Date: Wed, 23 Dec 2015 21:20:48 -0500
>>
>> >
>> >
>> > ----- Original Message -----
>> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> >> To: cfan@redhat.com
>> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
>> >> kexec@lists.infradead.org
>> >> Sent: Tuesday, December 22, 2015 4:32:25 PM
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >>
>> >> Chao,
>> >>
>> >> From: Chao Fan <cfan@redhat.com>
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> Date: Thu, 10 Dec 2015 05:54:28 -0500
>> >>
>> >> >
>> >> >
>> >> > ----- Original Message -----
>> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> To: "Chao Fan" <cfan@redhat.com>
>> >> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
>> >> >> kexec@lists.infradead.org
>> >> >> Sent: Thursday, December 10, 2015 6:32:32 PM
>> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >>
>> >> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
>> >> >> >
>> >> >> >
>> >> >> > ----- Original Message -----
>> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> >> >> >> Cc: kexec@lists.infradead.org
>> >> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
>> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> >>
>> >> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> >> >> >>>> Hello Kumagai,
>> >> >> >>>>
>> >> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> >> >> >>>>> Hello, Zhou
>> >> >> >>>>>
>> >> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>> >> >> >>>>>>> Hi,
>> >> >> >>>>>>>
>> >> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> >> >> >>>>>>>> I think there is no problem if other test results are as
>> >> >> >>>>>>>> expected.
>> >> >> >>>>>>>>
>> >> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
>> >> >> >>>>>>>> So for lzo, it can't do much help at most of time.
>> >> >> >>>>>>>
>> >> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
>> >> >> >>>>>>>
>> >> >> >>>>>>> [--num-threads THREADNUM]:
>> >> >> >>>>>>> Using multiple threads to read and compress data of
>> >> >> >>>>>>> each
>> >> >> >>>>>>> page
>> >> >> >>>>>>> in parallel.
>> >> >> >>>>>>> And it will reduces time for saving DUMPFILE.
>> >> >> >>>>>>> This feature only supports creating DUMPFILE in
>> >> >> >>>>>>> kdump-comressed format from
>> >> >> >>>>>>> VMCORE in kdump-compressed format or elf format.
>> >> >> >>>>>>>
>> >> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
>> >> >> >>>>>>> --num-threads only
>> >> >> >>>>>>> supports zlib compressed vmcore.
>> >> >> >>>>>>>
>> >> >> >>>>>>
>> >> >> >>>>>> Sorry, it seems that something I said is not so clear.
>> >> >> >>>>>> lzo is also supported. Since lzo compresses data at a high
>> >> >> >>>>>> speed,
>> >> >> >>>>>> the
>> >> >> >>>>>> improving of the performance is not so obvious at most of time.
>> >> >> >>>>>>
>> >> >> >>>>>>> Also worth to mention about the recommended -d value for this
>> >> >> >>>>>>> feature.
>> >> >> >>>>>>>
>> >> >> >>>>>>
>> >> >> >>>>>> Yes, I think it's worth. I forgot it.
>> >> >> >>>>>
>> >> >> >>>>> I saw your patch, but I think I should confirm what is the
>> >> >> >>>>> problem
>> >> >> >>>>> first.
>> >> >> >>>>>
>> >> >> >>>>>> However, when "-d 31" is specified, it will be worse.
>> >> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
>> >> >> >>>>>> And even the page has been filtered, it will also take a buffer.
>> >> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
>> >> >> >>>>>> of buffers. Then the page which needs to be compressed can't
>> >> >> >>>>>> be compressed parallel.
>> >> >> >>>>>
>> >> >> >>>>> Could you explain why compression will not be parallel in more
>> >> >> >>>>> detail ?
>> >> >> >>>>> Actually the buffers are used also for filtered pages, it sounds
>> >> >> >>>>> inefficient.
>> >> >> >>>>> However, I don't understand why it prevents parallel compression.
>> >> >> >>>>>
>> >> >> >>>>
>> >> >> >>>> Think about this, in a huge memory, most of the page will be
>> >> >> >>>> filtered,
>> >> >> >>>> and
>> >> >> >>>> we have 5 buffers.
>> >> >> >>>>
>> >> >> >>>> page1 page2 page3 page4 page5 page6
>> >> >> >>>> page7
>> >> >> >>>> .....
>> >> >> >>>> [buffer1] [2] [3] [4] [5]
>> >> >> >>>> unfiltered filtered filtered filtered filtered unfiltered
>> >> >> >>>> filtered
>> >> >> >>>>
>> >> >> >>>> Since filtered page will take a buffer, when compressing page1,
>> >> >> >>>> page6 can't be compressed at the same time.
>> >> >> >>>> That why it will prevent parallel compression.
>> >> >> >>>
>> >> >> >>> Thanks for your explanation, I understand.
>> >> >> >>> This is just an issue of the current implementation, there is no
>> >> >> >>> reason to stand this restriction.
>> >> >> >>>
>> >> >> >>>>> Further, according to Chao's benchmark, there is a big
>> >> >> >>>>> performance
>> >> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
>> >> >> >>>>> The current implementation seems to have some problems, we should
>> >> >> >>>>> solve them.
>> >> >> >>>>>
>> >> >> >>>>
>> >> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
>> >> >> >>>> compressing
>> >> >> >>>> parallel, on the other hand we will introduce some extra work by
>> >> >> >>>> adding
>> >> >> >>>> "--num-threads". So it is obvious that it will have a performance
>> >> >> >>>> degradation.
>> >> >> >>>
>> >> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
>> >> >> >>> exclusive
>> >> >> >>> lock),
>> >> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still
>> >> >> >>> sounds
>> >> >> >>> too slow, the degradation is too big to be called "some extra
>> >> >> >>> work".
>> >> >> >>>
>> >> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> >> >> >>> the above "buffer fairness issue" will not be related to this
>> >> >> >>> degradation.
>> >> >> >>> What do you think what make this degradation ?
>> >> >> >>>
>> >> >> >>
>> >> >> >> I can't get such result at this moment, so I can't do some further
>> >> >> >> investigation
>> >> >> >> right now. I guess it may be caused by the underlying implementation
>> >> >> >> of
>> >> >> >> pthread.
>> >> >> >> I reviewed the test result of the patch v2 and found in different
>> >> >> >> machines,
>> >> >> >> the results are quite different.
>> >> >> >
>> >> >> > Hi Zhou Wenjian,
>> >> >> >
>> >> >> > I have done more tests in another machine with 128G memory, and get
>> >> >> > the
>> >> >> > result:
>> >> >> >
>> >> >> > the size of vmcore is 300M in "-d 31"
>> >> >> > makedumpfile -l --message-level 1 -d 31:
>> >> >> > time: 8.6s page-faults: 2272
>> >> >> >
>> >> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
>> >> >> > time: 28.1s page-faults: 2359
>> >> >> >
>> >> >> >
>> >> >> > and the size of vmcore is 2.6G in "-d 0".
>> >> >> > In this machine, I get the same result as yours:
>> >> >> >
>> >> >> >
>> >> >> > makedumpfile -c --message-level 1 -d 0:
>> >> >> > time: 597s page-faults: 2287
>> >> >> >
>> >> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
>> >> >> > time: 602s page-faults: 2361
>> >> >> >
>> >> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
>> >> >> > time: 337s page-faults: 2397
>> >> >> >
>> >> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
>> >> >> > time: 175s page-faults: 2461
>> >> >> >
>> >> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
>> >> >> > time: 103s page-faults: 2611
>> >> >> >
>> >> >> >
>> >> >> > But the machine of my first test is not under my control, should I
>> >> >> > wait
>> >> >> > for
>> >> >> > the first machine to do more tests?
>> >> >> > If there are still some problems in my tests, please tell me.
>> >> >> >
>> >> >>
>> >> >> Thanks a lot for your test, it seems that there is nothing wrong.
>> >> >> And I haven't got any idea about more tests...
>> >> >>
>> >> >> Could you provide the information of your cpu ?
>> >> >> I will do some further investigation later.
>> >> >>
>> >> >
>> >> > OK, of course, here is the information of cpu:
>> >> >
>> >> > # lscpu
>> >> > Architecture: x86_64
>> >> > CPU op-mode(s): 32-bit, 64-bit
>> >> > Byte Order: Little Endian
>> >> > CPU(s): 48
>> >> > On-line CPU(s) list: 0-47
>> >> > Thread(s) per core: 1
>> >> > Core(s) per socket: 6
>> >> > Socket(s): 8
>> >> > NUMA node(s): 8
>> >> > Vendor ID: AuthenticAMD
>> >> > CPU family: 16
>> >> > Model: 8
>> >> > Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
>> >> > Stepping: 0
>> >> > CPU MHz: 2793.040
>> >> > BogoMIPS: 5586.22
>> >> > Virtualization: AMD-V
>> >> > L1d cache: 64K
>> >> > L1i cache: 64K
>> >> > L2 cache: 512K
>> >> > L3 cache: 5118K
>> >> > NUMA node0 CPU(s): 0,8,16,24,32,40
>> >> > NUMA node1 CPU(s): 1,9,17,25,33,41
>> >> > NUMA node2 CPU(s): 2,10,18,26,34,42
>> >> > NUMA node3 CPU(s): 3,11,19,27,35,43
>> >> > NUMA node4 CPU(s): 4,12,20,28,36,44
>> >> > NUMA node5 CPU(s): 5,13,21,29,37,45
>> >> > NUMA node6 CPU(s): 6,14,22,30,38,46
>> >> > NUMA node7 CPU(s): 7,15,23,31,39,47
>> >>
>> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
>> >> that this affects performance of makedumpfile? This is just a guess.
>> >>
>> >> Could you check whether the performance gets imporoved if you run each
>> >> thread on the same NUMA node? For example:
>> >>
>> >> # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> vmcore-cd0
>> >>
>> > Hi HATAYAMA,
>> >
>> > I think your guess is right, but maybe your command has a little problem.
>> >
>> > From my test, the NUMA did affect the performance, but not too much.
>> > The average time of cpus in the same NUMA node:
>> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
>> > vmcore-cd0
>> > is 314s
>> > The average time of cpus in different NUMA node:
>> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
>> > vmcore-cd0
>> > is 354s
>> >
>>
>> Hmm, according to some previous discussion, what we should see here is
>> whether it affects performance of makedumpfile with --num-threads 1
>> and -d 31. So you should need to compare:
>>
>> # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
>>
>> with:
>>
>> # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
I removed -c option wrongly. What I wanted to write is:
# taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
and:
# taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
just in case...
>>
>> Also, I'm assuming that you've done these benchmark on kdump 1st
>> kernel, not kdump 2nd kernel. Is this correct?
>>
> Hi HATAYAMA,
>
> I test in the first kernel, not in the kdump second kernel.
>
I see.
--
Thanks.
HATAYAMA, Daisuke
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-24 3:50 ` HATAYAMA Daisuke
@ 2015-12-24 6:02 ` Chao Fan
2015-12-24 7:22 ` HATAYAMA Daisuke
2015-12-24 8:20 ` Atsushi Kumagai
0 siblings, 2 replies; 43+ messages in thread
From: Chao Fan @ 2015-12-24 6:02 UTC (permalink / raw)
To: HATAYAMA Daisuke; +Cc: ats-kumagai, zhouwj-fnst, kexec
----- Original Message -----
> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> To: cfan@redhat.com
> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
> Sent: Thursday, December 24, 2015 11:50:08 AM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>
> From: Chao Fan <cfan@redhat.com>
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> Date: Wed, 23 Dec 2015 22:31:37 -0500
>
> >
> >
> > ----- Original Message -----
> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> >> To: cfan@redhat.com
> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
> >> kexec@lists.infradead.org
> >> Sent: Thursday, December 24, 2015 11:22:28 AM
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >>
> >> From: Chao Fan <cfan@redhat.com>
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> Date: Wed, 23 Dec 2015 21:20:48 -0500
> >>
> >> >
> >> >
> >> > ----- Original Message -----
> >> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> >> >> To: cfan@redhat.com
> >> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
> >> >> kexec@lists.infradead.org
> >> >> Sent: Tuesday, December 22, 2015 4:32:25 PM
> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >>
> >> >> Chao,
> >> >>
> >> >> From: Chao Fan <cfan@redhat.com>
> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> Date: Thu, 10 Dec 2015 05:54:28 -0500
> >> >>
> >> >> >
> >> >> >
> >> >> > ----- Original Message -----
> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> >> To: "Chao Fan" <cfan@redhat.com>
> >> >> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
> >> >> >> kexec@lists.infradead.org
> >> >> >> Sent: Thursday, December 10, 2015 6:32:32 PM
> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> >>
> >> >> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
> >> >> >> >
> >> >> >> >
> >> >> >> > ----- Original Message -----
> >> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> >> >> >> >> Cc: kexec@lists.infradead.org
> >> >> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
> >> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> >> >>
> >> >> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >> >> >> >>>> Hello Kumagai,
> >> >> >> >>>>
> >> >> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >> >> >> >>>>> Hello, Zhou
> >> >> >> >>>>>
> >> >> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >> >> >> >>>>>>> Hi,
> >> >> >> >>>>>>>
> >> >> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >> >> >> >>>>>>>> I think there is no problem if other test results are as
> >> >> >> >>>>>>>> expected.
> >> >> >> >>>>>>>>
> >> >> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
> >> >> >> >>>>>>>> So for lzo, it can't do much help at most of time.
> >> >> >> >>>>>>>
> >> >> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
> >> >> >> >>>>>>>
> >> >> >> >>>>>>> [--num-threads THREADNUM]:
> >> >> >> >>>>>>> Using multiple threads to read and compress data
> >> >> >> >>>>>>> of
> >> >> >> >>>>>>> each
> >> >> >> >>>>>>> page
> >> >> >> >>>>>>> in parallel.
> >> >> >> >>>>>>> And it will reduces time for saving DUMPFILE.
> >> >> >> >>>>>>> This feature only supports creating DUMPFILE in
> >> >> >> >>>>>>> kdump-comressed format from
> >> >> >> >>>>>>> VMCORE in kdump-compressed format or elf format.
> >> >> >> >>>>>>>
> >> >> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
> >> >> >> >>>>>>> --num-threads only
> >> >> >> >>>>>>> supports zlib compressed vmcore.
> >> >> >> >>>>>>>
> >> >> >> >>>>>>
> >> >> >> >>>>>> Sorry, it seems that something I said is not so clear.
> >> >> >> >>>>>> lzo is also supported. Since lzo compresses data at a high
> >> >> >> >>>>>> speed,
> >> >> >> >>>>>> the
> >> >> >> >>>>>> improving of the performance is not so obvious at most of
> >> >> >> >>>>>> time.
> >> >> >> >>>>>>
> >> >> >> >>>>>>> Also worth to mention about the recommended -d value for
> >> >> >> >>>>>>> this
> >> >> >> >>>>>>> feature.
> >> >> >> >>>>>>>
> >> >> >> >>>>>>
> >> >> >> >>>>>> Yes, I think it's worth. I forgot it.
> >> >> >> >>>>>
> >> >> >> >>>>> I saw your patch, but I think I should confirm what is the
> >> >> >> >>>>> problem
> >> >> >> >>>>> first.
> >> >> >> >>>>>
> >> >> >> >>>>>> However, when "-d 31" is specified, it will be worse.
> >> >> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
> >> >> >> >>>>>> And even the page has been filtered, it will also take a
> >> >> >> >>>>>> buffer.
> >> >> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
> >> >> >> >>>>>> of buffers. Then the page which needs to be compressed can't
> >> >> >> >>>>>> be compressed parallel.
> >> >> >> >>>>>
> >> >> >> >>>>> Could you explain why compression will not be parallel in more
> >> >> >> >>>>> detail ?
> >> >> >> >>>>> Actually the buffers are used also for filtered pages, it
> >> >> >> >>>>> sounds
> >> >> >> >>>>> inefficient.
> >> >> >> >>>>> However, I don't understand why it prevents parallel
> >> >> >> >>>>> compression.
> >> >> >> >>>>>
> >> >> >> >>>>
> >> >> >> >>>> Think about this, in a huge memory, most of the page will be
> >> >> >> >>>> filtered,
> >> >> >> >>>> and
> >> >> >> >>>> we have 5 buffers.
> >> >> >> >>>>
> >> >> >> >>>> page1 page2 page3 page4 page5 page6
> >> >> >> >>>> page7
> >> >> >> >>>> .....
> >> >> >> >>>> [buffer1] [2] [3] [4] [5]
> >> >> >> >>>> unfiltered filtered filtered filtered filtered
> >> >> >> >>>> unfiltered
> >> >> >> >>>> filtered
> >> >> >> >>>>
> >> >> >> >>>> Since filtered page will take a buffer, when compressing page1,
> >> >> >> >>>> page6 can't be compressed at the same time.
> >> >> >> >>>> That why it will prevent parallel compression.
> >> >> >> >>>
> >> >> >> >>> Thanks for your explanation, I understand.
> >> >> >> >>> This is just an issue of the current implementation, there is no
> >> >> >> >>> reason to stand this restriction.
> >> >> >> >>>
> >> >> >> >>>>> Further, according to Chao's benchmark, there is a big
> >> >> >> >>>>> performance
> >> >> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
> >> >> >> >>>>> The current implementation seems to have some problems, we
> >> >> >> >>>>> should
> >> >> >> >>>>> solve them.
> >> >> >> >>>>>
> >> >> >> >>>>
> >> >> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
> >> >> >> >>>> compressing
> >> >> >> >>>> parallel, on the other hand we will introduce some extra work
> >> >> >> >>>> by
> >> >> >> >>>> adding
> >> >> >> >>>> "--num-threads". So it is obvious that it will have a
> >> >> >> >>>> performance
> >> >> >> >>>> degradation.
> >> >> >> >>>
> >> >> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
> >> >> >> >>> exclusive
> >> >> >> >>> lock),
> >> >> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0"
> >> >> >> >>> still
> >> >> >> >>> sounds
> >> >> >> >>> too slow, the degradation is too big to be called "some extra
> >> >> >> >>> work".
> >> >> >> >>>
> >> >> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
> >> >> >> >>> the above "buffer fairness issue" will not be related to this
> >> >> >> >>> degradation.
> >> >> >> >>> What do you think what make this degradation ?
> >> >> >> >>>
> >> >> >> >>
> >> >> >> >> I can't get such result at this moment, so I can't do some
> >> >> >> >> further
> >> >> >> >> investigation
> >> >> >> >> right now. I guess it may be caused by the underlying
> >> >> >> >> implementation
> >> >> >> >> of
> >> >> >> >> pthread.
> >> >> >> >> I reviewed the test result of the patch v2 and found in different
> >> >> >> >> machines,
> >> >> >> >> the results are quite different.
> >> >> >> >
> >> >> >> > Hi Zhou Wenjian,
> >> >> >> >
> >> >> >> > I have done more tests in another machine with 128G memory, and
> >> >> >> > get
> >> >> >> > the
> >> >> >> > result:
> >> >> >> >
> >> >> >> > the size of vmcore is 300M in "-d 31"
> >> >> >> > makedumpfile -l --message-level 1 -d 31:
> >> >> >> > time: 8.6s page-faults: 2272
> >> >> >> >
> >> >> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> >> >> >> > time: 28.1s page-faults: 2359
> >> >> >> >
> >> >> >> >
> >> >> >> > and the size of vmcore is 2.6G in "-d 0".
> >> >> >> > In this machine, I get the same result as yours:
> >> >> >> >
> >> >> >> >
> >> >> >> > makedumpfile -c --message-level 1 -d 0:
> >> >> >> > time: 597s page-faults: 2287
> >> >> >> >
> >> >> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> >> >> >> > time: 602s page-faults: 2361
> >> >> >> >
> >> >> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> >> >> >> > time: 337s page-faults: 2397
> >> >> >> >
> >> >> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> >> >> >> > time: 175s page-faults: 2461
> >> >> >> >
> >> >> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> >> >> >> > time: 103s page-faults: 2611
> >> >> >> >
> >> >> >> >
> >> >> >> > But the machine of my first test is not under my control, should I
> >> >> >> > wait
> >> >> >> > for
> >> >> >> > the first machine to do more tests?
> >> >> >> > If there are still some problems in my tests, please tell me.
> >> >> >> >
> >> >> >>
> >> >> >> Thanks a lot for your test, it seems that there is nothing wrong.
> >> >> >> And I haven't got any idea about more tests...
> >> >> >>
> >> >> >> Could you provide the information of your cpu ?
> >> >> >> I will do some further investigation later.
> >> >> >>
> >> >> >
> >> >> > OK, of course, here is the information of cpu:
> >> >> >
> >> >> > # lscpu
> >> >> > Architecture: x86_64
> >> >> > CPU op-mode(s): 32-bit, 64-bit
> >> >> > Byte Order: Little Endian
> >> >> > CPU(s): 48
> >> >> > On-line CPU(s) list: 0-47
> >> >> > Thread(s) per core: 1
> >> >> > Core(s) per socket: 6
> >> >> > Socket(s): 8
> >> >> > NUMA node(s): 8
> >> >> > Vendor ID: AuthenticAMD
> >> >> > CPU family: 16
> >> >> > Model: 8
> >> >> > Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
> >> >> > Stepping: 0
> >> >> > CPU MHz: 2793.040
> >> >> > BogoMIPS: 5586.22
> >> >> > Virtualization: AMD-V
> >> >> > L1d cache: 64K
> >> >> > L1i cache: 64K
> >> >> > L2 cache: 512K
> >> >> > L3 cache: 5118K
> >> >> > NUMA node0 CPU(s): 0,8,16,24,32,40
> >> >> > NUMA node1 CPU(s): 1,9,17,25,33,41
> >> >> > NUMA node2 CPU(s): 2,10,18,26,34,42
> >> >> > NUMA node3 CPU(s): 3,11,19,27,35,43
> >> >> > NUMA node4 CPU(s): 4,12,20,28,36,44
> >> >> > NUMA node5 CPU(s): 5,13,21,29,37,45
> >> >> > NUMA node6 CPU(s): 6,14,22,30,38,46
> >> >> > NUMA node7 CPU(s): 7,15,23,31,39,47
> >> >>
> >> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
> >> >> that this affects performance of makedumpfile? This is just a guess.
> >> >>
> >> >> Could you check whether the performance gets imporoved if you run each
> >> >> thread on the same NUMA node? For example:
> >> >>
> >> >> # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> >> vmcore-cd0
> >> >>
> >> > Hi HATAYAMA,
> >> >
> >> > I think your guess is right, but maybe your command has a little
> >> > problem.
> >> >
> >> > From my test, the NUMA did affect the performance, but not too much.
> >> > The average time of cpus in the same NUMA node:
> >> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> > vmcore-cd0
> >> > is 314s
> >> > The average time of cpus in different NUMA node:
> >> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> > vmcore-cd0
> >> > is 354s
> >> >
> >>
> >> Hmm, according to some previous discussion, what we should see here is
> >> whether it affects performance of makedumpfile with --num-threads 1
> >> and -d 31. So you should need to compare:
> >>
> >> # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
> >>
> >> with:
> >>
> >> # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
>
> I removed -c option wrongly. What I wanted to write is:
>
> # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>
> and:
>
> # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>
> just in case...
>
Hi HATAYAMA,
the average time of
# taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
is 33s.
the average time of
# taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
is 18s.
My test steps:
1. change /etc/kdump/conf with
"core_collector makedumpfile -l --message-level 1 -d 31"
2. make a crash
3. cd into the directory of the vmcore made by kdump
4. in the directory of vmcore do
# taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
or
# taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
if there are there any problems, please tell me.
Thanks,
Chao Fan
> >>
> >> Also, I'm assuming that you've done these benchmark on kdump 1st
> >> kernel, not kdump 2nd kernel. Is this correct?
> >>
> > Hi HATAYAMA,
> >
> > I test in the first kernel, not in the kdump second kernel.
> >
>
> I see.
>
> --
> Thanks.
> HATAYAMA, Daisuke
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-24 6:02 ` Chao Fan
@ 2015-12-24 7:22 ` HATAYAMA Daisuke
2015-12-24 8:20 ` Atsushi Kumagai
1 sibling, 0 replies; 43+ messages in thread
From: HATAYAMA Daisuke @ 2015-12-24 7:22 UTC (permalink / raw)
To: cfan; +Cc: ats-kumagai, zhouwj-fnst, kexec
From: Chao Fan <cfan@redhat.com>
Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
Date: Thu, 24 Dec 2015 01:02:38 -0500
>
>
> ----- Original Message -----
>> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> To: cfan@redhat.com
>> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
>> Sent: Thursday, December 24, 2015 11:50:08 AM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> From: Chao Fan <cfan@redhat.com>
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> Date: Wed, 23 Dec 2015 22:31:37 -0500
>>
>> >
>> >
>> > ----- Original Message -----
>> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> >> To: cfan@redhat.com
>> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
>> >> kexec@lists.infradead.org
>> >> Sent: Thursday, December 24, 2015 11:22:28 AM
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >>
>> >> From: Chao Fan <cfan@redhat.com>
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> Date: Wed, 23 Dec 2015 21:20:48 -0500
>> >>
>> >> >
>> >> >
>> >> > ----- Original Message -----
>> >> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> >> >> To: cfan@redhat.com
>> >> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
>> >> >> kexec@lists.infradead.org
>> >> >> Sent: Tuesday, December 22, 2015 4:32:25 PM
>> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >>
>> >> >> Chao,
>> >> >>
>> >> >> From: Chao Fan <cfan@redhat.com>
>> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> Date: Thu, 10 Dec 2015 05:54:28 -0500
>> >> >>
>> >> >> >
>> >> >> >
>> >> >> > ----- Original Message -----
>> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> >> To: "Chao Fan" <cfan@redhat.com>
>> >> >> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
>> >> >> >> kexec@lists.infradead.org
>> >> >> >> Sent: Thursday, December 10, 2015 6:32:32 PM
>> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> >>
>> >> >> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > ----- Original Message -----
>> >> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> >> >> >> >> Cc: kexec@lists.infradead.org
>> >> >> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
>> >> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> >> >>
>> >> >> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> >> >> >> >>>> Hello Kumagai,
>> >> >> >> >>>>
>> >> >> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> >> >> >> >>>>> Hello, Zhou
>> >> >> >> >>>>>
>> >> >> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>> >> >> >> >>>>>>> Hi,
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> >> >> >> >>>>>>>> I think there is no problem if other test results are as
>> >> >> >> >>>>>>>> expected.
>> >> >> >> >>>>>>>>
>> >> >> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
>> >> >> >> >>>>>>>> So for lzo, it can't do much help at most of time.
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>> [--num-threads THREADNUM]:
>> >> >> >> >>>>>>> Using multiple threads to read and compress data
>> >> >> >> >>>>>>> of
>> >> >> >> >>>>>>> each
>> >> >> >> >>>>>>> page
>> >> >> >> >>>>>>> in parallel.
>> >> >> >> >>>>>>> And it will reduces time for saving DUMPFILE.
>> >> >> >> >>>>>>> This feature only supports creating DUMPFILE in
>> >> >> >> >>>>>>> kdump-comressed format from
>> >> >> >> >>>>>>> VMCORE in kdump-compressed format or elf format.
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
>> >> >> >> >>>>>>> --num-threads only
>> >> >> >> >>>>>>> supports zlib compressed vmcore.
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>
>> >> >> >> >>>>>> Sorry, it seems that something I said is not so clear.
>> >> >> >> >>>>>> lzo is also supported. Since lzo compresses data at a high
>> >> >> >> >>>>>> speed,
>> >> >> >> >>>>>> the
>> >> >> >> >>>>>> improving of the performance is not so obvious at most of
>> >> >> >> >>>>>> time.
>> >> >> >> >>>>>>
>> >> >> >> >>>>>>> Also worth to mention about the recommended -d value for
>> >> >> >> >>>>>>> this
>> >> >> >> >>>>>>> feature.
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>
>> >> >> >> >>>>>> Yes, I think it's worth. I forgot it.
>> >> >> >> >>>>>
>> >> >> >> >>>>> I saw your patch, but I think I should confirm what is the
>> >> >> >> >>>>> problem
>> >> >> >> >>>>> first.
>> >> >> >> >>>>>
>> >> >> >> >>>>>> However, when "-d 31" is specified, it will be worse.
>> >> >> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
>> >> >> >> >>>>>> And even the page has been filtered, it will also take a
>> >> >> >> >>>>>> buffer.
>> >> >> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
>> >> >> >> >>>>>> of buffers. Then the page which needs to be compressed can't
>> >> >> >> >>>>>> be compressed parallel.
>> >> >> >> >>>>>
>> >> >> >> >>>>> Could you explain why compression will not be parallel in more
>> >> >> >> >>>>> detail ?
>> >> >> >> >>>>> Actually the buffers are used also for filtered pages, it
>> >> >> >> >>>>> sounds
>> >> >> >> >>>>> inefficient.
>> >> >> >> >>>>> However, I don't understand why it prevents parallel
>> >> >> >> >>>>> compression.
>> >> >> >> >>>>>
>> >> >> >> >>>>
>> >> >> >> >>>> Think about this, in a huge memory, most of the page will be
>> >> >> >> >>>> filtered,
>> >> >> >> >>>> and
>> >> >> >> >>>> we have 5 buffers.
>> >> >> >> >>>>
>> >> >> >> >>>> page1 page2 page3 page4 page5 page6
>> >> >> >> >>>> page7
>> >> >> >> >>>> .....
>> >> >> >> >>>> [buffer1] [2] [3] [4] [5]
>> >> >> >> >>>> unfiltered filtered filtered filtered filtered
>> >> >> >> >>>> unfiltered
>> >> >> >> >>>> filtered
>> >> >> >> >>>>
>> >> >> >> >>>> Since filtered page will take a buffer, when compressing page1,
>> >> >> >> >>>> page6 can't be compressed at the same time.
>> >> >> >> >>>> That why it will prevent parallel compression.
>> >> >> >> >>>
>> >> >> >> >>> Thanks for your explanation, I understand.
>> >> >> >> >>> This is just an issue of the current implementation, there is no
>> >> >> >> >>> reason to stand this restriction.
>> >> >> >> >>>
>> >> >> >> >>>>> Further, according to Chao's benchmark, there is a big
>> >> >> >> >>>>> performance
>> >> >> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
>> >> >> >> >>>>> The current implementation seems to have some problems, we
>> >> >> >> >>>>> should
>> >> >> >> >>>>> solve them.
>> >> >> >> >>>>>
>> >> >> >> >>>>
>> >> >> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
>> >> >> >> >>>> compressing
>> >> >> >> >>>> parallel, on the other hand we will introduce some extra work
>> >> >> >> >>>> by
>> >> >> >> >>>> adding
>> >> >> >> >>>> "--num-threads". So it is obvious that it will have a
>> >> >> >> >>>> performance
>> >> >> >> >>>> degradation.
>> >> >> >> >>>
>> >> >> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
>> >> >> >> >>> exclusive
>> >> >> >> >>> lock),
>> >> >> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0"
>> >> >> >> >>> still
>> >> >> >> >>> sounds
>> >> >> >> >>> too slow, the degradation is too big to be called "some extra
>> >> >> >> >>> work".
>> >> >> >> >>>
>> >> >> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> >> >> >> >>> the above "buffer fairness issue" will not be related to this
>> >> >> >> >>> degradation.
>> >> >> >> >>> What do you think what make this degradation ?
>> >> >> >> >>>
>> >> >> >> >>
>> >> >> >> >> I can't get such result at this moment, so I can't do some
>> >> >> >> >> further
>> >> >> >> >> investigation
>> >> >> >> >> right now. I guess it may be caused by the underlying
>> >> >> >> >> implementation
>> >> >> >> >> of
>> >> >> >> >> pthread.
>> >> >> >> >> I reviewed the test result of the patch v2 and found in different
>> >> >> >> >> machines,
>> >> >> >> >> the results are quite different.
>> >> >> >> >
>> >> >> >> > Hi Zhou Wenjian,
>> >> >> >> >
>> >> >> >> > I have done more tests in another machine with 128G memory, and
>> >> >> >> > get
>> >> >> >> > the
>> >> >> >> > result:
>> >> >> >> >
>> >> >> >> > the size of vmcore is 300M in "-d 31"
>> >> >> >> > makedumpfile -l --message-level 1 -d 31:
>> >> >> >> > time: 8.6s page-faults: 2272
>> >> >> >> >
>> >> >> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
>> >> >> >> > time: 28.1s page-faults: 2359
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > and the size of vmcore is 2.6G in "-d 0".
>> >> >> >> > In this machine, I get the same result as yours:
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > makedumpfile -c --message-level 1 -d 0:
>> >> >> >> > time: 597s page-faults: 2287
>> >> >> >> >
>> >> >> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
>> >> >> >> > time: 602s page-faults: 2361
>> >> >> >> >
>> >> >> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
>> >> >> >> > time: 337s page-faults: 2397
>> >> >> >> >
>> >> >> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
>> >> >> >> > time: 175s page-faults: 2461
>> >> >> >> >
>> >> >> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
>> >> >> >> > time: 103s page-faults: 2611
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > But the machine of my first test is not under my control, should I
>> >> >> >> > wait
>> >> >> >> > for
>> >> >> >> > the first machine to do more tests?
>> >> >> >> > If there are still some problems in my tests, please tell me.
>> >> >> >> >
>> >> >> >>
>> >> >> >> Thanks a lot for your test, it seems that there is nothing wrong.
>> >> >> >> And I haven't got any idea about more tests...
>> >> >> >>
>> >> >> >> Could you provide the information of your cpu ?
>> >> >> >> I will do some further investigation later.
>> >> >> >>
>> >> >> >
>> >> >> > OK, of course, here is the information of cpu:
>> >> >> >
>> >> >> > # lscpu
>> >> >> > Architecture: x86_64
>> >> >> > CPU op-mode(s): 32-bit, 64-bit
>> >> >> > Byte Order: Little Endian
>> >> >> > CPU(s): 48
>> >> >> > On-line CPU(s) list: 0-47
>> >> >> > Thread(s) per core: 1
>> >> >> > Core(s) per socket: 6
>> >> >> > Socket(s): 8
>> >> >> > NUMA node(s): 8
>> >> >> > Vendor ID: AuthenticAMD
>> >> >> > CPU family: 16
>> >> >> > Model: 8
>> >> >> > Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
>> >> >> > Stepping: 0
>> >> >> > CPU MHz: 2793.040
>> >> >> > BogoMIPS: 5586.22
>> >> >> > Virtualization: AMD-V
>> >> >> > L1d cache: 64K
>> >> >> > L1i cache: 64K
>> >> >> > L2 cache: 512K
>> >> >> > L3 cache: 5118K
>> >> >> > NUMA node0 CPU(s): 0,8,16,24,32,40
>> >> >> > NUMA node1 CPU(s): 1,9,17,25,33,41
>> >> >> > NUMA node2 CPU(s): 2,10,18,26,34,42
>> >> >> > NUMA node3 CPU(s): 3,11,19,27,35,43
>> >> >> > NUMA node4 CPU(s): 4,12,20,28,36,44
>> >> >> > NUMA node5 CPU(s): 5,13,21,29,37,45
>> >> >> > NUMA node6 CPU(s): 6,14,22,30,38,46
>> >> >> > NUMA node7 CPU(s): 7,15,23,31,39,47
>> >> >>
>> >> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
>> >> >> that this affects performance of makedumpfile? This is just a guess.
>> >> >>
>> >> >> Could you check whether the performance gets imporoved if you run each
>> >> >> thread on the same NUMA node? For example:
>> >> >>
>> >> >> # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> >> vmcore-cd0
>> >> >>
>> >> > Hi HATAYAMA,
>> >> >
>> >> > I think your guess is right, but maybe your command has a little
>> >> > problem.
>> >> >
>> >> > From my test, the NUMA did affect the performance, but not too much.
>> >> > The average time of cpus in the same NUMA node:
>> >> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> > vmcore-cd0
>> >> > is 314s
>> >> > The average time of cpus in different NUMA node:
>> >> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> > vmcore-cd0
>> >> > is 354s
>> >> >
>> >>
>> >> Hmm, according to some previous discussion, what we should see here is
>> >> whether it affects performance of makedumpfile with --num-threads 1
>> >> and -d 31. So you should need to compare:
>> >>
>> >> # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
>> >>
>> >> with:
>> >>
>> >> # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
>>
>> I removed -c option wrongly. What I wanted to write is:
>>
>> # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>>
>> and:
>>
>> # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>>
>> just in case...
>>
> Hi HATAYAMA,
>
> the average time of
> # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> is 33s.
> the average time of
> # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> is 18s.
>
Thanks. I found out that NUMA nodes is irrelevant here.
> My test steps:
> 1. change /etc/kdump/conf with
> "core_collector makedumpfile -l --message-level 1 -d 31"
> 2. make a crash
> 3. cd into the directory of the vmcore made by kdump
> 4. in the directory of vmcore do
> # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> or
> # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>
> if there are there any problems, please tell me.
>
I'll investigate this with Zhou. Please provide us with information
when neccesary.
--
Thanks.
HATAYAMA, Daisuke
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-24 6:02 ` Chao Fan
2015-12-24 7:22 ` HATAYAMA Daisuke
@ 2015-12-24 8:20 ` Atsushi Kumagai
2015-12-24 9:04 ` Chao Fan
1 sibling, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-12-24 8:20 UTC (permalink / raw)
To: HATAYAMA Daisuke (d.hatayama@jp.fujitsu.com), Chao Fan
Cc: zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
>> >> >> >> Could you provide the information of your cpu ?
>> >> >> >> I will do some further investigation later.
>> >> >> >>
>> >> >> >
>> >> >> > OK, of course, here is the information of cpu:
>> >> >> >
>> >> >> > # lscpu
>> >> >> > Architecture: x86_64
>> >> >> > CPU op-mode(s): 32-bit, 64-bit
>> >> >> > Byte Order: Little Endian
>> >> >> > CPU(s): 48
>> >> >> > On-line CPU(s) list: 0-47
>> >> >> > Thread(s) per core: 1
>> >> >> > Core(s) per socket: 6
>> >> >> > Socket(s): 8
>> >> >> > NUMA node(s): 8
>> >> >> > Vendor ID: AuthenticAMD
>> >> >> > CPU family: 16
>> >> >> > Model: 8
>> >> >> > Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
>> >> >> > Stepping: 0
>> >> >> > CPU MHz: 2793.040
>> >> >> > BogoMIPS: 5586.22
>> >> >> > Virtualization: AMD-V
>> >> >> > L1d cache: 64K
>> >> >> > L1i cache: 64K
>> >> >> > L2 cache: 512K
>> >> >> > L3 cache: 5118K
>> >> >> > NUMA node0 CPU(s): 0,8,16,24,32,40
>> >> >> > NUMA node1 CPU(s): 1,9,17,25,33,41
>> >> >> > NUMA node2 CPU(s): 2,10,18,26,34,42
>> >> >> > NUMA node3 CPU(s): 3,11,19,27,35,43
>> >> >> > NUMA node4 CPU(s): 4,12,20,28,36,44
>> >> >> > NUMA node5 CPU(s): 5,13,21,29,37,45
>> >> >> > NUMA node6 CPU(s): 6,14,22,30,38,46
>> >> >> > NUMA node7 CPU(s): 7,15,23,31,39,47
>> >> >>
>> >> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
>> >> >> that this affects performance of makedumpfile? This is just a guess.
>> >> >>
>> >> >> Could you check whether the performance gets imporoved if you run each
>> >> >> thread on the same NUMA node? For example:
>> >> >>
>> >> >> # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> >> vmcore-cd0
>> >> >>
>> >> > Hi HATAYAMA,
>> >> >
>> >> > I think your guess is right, but maybe your command has a little
>> >> > problem.
>> >> >
>> >> > From my test, the NUMA did affect the performance, but not too much.
>> >> > The average time of cpus in the same NUMA node:
>> >> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> > vmcore-cd0
>> >> > is 314s
>> >> > The average time of cpus in different NUMA node:
>> >> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> > vmcore-cd0
>> >> > is 354s
>> >> >
>> >>
>> >> Hmm, according to some previous discussion, what we should see here is
>> >> whether it affects performance of makedumpfile with --num-threads 1
>> >> and -d 31. So you should need to compare:
>> >>
>> >> # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
>> >>
>> >> with:
>> >>
>> >> # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
>>
>> I removed -c option wrongly. What I wanted to write is:
>>
>> # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>>
>> and:
>>
>> # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>>
>> just in case...
Why did you remove -c option from makedumpfile ?
We are discussing the performance with compression.
I think the below is correct:
# taskset -c 0,8 makedumpfile --num-threads 1 [-c|-l|-p] -d 31 vmcore vmcore-d31
and:
# taskset -c 0 makedumpfile [-c|-l|-p] -d 31 vmcore vmcore-d31
Thanks,
Atsushi Kumagai
>Hi HATAYAMA,
>
>the average time of
># taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>is 33s.
>the average time of
># taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>is 18s.
>
>My test steps:
>1. change /etc/kdump/conf with
>"core_collector makedumpfile -l --message-level 1 -d 31"
>2. make a crash
>3. cd into the directory of the vmcore made by kdump
>4. in the directory of vmcore do
># taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>or
># taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>
>if there are there any problems, please tell me.
>
>Thanks,
>Chao Fan
>
>> >>
>> >> Also, I'm assuming that you've done these benchmark on kdump 1st
>> >> kernel, not kdump 2nd kernel. Is this correct?
>> >>
>> > Hi HATAYAMA,
>> >
>> > I test in the first kernel, not in the kdump second kernel.
>> >
>>
>> I see.
>>
>> --
>> Thanks.
>> HATAYAMA, Daisuke
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>
>
>_______________________________________________
>kexec mailing list
>kexec@lists.infradead.org
>http://lists.infradead.org/mailman/listinfo/kexec
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
2015-12-24 8:20 ` Atsushi Kumagai
@ 2015-12-24 9:04 ` Chao Fan
0 siblings, 0 replies; 43+ messages in thread
From: Chao Fan @ 2015-12-24 9:04 UTC (permalink / raw)
To: Atsushi Kumagai
Cc: HATAYAMA Daisuke (d.hatayama@jp.fujitsu.com), zhouwj-fnst, kexec
----- Original Message -----
> From: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> To: "HATAYAMA Daisuke (d.hatayama@jp.fujitsu.com)" <d.hatayama@jp.fujitsu.com>, "Chao Fan" <cfan@redhat.com>
> Cc: zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
> Sent: Thursday, December 24, 2015 4:20:42 PM
> Subject: RE: [PATCH RFC 00/11] makedumpfile: parallel processing
>
> >> >> >> >> Could you provide the information of your cpu ?
> >> >> >> >> I will do some further investigation later.
> >> >> >> >>
> >> >> >> >
> >> >> >> > OK, of course, here is the information of cpu:
> >> >> >> >
> >> >> >> > # lscpu
> >> >> >> > Architecture: x86_64
> >> >> >> > CPU op-mode(s): 32-bit, 64-bit
> >> >> >> > Byte Order: Little Endian
> >> >> >> > CPU(s): 48
> >> >> >> > On-line CPU(s) list: 0-47
> >> >> >> > Thread(s) per core: 1
> >> >> >> > Core(s) per socket: 6
> >> >> >> > Socket(s): 8
> >> >> >> > NUMA node(s): 8
> >> >> >> > Vendor ID: AuthenticAMD
> >> >> >> > CPU family: 16
> >> >> >> > Model: 8
> >> >> >> > Model name: Six-Core AMD Opteron(tm) Processor 8439 SE
> >> >> >> > Stepping: 0
> >> >> >> > CPU MHz: 2793.040
> >> >> >> > BogoMIPS: 5586.22
> >> >> >> > Virtualization: AMD-V
> >> >> >> > L1d cache: 64K
> >> >> >> > L1i cache: 64K
> >> >> >> > L2 cache: 512K
> >> >> >> > L3 cache: 5118K
> >> >> >> > NUMA node0 CPU(s): 0,8,16,24,32,40
> >> >> >> > NUMA node1 CPU(s): 1,9,17,25,33,41
> >> >> >> > NUMA node2 CPU(s): 2,10,18,26,34,42
> >> >> >> > NUMA node3 CPU(s): 3,11,19,27,35,43
> >> >> >> > NUMA node4 CPU(s): 4,12,20,28,36,44
> >> >> >> > NUMA node5 CPU(s): 5,13,21,29,37,45
> >> >> >> > NUMA node6 CPU(s): 6,14,22,30,38,46
> >> >> >> > NUMA node7 CPU(s): 7,15,23,31,39,47
> >> >> >>
> >> >> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
> >> >> >> that this affects performance of makedumpfile? This is just a guess.
> >> >> >>
> >> >> >> Could you check whether the performance gets imporoved if you run
> >> >> >> each
> >> >> >> thread on the same NUMA node? For example:
> >> >> >>
> >> >> >> # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> >> >> vmcore-cd0
> >> >> >>
> >> >> > Hi HATAYAMA,
> >> >> >
> >> >> > I think your guess is right, but maybe your command has a little
> >> >> > problem.
> >> >> >
> >> >> > From my test, the NUMA did affect the performance, but not too much.
> >> >> > The average time of cpus in the same NUMA node:
> >> >> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> >> > vmcore-cd0
> >> >> > is 314s
> >> >> > The average time of cpus in different NUMA node:
> >> >> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> >> > vmcore-cd0
> >> >> > is 354s
> >> >> >
> >> >>
> >> >> Hmm, according to some previous discussion, what we should see here is
> >> >> whether it affects performance of makedumpfile with --num-threads 1
> >> >> and -d 31. So you should need to compare:
> >> >>
> >> >> # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore
> >> >> vmcore-d31
> >> >>
> >> >> with:
> >> >>
> >> >> # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
> >>
> >> I removed -c option wrongly. What I wanted to write is:
> >>
> >> # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> >>
> >> and:
> >>
> >> # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> >>
> >> just in case...
>
> Why did you remove -c option from makedumpfile ?
> We are discussing the performance with compression.
> I think the below is correct:
>
> # taskset -c 0,8 makedumpfile --num-threads 1 [-c|-l|-p] -d 31 vmcore
> vmcore-d31
>
> and:
>
> # taskset -c 0 makedumpfile [-c|-l|-p] -d 31 vmcore vmcore-d31
>
Hi Atsushi Kumagai,
"taskset -c 0,8 makedumpfile --num-threads 1" "taskset -c 0 makedumpfile"
-c 52s 61s
-l 33s 17s
-p 33s 18s
Thanks,
Chao Fan
>
> Thanks,
> Atsushi Kumagai
>
> >Hi HATAYAMA,
> >
> >the average time of
> ># taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> >is 33s.
> >the average time of
> ># taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> >is 18s.
> >
> >My test steps:
> >1. change /etc/kdump/conf with
> >"core_collector makedumpfile -l --message-level 1 -d 31"
> >2. make a crash
> >3. cd into the directory of the vmcore made by kdump
> >4. in the directory of vmcore do
> ># taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> >or
> ># taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> >
> >if there are there any problems, please tell me.
> >
> >Thanks,
> >Chao Fan
> >
> >> >>
> >> >> Also, I'm assuming that you've done these benchmark on kdump 1st
> >> >> kernel, not kdump 2nd kernel. Is this correct?
> >> >>
> >> > Hi HATAYAMA,
> >> >
> >> > I test in the first kernel, not in the kdump second kernel.
> >> >
> >>
> >> I see.
> >>
> >> --
> >> Thanks.
> >> HATAYAMA, Daisuke
> >> _______________________________________________
> >> kexec mailing list
> >> kexec@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/kexec
> >>
> >
> >_______________________________________________
> >kexec mailing list
> >kexec@lists.infradead.org
> >http://lists.infradead.org/mailman/listinfo/kexec
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 43+ messages in thread
end of thread, other threads:[~2015-12-24 9:05 UTC | newest]
Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-06-05 7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 02/11] Add mappage_elf_parallel Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 03/11] Add readpage_elf_parallel Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 04/11] Add read_pfn_parallel Zhou Wenjian
2015-06-05 7:56 ` [PATCH RFC 05/11] Add function to initial bitmap for parallel use Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 06/11] Add filter_data_buffer_parallel Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 07/11] Add write_kdump_pages_parallel to allow parallel process Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 08/11] Add write_kdump_pages_parallel_cyclic to allow parallel process in cyclic_mode Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 09/11] Initial and free data used for parallel process Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 10/11] Make makedumpfile available to read and compress pages parallelly Zhou Wenjian
2015-06-05 7:57 ` [PATCH RFC 11/11] Add usage and manual about multiple threads process Zhou Wenjian
2015-06-08 3:55 ` [PATCH RFC 00/11] makedumpfile: parallel processing "Zhou, Wenjian/周文剑"
2015-12-01 8:39 ` Chao Fan
2015-12-02 5:29 ` "Zhou, Wenjian/周文剑"
2015-12-02 7:24 ` Dave Young
2015-12-02 7:38 ` "Zhou, Wenjian/周文剑"
2015-12-04 2:30 ` Atsushi Kumagai
2015-12-04 3:33 ` "Zhou, Wenjian/周文剑"
2015-12-04 8:56 ` Chao Fan
2015-12-07 1:09 ` "Zhou, Wenjian/周文剑"
2015-12-10 8:14 ` Atsushi Kumagai
2015-12-10 9:36 ` "Zhou, Wenjian/周文剑"
2015-12-10 9:58 ` Chao Fan
2015-12-10 10:32 ` "Zhou, Wenjian/周文剑"
2015-12-10 10:54 ` Chao Fan
2015-12-22 8:32 ` HATAYAMA Daisuke
2015-12-24 2:20 ` Chao Fan
2015-12-24 3:22 ` HATAYAMA Daisuke
2015-12-24 3:31 ` Chao Fan
2015-12-24 3:50 ` HATAYAMA Daisuke
2015-12-24 6:02 ` Chao Fan
2015-12-24 7:22 ` HATAYAMA Daisuke
2015-12-24 8:20 ` Atsushi Kumagai
2015-12-24 9:04 ` Chao Fan
2015-12-14 8:26 ` Atsushi Kumagai
2015-12-14 8:59 ` "Zhou, Wenjian/周文剑"
2015-06-10 6:06 ` Atsushi Kumagai
2015-06-11 3:47 ` "Zhou, Wenjian/周文剑"
2015-06-15 1:59 ` qiaonuohan
2015-06-15 5:57 ` Atsushi Kumagai
2015-06-15 6:06 ` qiaonuohan
2015-06-15 6:07 ` qiaonuohan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox