Kexec Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 00/11] makedumpfile: parallel processing
@ 2015-06-05  7:56 Zhou Wenjian
  2015-06-05  7:56 ` [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel Zhou Wenjian
                   ` (12 more replies)
  0 siblings, 13 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:56 UTC (permalink / raw)
  To: kexec

This patch set implements parallel processing by means of multiple threads.
With this patch set, it is available to use multiple threads to read
and compress pages. This parallel process will save time.
This feature only supports creating dumpfile in kdump-compressed format from
vmcore in kdump-compressed format or elf format. Currently, sadump and
 xen kdump are not supported.

Qiao Nuohan (11):
  Add readpage_kdump_compressed_parallel
  Add mappage_elf_parallel
  Add readpage_elf_parallel
  Add read_pfn_parallel
  Add function to initial bitmap for parallel use
  Add filter_data_buffer_parallel
  Add write_kdump_pages_parallel to allow parallel process
  Add write_kdump_pages_parallel_cyclic to allow parallel process in
    cyclic_mode
  Initial and free data used for parallel process
  Make makedumpfile available to read and compress pages parallelly
  Add usage and manual about multiple threads process

 Makefile       |    2 +
 erase_info.c   |   29 +-
 erase_info.h   |    2 +
 makedumpfile.8 |   24 +
 makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 makedumpfile.h |   79 +++
 print_info.c   |   16 +
 7 files changed, 1652 insertions(+), 5 deletions(-)


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
@ 2015-06-05  7:56 ` Zhou Wenjian
  2015-06-05  7:56 ` [PATCH RFC 02/11] Add mappage_elf_parallel Zhou Wenjian
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:56 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

readpage_kdump_compressed_parallel is used to enable reading pages from
vmcore in kdump-compressed format parallel. fd_memory and bitmap_memory
should be initialized and offered to each thread individually to avoid
conflict.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.c |  137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 137 insertions(+), 0 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index 32f5459..10b6738 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -252,6 +252,20 @@ pfn_to_pos(mdf_pfn_t pfn)
 	return desc_pos;
 }
 
+unsigned long
+pfn_to_pos_parallel(mdf_pfn_t pfn, struct dump_bitmap* bitmap_memory_parallel)
+{
+	unsigned long desc_pos;
+	mdf_pfn_t i;
+
+	desc_pos = info->valid_pages[pfn / BITMAP_SECT_LEN];
+	for (i = round(pfn, BITMAP_SECT_LEN); i < pfn; i++)
+		if (is_dumpable(bitmap_memory_parallel, i))
+			desc_pos++;
+
+	return desc_pos;
+}
+
 int
 read_page_desc(unsigned long long paddr, page_desc_t *pd)
 {
@@ -294,6 +308,50 @@ read_page_desc(unsigned long long paddr, page_desc_t *pd)
 	return TRUE;
 }
 
+int
+read_page_desc_parallel(int fd_memory, unsigned long long paddr,
+			page_desc_t *pd,
+			struct dump_bitmap* bitmap_memory_parallel)
+{
+	struct disk_dump_header *dh;
+	unsigned long desc_pos;
+	mdf_pfn_t pfn;
+	off_t offset;
+
+	/*
+	 * Find page descriptor
+	 */
+	dh = info->dh_memory;
+	offset
+	    = (DISKDUMP_HEADER_BLOCKS + dh->sub_hdr_size + dh->bitmap_blocks)
+		* dh->block_size;
+	pfn = paddr_to_pfn(paddr);
+	desc_pos = pfn_to_pos_parallel(pfn, bitmap_memory_parallel);
+	offset += (off_t)desc_pos * sizeof(page_desc_t);
+	if (lseek(fd_memory, offset, SEEK_SET) < 0) {
+		ERRMSG("Can't seek %s. %s\n",
+				 info->name_memory, strerror(errno));
+		return FALSE;
+	}
+
+	/*
+	 * Read page descriptor
+	 */
+	if (read(fd_memory, pd, sizeof(*pd)) != sizeof(*pd)) {
+		ERRMSG("Can't read %s. %s\n",
+				info->name_memory, strerror(errno));
+		return FALSE;
+	}
+
+	/*
+	 * Sanity check
+	 */
+	if (pd->size > dh->block_size)
+		return FALSE;
+
+	return TRUE;
+}
+
 static void
 unmap_cache(struct cache_entry *entry)
 {
@@ -590,6 +648,85 @@ readpage_kdump_compressed(unsigned long long paddr, void *bufptr)
 	return TRUE;
 }
 
+static int
+readpage_kdump_compressed_parallel(int fd_memory, unsigned long long paddr,
+				   void *bufptr,
+				   struct dump_bitmap* bitmap_memory_parallel)
+{
+	page_desc_t pd;
+	char buf[info->page_size], *rdbuf;
+	int ret;
+	unsigned long retlen;
+
+	if (!is_dumpable(bitmap_memory_parallel, paddr_to_pfn(paddr))) {
+		ERRMSG("pfn(%llx) is excluded from %s.\n",
+				paddr_to_pfn(paddr), info->name_memory);
+		return FALSE;
+	}
+
+	if (!read_page_desc_parallel(fd_memory, paddr, &pd,
+						bitmap_memory_parallel)) {
+		ERRMSG("Can't read page_desc: %llx\n", paddr);
+		return FALSE;
+	}
+
+	if (lseek(fd_memory, pd.offset, SEEK_SET) < 0) {
+		ERRMSG("Can't seek %s. %s\n",
+				info->name_memory, strerror(errno));
+		return FALSE;
+	}
+
+	/*
+	 * Read page data
+	 */
+	rdbuf = pd.flags & (DUMP_DH_COMPRESSED_ZLIB | DUMP_DH_COMPRESSED_LZO |
+		DUMP_DH_COMPRESSED_SNAPPY) ? buf : bufptr;
+	if (read(fd_memory, rdbuf, pd.size) != pd.size) {
+		ERRMSG("Can't read %s. %s\n",
+				info->name_memory, strerror(errno));
+		return FALSE;
+	}
+
+	if (pd.flags & DUMP_DH_COMPRESSED_ZLIB) {
+		retlen = info->page_size;
+		ret = uncompress((unsigned char *)bufptr, &retlen,
+					(unsigned char *)buf, pd.size);
+		if ((ret != Z_OK) || (retlen != info->page_size)) {
+			ERRMSG("Uncompress failed: %d\n", ret);
+			return FALSE;
+		}
+#ifdef USELZO
+	} else if (info->flag_lzo_support
+		   && (pd.flags & DUMP_DH_COMPRESSED_LZO)) {
+		retlen = info->page_size;
+		ret = lzo1x_decompress_safe((unsigned char *)buf, pd.size,
+					    (unsigned char *)bufptr, &retlen,
+					    LZO1X_MEM_DECOMPRESS);
+		if ((ret != LZO_E_OK) || (retlen != info->page_size)) {
+			ERRMSG("Uncompress failed: %d\n", ret);
+			return FALSE;
+		}
+#endif
+#ifdef USESNAPPY
+	} else if ((pd.flags & DUMP_DH_COMPRESSED_SNAPPY)) {
+
+		ret = snappy_uncompressed_length(buf, pd.size, (size_t *)&retlen);
+		if (ret != SNAPPY_OK) {
+			ERRMSG("Uncompress failed: %d\n", ret);
+			return FALSE;
+		}
+
+		ret = snappy_uncompress(buf, pd.size, bufptr, (size_t *)&retlen);
+		if ((ret != SNAPPY_OK) || (retlen != info->page_size)) {
+			ERRMSG("Uncompress failed: %d\n", ret);
+			return FALSE;
+		}
+#endif
+	}
+
+	return TRUE;
+}
+
 int
 readmem(int type_addr, unsigned long long addr, void *bufptr, size_t size)
 {
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 02/11] Add mappage_elf_parallel
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
  2015-06-05  7:56 ` [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel Zhou Wenjian
@ 2015-06-05  7:56 ` Zhou Wenjian
  2015-06-05  7:56 ` [PATCH RFC 03/11] Add readpage_elf_parallel Zhou Wenjian
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:56 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

mappage_elf_parallel is used to enable mmaping elf format to memory
parallelly. later patch will will use the mmapped memory to get data
of each page. fd_memory and mmap_cache should be initialized and offered
to each threads individually to avoid conflict.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.c |   97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 makedumpfile.h |   14 ++++++++
 2 files changed, 111 insertions(+), 0 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index 10b6738..0f71ce7 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -395,6 +395,46 @@ update_mmap_range(off_t offset, int initial) {
 }
 
 static int
+update_mmap_range_parallel(int fd_memory, off_t offset,
+			   struct mmap_cache *mmap_cache)
+{
+	off_t start_offset, end_offset;
+	off_t map_size;
+	off_t max_offset = get_max_file_offset();
+	off_t pt_load_end = offset_to_pt_load_end(offset);
+
+	/*
+	 * mmap_buf must be cleaned
+	 */
+	if (mmap_cache->mmap_buf != MAP_FAILED)
+		munmap(mmap_cache->mmap_buf, mmap_cache->mmap_end_offset
+					     - mmap_cache->mmap_start_offset);
+
+	/*
+	 * offset for mmap() must be page aligned.
+	 */
+	start_offset = roundup(offset, info->page_size);
+	end_offset = MIN(max_offset, round(pt_load_end, info->page_size));
+
+	if (!pt_load_end || (end_offset - start_offset) <= 0)
+		return FALSE;
+
+	map_size = MIN(end_offset - start_offset, info->mmap_region_size);
+
+	mmap_cache->mmap_buf = mmap(NULL, map_size, PROT_READ, MAP_PRIVATE,
+					fd_memory, start_offset);
+
+	if (mmap_cache->mmap_buf == MAP_FAILED) {
+		return FALSE;
+	}
+
+	mmap_cache->mmap_start_offset = start_offset;
+	mmap_cache->mmap_end_offset = start_offset + map_size;
+
+	return TRUE;
+}
+
+static int
 is_mapped_with_mmap(off_t offset) {
 
 	if (info->flag_usemmap == MMAP_ENABLE
@@ -405,6 +445,15 @@ is_mapped_with_mmap(off_t offset) {
 		return FALSE;
 }
 
+static int
+is_mapped_with_mmap_parallel(off_t offset, struct mmap_cache *mmap_cache) {
+	if (offset >= mmap_cache->mmap_start_offset
+	    && offset < mmap_cache->mmap_end_offset)
+		return TRUE;
+	else
+		return FALSE;
+}
+
 int
 initialize_mmap(void) {
 	unsigned long long phys_start;
@@ -459,6 +508,54 @@ mappage_elf(unsigned long long paddr)
 	return info->mmap_buf + (offset - info->mmap_start_offset);
 }
 
+static char *
+mappage_elf_parallel(int fd_memory, unsigned long long paddr,
+		     struct mmap_cache *mmap_cache)
+{
+	off_t offset, offset2;
+	int flag_usemmap;
+
+	pthread_rwlock_rdlock(&info->usemmap_rwlock);
+	flag_usemmap = info->flag_usemmap;
+	pthread_rwlock_unlock(&info->usemmap_rwlock);
+	if (flag_usemmap != MMAP_ENABLE)
+		return NULL;
+
+	offset = paddr_to_offset(paddr);
+	if (!offset || page_is_fractional(offset))
+		return NULL;
+
+	offset2 = paddr_to_offset(paddr + info->page_size - 1);
+	if (!offset2)
+		return NULL;
+
+	if (offset2 - offset != info->page_size - 1)
+		return NULL;
+
+	if (!is_mapped_with_mmap_parallel(offset, mmap_cache) &&
+	    !update_mmap_range_parallel(fd_memory, offset, mmap_cache)) {
+		ERRMSG("Can't read the dump memory(%s) with mmap().\n",
+		       info->name_memory);
+
+		ERRMSG("This kernel might have some problems about mmap().\n");
+		ERRMSG("read() will be used instead of mmap() from now.\n");
+
+		/*
+		 * Fall back to read().
+		 */
+		pthread_rwlock_wrlock(&info->usemmap_rwlock);
+		info->flag_usemmap = MMAP_DISABLE;
+		pthread_rwlock_unlock(&info->usemmap_rwlock);
+		return NULL;
+	}
+
+	if (offset < mmap_cache->mmap_start_offset ||
+	    offset + info->page_size > mmap_cache->mmap_end_offset)
+		return NULL;
+
+	return mmap_cache->mmap_buf + (offset - mmap_cache->mmap_start_offset);
+}
+
 static int
 read_from_vmcore(off_t offset, void *bufptr, unsigned long size)
 {
diff --git a/makedumpfile.h b/makedumpfile.h
index d2fadbd..939850f 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -42,6 +42,7 @@
 #include "dwarf_info.h"
 #include "diskdump_mod.h"
 #include "sadump_mod.h"
+#include <pthread.h>
 
 /*
  * Result of command
@@ -913,6 +914,15 @@ typedef unsigned long int ulong;
 typedef unsigned long long int ulonglong;
 
 /*
+ * for parallel process
+ */
+struct mmap_cache {
+	char	*mmap_buf;
+	off_t	mmap_start_offset;
+	off_t   mmap_end_offset;
+};
+
+/*
  * makedumpfile header
  *   For re-arranging the dump data on different architecture, all the
  *   variables are defined by 64bits. The size of signature is aligned
@@ -1177,6 +1187,10 @@ struct DumpInfo {
 	 * for cyclic_splitting mode, setup splitblock_size
 	 */
 	long long splitblock_size;
+	/*
+	 * for parallel process
+	 */
+	pthread_rwlock_t usemmap_rwlock;
 };
 extern struct DumpInfo		*info;
 
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 03/11] Add readpage_elf_parallel
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
  2015-06-05  7:56 ` [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel Zhou Wenjian
  2015-06-05  7:56 ` [PATCH RFC 02/11] Add mappage_elf_parallel Zhou Wenjian
@ 2015-06-05  7:56 ` Zhou Wenjian
  2015-06-05  7:56 ` [PATCH RFC 04/11] Add read_pfn_parallel Zhou Wenjian
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:56 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

readpage_elf_parallel is used to enable reading pages from elf format
parallelly. fd_memory should be initialize and offered  to each threads
individually to avoid conflict.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.c |   98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 98 insertions(+), 0 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index 0f71ce7..9f12865 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -576,6 +576,27 @@ read_from_vmcore(off_t offset, void *bufptr, unsigned long size)
 	return TRUE;
 }
 
+static int
+read_from_vmcore_parallel(int fd_memory, off_t offset, void *bufptr,
+			  unsigned long size)
+{
+	const off_t failed = (off_t)-1;
+
+	if (lseek(fd_memory, offset, SEEK_SET) == failed) {
+		ERRMSG("Can't seek the dump memory(%s). (offset: %llx) %s\n",
+		       info->name_memory, (unsigned long long)offset, strerror(errno));
+		return FALSE;
+	}
+
+	if (read(fd_memory, bufptr, size) != size) {
+		ERRMSG("Can't read the dump memory(%s). %s\n",
+		       info->name_memory, strerror(errno));
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
 /*
  * This function is specific for reading page from ELF.
  *
@@ -670,6 +691,83 @@ readpage_elf(unsigned long long paddr, void *bufptr)
 }
 
 static int
+readpage_elf_parallel(int fd_memory, unsigned long long paddr, void *bufptr)
+{
+	off_t offset1, offset2;
+	size_t size1, size2;
+	unsigned long long phys_start, phys_end, frac_head = 0;
+
+	offset1 = paddr_to_offset(paddr);
+	offset2 = paddr_to_offset(paddr + info->page_size);
+	phys_start = paddr;
+	phys_end = paddr + info->page_size;
+
+	/*
+	 * Check the case phys_start isn't aligned by page size like below:
+	 *
+	 *                           phys_start
+	 *                           = 0x40ffda7000
+	 *         |<-- frac_head -->|------------- PT_LOAD -------------
+	 *     ----+-----------------------+---------------------+----
+	 *         |         pfn:N         |       pfn:N+1       | ...
+	 *     ----+-----------------------+---------------------+----
+	 *         |
+	 *     pfn_to_paddr(pfn:N)               # page size = 16k
+	 *     = 0x40ffda4000
+	 */
+	if (!offset1) {
+		phys_start = page_head_to_phys_start(paddr);
+		offset1 = paddr_to_offset(phys_start);
+		frac_head = phys_start - paddr;
+		memset(bufptr, 0, frac_head);
+	}
+
+	/*
+	 * Check the case phys_end isn't aligned by page size like the
+	 * phys_start's case.
+	 */
+	if (!offset2) {
+		phys_end = page_head_to_phys_end(paddr);
+		offset2 = paddr_to_offset(phys_end);
+		memset(bufptr + (phys_end - paddr), 0, info->page_size
+							- (phys_end - paddr));
+	}
+
+	/*
+	 * Check the separated page on different PT_LOAD segments.
+	 */
+	if (offset1 + (phys_end - phys_start) == offset2) {
+		size1 = phys_end - phys_start;
+	} else {
+		for (size1 = 1; size1 < info->page_size - frac_head; size1++) {
+			offset2 = paddr_to_offset(phys_start + size1);
+			if (offset1 + size1 != offset2)
+				break;
+		}
+	}
+
+	if(!read_from_vmcore_parallel(fd_memory, offset1, bufptr + frac_head,
+								size1)) {
+		ERRMSG("Can't read the dump memory(%s).\n",
+		       info->name_memory);
+		return FALSE;
+	}
+
+	if (size1 + frac_head != info->page_size) {
+		size2 = phys_end - (phys_start + size1);
+
+		if(!read_from_vmcore_parallel(fd_memory, offset2,
+					bufptr + frac_head + size1, size2)) {
+			ERRMSG("Can't read the dump memory(%s).\n",
+			       info->name_memory);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static int
 readpage_kdump_compressed(unsigned long long paddr, void *bufptr)
 {
 	page_desc_t pd;
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 04/11] Add read_pfn_parallel
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (2 preceding siblings ...)
  2015-06-05  7:56 ` [PATCH RFC 03/11] Add readpage_elf_parallel Zhou Wenjian
@ 2015-06-05  7:56 ` Zhou Wenjian
  2015-06-05  7:56 ` [PATCH RFC 05/11] Add function to initial bitmap for parallel use Zhou Wenjian
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:56 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

read_pfn_parallel is used to enable reading pages from vmcore parallely.
Current supported format is kdump-compressed and elf, mmap elf format
is also supported.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 Makefile       |    2 ++
 makedumpfile.c |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/Makefile b/Makefile
index 2d2b1b7..0b10312 100644
--- a/Makefile
+++ b/Makefile
@@ -66,6 +66,8 @@ LIBS := -lsnappy $(LIBS)
 CFLAGS += -DUSESNAPPY
 endif
 
+LIBS := -lpthread $(LIBS)
+
 all: makedumpfile
 
 $(OBJ_PART): $(SRC_PART)
diff --git a/makedumpfile.c b/makedumpfile.c
index 9f12865..8a69321 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -6667,6 +6667,40 @@ read_pfn(mdf_pfn_t pfn, unsigned char *buf)
 }
 
 int
+read_pfn_parallel(int fd_memory, mdf_pfn_t pfn, unsigned char *buf,
+		  struct dump_bitmap* bitmap_memory_parallel,
+		  struct mmap_cache *mmap_cache)
+{
+	unsigned long long paddr;
+	unsigned long long pgaddr;
+
+	paddr = pfn_to_paddr(pfn);
+
+	pgaddr = PAGEBASE(paddr);
+
+	if (info->flag_refiltering) {
+		if (!readpage_kdump_compressed_parallel(fd_memory, pgaddr, buf,
+						      bitmap_memory_parallel)) {
+			ERRMSG("Can't get the page data.\n");
+			return FALSE;
+		}
+	} else {
+		char *mapbuf = mappage_elf_parallel(fd_memory, pgaddr,
+						    mmap_cache);
+		if (mapbuf) {
+			memcpy(buf, mapbuf, info->page_size);
+		} else {
+			if (!readpage_elf_parallel(fd_memory, pgaddr, buf)) {
+				ERRMSG("Can't get the page data.\n");
+				return FALSE;
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+int
 get_loads_dumpfile_cyclic(void)
 {
 	int i, phnum, num_new_load = 0;
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 05/11] Add function to initial bitmap for parallel use
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (3 preceding siblings ...)
  2015-06-05  7:56 ` [PATCH RFC 04/11] Add read_pfn_parallel Zhou Wenjian
@ 2015-06-05  7:56 ` Zhou Wenjian
  2015-06-05  7:57 ` [PATCH RFC 06/11] Add filter_data_buffer_parallel Zhou Wenjian
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:56 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

initialize_bitmap_memory_parallel and initialize_2nd_bitmap_parallel
is used for parallel process to avoid conflict on bitmap.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.c |   20 ++++++++++++++++++++
 makedumpfile.h |   18 ++++++++++++++++++
 2 files changed, 38 insertions(+), 0 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index 8a69321..05859a3 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -3398,6 +3398,16 @@ initialize_bitmap_memory(void)
 	return TRUE;
 }
 
+void
+initialize_bitmap_memory_parallel(struct dump_bitmap *bitmap, int thread_num)
+{
+	bitmap->fd = FD_BITMAP_MEMORY_PARALLEL(thread_num);
+	bitmap->file_name = info->name_memory;
+	bitmap->no_block = -1;
+	memset(bitmap->buf, 0, BUFSIZE_BITMAP);
+	bitmap->offset = info->bitmap_memory->offset;
+}
+
 int
 calibrate_machdep_info(void)
 {
@@ -3713,6 +3723,16 @@ initialize_2nd_bitmap(struct dump_bitmap *bitmap)
 	bitmap->offset = info->len_bitmap / 2;
 }
 
+void
+initialize_2nd_bitmap_parallel(struct dump_bitmap *bitmap, int thread_num)
+{
+	bitmap->fd = FD_BITMAP_PARALLEL(thread_num);
+	bitmap->file_name = info->name_bitmap;
+	bitmap->no_block = -1;
+	memset(bitmap->buf, 0, BUFSIZE_BITMAP);
+	bitmap->offset = info->len_bitmap / 2;
+}
+
 int
 set_bitmap(struct dump_bitmap *bitmap, mdf_pfn_t pfn, int val)
 {
diff --git a/makedumpfile.h b/makedumpfile.h
index 939850f..b1ff561 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -429,6 +429,11 @@ do { \
 #define SPLITTING_SIZE_EI(i)	info->splitting_info[i].size_eraseinfo
 
 /*
+ * Macro for getting parallel info.
+ */
+#define FD_BITMAP_MEMORY_PARALLEL(i)	info->parallel_info[i].fd_bitmap_memory
+#define FD_BITMAP_PARALLEL(i)		info->parallel_info[i].fd_bitmap
+/*
  * kernel version
  *
  * NOTE: the format of kernel_version is as follows
@@ -957,6 +962,18 @@ struct splitting_info {
 	unsigned long		size_eraseinfo;
 } splitting_info_t;
 
+struct parallel_info {
+	int			fd_memory;
+	int 			fd_bitmap_memory;
+	int			fd_bitmap;
+	unsigned char		*buf;
+	unsigned char 		*buf_out;
+	struct mmap_cache	*mmap_cache;
+#ifdef USELZO
+	lzo_bytep		wrkmem;
+#endif
+} parallel_info_t;
+
 struct ppc64_vmemmap {
 	unsigned long		phys;
 	unsigned long		virt;
@@ -1093,6 +1110,7 @@ struct DumpInfo {
 	char			*name_dumpfile;
 	int			num_dumpfile;
 	struct splitting_info	*splitting_info;
+	struct parallel_info	*parallel_info;
 
 	/*
 	 * bitmap info:
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 06/11] Add filter_data_buffer_parallel
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (4 preceding siblings ...)
  2015-06-05  7:56 ` [PATCH RFC 05/11] Add function to initial bitmap for parallel use Zhou Wenjian
@ 2015-06-05  7:57 ` Zhou Wenjian
  2015-06-05  7:57 ` [PATCH RFC 07/11] Add write_kdump_pages_parallel to allow parallel process Zhou Wenjian
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:57 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

filter_data_buffer_parallel is used to enable filtering buffer
parallely.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 erase_info.c |   29 ++++++++++++++++++++++++++++-
 erase_info.h |    2 ++
 2 files changed, 30 insertions(+), 1 deletions(-)

diff --git a/erase_info.c b/erase_info.c
index e0e0f71..0b253d7 100644
--- a/erase_info.c
+++ b/erase_info.c
@@ -2328,7 +2328,6 @@ extract_filter_info(unsigned long long start_paddr,
 	return TRUE;
 }
 
-
 /*
  * External functions.
  */
@@ -2413,6 +2412,34 @@ filter_data_buffer(unsigned char *buf, unsigned long long paddr,
 	}
 }
 
+/*
+ * Filter buffer if the physical address is in filter_info.
+ */
+void
+filter_data_buffer_parallel(unsigned char *buf, unsigned long long paddr,
+					size_t size, pthread_mutex_t *mutex)
+{
+	struct filter_info fl_info;
+	unsigned char *buf_ptr;
+	int found = FALSE;
+
+	while (TRUE) {
+		pthread_mutex_lock(mutex);
+		found = extract_filter_info(paddr, paddr + size, &fl_info);
+		pthread_mutex_unlock(mutex);
+
+		if (found) {
+			buf_ptr = buf + (fl_info.paddr - paddr);
+			if (fl_info.nullify)
+				memset(buf_ptr, 0, fl_info.size);
+			else
+				memset(buf_ptr, fl_info.erase_ch, fl_info.size);
+		} else {
+			break;
+		}
+	}
+}
+
 unsigned long
 get_size_eraseinfo(void)
 {
diff --git a/erase_info.h b/erase_info.h
index 4d4957e..b363a40 100644
--- a/erase_info.h
+++ b/erase_info.h
@@ -60,6 +60,8 @@ extern unsigned long		num_erase_info;
 int gather_filter_info(void);
 void clear_filter_info(void);
 void filter_data_buffer(unsigned char *buf, unsigned long long paddr, size_t size);
+void filter_data_buffer_parallel(unsigned char *buf, unsigned long long paddr,
+					size_t size, pthread_mutex_t *mutex);
 unsigned long get_size_eraseinfo(void);
 int update_filter_info_raw(unsigned long long, int, int);
 
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 07/11] Add write_kdump_pages_parallel to allow parallel process
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (5 preceding siblings ...)
  2015-06-05  7:57 ` [PATCH RFC 06/11] Add filter_data_buffer_parallel Zhou Wenjian
@ 2015-06-05  7:57 ` Zhou Wenjian
  2015-06-05  7:57 ` [PATCH RFC 08/11] Add write_kdump_pages_parallel_cyclic to allow parallel process in cyclic_mode Zhou Wenjian
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:57 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

Use several threads to read and compress pages and one thread to write
the produced pages into dumpfile. The produced pages will be stored in
a buffer, then the consumer thread will get pages from this buffer.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.c |  450 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 makedumpfile.h |   44 ++++++
 2 files changed, 494 insertions(+), 0 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index 05859a3..bce6dc3 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -235,6 +235,31 @@ is_in_same_page(unsigned long vaddr1, unsigned long vaddr2)
 	return FALSE;
 }
 
+static inline unsigned long
+calculate_len_buf_out(long page_size)
+{
+	unsigned long len_buf_out_zlib, len_buf_out_lzo, len_buf_out_snappy;
+	unsigned long len_buf_out;
+
+	len_buf_out_zlib = len_buf_out_lzo = len_buf_out_snappy = 0;
+
+#ifdef USELZO
+	len_buf_out_lzo = page_size + page_size / 16 + 64 + 3;
+#endif
+
+#ifdef USESNAPPY
+	len_buf_out_snappy = snappy_max_compressed_length(page_size);
+#endif
+
+	len_buf_out_zlib = compressBound(page_size);
+
+	len_buf_out = MAX(len_buf_out_zlib,
+			  MAX(len_buf_out_lzo,
+			      len_buf_out_snappy));
+
+	return len_buf_out;
+}
+
 #define BITMAP_SECT_LEN 4096
 static inline int is_dumpable(struct dump_bitmap *, mdf_pfn_t);
 static inline int is_dumpable_cyclic(char *bitmap, mdf_pfn_t, struct cycle *cycle);
@@ -7016,6 +7041,431 @@ write_elf_pages_cyclic(struct cache_data *cd_header, struct cache_data *cd_page)
 	return TRUE;
 }
 
+void *
+kdump_thread_function(void *arg) {
+	void *retval = PTHREAD_FAIL;
+	struct thread_args *kdump_thread_args = (struct thread_args *)arg;
+	struct page_data *page_data_buf = kdump_thread_args->page_data_buf;
+	int page_data_num = kdump_thread_args->page_data_num;
+	mdf_pfn_t pfn;
+	mdf_pfn_t consumed_pfn;
+	int index;
+	int found;
+	int fd_memory = 0;
+	struct dump_bitmap bitmap_parallel, bitmap_memory_parallel;
+	unsigned char *buf = NULL, *buf_out = NULL;
+	struct mmap_cache *mmap_cache =
+			MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
+	unsigned long size_out;
+#ifdef USELZO
+	lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
+#endif
+#ifdef USESNAPPY
+	unsigned long len_buf_out_snappy =
+				snappy_max_compressed_length(info->page_size);
+#endif
+
+	buf = BUF_PARALLEL(kdump_thread_args->thread_num);
+	buf_out = BUF_OUT_PARALLEL(kdump_thread_args->thread_num);
+
+	fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->thread_num);
+
+	initialize_2nd_bitmap_parallel(&bitmap_parallel, kdump_thread_args->thread_num);
+
+	if (info->flag_refiltering) {
+		initialize_bitmap_memory_parallel(&bitmap_memory_parallel,
+						kdump_thread_args->thread_num);
+	}
+
+	while (1) {
+		/* get next pfn */
+		pthread_mutex_lock(&info->current_pfn_mutex);
+		pfn = info->current_pfn;
+		info->current_pfn++;
+		pthread_mutex_unlock(&info->current_pfn_mutex);
+
+		if (pfn >= kdump_thread_args->end_pfn)
+			break;
+
+		index = -1;
+		found = FALSE;
+
+		while (found == FALSE) {
+			/*
+			 * need a cancellation point here
+			 */
+			sleep(0);
+
+			index = (index + 1) % page_data_num;
+
+			if (pthread_mutex_trylock(&page_data_buf[index].mutex) != 0)
+				continue;
+
+			if (page_data_buf[index].ready != 0)
+				goto unlock;
+
+			pthread_mutex_lock(&info->consumed_pfn_mutex);
+			if ((long)page_data_buf[index].pfn >
+						(long)info->consumed_pfn)
+				info->consumed_pfn = page_data_buf[index].pfn;
+			consumed_pfn = info->consumed_pfn;
+			pthread_mutex_unlock(&info->consumed_pfn_mutex);
+
+			/*
+			 * leave space for slow producer
+			 */
+			if ((long)pfn - (long)consumed_pfn > page_data_num)
+				goto unlock;
+
+			found = TRUE;
+
+			page_data_buf[index].pfn = pfn;
+			page_data_buf[index].ready = 1;
+
+			if (!is_dumpable(&bitmap_parallel, pfn)) {
+				page_data_buf[index].dumpable = FALSE;
+				goto unlock;
+			}
+
+			page_data_buf[index].dumpable = TRUE;
+
+			if (!read_pfn_parallel(fd_memory, pfn, buf,
+					       &bitmap_memory_parallel,
+					       mmap_cache))
+				goto fail;
+
+			filter_data_buffer_parallel(buf, pfn_to_paddr(pfn),
+							info->page_size,
+							&info->filter_mutex);
+
+			if ((info->dump_level & DL_EXCLUDE_ZERO)
+			    && is_zero_page(buf, info->page_size)) {
+				page_data_buf[index].zero = TRUE;
+				goto unlock;
+			}
+
+			page_data_buf[index].zero = FALSE;
+
+			/*
+			 * Compress the page data.
+			 */
+			size_out = kdump_thread_args->len_buf_out;
+			if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
+			    && ((size_out = kdump_thread_args->len_buf_out),
+				compress2(buf_out, &size_out, buf,
+					  info->page_size,
+					  Z_BEST_SPEED)	== Z_OK)
+			    && (size_out < info->page_size)) {
+				page_data_buf[index].flags =
+							DUMP_DH_COMPRESSED_ZLIB;
+				page_data_buf[index].size  = size_out;
+				memcpy(page_data_buf[index].buf, buf_out,
+								size_out);
+#ifdef USELZO
+			} else if (info->flag_lzo_support
+				   && (info->flag_compress
+				       & DUMP_DH_COMPRESSED_LZO)
+				   && ((size_out = info->page_size),
+				       lzo1x_1_compress(buf, info->page_size,
+							buf_out, &size_out,
+							wrkmem) == LZO_E_OK)
+				   && (size_out < info->page_size)) {
+				page_data_buf[index].flags =
+							DUMP_DH_COMPRESSED_LZO;
+				page_data_buf[index].size  = size_out;
+				memcpy(page_data_buf[index].buf, buf_out,
+								size_out);
+#endif
+#ifdef USESNAPPY
+			} else if ((info->flag_compress
+				    & DUMP_DH_COMPRESSED_SNAPPY)
+				   && ((size_out = len_buf_out_snappy),
+				       snappy_compress((char *)buf,
+						       info->page_size,
+						       (char *)buf_out,
+						       (size_t *)&size_out)
+				       == SNAPPY_OK)
+				   && (size_out < info->page_size)) {
+				page_data_buf[index].flags =
+						DUMP_DH_COMPRESSED_SNAPPY;
+				page_data_buf[index].size  = size_out;
+				memcpy(page_data_buf[index].buf, buf_out,
+								size_out);
+#endif
+			} else {
+				page_data_buf[index].flags = 0;
+				page_data_buf[index].size  = info->page_size;
+				memcpy(page_data_buf[index].buf, buf,
+							info->page_size);
+			}
+unlock:
+			pthread_mutex_unlock(&page_data_buf[index].mutex);
+		}
+	}
+
+	retval = NULL;
+
+fail:
+	if (bitmap_memory_parallel.fd > 0)
+		close(bitmap_memory_parallel.fd);
+
+	pthread_exit(retval);
+}
+
+int
+write_kdump_pages_parallel(struct cache_data *cd_header,
+			   struct cache_data *cd_page)
+{
+	int ret = FALSE;
+	int res;
+	unsigned long len_buf_out;
+	mdf_pfn_t per, num_dumpable;
+	mdf_pfn_t start_pfn, end_pfn;
+	struct disk_dump_header *dh = info->dump_header;
+	struct page_desc pd, pd_zero;
+	off_t offset_data = 0;
+	struct timeval tv_start;
+	struct timeval last, new;
+	unsigned char buf[info->page_size];
+	unsigned long long consuming_pfn;
+	pthread_t **threads = NULL;
+	struct thread_args *kdump_thread_args = NULL;
+	void *thread_result;
+	int page_data_num;
+	struct page_data *page_data_buf = NULL;
+	int index;
+	int i;
+
+	if (info->flag_elf_dumpfile)
+		return ret;
+
+	res = pthread_mutex_init(&info->current_pfn_mutex, NULL);
+	if (res != 0) {
+		ERRMSG("Can't initialize current_pfn_mutex. %s\n",
+				strerror(res));
+		goto out;
+	}
+
+	res = pthread_mutex_init(&info->consumed_pfn_mutex, NULL);
+	if (res != 0) {
+		ERRMSG("Can't initialize consumed_pfn_mutex. %s\n",
+				strerror(res));
+		goto out;
+	}
+
+	res = pthread_mutex_init(&info->filter_mutex, NULL);
+	if (res != 0) {
+		ERRMSG("Can't initialize filter_mutex. %s\n", strerror(res));
+		goto out;
+	}
+
+	res = pthread_rwlock_init(&info->usemmap_rwlock, NULL);
+	if (res != 0) {
+		ERRMSG("Can't initialize usemmap_rwlock. %s\n", strerror(res));
+		goto out;
+	}
+
+	len_buf_out = calculate_len_buf_out(info->page_size);
+
+	num_dumpable = get_num_dumpable();
+	per = num_dumpable / 10000;
+	per = per ? per : 1;
+
+	/*
+	 * Calculate the offset of the page data.
+	 */
+	cd_header->offset
+	    = (DISKDUMP_HEADER_BLOCKS + dh->sub_hdr_size + dh->bitmap_blocks)
+		* dh->block_size;
+	cd_page->offset = cd_header->offset + sizeof(page_desc_t)*num_dumpable;
+	offset_data  = cd_page->offset;
+
+	/*
+	 * Write the data of zero-filled page.
+	 */
+	gettimeofday(&tv_start, NULL);
+	if (info->dump_level & DL_EXCLUDE_ZERO) {
+		pd_zero.size = info->page_size;
+		pd_zero.flags = 0;
+		pd_zero.offset = offset_data;
+		pd_zero.page_flags = 0;
+		memset(buf, 0, pd_zero.size);
+		if (!write_cache(cd_page, buf, pd_zero.size))
+			goto out;
+		offset_data  += pd_zero.size;
+	}
+
+	start_pfn = 0;
+	end_pfn   = info->max_mapnr;
+
+	info->current_pfn = start_pfn;
+	info->consumed_pfn = start_pfn - 1;
+
+	threads = info->threads;
+	kdump_thread_args = info->kdump_thread_args;
+
+	page_data_num = info->num_buffers;
+	page_data_buf = info->page_data_buf;
+
+	for (i = 0; i < page_data_num; i++) {
+		/*
+		 * producer will use pfn in page_data_buf to decide the
+		 * consumed pfn
+		 */
+		page_data_buf[i].pfn = start_pfn - 1;
+		page_data_buf[i].ready = 0;
+		res = pthread_mutex_init(&page_data_buf[i].mutex, NULL);
+		if (res != 0) {
+			ERRMSG("Can't initialize mutex of page_data_buf. %s\n",
+					strerror(res));
+			goto out;
+		}
+	}
+
+	for (i = 0; i < info->num_threads; i++) {
+		kdump_thread_args[i].thread_num = i;
+		kdump_thread_args[i].len_buf_out = len_buf_out;
+		kdump_thread_args[i].start_pfn = start_pfn;
+		kdump_thread_args[i].end_pfn = end_pfn;
+		kdump_thread_args[i].page_data_num = page_data_num;
+		kdump_thread_args[i].page_data_buf = page_data_buf;
+
+		res = pthread_create(threads[i], NULL,
+				     kdump_thread_function,
+				     (void *)&kdump_thread_args[i]);
+		if (res != 0) {
+			ERRMSG("Can't create thread %d. %s\n",
+					i, strerror(res));
+			goto out;
+		}
+	}
+
+	consuming_pfn = start_pfn;
+	index = -1;
+
+	gettimeofday(&last, NULL);
+
+	while (consuming_pfn < end_pfn) {
+		index = (index + 1) % page_data_num;
+
+		gettimeofday(&new, NULL);
+		if (new.tv_sec - last.tv_sec > WAIT_TIME) {
+			ERRMSG("Can't get data of pfn %llx.\n", consuming_pfn);
+			goto out;
+		}
+
+		/*
+		 * check pfn first without mutex locked to reduce the time
+		 * trying to lock the mutex
+		 */
+		if (page_data_buf[index].pfn != consuming_pfn)
+			continue;
+
+		pthread_mutex_lock(&page_data_buf[index].mutex);
+
+		/* check whether the found one is ready to be consumed */
+		if (page_data_buf[index].pfn != consuming_pfn ||
+		    page_data_buf[index].ready != 1) {
+			goto unlock;
+		}
+
+		if ((num_dumped % per) == 0)
+			print_progress(PROGRESS_COPY, num_dumped, num_dumpable);
+
+		/* next pfn is found, refresh last here */
+		last = new;
+		consuming_pfn++;
+		page_data_buf[index].ready = 0;
+
+		if (page_data_buf[index].dumpable == FALSE)
+			goto unlock;
+
+		num_dumped++;
+
+		if (page_data_buf[index].zero == TRUE) {
+			if (!write_cache(cd_header, &pd_zero, sizeof(page_desc_t)))
+				goto out;
+			pfn_zero++;
+		} else {
+			pd.flags      = page_data_buf[index].flags;
+			pd.size       = page_data_buf[index].size;
+			pd.page_flags = 0;
+			pd.offset     = offset_data;
+			offset_data  += pd.size;
+			/*
+			 * Write the page header.
+			 */
+			if (!write_cache(cd_header, &pd, sizeof(page_desc_t)))
+				goto out;
+			/*
+			 * Write the page data.
+			 */
+			if (!write_cache(cd_page, page_data_buf[index].buf, pd.size))
+				goto out;
+		}
+unlock:
+		pthread_mutex_unlock(&page_data_buf[index].mutex);
+	}
+
+	/*
+	 * Write the remainder.
+	 */
+	if (!write_cache_bufsz(cd_page))
+		goto out;
+	if (!write_cache_bufsz(cd_header))
+		goto out;
+
+	ret = TRUE;
+	/*
+	 * print [100 %]
+	 */
+	print_progress(PROGRESS_COPY, num_dumped, num_dumpable);
+	print_execution_time(PROGRESS_COPY, &tv_start);
+	PROGRESS_MSG("\n");
+
+out:
+	if (threads != NULL) {
+		for (i = 0; i < info->num_threads; i++) {
+			if (threads[i] != NULL) {
+				res = pthread_cancel(*threads[i]);
+				if (res != 0 && res != ESRCH)
+					ERRMSG("Can't cancel thread %d. %s\n",
+							i, strerror(res));
+			}
+		}
+
+		for (i = 0; i < info->num_threads; i++) {
+			if (threads[i] != NULL) {
+				res = pthread_join(*threads[i], &thread_result);
+				if (res != 0)
+					ERRMSG("Can't join with thread %d. %s\n",
+							i, strerror(res));
+
+				if (thread_result == PTHREAD_CANCELED)
+					DEBUG_MSG("Thread %d is cancelled.\n", i);
+				else if (thread_result == PTHREAD_FAIL)
+					DEBUG_MSG("Thread %d fails.\n", i);
+				else
+					DEBUG_MSG("Thread %d finishes.\n", i);
+
+			}
+		}
+	}
+
+	if (page_data_buf != NULL) {
+		for (i = 0; i < page_data_num; i++) {
+			pthread_mutex_destroy(&page_data_buf[i].mutex);
+		}
+	}
+
+	pthread_rwlock_destroy(&info->usemmap_rwlock);
+	pthread_mutex_destroy(&info->filter_mutex);
+	pthread_mutex_destroy(&info->consumed_pfn_mutex);
+	pthread_mutex_destroy(&info->current_pfn_mutex);
+
+	return ret;
+}
+
 int
 write_kdump_pages(struct cache_data *cd_header, struct cache_data *cd_page)
 {
diff --git a/makedumpfile.h b/makedumpfile.h
index b1ff561..bca3d56 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -431,8 +431,15 @@ do { \
 /*
  * Macro for getting parallel info.
  */
+#define FD_MEMORY_PARALLEL(i)		info->parallel_info[i].fd_memory
 #define FD_BITMAP_MEMORY_PARALLEL(i)	info->parallel_info[i].fd_bitmap_memory
 #define FD_BITMAP_PARALLEL(i)		info->parallel_info[i].fd_bitmap
+#define BUF_PARALLEL(i)			info->parallel_info[i].buf
+#define BUF_OUT_PARALLEL(i)		info->parallel_info[i].buf_out
+#define MMAP_CACHE_PARALLEL(i)		info->parallel_info[i].mmap_cache
+#ifdef USELZO
+#define WRKMEM_PARALLEL(i)		info->parallel_info[i].wrkmem
+#endif
 /*
  * kernel version
  *
@@ -921,12 +928,39 @@ typedef unsigned long long int ulonglong;
 /*
  * for parallel process
  */
+
+#define WAIT_TIME	(60 * 10)
+#define PTHREAD_FAIL	((void *)-2)
+
 struct mmap_cache {
 	char	*mmap_buf;
 	off_t	mmap_start_offset;
 	off_t   mmap_end_offset;
 };
 
+struct page_data
+{
+	mdf_pfn_t pfn;
+	int dumpable;
+	int zero;
+	unsigned int flags;
+	long size;
+	unsigned char *buf;
+	pthread_mutex_t mutex;
+	/*
+	 * whether the page_data is ready to be consumed
+	 */
+	int ready;
+};
+
+struct thread_args {
+	int thread_num;
+	unsigned long len_buf_out;
+	mdf_pfn_t start_pfn, end_pfn;
+	int page_data_num;
+	struct page_data *page_data_buf;
+};
+
 /*
  * makedumpfile header
  *   For re-arranging the dump data on different architecture, all the
@@ -1208,7 +1242,17 @@ struct DumpInfo {
 	/*
 	 * for parallel process
 	 */
+	int num_threads;
+	int num_buffers;
+	pthread_t **threads;
+	struct thread_args *kdump_thread_args;
+	struct page_data *page_data_buf;
 	pthread_rwlock_t usemmap_rwlock;
+	mdf_pfn_t current_pfn;
+	pthread_mutex_t current_pfn_mutex;
+	mdf_pfn_t consumed_pfn;
+	pthread_mutex_t consumed_pfn_mutex;
+	pthread_mutex_t filter_mutex;
 };
 extern struct DumpInfo		*info;
 
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 08/11] Add write_kdump_pages_parallel_cyclic to allow parallel process in cyclic_mode
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (6 preceding siblings ...)
  2015-06-05  7:57 ` [PATCH RFC 07/11] Add write_kdump_pages_parallel to allow parallel process Zhou Wenjian
@ 2015-06-05  7:57 ` Zhou Wenjian
  2015-06-05  7:57 ` [PATCH RFC 09/11] Initial and free data used for parallel process Zhou Wenjian
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:57 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

Use several threads to read and compress pages and one thread to write
the produced pages into dumpfile. This patch is used for cyclic mode.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.c |  390 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 390 insertions(+), 0 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index bce6dc3..86426d8 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -7672,6 +7672,396 @@ out:
 	return ret;
 }
 
+void *
+kdump_thread_function_cyclic(void *arg) {
+	void *retval = PTHREAD_FAIL;
+	struct thread_args *kdump_thread_args = (struct thread_args *)arg;
+	struct page_data *page_data_buf = kdump_thread_args->page_data_buf;
+	int page_data_num = kdump_thread_args->page_data_num;
+	mdf_pfn_t pfn;
+	mdf_pfn_t consumed_pfn;
+	int index;
+	int found;
+	int fd_memory = 0;
+	struct dump_bitmap bitmap_memory_parallel;
+	unsigned char *buf = NULL, *buf_out = NULL;
+	struct mmap_cache *mmap_cache =
+			MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
+	unsigned long size_out;
+#ifdef USELZO
+	lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
+#endif
+#ifdef USESNAPPY
+	unsigned long len_buf_out_snappy =
+				snappy_max_compressed_length(info->page_size);
+#endif
+
+	buf = BUF_PARALLEL(kdump_thread_args->thread_num);
+	buf_out = BUF_OUT_PARALLEL(kdump_thread_args->thread_num);
+
+	fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->thread_num);
+
+	if (info->flag_refiltering) {
+		initialize_bitmap_memory_parallel(&bitmap_memory_parallel,
+						kdump_thread_args->thread_num);
+	}
+
+	while (1) {
+		/* get next pfn */
+		pthread_mutex_lock(&info->current_pfn_mutex);
+		pfn = info->current_pfn;
+		info->current_pfn++;
+		pthread_mutex_unlock(&info->current_pfn_mutex);
+
+		if (pfn >= kdump_thread_args->end_pfn)
+			break;
+
+		index = -1;
+		found = FALSE;
+
+		while (found == FALSE) {
+			/*
+			 * need a cancellation point here
+			 */
+			sleep(0);
+
+			index = (index + 1) % page_data_num;
+
+			if (pthread_mutex_trylock(&page_data_buf[index].mutex) != 0)
+				continue;
+
+			if (page_data_buf[index].ready != 0)
+				goto unlock;
+
+			pthread_mutex_lock(&info->consumed_pfn_mutex);
+			if ((long)page_data_buf[index].pfn >
+						(long)info->consumed_pfn)
+				info->consumed_pfn = page_data_buf[index].pfn;
+			consumed_pfn = info->consumed_pfn;
+			pthread_mutex_unlock(&info->consumed_pfn_mutex);
+
+			/*
+			 * leave space for slow producer
+			 */
+			if ((long)pfn - (long)consumed_pfn > page_data_num)
+				goto unlock;
+
+			found = TRUE;
+
+			page_data_buf[index].pfn = pfn;
+			page_data_buf[index].ready = 1;
+
+			if (!is_on(info->partial_bitmap2,
+					pfn - kdump_thread_args->start_pfn)) {
+				page_data_buf[index].dumpable = FALSE;
+				goto unlock;
+			}
+
+			page_data_buf[index].dumpable = TRUE;
+
+			if (!read_pfn_parallel(fd_memory, pfn, buf,
+					       &bitmap_memory_parallel,
+					       mmap_cache))
+					goto fail;
+
+			filter_data_buffer_parallel(buf, pfn_to_paddr(pfn),
+							info->page_size,
+							&info->filter_mutex);
+
+			if ((info->dump_level & DL_EXCLUDE_ZERO)
+			    && is_zero_page(buf, info->page_size)) {
+				page_data_buf[index].zero = TRUE;
+				goto unlock;
+			}
+
+			page_data_buf[index].zero = FALSE;
+
+			/*
+			 * Compress the page data.
+			 */
+			size_out = kdump_thread_args->len_buf_out;
+			if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
+			    && ((size_out = kdump_thread_args->len_buf_out),
+				compress2(buf_out, &size_out, buf,
+					  info->page_size,
+					  Z_BEST_SPEED) == Z_OK)
+			    && (size_out < info->page_size)) {
+				page_data_buf[index].flags =
+							DUMP_DH_COMPRESSED_ZLIB;
+				page_data_buf[index].size  = size_out;
+				memcpy(page_data_buf[index].buf, buf_out, size_out);
+#ifdef USELZO
+			} else if (info->flag_lzo_support
+				   && (info->flag_compress
+				       & DUMP_DH_COMPRESSED_LZO)
+				   && ((size_out = info->page_size),
+				       lzo1x_1_compress(buf, info->page_size,
+							buf_out, &size_out,
+							wrkmem) == LZO_E_OK)
+				   && (size_out < info->page_size)) {
+				page_data_buf[index].flags =
+							DUMP_DH_COMPRESSED_LZO;
+				page_data_buf[index].size  = size_out;
+				memcpy(page_data_buf[index].buf, buf_out, size_out);
+#endif
+#ifdef USESNAPPY
+			} else if ((info->flag_compress
+				    & DUMP_DH_COMPRESSED_SNAPPY)
+				   && ((size_out = len_buf_out_snappy),
+				       snappy_compress((char *)buf,
+						       info->page_size,
+						       (char *)buf_out,
+						       (size_t *)&size_out)
+				       == SNAPPY_OK)
+				   && (size_out < info->page_size)) {
+				page_data_buf[index].flags =
+						DUMP_DH_COMPRESSED_SNAPPY;
+				page_data_buf[index].size  = size_out;
+				memcpy(page_data_buf[index].buf, buf_out, size_out);
+#endif
+			} else {
+				page_data_buf[index].flags = 0;
+				page_data_buf[index].size  = info->page_size;
+				memcpy(page_data_buf[index].buf, buf, info->page_size);
+			}
+unlock:
+			pthread_mutex_unlock(&page_data_buf[index].mutex);
+
+		}
+	}
+
+	retval = NULL;
+
+fail:
+	if (bitmap_memory_parallel.fd > 0)
+		close(bitmap_memory_parallel.fd);
+
+	pthread_exit(retval);
+}
+
+int
+write_kdump_pages_parallel_cyclic(struct cache_data *cd_header,
+				  struct cache_data *cd_page,
+				  struct page_desc *pd_zero,
+				  off_t *offset_data, struct cycle *cycle)
+{
+	int ret = FALSE;
+	int res;
+	unsigned long len_buf_out;
+	mdf_pfn_t per;
+	mdf_pfn_t start_pfn, end_pfn;
+	struct page_desc pd;
+	struct timeval tv_start;
+	struct timeval last, new;
+	unsigned long long consuming_pfn;
+	pthread_t **threads = NULL;
+	struct thread_args *kdump_thread_args = NULL;
+	void *thread_result;
+	int page_data_num;
+	struct page_data *page_data_buf = NULL;
+	int i;
+	int index;
+
+	if (info->flag_elf_dumpfile)
+		return FALSE;
+
+	res = pthread_mutex_init(&info->current_pfn_mutex, NULL);
+	if (res != 0) {
+		ERRMSG("Can't initialize current_pfn_mutex. %s\n",
+				strerror(res));
+		goto out;
+	}
+
+	res = pthread_mutex_init(&info->consumed_pfn_mutex, NULL);
+	if (res != 0) {
+		ERRMSG("Can't initialize consumed_pfn_mutex. %s\n",
+				strerror(res));
+		goto out;
+	}
+
+	res = pthread_mutex_init(&info->filter_mutex, NULL);
+	if (res != 0) {
+		ERRMSG("Can't initialize filter_mutex. %s\n", strerror(res));
+		goto out;
+	}
+
+	res = pthread_rwlock_init(&info->usemmap_rwlock, NULL);
+	if (res != 0) {
+		ERRMSG("Can't initialize usemmap_rwlock. %s\n", strerror(res));
+		goto out;
+	}
+
+	len_buf_out = calculate_len_buf_out(info->page_size);
+
+	per = info->num_dumpable / 10000;
+	per = per ? per : 1;
+
+	gettimeofday(&tv_start, NULL);
+
+	start_pfn = cycle->start_pfn;
+	end_pfn   = cycle->end_pfn;
+
+	info->current_pfn = start_pfn;
+	info->consumed_pfn = start_pfn - 1;
+
+	threads = info->threads;
+	kdump_thread_args = info->kdump_thread_args;
+
+	page_data_num = info->num_buffers;
+	page_data_buf = info->page_data_buf;
+
+	for (i = 0; i < page_data_num; i++) {
+		/*
+		 * producer will use pfn in page_data_buf to decide the
+		 * consumed pfn
+		 */
+		page_data_buf[i].pfn = start_pfn - 1;
+		page_data_buf[i].ready = 0;
+		res = pthread_mutex_init(&page_data_buf[i].mutex, NULL);
+		if (res != 0) {
+			ERRMSG("Can't initialize mutex of page_data_buf. %s\n",
+					strerror(res));
+			goto out;
+		}
+	}
+
+	for (i = 0; i < info->num_threads; i++) {
+		kdump_thread_args[i].thread_num = i;
+		kdump_thread_args[i].len_buf_out = len_buf_out;
+		kdump_thread_args[i].start_pfn = start_pfn;
+		kdump_thread_args[i].end_pfn = end_pfn;
+		kdump_thread_args[i].page_data_num = page_data_num;
+		kdump_thread_args[i].page_data_buf = page_data_buf;
+
+		res = pthread_create(threads[i], NULL,
+				     kdump_thread_function_cyclic,
+				     (void *)&kdump_thread_args[i]);
+		if (res != 0) {
+			ERRMSG("Can't create thread %d. %s\n",
+					i, strerror(res));
+			goto out;
+		}
+	}
+
+	consuming_pfn = start_pfn;
+	index = -1;
+
+	gettimeofday(&last, NULL);
+
+	while (consuming_pfn < end_pfn) {
+		index = (index + 1) % page_data_num;
+
+		gettimeofday(&new, NULL);
+		if (new.tv_sec - last.tv_sec > WAIT_TIME) {
+			ERRMSG("Can't get data of pfn %llx.\n", consuming_pfn);
+			goto out;
+		}
+
+		/*
+		 * check pfn first without mutex locked to reduce the time
+		 * trying to lock the mutex
+		 */
+		if (page_data_buf[index].pfn != consuming_pfn)
+			continue;
+
+		pthread_mutex_lock(&page_data_buf[index].mutex);
+
+		/* check whether the found one is ready to be consumed */
+		if (page_data_buf[index].pfn != consuming_pfn ||
+		    page_data_buf[index].ready != 1) {
+			goto unlock;
+		}
+
+		if ((num_dumped % per) == 0)
+			print_progress(PROGRESS_COPY, num_dumped, info->num_dumpable);
+
+		/* next pfn is found, refresh last here */
+		last = new;
+		consuming_pfn++;
+		page_data_buf[index].ready = 0;
+
+		if (page_data_buf[index].dumpable == FALSE)
+			goto unlock;
+
+		num_dumped++;
+
+		if (page_data_buf[index].zero == TRUE) {
+			if (!write_cache(cd_header, pd_zero, sizeof(page_desc_t)))
+				goto out;
+			pfn_zero++;
+		} else {
+			pd.flags      = page_data_buf[index].flags;
+			pd.size       = page_data_buf[index].size;
+			pd.page_flags = 0;
+			pd.offset     = *offset_data;
+			*offset_data  += pd.size;
+			/*
+			 * Write the page header.
+			 */
+			if (!write_cache(cd_header, &pd, sizeof(page_desc_t)))
+				goto out;
+			/*
+			 * Write the page data.
+			 */
+			if (!write_cache(cd_page, page_data_buf[index].buf, pd.size))
+				goto out;
+
+		}
+unlock:
+		pthread_mutex_unlock(&page_data_buf[index].mutex);
+	}
+
+	ret = TRUE;
+	/*
+	 * print [100 %]
+	 */
+	print_progress(PROGRESS_COPY, num_dumped, info->num_dumpable);
+	print_execution_time(PROGRESS_COPY, &tv_start);
+	PROGRESS_MSG("\n");
+
+out:
+	if (threads != NULL) {
+		for (i = 0; i < info->num_threads; i++) {
+			if (threads[i] != NULL) {
+				res = pthread_cancel(*threads[i]);
+				if (res != 0 && res != ESRCH)
+					ERRMSG("Can't cancel thread %d. %s\n",
+							i, strerror(res));
+			}
+		}
+
+		for (i = 0; i < info->num_threads; i++) {
+			if (threads[i] != NULL) {
+				res = pthread_join(*threads[i], &thread_result);
+				if (res != 0)
+					ERRMSG("Can't join with thread %d. %s\n",
+							i, strerror(res));
+
+				if (thread_result == PTHREAD_CANCELED)
+					DEBUG_MSG("Thread %d is cancelled.\n", i);
+				else if (thread_result == PTHREAD_FAIL)
+					DEBUG_MSG("Thread %d fails.\n", i);
+				else
+					DEBUG_MSG("Thread %d finishes.\n", i);
+
+			}
+		}
+	}
+
+	if (page_data_buf != NULL) {
+		for (i = 0; i < page_data_num; i++) {
+			pthread_mutex_destroy(&page_data_buf[i].mutex);
+		}
+	}
+
+	pthread_rwlock_destroy(&info->usemmap_rwlock);
+	pthread_mutex_destroy(&info->filter_mutex);
+	pthread_mutex_destroy(&info->consumed_pfn_mutex);
+	pthread_mutex_destroy(&info->current_pfn_mutex);
+
+	return ret;
+}
+
 int
 write_kdump_pages_cyclic(struct cache_data *cd_header, struct cache_data *cd_page,
 			 struct page_desc *pd_zero, off_t *offset_data, struct cycle *cycle)
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 09/11] Initial and free data used for parallel process
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (7 preceding siblings ...)
  2015-06-05  7:57 ` [PATCH RFC 08/11] Add write_kdump_pages_parallel_cyclic to allow parallel process in cyclic_mode Zhou Wenjian
@ 2015-06-05  7:57 ` Zhou Wenjian
  2015-06-05  7:57 ` [PATCH RFC 10/11] Make makedumpfile available to read and compress pages parallelly Zhou Wenjian
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:57 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

This patch is used to initial/free data for parallel process and
the memory limit is concerned in this function.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.c |  202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 makedumpfile.h |    1 +
 2 files changed, 203 insertions(+), 0 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index 86426d8..04a6c45 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -1426,6 +1426,23 @@ open_dump_bitmap(void)
 			SPLITTING_FD_BITMAP(i) = fd;
 		}
 	}
+
+	if (info->num_threads) {
+		/*
+		 * Reserve file descriptors of bitmap for creating dumpfiles
+		 * parallelly, because a bitmap file will be unlinked just after
+		 * this and it is not possible to open a bitmap file later.
+		 */
+		for (i = 0; i < info->num_threads; i++) {
+			if ((fd = open(info->name_bitmap, O_RDONLY)) < 0) {
+				ERRMSG("Can't open the bitmap file(%s). %s\n",
+				    info->name_bitmap, strerror(errno));
+				return FALSE;
+			}
+			FD_BITMAP_PARALLEL(i) = fd;
+		}
+	}
+
 	unlink(info->name_bitmap);
 
 	return TRUE;
@@ -3446,6 +3463,191 @@ calibrate_machdep_info(void)
 }
 
 int
+initial_for_parallel()
+{
+	unsigned long len_buf_out;
+	unsigned long page_data_buf_size;
+	unsigned long limit_size;
+	int page_data_num;
+	int i;
+
+	len_buf_out = calculate_len_buf_out(info->page_size);
+
+	/*
+	 * allocate memory for threads
+	 */
+	if ((info->threads = malloc(sizeof(pthread_t *) * info->num_threads))
+	    == NULL) {
+		MSG("Can't allocate memory for threads. %s\n",
+				strerror(errno));
+		return FALSE;
+	}
+	memset(info->threads, 0, sizeof(pthread_t *) * info->num_threads);
+
+	if ((info->kdump_thread_args =
+			malloc(sizeof(struct thread_args) * info->num_threads))
+	    == NULL) {
+		MSG("Can't allocate memory for arguments of threads. %s\n",
+				strerror(errno));
+		return FALSE;
+	}
+	memset(info->kdump_thread_args, 0, sizeof(struct thread_args) * info->num_threads);
+
+	for (i = 0; i < info->num_threads; i++) {
+		if ((info->threads[i] = malloc(sizeof(pthread_t))) == NULL) {
+			MSG("Can't allocate memory for thread %d. %s",
+					i, strerror(errno));
+			return FALSE;
+		}
+
+		if ((BUF_PARALLEL(i) = malloc(info->page_size)) == NULL) {
+			MSG("Can't allocate memory for the memory buffer. %s\n",
+					strerror(errno));
+			return FALSE;
+		}
+
+		if ((BUF_OUT_PARALLEL(i) = malloc(len_buf_out)) == NULL) {
+			MSG("Can't allocate memory for the compression buffer. %s\n",
+					strerror(errno));
+			return FALSE;
+		}
+
+		if ((MMAP_CACHE_PARALLEL(i) = malloc(sizeof(struct mmap_cache))) == NULL) {
+			MSG("Can't allocate memory for mmap_cache. %s\n",
+					strerror(errno));
+			return FALSE;
+		}
+
+		/*
+		 * initial for mmap_cache
+		 */
+		MMAP_CACHE_PARALLEL(i)->mmap_buf = MAP_FAILED;
+		MMAP_CACHE_PARALLEL(i)->mmap_start_offset = 0;
+		MMAP_CACHE_PARALLEL(i)->mmap_end_offset = 0;
+
+#ifdef USELZO
+		if ((WRKMEM_PARALLEL(i) = malloc(LZO1X_1_MEM_COMPRESS)) == NULL) {
+			MSG("Can't allocate memory for the working memory. %s\n",
+					strerror(errno));
+			return FALSE;
+		}
+#endif
+	}
+
+	/*
+	 * get a safe number of page_data
+	 */
+	page_data_buf_size = MAX(len_buf_out, info->page_size);
+
+	limit_size = (get_free_memory_size()
+		      - MAP_REGION * info->num_threads) * 0.6;
+
+	page_data_num = limit_size / page_data_buf_size;
+
+	if (info->num_buffers != 0)
+		info->num_buffers = MIN(info->num_buffers, page_data_num);
+	else
+		info->num_buffers = MIN(PAGE_DATA_NUM, page_data_num);
+
+	DEBUG_MSG("Number of struct page_data for produce/consume: %d\n",
+			info->num_buffers);
+
+	/*
+	 * allocate memory for page_data
+	 */
+	if ((info->page_data_buf = malloc(sizeof(struct page_data) * info->num_buffers))
+	    == NULL) {
+		MSG("Can't allocate memory for page_data_buf. %s\n",
+				strerror(errno));
+		return FALSE;
+	}
+	memset(info->page_data_buf, 0, sizeof(struct page_data) * info->num_buffers);
+
+	for (i = 0; i < info->num_buffers; i++) {
+		if ((info->page_data_buf[i].buf = malloc(page_data_buf_size)) == NULL) {
+			MSG("Can't allocate memory for buf of page_data_buf. %s\n",
+					strerror(errno));
+			return FALSE;
+		}
+	}
+
+	/*
+	 * initial fd_memory for threads
+	 */
+	for (i = 0; i < info->num_threads; i++) {
+		if ((FD_MEMORY_PARALLEL(i) = open(info->name_memory, O_RDONLY))
+									< 0) {
+			ERRMSG("Can't open the dump memory(%s). %s\n",
+					info->name_memory, strerror(errno));
+			return FALSE;
+		}
+
+		if ((FD_BITMAP_MEMORY_PARALLEL(i) =
+				open(info->name_memory, O_RDONLY)) < 0) {
+			ERRMSG("Can't open the dump memory(%s). %s\n",
+					info->name_memory, strerror(errno));
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+void
+free_for_parallel()
+{
+	int i;
+
+	if (info->threads != NULL) {
+		for (i = 0; i < info->num_threads; i++) {
+			if (info->threads[i] != NULL)
+				free(info->threads[i]);
+
+			if (BUF_PARALLEL(i) != NULL)
+				free(BUF_PARALLEL(i));
+
+			if (BUF_OUT_PARALLEL(i) != NULL)
+				free(BUF_OUT_PARALLEL(i));
+
+			if (MMAP_CACHE_PARALLEL(i) != NULL) {
+				if (MMAP_CACHE_PARALLEL(i)->mmap_buf !=
+								MAP_FAILED)
+					munmap(MMAP_CACHE_PARALLEL(i)->mmap_buf,
+					       MMAP_CACHE_PARALLEL(i)->mmap_end_offset
+					       - MMAP_CACHE_PARALLEL(i)->mmap_start_offset);
+
+				free(MMAP_CACHE_PARALLEL(i));
+			}
+#ifdef USELZO
+			if (WRKMEM_PARALLEL(i) != NULL)
+				free(WRKMEM_PARALLEL(i));
+#endif
+
+		}
+		free(info->threads);
+	}
+
+	if (info->kdump_thread_args != NULL)
+		free(info->kdump_thread_args);
+
+	if (info->page_data_buf != NULL) {
+		for (i = 0; i < info->num_buffers; i++) {
+			if (info->page_data_buf[i].buf != NULL)
+				free(info->page_data_buf[i].buf);
+		}
+		free(info->page_data_buf);
+	}
+
+	for (i = 0; i < info->num_threads; i++) {
+		if (FD_MEMORY_PARALLEL(i) > 0)
+			close(FD_MEMORY_PARALLEL(i));
+
+		if (FD_BITMAP_MEMORY_PARALLEL(i) > 0)
+			close(FD_BITMAP_MEMORY_PARALLEL(i));
+	}
+}
+
+int
 initial(void)
 {
 	off_t offset;
diff --git a/makedumpfile.h b/makedumpfile.h
index bca3d56..67c2a38 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -929,6 +929,7 @@ typedef unsigned long long int ulonglong;
  * for parallel process
  */
 
+#define PAGE_DATA_NUM	(50)
 #define WAIT_TIME	(60 * 10)
 #define PTHREAD_FAIL	((void *)-2)
 
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 10/11] Make makedumpfile available to read and compress pages parallelly
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (8 preceding siblings ...)
  2015-06-05  7:57 ` [PATCH RFC 09/11] Initial and free data used for parallel process Zhou Wenjian
@ 2015-06-05  7:57 ` Zhou Wenjian
  2015-06-05  7:57 ` [PATCH RFC 11/11] Add usage and manual about multiple threads process Zhou Wenjian
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:57 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

Using this patch, it is available to use multiple threads to read
and compress pages. This parallel process will save time.

Currently, sadump and xen kdump is not supported.

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.c |   77 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 makedumpfile.h |    2 +
 2 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index 04a6c45..bb931c3 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -3846,6 +3846,27 @@ out:
 		DEBUG_MSG("Buffer size for the cyclic mode: %ld\n", info->bufsize_cyclic);
 	}
 
+	if (info->num_threads) {
+		if (is_xen_memory()) {
+			MSG("'--num-threads' option is disable,\n");
+			MSG("because %s is Xen's memory core image.\n",
+							info->name_memory);
+			return FALSE;
+		}
+
+		if (info->flag_sadump) {
+			MSG("'--num-threads' option is disable,\n");
+			MSG("because %s is sadump %s format.\n",
+			    info->name_memory, sadump_format_type_name());
+			return FALSE;
+		}
+
+		if (!initial_for_parallel()) {
+			MSG("Fail to initial for parallel process.\n");
+			return FALSE;
+		}
+	}
+
 	if (!is_xen_memory() && !cache_init())
 		return FALSE;
 
@@ -8823,9 +8844,16 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data *cd_header, struct cache_d
 		if (!write_kdump_bitmap2_cyclic(&cycle))
 			return FALSE;
 
-		if (!write_kdump_pages_cyclic(cd_header, cd_page, &pd_zero,
+		if (info->num_threads) {
+			if (!write_kdump_pages_parallel_cyclic(cd_header,
+							cd_page, &pd_zero,
+							&offset_data, &cycle))
+				return FALSE;
+		} else {
+			if (!write_kdump_pages_cyclic(cd_header, cd_page, &pd_zero,
 					&offset_data, &cycle))
-			return FALSE;
+				return FALSE;
+		}
 	}
 
 	free_bitmap2_buffer_cyclic();
@@ -9832,8 +9860,13 @@ writeout_dumpfile(void)
 			goto out;
 		if (!write_kdump_bitmap())
 			goto out;
-		if (!write_kdump_pages(&cd_header, &cd_page))
-			goto out;
+		if (info->num_threads) {
+			if (!write_kdump_pages_parallel(&cd_header, &cd_page))
+				goto out;
+		} else {
+			if (!write_kdump_pages(&cd_header, &cd_page))
+				goto out;
+		}
 		if (!write_kdump_eraseinfo(&cd_page))
 			goto out;
 	}
@@ -10847,6 +10880,18 @@ check_param_for_creating_dumpfile(int argc, char *argv[])
 	if (info->flag_sadump_diskset && !sadump_is_supported_arch())
 		return FALSE;
 
+	if (info->num_threads) {
+		if (info->flag_split) {
+			MSG("--num-threads cannot used with --split.\n");
+			return FALSE;
+		}
+
+		if (info->flag_elf_dumpfile) {
+			MSG("--num-threads cannot used with ELF format.\n");
+			return FALSE;
+		}
+	}
+
 	if ((argc == optind + 2) && !info->flag_flatten
 				 && !info->flag_split
 				 && !info->flag_sadump_diskset) {
@@ -10911,6 +10956,18 @@ check_param_for_creating_dumpfile(int argc, char *argv[])
 	} else
 		return FALSE;
 
+	if (info->num_threads) {
+		if ((info->parallel_info =
+		     malloc(sizeof(parallel_info_t) * info->num_threads))
+		    == NULL) {
+			MSG("Can't allocate memory for parallel_info.\n");
+			return FALSE;
+		}
+
+		memset(info->parallel_info, 0, sizeof(parallel_info_t)
+							* info->num_threads);
+	}
+
 	return TRUE;
 }
 
@@ -11223,6 +11280,8 @@ static struct option longopts[] = {
 	{"non-mmap", no_argument, NULL, OPT_NON_MMAP},
 	{"mem-usage", no_argument, NULL, OPT_MEM_USAGE},
 	{"splitblock-size", required_argument, NULL, OPT_SPLITBLOCK_SIZE},
+	{"num-threads", required_argument, NULL, OPT_NUM_THREADS},
+	{"num-buffers", required_argument, NULL, OPT_NUM_BUFFERS},
 	{0, 0, 0, 0}
 };
 
@@ -11366,6 +11425,12 @@ main(int argc, char *argv[])
 		case OPT_SPLITBLOCK_SIZE:
 			info->splitblock_size = atoi(optarg);
 			break;
+		case OPT_NUM_THREADS:
+			info->num_threads = atoi(optarg);
+			break;
+		case OPT_NUM_BUFFERS:
+			info->num_buffers = atoi(optarg);
+			break;
 		case '?':
 			MSG("Commandline parameter is invalid.\n");
 			MSG("Try `makedumpfile --help' for more information.\n");
@@ -11509,6 +11574,8 @@ out:
 	else if (!info->flag_mem_usage)
 		MSG("makedumpfile Completed.\n");
 
+	free_for_parallel();
+
 	if (info) {
 		if (info->dh_memory)
 			free(info->dh_memory);
@@ -11536,6 +11603,8 @@ out:
 			free(info->p2m_mfn_frame_list);
 		if (info->page_buf != NULL)
 			free(info->page_buf);
+		if (info->parallel_info != NULL)
+			free(info->parallel_info);
 		free(info);
 
 		if (splitblock) {
diff --git a/makedumpfile.h b/makedumpfile.h
index 67c2a38..42a8ee3 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -1974,6 +1974,8 @@ struct elf_prstatus {
 #define OPT_NON_MMAP            OPT_START+13
 #define OPT_MEM_USAGE            OPT_START+14
 #define OPT_SPLITBLOCK_SIZE	OPT_START+15
+#define OPT_NUM_THREADS		OPT_START+16
+#define OPT_NUM_BUFFERS		OPT_START+17
 
 /*
  * Function Prototype.
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH RFC 11/11] Add usage and manual about multiple threads process
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (9 preceding siblings ...)
  2015-06-05  7:57 ` [PATCH RFC 10/11] Make makedumpfile available to read and compress pages parallelly Zhou Wenjian
@ 2015-06-05  7:57 ` Zhou Wenjian
  2015-06-08  3:55 ` [PATCH RFC 00/11] makedumpfile: parallel processing "Zhou, Wenjian/周文剑"
  2015-06-10  6:06 ` Atsushi Kumagai
  12 siblings, 0 replies; 43+ messages in thread
From: Zhou Wenjian @ 2015-06-05  7:57 UTC (permalink / raw)
  To: kexec; +Cc: Qiao Nuohan

From: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>

Signed-off-by: Qiao Nuohan <qiaonuohan@cn.fujitsu.com>
---
 makedumpfile.8 |   24 ++++++++++++++++++++++++
 print_info.c   |   16 ++++++++++++++++
 2 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/makedumpfile.8 b/makedumpfile.8
index 9752671..5a760c8 100644
--- a/makedumpfile.8
+++ b/makedumpfile.8
@@ -12,6 +12,8 @@ makedumpfile \- make a small dumpfile of kdump
 .br
 \fBmakedumpfile\fR \-\-split [\fIOPTION\fR] [\-x \fIVMLINUX\fR|\-i \fIVMCOREINFO\fR] \fIVMCORE\fR \fIDUMPFILE1\fR \fIDUMPFILE2\fR [\fIDUMPFILE3\fR ..]
 .br
+\fBmakedumpfile\fR [\fIOPTION\fR] [\-x \fIVMLINUX\fR|\-i \fIVMCOREINFO\fR] \-\-num\-threads \fITHREADNUM\fR [\-\-num\-buffers \fIBUFNUM\fR] \fIVMCORE\fR \fIDUMPFILE\fR
+.br
 \fBmakedumpfile\fR \-\-reassemble \fIDUMPFILE1\fR \fIDUMPFILE2\fR [\fIDUMPFILE3\fR ..] \fIDUMPFILE\fR
 .br
 \fBmakedumpfile\fR \-g \fIVMCOREINFO\fR \-x \fIVMLINUX\fR
@@ -371,6 +373,28 @@ the kdump\-compressed format.
 # makedumpfile \-\-split \-d 31 \-x vmlinux /proc/vmcore dumpfile1 dumpfile2
 
 .TP
+\fB\-\-num\-threads\fR \fITHREADNUM\fR
+Using multiple threads to read and compress data of each page in parallel.
+And it will reduces time for saving \fIDUMPFILE\fR.
+This feature only supports creating \fIDUMPFILE\fR in kdump\-comressed
+format from \fIVMCORE\fR in kdump\-compressed format or elf format.
+.br
+.B Example:
+.br
+# makedumpfile \-d 31 \-\-num\-threads 4 /proc/vmcore dumpfile
+
+.TP
+\fB\-\-num\-buffers\fR \fIBUFNUM\fR
+This option is used for multiple threads process, please check \-\-num\-threads
+option. Multiple threads process will need buffers to store generated page
+data by threads temporarily, and this option is used to specify the number
+of pages can be stored.
+.br
+.B Example:
+.br
+# makedumpfile \-d 31 \-\-num\-threads 4 \-\-num\-buffers 30 /proc/vmcore dumpfile
+
+.TP
 \fB\-\-reassemble\fR
 Reassemble multiple \fIDUMPFILE\fRs, which are created by \-\-split option,
 into one \fIDUMPFILE\fR. dumpfile1 and dumpfile2 are reassembled into dumpfile
diff --git a/print_info.c b/print_info.c
index 9215e0f..a830ee2 100644
--- a/print_info.c
+++ b/print_info.c
@@ -76,6 +76,10 @@ print_usage(void)
 	MSG("  # makedumpfile --split [OPTION] [-x VMLINUX|-i VMCOREINFO] VMCORE DUMPFILE1\n");
 	MSG("    DUMPFILE2 [DUMPFILE3 ..]\n");
 	MSG("\n");
+	MSG("  Using multiple threads to create DUMPFILE in parallel:\n");
+	MSG("  # makedumpfile [OPTION] [-x VMLINUX|-i VMCOREINFO] --num-threads THREADNUM\n");
+	MSG("    [--num-buffers BUFNUM] VMCORE DUMPFILE1\n");
+	MSG("\n");
 	MSG("  Reassemble multiple DUMPFILEs:\n");
 	MSG("  # makedumpfile --reassemble DUMPFILE1 DUMPFILE2 [DUMPFILE3 ..] DUMPFILE\n");
 	MSG("\n");
@@ -184,6 +188,18 @@ print_usage(void)
 	MSG("      by the number of DUMPFILEs.\n");
 	MSG("      This feature supports only the kdump-compressed format.\n");
 	MSG("\n");
+	MSG("  [--num-threads THREADNUM]:\n");
+	MSG("      Using multiple threads to read and compress data of each page in parallel.\n");
+	MSG("      And it will reduces time for saving DUMPFILE.\n");
+	MSG("      This feature only supports creating DUMPFILE in kdump-comressed format from\n");
+	MSG("      VMCORE in kdump-compressed format or elf format.\n");
+	MSG("\n");
+	MSG("  [--num-buffers BUFNUM]:\n");
+	MSG("      This option is used for multiple threads process, please check --num-threads\n");
+	MSG("      option. Multiple threads process will need buffers to store generated page\n");
+	MSG("      data by threads temporarily, and this option is used to specify the number\n");
+	MSG("      of pages can be stored.\n");
+	MSG("\n");
 	MSG("  [--reassemble]:\n");
 	MSG("      Reassemble multiple DUMPFILEs, which are created by --split option,\n");
 	MSG("      into one DUMPFILE. dumpfile1 and dumpfile2 are reassembled into dumpfile.\n");
-- 
1.7.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (10 preceding siblings ...)
  2015-06-05  7:57 ` [PATCH RFC 11/11] Add usage and manual about multiple threads process Zhou Wenjian
@ 2015-06-08  3:55 ` "Zhou, Wenjian/周文剑"
  2015-12-01  8:39   ` Chao Fan
  2015-06-10  6:06 ` Atsushi Kumagai
  12 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-06-08  3:55 UTC (permalink / raw)
  To: kexec

hello all,

I test this patch set in two machines and the following is the benchmark.

These tables show the time that makedumpfile spends. And the unit is second.

"core-data" in the table means the context in the vmcore.
	For example:
	core-data's value is 256. It means that in the vmcore, 256 * 8 bits of each page
	are set to 1.

"-l" in the table means producing lzo format vmcore

"-c" in the table means producing kdump-compressed format vmcore

###################################machine with 128G memory

************ makedumpfile -d 0 ******************
		core-data	256	1280
	threads_num
-l
	0			758	881	
	8			932	1014
	16			973	1085
-c
	0			3994	4071
	8			966	1007
	16			1053	1192

************ makedumpfile -d 3 ******************
		core-data	256	1280
	threads_num
-l
	0			764	847
	8			948	1058
	16			943	1069
-c
	0			4021	4050
	8			949	1029
	16			1051	1190

************ makedumpfile -d 31 ******************
		core-data	256	1280
	threads_num
-l
	0			4	4
	8			639	610
	16			680	680
-c
	0			14	13
	8			607	610
	16			631	662

###################################machine with 24G memory

************ makedumpfile -d 0 ******************
		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328	3584	3840	4096
	threads_num	
-l	
	0			15	140	186	196	196	196	196	197	197	197	195	195	195	195	186	131	15
	4			9	136	189	204	204	202	201	200	201	200	200	202	204	203	189	136	9
	8			11	131	193	198	198	202	206	205	206	205	205	202	198	197	193	132	11
	12			18	137	194	202	203	197	201	203	204	202	201	196	202	202	194	136	17
-c	
	0			80	786	967	1031	874	849	700	608	652	603	764	768	873	1031	1016	776	80
	4			82	262	315	321	296	256	255	220	218	221	241	268	303	320	319	259	84
	8			58	148	174	189	179	189	196	198	199	198	196	190	178	174	170	145	57
	12			56	112	131	157	170	189	200	204	204	203	199	191	170	157	132	111	59

************ makedumpfile -d 1 ******************
		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328	3584	3840	4096
	threads_num	
-l	
	0			16	134	194	204	204	205	205	206	205	207	204	203	204	204	193	134	15
	4			9	132	193	197	196	198	199	200	200	200	199	197	196	197	192	132	9
	8			12	135	189	202	204	200	197	196	197	195	196	199	203	202	189	136	12
	12			16	130	190	200	200	205	202	201	200	201	202	205	199	200	189	131	17
-c	
	0			77	775	1009	1032	872	853	699	606	643	602	758	765	870	1026	1014	774	78
	4			80	262	316	322	332	257	247	217	223	218	288	256	322	322	315	258	81
	8			56	146	173	176	170	184	198	205	207	203	198	185	169	180	169	149	56
	12			56	110	133	152	175	185	194	202	202	202	193	184	176	152	135	114	56

************ makedumpfile -d 7 ******************
		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328	3584	3840	4096
	threads_num	
-l	
	0			16	138	188	197	197	197	197	197	197	198	196	197	197	197	189	137	16
	4			10	131	187	202	205	203	202	202	203	203	201	203	204	201	187	131	8
	8			11	135	191	199	197	201	203	205	206	204	203	200	197	199	192	134	11
	12			18	134	195	201	203	197	199	202	202	201	199	196	203	201	197	134	19
-c	
	0			77	770	1011	1032	871	841	698	621	645	601	763	765	870	1025	1014	773	78
	4			81	263	311	320	319	255	240	216	242	214	240	257	300	319	314	255	80
	8			57	157	176	172	174	191	196	199	199	199	195	191	173	171	167	146	57
	12			55	111	136	156	170	188	203	204	204	203	201	186	168	156	136	112	56

************ makedumpfile -d 31 ******************
		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328	3584	3840	4096
	threads_num	
-l	
	0			1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1
	4			7	8	8	8	8	8	8	8	8	8	8	8	8	8	7	8	8
	8			11	11	11	10	11	11	11	11	11	11	10	11	11	11	11	11	11
	12			14	13	14	13	13	15	15	13	15	13	14	14	13	15	15	15	16
-c	
	0			4	4	5	4	4	4	4	4	4	4	4	4	4	4	4	4	4
	4			10	10	10	10	10	10	10	10	10	10	10	10	10	10	10	10	10
	8			12	12	12	13	12	12	12	12	12	12	13	12	14	13	12	12	13
	12			14	16	14	14	13	15	15	15	14	14	14	14	16	14	15	15	14


On 06/05/2015 03:56 PM, Zhou Wenjian wrote:
> This patch set implements parallel processing by means of multiple threads.
> With this patch set, it is available to use multiple threads to read
> and compress pages. This parallel process will save time.
> This feature only supports creating dumpfile in kdump-compressed format from
> vmcore in kdump-compressed format or elf format. Currently, sadump and
>   xen kdump are not supported.
>
> Qiao Nuohan (11):
>    Add readpage_kdump_compressed_parallel
>    Add mappage_elf_parallel
>    Add readpage_elf_parallel
>    Add read_pfn_parallel
>    Add function to initial bitmap for parallel use
>    Add filter_data_buffer_parallel
>    Add write_kdump_pages_parallel to allow parallel process
>    Add write_kdump_pages_parallel_cyclic to allow parallel process in
>      cyclic_mode
>    Initial and free data used for parallel process
>    Make makedumpfile available to read and compress pages parallelly
>    Add usage and manual about multiple threads process
>
>   Makefile       |    2 +
>   erase_info.c   |   29 +-
>   erase_info.h   |    2 +
>   makedumpfile.8 |   24 +
>   makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   makedumpfile.h |   79 +++
>   print_info.c   |   16 +
>   7 files changed, 1652 insertions(+), 5 deletions(-)
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec


-- 
Thanks
Zhou Wenjian

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
                   ` (11 preceding siblings ...)
  2015-06-08  3:55 ` [PATCH RFC 00/11] makedumpfile: parallel processing "Zhou, Wenjian/周文剑"
@ 2015-06-10  6:06 ` Atsushi Kumagai
  2015-06-11  3:47   ` "Zhou, Wenjian/周文剑"
  12 siblings, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-06-10  6:06 UTC (permalink / raw)
  To: zhouwj-fnst@cn.fujitsu.com; +Cc: kexec@lists.infradead.org

Hello Zhou,

>This patch set implements parallel processing by means of multiple threads.
>With this patch set, it is available to use multiple threads to read
>and compress pages. This parallel process will save time.
>This feature only supports creating dumpfile in kdump-compressed format from
>vmcore in kdump-compressed format or elf format. Currently, sadump and
> xen kdump are not supported.

makedumpfile already has a parallel processing feature (--split),
it parallelizes not only page compression but also disk i/o, so
I think --split includes what you want to do by this patch.

In what case do you think this patch will be effective, what is
the advantage of this patch ?


Thanks
Atsushi Kumagai

>
>Qiao Nuohan (11):
>  Add readpage_kdump_compressed_parallel
>  Add mappage_elf_parallel
>  Add readpage_elf_parallel
>  Add read_pfn_parallel
>  Add function to initial bitmap for parallel use
>  Add filter_data_buffer_parallel
>  Add write_kdump_pages_parallel to allow parallel process
>  Add write_kdump_pages_parallel_cyclic to allow parallel process in
>    cyclic_mode
>  Initial and free data used for parallel process
>  Make makedumpfile available to read and compress pages parallelly
>  Add usage and manual about multiple threads process
>
> Makefile       |    2 +
> erase_info.c   |   29 +-
> erase_info.h   |    2 +
> makedumpfile.8 |   24 +
> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> makedumpfile.h |   79 +++
> print_info.c   |   16 +
> 7 files changed, 1652 insertions(+), 5 deletions(-)
>
>
>_______________________________________________
>kexec mailing list
>kexec@lists.infradead.org
>http://lists.infradead.org/mailman/listinfo/kexec

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-06-10  6:06 ` Atsushi Kumagai
@ 2015-06-11  3:47   ` "Zhou, Wenjian/周文剑"
  2015-06-15  1:59     ` qiaonuohan
  0 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-06-11  3:47 UTC (permalink / raw)
  To: Atsushi Kumagai; +Cc: kexec@lists.infradead.org

hello,

though --split can parallel process, it can't just produce one core.
more processes, better performance. but it also means more split cores.
people may want to just produce one core, however they still prefer parallel
processing for its better performance.

so, parallel processing by multiple threads is needed.
in the future, multiple threads can also be used in each split process to
accelerate process.


On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
> Hello Zhou,
>
>> This patch set implements parallel processing by means of multiple threads.
>> With this patch set, it is available to use multiple threads to read
>> and compress pages. This parallel process will save time.
>> This feature only supports creating dumpfile in kdump-compressed format from
>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>> xen kdump are not supported.
>
> makedumpfile already has a parallel processing feature (--split),
> it parallelizes not only page compression but also disk i/o, so
> I think --split includes what you want to do by this patch.
>
> In what case do you think this patch will be effective, what is
> the advantage of this patch ?
>
>
> Thanks
> Atsushi Kumagai
>
>>
>> Qiao Nuohan (11):
>>   Add readpage_kdump_compressed_parallel
>>   Add mappage_elf_parallel
>>   Add readpage_elf_parallel
>>   Add read_pfn_parallel
>>   Add function to initial bitmap for parallel use
>>   Add filter_data_buffer_parallel
>>   Add write_kdump_pages_parallel to allow parallel process
>>   Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>     cyclic_mode
>>   Initial and free data used for parallel process
>>   Make makedumpfile available to read and compress pages parallelly
>>   Add usage and manual about multiple threads process
>>
>> Makefile       |    2 +
>> erase_info.c   |   29 +-
>> erase_info.h   |    2 +
>> makedumpfile.8 |   24 +
>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> makedumpfile.h |   79 +++
>> print_info.c   |   16 +
>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec


-- 
Thanks
Zhou Wenjian

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-06-11  3:47   ` "Zhou, Wenjian/周文剑"
@ 2015-06-15  1:59     ` qiaonuohan
  2015-06-15  5:57       ` Atsushi Kumagai
  0 siblings, 1 reply; 43+ messages in thread
From: qiaonuohan @ 2015-06-15  1:59 UTC (permalink / raw)
  To: "Zhou, Wenjian/周文剑", Atsushi Kumagai
  Cc: kexec@lists.infradead.org

On 06/11/2015 11:47 AM, "Zhou, Wenjian/周文剑" wrote:
> hello,
>
> though --split can parallel process, it can't just produce one core.
> more processes, better performance. but it also means more split cores.
> people may want to just produce one core, however they still prefer parallel
> processing for its better performance.
>
> so, parallel processing by multiple threads is needed.
> in the future, multiple threads can also be used in each split process to
> accelerate process.
>
>
> On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
>> Hello Zhou,
>>

Hello Atsushi,

>>> This patch set implements parallel processing by means of multiple threads.
>>> With this patch set, it is available to use multiple threads to read
>>> and compress pages. This parallel process will save time.
>>> This feature only supports creating dumpfile in kdump-compressed format from
>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>> xen kdump are not supported.
>>
>> makedumpfile already has a parallel processing feature (--split),
>> it parallelizes not only page compression but also disk i/o, so
>> I think --split includes what you want to do by this patch.
>>
>> In what case do you think this patch will be effective, what is
>> the advantage of this patch ?

Since commit 428a5e99eea929639ab9c761f33743f78a961b1a(kdumpctl: Pass
disable_cpu_apicid to kexec of capture kernel) has been merged. It is possible for
us to use multiple cpus in 2nd kernel.

Using multiple threads is trying to take advantage of multiple cpus in 2nd kernel.
Since memory becomes bigger and bigger, dumping spends more time. Why not take
advantage of multiple cpus?

OTOH, --split does a lot help to improve performance. But more processes
means more files, saving multiple files and managing those files is not that
convenient.

Multiple threads do have some merit in improving performance. And later, as zhou
said, we can also try to combine --split with multiple threads to save more time.

-- 
Regards
Qiao Nuohan

>>
>>
>> Thanks
>> Atsushi Kumagai
>>
>>>
>>> Qiao Nuohan (11):
>>>   Add readpage_kdump_compressed_parallel
>>>   Add mappage_elf_parallel
>>>   Add readpage_elf_parallel
>>>   Add read_pfn_parallel
>>>   Add function to initial bitmap for parallel use
>>>   Add filter_data_buffer_parallel
>>>   Add write_kdump_pages_parallel to allow parallel process
>>>   Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>>     cyclic_mode
>>>   Initial and free data used for parallel process
>>>   Make makedumpfile available to read and compress pages parallelly
>>>   Add usage and manual about multiple threads process
>>>
>>> Makefile       |    2 +
>>> erase_info.c   |   29 +-
>>> erase_info.h   |    2 +
>>> makedumpfile.8 |   24 +
>>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>> makedumpfile.h |   79 +++
>>> print_info.c   |   16 +
>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>
>>>
>>> _______________________________________________
>>> kexec mailing list
>>> kexec@lists.infradead.org
>>> http://lists.infradead.org/mailman/listinfo/kexec
>
>


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-06-15  1:59     ` qiaonuohan
@ 2015-06-15  5:57       ` Atsushi Kumagai
  2015-06-15  6:06         ` qiaonuohan
  2015-06-15  6:07         ` qiaonuohan
  0 siblings, 2 replies; 43+ messages in thread
From: Atsushi Kumagai @ 2015-06-15  5:57 UTC (permalink / raw)
  To: qiaonuohan@cn.fujitsu.com, zhouwj-fnst@cn.fujitsu.com
  Cc: kexec@lists.infradead.org

Hello Qiao,

>> On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
>>> Hello Zhou,
>>>
>
>Hello Atsushi,
>
>>>> This patch set implements parallel processing by means of multiple threads.
>>>> With this patch set, it is available to use multiple threads to read
>>>> and compress pages. This parallel process will save time.
>>>> This feature only supports creating dumpfile in kdump-compressed format from
>>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>>> xen kdump are not supported.
>>>
>>> makedumpfile already has a parallel processing feature (--split),
>>> it parallelizes not only page compression but also disk i/o, so
>>> I think --split includes what you want to do by this patch.
>>>
>>> In what case do you think this patch will be effective, what is
>>> the advantage of this patch ?
>
>Since commit 428a5e99eea929639ab9c761f33743f78a961b1a(kdumpctl: Pass
>disable_cpu_apicid to kexec of capture kernel) has been merged. It is possible for
>us to use multiple cpus in 2nd kernel.
>
>Using multiple threads is trying to take advantage of multiple cpus in 2nd kernel.
>Since memory becomes bigger and bigger, dumping spends more time. Why not take
>advantage of multiple cpus?
>
>OTOH, --split does a lot help to improve performance. But more processes
>means more files, saving multiple files and managing those files is not that
>convenient.

I see, actually I guess some users may feel lazy to use --split since
it requires concatenation for analyzing, and it seems that some improvements
by using multiple threads can be expected at least in the zlib case.
So I agree with the concept.

>Multiple threads do have some merit in improving performance. And later, as zhou
>said, we can also try to combine --split with multiple threads to save more time.

At first I thought it's enough to modify --split path to generate single vmcore.
However, if the compression process is the bottleneck, we should allot multiple
cpus to each i/o process when doing parallel i/o. For that reason, it's good to
introduce the new feature to create multiple threads in addition to --split.

Just one thing, when you make the complete version, please make it on the devel
branch because cyclic/non-cyclic codes have been changed from v1.5.8.


Thanks
Atsushi Kumagai


>--
>Regards
>Qiao Nuohan
>
>>>
>>>
>>> Thanks
>>> Atsushi Kumagai
>>>
>>>>
>>>> Qiao Nuohan (11):
>>>>   Add readpage_kdump_compressed_parallel
>>>>   Add mappage_elf_parallel
>>>>   Add readpage_elf_parallel
>>>>   Add read_pfn_parallel
>>>>   Add function to initial bitmap for parallel use
>>>>   Add filter_data_buffer_parallel
>>>>   Add write_kdump_pages_parallel to allow parallel process
>>>>   Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>>>     cyclic_mode
>>>>   Initial and free data used for parallel process
>>>>   Make makedumpfile available to read and compress pages parallelly
>>>>   Add usage and manual about multiple threads process
>>>>
>>>> Makefile       |    2 +
>>>> erase_info.c   |   29 +-
>>>> erase_info.h   |    2 +
>>>> makedumpfile.8 |   24 +
>>>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>> makedumpfile.h |   79 +++
>>>> print_info.c   |   16 +
>>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>>
>>>>
>>>> _______________________________________________
>>>> kexec mailing list
>>>> kexec@lists.infradead.org
>>>> http://lists.infradead.org/mailman/listinfo/kexec
>>
>>

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-06-15  5:57       ` Atsushi Kumagai
@ 2015-06-15  6:06         ` qiaonuohan
  2015-06-15  6:07         ` qiaonuohan
  1 sibling, 0 replies; 43+ messages in thread
From: qiaonuohan @ 2015-06-15  6:06 UTC (permalink / raw)
  To: Atsushi Kumagai, zhouwj-fnst@cn.fujitsu.com; +Cc: kexec@lists.infradead.org

On 06/15/2015 01:57 PM, Atsushi Kumagai wrote:
> Hello Qiao,
>
>>> On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
>>>> Hello Zhou,
>>>>
>>
>> Hello Atsushi,
>>
>>>>> This patch set implements parallel processing by means of multiple threads.
>>>>> With this patch set, it is available to use multiple threads to read
>>>>> and compress pages. This parallel process will save time.
>>>>> This feature only supports creating dumpfile in kdump-compressed format from
>>>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>>>> xen kdump are not supported.
>>>>
>>>> makedumpfile already has a parallel processing feature (--split),
>>>> it parallelizes not only page compression but also disk i/o, so
>>>> I think --split includes what you want to do by this patch.
>>>>
>>>> In what case do you think this patch will be effective, what is
>>>> the advantage of this patch ?
>>
>> Since commit 428a5e99eea929639ab9c761f33743f78a961b1a(kdumpctl: Pass
>> disable_cpu_apicid to kexec of capture kernel) has been merged. It is possible for
>> us to use multiple cpus in 2nd kernel.
>>
>> Using multiple threads is trying to take advantage of multiple cpus in 2nd kernel.
>> Since memory becomes bigger and bigger, dumping spends more time. Why not take
>> advantage of multiple cpus?
>>
>> OTOH, --split does a lot help to improve performance. But more processes
>> means more files, saving multiple files and managing those files is not that
>> convenient.
>
> I see, actually I guess some users may feel lazy to use --split since
> it requires concatenation for analyzing, and it seems that some improvements
> by using multiple threads can be expected at least in the zlib case.
> So I agree with the concept.
>
>> Multiple threads do have some merit in improving performance. And later, as zhou
>> said, we can also try to combine --split with multiple threads to save more time.
>
> At first I thought it's enough to modify --split path to generate single vmcore.
> However, if the compression process is the bottleneck, we should allot multiple
> cpus to each i/o process when doing parallel i/o. For that reason, it's good to
> introduce the new feature to create multiple threads in addition to --split.

I see.

>
> Just one thing, when you make the complete version, please make it on the devel
> branch because cyclic/non-cyclic codes have been changed from v1.5.8.

Yes, we will start rebasing the code.

>
>
> Thanks
> Atsushi Kumagai
>
>
>> --
>> Regards
>> Qiao Nuohan
>>
>>>>
>>>>
>>>> Thanks
>>>> Atsushi Kumagai
>>>>
>>>>>
>>>>> Qiao Nuohan (11):
>>>>>    Add readpage_kdump_compressed_parallel
>>>>>    Add mappage_elf_parallel
>>>>>    Add readpage_elf_parallel
>>>>>    Add read_pfn_parallel
>>>>>    Add function to initial bitmap for parallel use
>>>>>    Add filter_data_buffer_parallel
>>>>>    Add write_kdump_pages_parallel to allow parallel process
>>>>>    Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>>>>      cyclic_mode
>>>>>    Initial and free data used for parallel process
>>>>>    Make makedumpfile available to read and compress pages parallelly
>>>>>    Add usage and manual about multiple threads process
>>>>>
>>>>> Makefile       |    2 +
>>>>> erase_info.c   |   29 +-
>>>>> erase_info.h   |    2 +
>>>>> makedumpfile.8 |   24 +
>>>>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>> makedumpfile.h |   79 +++
>>>>> print_info.c   |   16 +
>>>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> kexec mailing list
>>>>> kexec@lists.infradead.org
>>>>> http://lists.infradead.org/mailman/listinfo/kexec
>>>
>>>
>


-- 
Regards
Qiao Nuohan

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-06-15  5:57       ` Atsushi Kumagai
  2015-06-15  6:06         ` qiaonuohan
@ 2015-06-15  6:07         ` qiaonuohan
  1 sibling, 0 replies; 43+ messages in thread
From: qiaonuohan @ 2015-06-15  6:07 UTC (permalink / raw)
  To: Atsushi Kumagai, zhouwj-fnst@cn.fujitsu.com; +Cc: kexec@lists.infradead.org

On 06/15/2015 01:57 PM, Atsushi Kumagai wrote:
> Hello Qiao,
>
>>> On 06/10/2015 02:06 PM, Atsushi Kumagai wrote:
>>>> Hello Zhou,
>>>>
>>
>> Hello Atsushi,
>>
>>>>> This patch set implements parallel processing by means of multiple threads.
>>>>> With this patch set, it is available to use multiple threads to read
>>>>> and compress pages. This parallel process will save time.
>>>>> This feature only supports creating dumpfile in kdump-compressed format from
>>>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>>>> xen kdump are not supported.
>>>>
>>>> makedumpfile already has a parallel processing feature (--split),
>>>> it parallelizes not only page compression but also disk i/o, so
>>>> I think --split includes what you want to do by this patch.
>>>>
>>>> In what case do you think this patch will be effective, what is
>>>> the advantage of this patch ?
>>
>> Since commit 428a5e99eea929639ab9c761f33743f78a961b1a(kdumpctl: Pass
>> disable_cpu_apicid to kexec of capture kernel) has been merged. It is possible for
>> us to use multiple cpus in 2nd kernel.
>>
>> Using multiple threads is trying to take advantage of multiple cpus in 2nd kernel.
>> Since memory becomes bigger and bigger, dumping spends more time. Why not take
>> advantage of multiple cpus?
>>
>> OTOH, --split does a lot help to improve performance. But more processes
>> means more files, saving multiple files and managing those files is not that
>> convenient.
>
> I see, actually I guess some users may feel lazy to use --split since
> it requires concatenation for analyzing, and it seems that some improvements
> by using multiple threads can be expected at least in the zlib case.
> So I agree with the concept.
>
>> Multiple threads do have some merit in improving performance. And later, as zhou
>> said, we can also try to combine --split with multiple threads to save more time.
>
> At first I thought it's enough to modify --split path to generate single vmcore.
> However, if the compression process is the bottleneck, we should allot multiple
> cpus to each i/o process when doing parallel i/o. For that reason, it's good to
> introduce the new feature to create multiple threads in addition to --split.

I see.

>
> Just one thing, when you make the complete version, please make it on the devel
> branch because cyclic/non-cyclic codes have been changed from v1.5.8.

Yes, we will start rebasing the code.

>
>
> Thanks
> Atsushi Kumagai
>
>
>> --
>> Regards
>> Qiao Nuohan
>>
>>>>
>>>>
>>>> Thanks
>>>> Atsushi Kumagai
>>>>
>>>>>
>>>>> Qiao Nuohan (11):
>>>>>    Add readpage_kdump_compressed_parallel
>>>>>    Add mappage_elf_parallel
>>>>>    Add readpage_elf_parallel
>>>>>    Add read_pfn_parallel
>>>>>    Add function to initial bitmap for parallel use
>>>>>    Add filter_data_buffer_parallel
>>>>>    Add write_kdump_pages_parallel to allow parallel process
>>>>>    Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>>>>      cyclic_mode
>>>>>    Initial and free data used for parallel process
>>>>>    Make makedumpfile available to read and compress pages parallelly
>>>>>    Add usage and manual about multiple threads process
>>>>>
>>>>> Makefile       |    2 +
>>>>> erase_info.c   |   29 +-
>>>>> erase_info.h   |    2 +
>>>>> makedumpfile.8 |   24 +
>>>>> makedumpfile.c | 1505 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>> makedumpfile.h |   79 +++
>>>>> print_info.c   |   16 +
>>>>> 7 files changed, 1652 insertions(+), 5 deletions(-)
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> kexec mailing list
>>>>> kexec@lists.infradead.org
>>>>> http://lists.infradead.org/mailman/listinfo/kexec
>>>
>>>
>


-- 
Regards
Qiao Nuohan

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-06-08  3:55 ` [PATCH RFC 00/11] makedumpfile: parallel processing "Zhou, Wenjian/周文剑"
@ 2015-12-01  8:39   ` Chao Fan
  2015-12-02  5:29     ` "Zhou, Wenjian/周文剑"
  0 siblings, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-01  8:39 UTC (permalink / raw)
  To: Wenjian Zhou/周文剑; +Cc: Shaohui Deng, kexec

Hi Zhou Wenjian,

I did some tests according to your tables. I have a problem when I set
dump_level to 31. The machine has 1T memory, and when dump_level was set
to 31, the size of vmcore is 17G. The kernel is 3.10.0-327.el7.x86_64.
The kexec-tools is kexec-tools-2.0.7-38.el7.x86_64.

If I use 
core_collector time makedumpfile -l --message-level 1 -d 31 
in kdump based on makedumpfile 1.5.7, the time is 
63 seconds(the average of many tests).

And then I use the kdump based on makedumpfile 1.5.9.
core_collector time makedumpfile -l --message-level 1 -d 31
the time is 58 seconds.

core_collector time makedumpfile --num-threads 1 -l --message-level 1 -d 31
the time is 240 seconds.

core_collector time makedumpfile --num-threads 2 -l --message-level 1 -d 31
the time is 189 seconds.

core_collector time makedumpfile --num-threads 4 -l --message-level 1 -d 31
the time is 220 seconds.

core_collector time makedumpfile --num-threads 8 -l --message-level 1 -d 31
the time is 417 seconds.

core_collector time makedumpfile --num-threads 12 -l --message-level 1 -d 31
the time is 579 seconds.

core_collector time makedumpfile --num-threads 16 -l --message-level 1 -d 31
the time is 756 seconds.

So I do not know why if I add "--num-threads", the makedumpfile will use more
time than without "--num-threads". Since your table also shows that
makedumpfile -d 31, the threads_num is 0, the makdumpfile is fatest.

If there are any problems in my tests, please tell me.

Thanks,
Chao Fan

----- Original Message -----
> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> To: kexec@lists.infradead.org
> Sent: Monday, June 8, 2015 11:55:41 AM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> 
> hello all,
> 
> I test this patch set in two machines and the following is the benchmark.
> 
> These tables show the time that makedumpfile spends. And the unit is second.
> 
> "core-data" in the table means the context in the vmcore.
> 	For example:
> 	core-data's value is 256. It means that in the vmcore, 256 * 8 bits of each
> 	page
> 	are set to 1.
> 
> "-l" in the table means producing lzo format vmcore
> 
> "-c" in the table means producing kdump-compressed format vmcore
> 
> ###################################machine with 128G memory
> 
> ************ makedumpfile -d 0 ******************
> 		core-data	256	1280
> 	threads_num
> -l
> 	0			758	881
> 	8			932	1014
> 	16			973	1085
> -c
> 	0			3994	4071
> 	8			966	1007
> 	16			1053	1192
> 
> ************ makedumpfile -d 3 ******************
> 		core-data	256	1280
> 	threads_num
> -l
> 	0			764	847
> 	8			948	1058
> 	16			943	1069
> -c
> 	0			4021	4050
> 	8			949	1029
> 	16			1051	1190
> 
> ************ makedumpfile -d 31 ******************
> 		core-data	256	1280
> 	threads_num
> -l
> 	0			4	4
> 	8			639	610
> 	16			680	680
> -c
> 	0			14	13
> 	8			607	610
> 	16			631	662
> 
> ###################################machine with 24G memory
> 
> ************ makedumpfile -d 0 ******************
> 		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328
> 			3584	3840	4096
> 	threads_num
> -l
> 	0			15	140	186	196	196	196	196	197	197	197	195	195	195	195	186	131	15
> 	4			9	136	189	204	204	202	201	200	201	200	200	202	204	203	189	136	9
> 	8			11	131	193	198	198	202	206	205	206	205	205	202	198	197	193	132	11
> 	12			18	137	194	202	203	197	201	203	204	202	201	196	202	202	194	136	17
> -c
> 	0			80	786	967	1031	874	849	700	608	652	603	764	768	873	1031	1016	776	80
> 	4			82	262	315	321	296	256	255	220	218	221	241	268	303	320	319	259	84
> 	8			58	148	174	189	179	189	196	198	199	198	196	190	178	174	170	145	57
> 	12			56	112	131	157	170	189	200	204	204	203	199	191	170	157	132	111	59
> 
> ************ makedumpfile -d 1 ******************
> 		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328
> 			3584	3840	4096
> 	threads_num
> -l
> 	0			16	134	194	204	204	205	205	206	205	207	204	203	204	204	193	134	15
> 	4			9	132	193	197	196	198	199	200	200	200	199	197	196	197	192	132	9
> 	8			12	135	189	202	204	200	197	196	197	195	196	199	203	202	189	136	12
> 	12			16	130	190	200	200	205	202	201	200	201	202	205	199	200	189	131	17
> -c
> 	0			77	775	1009	1032	872	853	699	606	643	602	758	765	870	1026	1014	774	78
> 	4			80	262	316	322	332	257	247	217	223	218	288	256	322	322	315	258	81
> 	8			56	146	173	176	170	184	198	205	207	203	198	185	169	180	169	149	56
> 	12			56	110	133	152	175	185	194	202	202	202	193	184	176	152	135	114	56
> 
> ************ makedumpfile -d 7 ******************
> 		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328
> 			3584	3840	4096
> 	threads_num
> -l
> 	0			16	138	188	197	197	197	197	197	197	198	196	197	197	197	189	137	16
> 	4			10	131	187	202	205	203	202	202	203	203	201	203	204	201	187	131	8
> 	8			11	135	191	199	197	201	203	205	206	204	203	200	197	199	192	134	11
> 	12			18	134	195	201	203	197	199	202	202	201	199	196	203	201	197	134	19
> -c
> 	0			77	770	1011	1032	871	841	698	621	645	601	763	765	870	1025	1014	773	78
> 	4			81	263	311	320	319	255	240	216	242	214	240	257	300	319	314	255	80
> 	8			57	157	176	172	174	191	196	199	199	199	195	191	173	171	167	146	57
> 	12			55	111	136	156	170	188	203	204	204	203	201	186	168	156	136	112	56
> 
> ************ makedumpfile -d 31 ******************
> 		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328
> 			3584	3840	4096
> 	threads_num
> -l
> 	0			1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1
> 	4			7	8	8	8	8	8	8	8	8	8	8	8	8	8	7	8	8
> 	8			11	11	11	10	11	11	11	11	11	11	10	11	11	11	11	11	11
> 	12			14	13	14	13	13	15	15	13	15	13	14	14	13	15	15	15	16
> -c
> 	0			4	4	5	4	4	4	4	4	4	4	4	4	4	4	4	4	4
> 	4			10	10	10	10	10	10	10	10	10	10	10	10	10	10	10	10	10
> 	8			12	12	12	13	12	12	12	12	12	12	13	12	14	13	12	12	13
> 	12			14	16	14	14	13	15	15	15	14	14	14	14	16	14	15	15	14
> 
> 
> On 06/05/2015 03:56 PM, Zhou Wenjian wrote:
> > This patch set implements parallel processing by means of multiple threads.
> > With this patch set, it is available to use multiple threads to read
> > and compress pages. This parallel process will save time.
> > This feature only supports creating dumpfile in kdump-compressed format
> > from
> > vmcore in kdump-compressed format or elf format. Currently, sadump and
> >   xen kdump are not supported.
> >
> > Qiao Nuohan (11):
> >    Add readpage_kdump_compressed_parallel
> >    Add mappage_elf_parallel
> >    Add readpage_elf_parallel
> >    Add read_pfn_parallel
> >    Add function to initial bitmap for parallel use
> >    Add filter_data_buffer_parallel
> >    Add write_kdump_pages_parallel to allow parallel process
> >    Add write_kdump_pages_parallel_cyclic to allow parallel process in
> >      cyclic_mode
> >    Initial and free data used for parallel process
> >    Make makedumpfile available to read and compress pages parallelly
> >    Add usage and manual about multiple threads process
> >
> >   Makefile       |    2 +
> >   erase_info.c   |   29 +-
> >   erase_info.h   |    2 +
> >   makedumpfile.8 |   24 +
> >   makedumpfile.c | 1505
> >   +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >   makedumpfile.h |   79 +++
> >   print_info.c   |   16 +
> >   7 files changed, 1652 insertions(+), 5 deletions(-)
> >
> >
> > _______________________________________________
> > kexec mailing list
> > kexec@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec
> 
> 
> --
> Thanks
> Zhou Wenjian
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
> 

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-01  8:39   ` Chao Fan
@ 2015-12-02  5:29     ` "Zhou, Wenjian/周文剑"
  2015-12-02  7:24       ` Dave Young
  0 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-02  5:29 UTC (permalink / raw)
  To: Chao Fan; +Cc: Shaohui Deng, kexec

On 12/01/2015 04:39 PM, Chao Fan wrote:
> Hi Zhou Wenjian,
>
> I did some tests according to your tables. I have a problem when I set
> dump_level to 31. The machine has 1T memory, and when dump_level was set
> to 31, the size of vmcore is 17G. The kernel is 3.10.0-327.el7.x86_64.
> The kexec-tools is kexec-tools-2.0.7-38.el7.x86_64.
>
> If I use
> core_collector time makedumpfile -l --message-level 1 -d 31
> in kdump based on makedumpfile 1.5.7, the time is
> 63 seconds(the average of many tests).
>
> And then I use the kdump based on makedumpfile 1.5.9.
> core_collector time makedumpfile -l --message-level 1 -d 31
> the time is 58 seconds.
>
> core_collector time makedumpfile --num-threads 1 -l --message-level 1 -d 31
> the time is 240 seconds.
>
> core_collector time makedumpfile --num-threads 2 -l --message-level 1 -d 31
> the time is 189 seconds.
>
> core_collector time makedumpfile --num-threads 4 -l --message-level 1 -d 31
> the time is 220 seconds.
>
> core_collector time makedumpfile --num-threads 8 -l --message-level 1 -d 31
> the time is 417 seconds.
>
> core_collector time makedumpfile --num-threads 12 -l --message-level 1 -d 31
> the time is 579 seconds.
>
> core_collector time makedumpfile --num-threads 16 -l --message-level 1 -d 31
> the time is 756 seconds.
>
> So I do not know why if I add "--num-threads", the makedumpfile will use more
> time than without "--num-threads". Since your table also shows that
> makedumpfile -d 31, the threads_num is 0, the makdumpfile is fatest.
>
> If there are any problems in my tests, please tell me.
>
Hello,

I think there is no problem if other test results are as expected.

--num-threads mainly reduces the time of compressing.
So for lzo, it can't do much help at most of time.

However, when "-d 31" is specified, it will be worse.
Less than 50 buffers are used to cache the compressed page.
And even the page has been filtered, it will also take a buffer.
So if "-d 31" is specified, the filtered page will use a lot
of buffers. Then the page which needs to be compressed can't
be compressed parallel.

So, it's not strange that "--num-threads" will take more time
in "-l -d 31"

-- 
Thanks
Zhou

> Thanks,
> Chao Fan
>
> ----- Original Message -----
>> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> To: kexec@lists.infradead.org
>> Sent: Monday, June 8, 2015 11:55:41 AM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> hello all,
>>
>> I test this patch set in two machines and the following is the benchmark.
>>
>> These tables show the time that makedumpfile spends. And the unit is second.
>>
>> "core-data" in the table means the context in the vmcore.
>> 	For example:
>> 	core-data's value is 256. It means that in the vmcore, 256 * 8 bits of each
>> 	page
>> 	are set to 1.
>>
>> "-l" in the table means producing lzo format vmcore
>>
>> "-c" in the table means producing kdump-compressed format vmcore
>>
>> ###################################machine with 128G memory
>>
>> ************ makedumpfile -d 0 ******************
>> 		core-data	256	1280
>> 	threads_num
>> -l
>> 	0			758	881
>> 	8			932	1014
>> 	16			973	1085
>> -c
>> 	0			3994	4071
>> 	8			966	1007
>> 	16			1053	1192
>>
>> ************ makedumpfile -d 3 ******************
>> 		core-data	256	1280
>> 	threads_num
>> -l
>> 	0			764	847
>> 	8			948	1058
>> 	16			943	1069
>> -c
>> 	0			4021	4050
>> 	8			949	1029
>> 	16			1051	1190
>>
>> ************ makedumpfile -d 31 ******************
>> 		core-data	256	1280
>> 	threads_num
>> -l
>> 	0			4	4
>> 	8			639	610
>> 	16			680	680
>> -c
>> 	0			14	13
>> 	8			607	610
>> 	16			631	662
>>
>> ###################################machine with 24G memory
>>
>> ************ makedumpfile -d 0 ******************
>> 		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328
>> 			3584	3840	4096
>> 	threads_num
>> -l
>> 	0			15	140	186	196	196	196	196	197	197	197	195	195	195	195	186	131	15
>> 	4			9	136	189	204	204	202	201	200	201	200	200	202	204	203	189	136	9
>> 	8			11	131	193	198	198	202	206	205	206	205	205	202	198	197	193	132	11
>> 	12			18	137	194	202	203	197	201	203	204	202	201	196	202	202	194	136	17
>> -c
>> 	0			80	786	967	1031	874	849	700	608	652	603	764	768	873	1031	1016	776	80
>> 	4			82	262	315	321	296	256	255	220	218	221	241	268	303	320	319	259	84
>> 	8			58	148	174	189	179	189	196	198	199	198	196	190	178	174	170	145	57
>> 	12			56	112	131	157	170	189	200	204	204	203	199	191	170	157	132	111	59
>>
>> ************ makedumpfile -d 1 ******************
>> 		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328
>> 			3584	3840	4096
>> 	threads_num
>> -l
>> 	0			16	134	194	204	204	205	205	206	205	207	204	203	204	204	193	134	15
>> 	4			9	132	193	197	196	198	199	200	200	200	199	197	196	197	192	132	9
>> 	8			12	135	189	202	204	200	197	196	197	195	196	199	203	202	189	136	12
>> 	12			16	130	190	200	200	205	202	201	200	201	202	205	199	200	189	131	17
>> -c
>> 	0			77	775	1009	1032	872	853	699	606	643	602	758	765	870	1026	1014	774	78
>> 	4			80	262	316	322	332	257	247	217	223	218	288	256	322	322	315	258	81
>> 	8			56	146	173	176	170	184	198	205	207	203	198	185	169	180	169	149	56
>> 	12			56	110	133	152	175	185	194	202	202	202	193	184	176	152	135	114	56
>>
>> ************ makedumpfile -d 7 ******************
>> 		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328
>> 			3584	3840	4096
>> 	threads_num
>> -l
>> 	0			16	138	188	197	197	197	197	197	197	198	196	197	197	197	189	137	16
>> 	4			10	131	187	202	205	203	202	202	203	203	201	203	204	201	187	131	8
>> 	8			11	135	191	199	197	201	203	205	206	204	203	200	197	199	192	134	11
>> 	12			18	134	195	201	203	197	199	202	202	201	199	196	203	201	197	134	19
>> -c
>> 	0			77	770	1011	1032	871	841	698	621	645	601	763	765	870	1025	1014	773	78
>> 	4			81	263	311	320	319	255	240	216	242	214	240	257	300	319	314	255	80
>> 	8			57	157	176	172	174	191	196	199	199	199	195	191	173	171	167	146	57
>> 	12			55	111	136	156	170	188	203	204	204	203	201	186	168	156	136	112	56
>>
>> ************ makedumpfile -d 31 ******************
>> 		core-data	0	256	512	768	1024	1280	1536	1792	2048	2304	2560	2816	3072	3328
>> 			3584	3840	4096
>> 	threads_num
>> -l
>> 	0			1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1
>> 	4			7	8	8	8	8	8	8	8	8	8	8	8	8	8	7	8	8
>> 	8			11	11	11	10	11	11	11	11	11	11	10	11	11	11	11	11	11
>> 	12			14	13	14	13	13	15	15	13	15	13	14	14	13	15	15	15	16
>> -c
>> 	0			4	4	5	4	4	4	4	4	4	4	4	4	4	4	4	4	4
>> 	4			10	10	10	10	10	10	10	10	10	10	10	10	10	10	10	10	10
>> 	8			12	12	12	13	12	12	12	12	12	12	13	12	14	13	12	12	13
>> 	12			14	16	14	14	13	15	15	15	14	14	14	14	16	14	15	15	14
>>
>>
>> On 06/05/2015 03:56 PM, Zhou Wenjian wrote:
>>> This patch set implements parallel processing by means of multiple threads.
>>> With this patch set, it is available to use multiple threads to read
>>> and compress pages. This parallel process will save time.
>>> This feature only supports creating dumpfile in kdump-compressed format
>>> from
>>> vmcore in kdump-compressed format or elf format. Currently, sadump and
>>>    xen kdump are not supported.
>>>
>>> Qiao Nuohan (11):
>>>     Add readpage_kdump_compressed_parallel
>>>     Add mappage_elf_parallel
>>>     Add readpage_elf_parallel
>>>     Add read_pfn_parallel
>>>     Add function to initial bitmap for parallel use
>>>     Add filter_data_buffer_parallel
>>>     Add write_kdump_pages_parallel to allow parallel process
>>>     Add write_kdump_pages_parallel_cyclic to allow parallel process in
>>>       cyclic_mode
>>>     Initial and free data used for parallel process
>>>     Make makedumpfile available to read and compress pages parallelly
>>>     Add usage and manual about multiple threads process
>>>
>>>    Makefile       |    2 +
>>>    erase_info.c   |   29 +-
>>>    erase_info.h   |    2 +
>>>    makedumpfile.8 |   24 +
>>>    makedumpfile.c | 1505
>>>    +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>    makedumpfile.h |   79 +++
>>>    print_info.c   |   16 +
>>>    7 files changed, 1652 insertions(+), 5 deletions(-)
>>>
>>>
>>> _______________________________________________
>>> kexec mailing list
>>> kexec@lists.infradead.org
>>> http://lists.infradead.org/mailman/listinfo/kexec
>>
>>
>> --
>> Thanks
>> Zhou Wenjian
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>




_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-02  5:29     ` "Zhou, Wenjian/周文剑"
@ 2015-12-02  7:24       ` Dave Young
  2015-12-02  7:38         ` "Zhou, Wenjian/周文剑"
  0 siblings, 1 reply; 43+ messages in thread
From: Dave Young @ 2015-12-02  7:24 UTC (permalink / raw)
  To: "Zhou, Wenjian/周文剑"
  Cc: Chao Fan, Shaohui Deng, kexec

Hi,

On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> I think there is no problem if other test results are as expected.
> 
> --num-threads mainly reduces the time of compressing.
> So for lzo, it can't do much help at most of time.

Seems the help of --num-threads does not say it exactly:

  [--num-threads THREADNUM]:
      Using multiple threads to read and compress data of each page in parallel.
      And it will reduces time for saving DUMPFILE.
      This feature only supports creating DUMPFILE in kdump-comressed format from
      VMCORE in kdump-compressed format or elf format.

Lzo is also a compress method, it should be mentioned that --num-threads only
supports zlib compressed vmcore.

Also worth to mention about the recommended -d value for this feature.

Thanks
Dave

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-02  7:24       ` Dave Young
@ 2015-12-02  7:38         ` "Zhou, Wenjian/周文剑"
  2015-12-04  2:30           ` Atsushi Kumagai
  0 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-02  7:38 UTC (permalink / raw)
  To: Dave Young; +Cc: Chao Fan, Shaohui Deng, kexec

On 12/02/2015 03:24 PM, Dave Young wrote:
> Hi,
>
> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> I think there is no problem if other test results are as expected.
>>
>> --num-threads mainly reduces the time of compressing.
>> So for lzo, it can't do much help at most of time.
>
> Seems the help of --num-threads does not say it exactly:
>
>    [--num-threads THREADNUM]:
>        Using multiple threads to read and compress data of each page in parallel.
>        And it will reduces time for saving DUMPFILE.
>        This feature only supports creating DUMPFILE in kdump-comressed format from
>        VMCORE in kdump-compressed format or elf format.
>
> Lzo is also a compress method, it should be mentioned that --num-threads only
> supports zlib compressed vmcore.
>

Sorry, it seems that something I said is not so clear.
lzo is also supported. Since lzo compresses data at a high speed, the
improving of the performance is not so obvious at most of time.

> Also worth to mention about the recommended -d value for this feature.
>

Yes, I think it's worth. I forgot it.

-- 
Thanks
Zhou



_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-02  7:38         ` "Zhou, Wenjian/周文剑"
@ 2015-12-04  2:30           ` Atsushi Kumagai
  2015-12-04  3:33             ` "Zhou, Wenjian/周文剑"
  0 siblings, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-12-04  2:30 UTC (permalink / raw)
  To: zhouwj-fnst@cn.fujitsu.com; +Cc: kexec@lists.infradead.org

Hello, Zhou

>On 12/02/2015 03:24 PM, Dave Young wrote:
>> Hi,
>>
>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>> I think there is no problem if other test results are as expected.
>>>
>>> --num-threads mainly reduces the time of compressing.
>>> So for lzo, it can't do much help at most of time.
>>
>> Seems the help of --num-threads does not say it exactly:
>>
>>    [--num-threads THREADNUM]:
>>        Using multiple threads to read and compress data of each page in parallel.
>>        And it will reduces time for saving DUMPFILE.
>>        This feature only supports creating DUMPFILE in kdump-comressed format from
>>        VMCORE in kdump-compressed format or elf format.
>>
>> Lzo is also a compress method, it should be mentioned that --num-threads only
>> supports zlib compressed vmcore.
>>
>
>Sorry, it seems that something I said is not so clear.
>lzo is also supported. Since lzo compresses data at a high speed, the
>improving of the performance is not so obvious at most of time.
>
>> Also worth to mention about the recommended -d value for this feature.
>>
>
>Yes, I think it's worth. I forgot it.

I saw your patch, but I think I should confirm what is the problem first.

>However, when "-d 31" is specified, it will be worse.
>Less than 50 buffers are used to cache the compressed page.
>And even the page has been filtered, it will also take a buffer.
>So if "-d 31" is specified, the filtered page will use a lot
>of buffers. Then the page which needs to be compressed can't
>be compressed parallel.

Could you explain why compression will not be parallel in more detail ?
Actually the buffers are used also for filtered pages, it sounds inefficient.
However, I don't understand why it prevents parallel compression.

Further, according to Chao's benchmark, there is a big performance
degradation even if the number of thread is 1. (58s vs 240s)
The current implementation seems to have some problems, we should
solve them.


Thanks,
Atsushi Kumagai
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-04  2:30           ` Atsushi Kumagai
@ 2015-12-04  3:33             ` "Zhou, Wenjian/周文剑"
  2015-12-04  8:56               ` Chao Fan
  2015-12-10  8:14               ` Atsushi Kumagai
  0 siblings, 2 replies; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-04  3:33 UTC (permalink / raw)
  To: Atsushi Kumagai; +Cc: kexec@lists.infradead.org

Hello Kumagai,

On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> Hello, Zhou
>
>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>> Hi,
>>>
>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>> I think there is no problem if other test results are as expected.
>>>>
>>>> --num-threads mainly reduces the time of compressing.
>>>> So for lzo, it can't do much help at most of time.
>>>
>>> Seems the help of --num-threads does not say it exactly:
>>>
>>>     [--num-threads THREADNUM]:
>>>         Using multiple threads to read and compress data of each page in parallel.
>>>         And it will reduces time for saving DUMPFILE.
>>>         This feature only supports creating DUMPFILE in kdump-comressed format from
>>>         VMCORE in kdump-compressed format or elf format.
>>>
>>> Lzo is also a compress method, it should be mentioned that --num-threads only
>>> supports zlib compressed vmcore.
>>>
>>
>> Sorry, it seems that something I said is not so clear.
>> lzo is also supported. Since lzo compresses data at a high speed, the
>> improving of the performance is not so obvious at most of time.
>>
>>> Also worth to mention about the recommended -d value for this feature.
>>>
>>
>> Yes, I think it's worth. I forgot it.
>
> I saw your patch, but I think I should confirm what is the problem first.
>
>> However, when "-d 31" is specified, it will be worse.
>> Less than 50 buffers are used to cache the compressed page.
>> And even the page has been filtered, it will also take a buffer.
>> So if "-d 31" is specified, the filtered page will use a lot
>> of buffers. Then the page which needs to be compressed can't
>> be compressed parallel.
>
> Could you explain why compression will not be parallel in more detail ?
> Actually the buffers are used also for filtered pages, it sounds inefficient.
> However, I don't understand why it prevents parallel compression.
>

Think about this, in a huge memory, most of the page will be filtered, and
we have 5 buffers.

page1       page2      page3     page4     page5      page6       page7 .....
[buffer1]   [2]        [3]       [4]       [5]
unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered

Since filtered page will take a buffer, when compressing page1,
page6 can't be compressed at the same time.
That why it will prevent parallel compression.

> Further, according to Chao's benchmark, there is a big performance
> degradation even if the number of thread is 1. (58s vs 240s)
> The current implementation seems to have some problems, we should
> solve them.
>

If "-d 31" is specified, on the one hand we can't save time by compressing
parallel, on the other hand we will introduce some extra work by adding
"--num-threads". So it is obvious that it will have a performance degradation.

I'm not so sure if it is a problem that the performance degradation is so big.
But I think if in other cases, it works as expected, this won't be a problem(
or a problem needs to be fixed), for the performance degradation existing
in theory.

Or the current implementation should be replaced by a new arithmetic.
For example:
We can add an array to record whether the page is filtered or not.
And only the unfiltered page will take the buffer.

But I'm not sure if it is worth.
For "-l -d 31" is fast enough, the new arithmetic also can't do much help.

-- 
Thanks
Zhou



_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-04  3:33             ` "Zhou, Wenjian/周文剑"
@ 2015-12-04  8:56               ` Chao Fan
  2015-12-07  1:09                 ` "Zhou, Wenjian/周文剑"
  2015-12-10  8:14               ` Atsushi Kumagai
  1 sibling, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-04  8:56 UTC (permalink / raw)
  To: Wenjian Zhou/周文剑; +Cc: Atsushi Kumagai, kexec

Hi Zhou Wenjian and Kumagai,

I have follow Zhou Wenjian's words to do some tests, in the condition of
"-c", makdumpfile 1.5.9 does perform better than "-l".

I have done more tests in a machine with 128G memory, in the condition
of "-d 0" and "-d 3", the makedumpfile 1.5.9 performs well. But if with
"--num-threads 1", it does need more time than without "--num-threads".

Here is my results(makedumpfile -c):

"-d 0" (the size of vmcore is 2.6G):
--num-threads        time(seconds)
    0                 556
    1                1186
    4                 307
    8                 186
   12                 131
   16                 123

  
"-d 3" (the size of vmcore is 1.3G):
--num-threads        time(seconds)
    0                 141
    1                 262
    2                 137
    4                  91
    8                 121
   16                 137

So, I think makedumpfile 1.5.9 can save time in the condition of "-c"
and not "-d 31" and not "--num-threads 1".

----- Original Message -----
> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> Cc: kexec@lists.infradead.org
> Sent: Friday, December 4, 2015 11:33:36 AM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> 
> Hello Kumagai,
> 
> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> > Hello, Zhou
> >
> >> On 12/02/2015 03:24 PM, Dave Young wrote:
> >>> Hi,
> >>>
> >>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >>>> I think there is no problem if other test results are as expected.
> >>>>
> >>>> --num-threads mainly reduces the time of compressing.
> >>>> So for lzo, it can't do much help at most of time.
> >>>
> >>> Seems the help of --num-threads does not say it exactly:
> >>>
> >>>     [--num-threads THREADNUM]:
> >>>         Using multiple threads to read and compress data of each page in
> >>>         parallel.
> >>>         And it will reduces time for saving DUMPFILE.
> >>>         This feature only supports creating DUMPFILE in kdump-comressed
> >>>         format from
> >>>         VMCORE in kdump-compressed format or elf format.
> >>>
> >>> Lzo is also a compress method, it should be mentioned that --num-threads
> >>> only
> >>> supports zlib compressed vmcore.
> >>>
> >>
> >> Sorry, it seems that something I said is not so clear.
> >> lzo is also supported. Since lzo compresses data at a high speed, the
> >> improving of the performance is not so obvious at most of time.
> >>
> >>> Also worth to mention about the recommended -d value for this feature.
> >>>
> >>
> >> Yes, I think it's worth. I forgot it.
> >
> > I saw your patch, but I think I should confirm what is the problem first.
> >
> >> However, when "-d 31" is specified, it will be worse.
> >> Less than 50 buffers are used to cache the compressed page.
> >> And even the page has been filtered, it will also take a buffer.
> >> So if "-d 31" is specified, the filtered page will use a lot
> >> of buffers. Then the page which needs to be compressed can't
> >> be compressed parallel.
> >
> > Could you explain why compression will not be parallel in more detail ?
> > Actually the buffers are used also for filtered pages, it sounds
> > inefficient.
> > However, I don't understand why it prevents parallel compression.
> >
> 
> Think about this, in a huge memory, most of the page will be filtered, and
> we have 5 buffers.
> 
> page1       page2      page3     page4     page5      page6       page7 .....
> [buffer1]   [2]        [3]       [4]       [5]
> unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered
> 
> Since filtered page will take a buffer, when compressing page1,
> page6 can't be compressed at the same time.
> That why it will prevent parallel compression.
> 
> > Further, according to Chao's benchmark, there is a big performance
> > degradation even if the number of thread is 1. (58s vs 240s)
> > The current implementation seems to have some problems, we should
> > solve them.
> >
> 
> If "-d 31" is specified, on the one hand we can't save time by compressing
> parallel, on the other hand we will introduce some extra work by adding
> "--num-threads". So it is obvious that it will have a performance
> degradation.
> 
> I'm not so sure if it is a problem that the performance degradation is so
> big.
> But I think if in other cases, it works as expected, this won't be a problem(
> or a problem needs to be fixed), for the performance degradation existing
> in theory.
> 
> Or the current implementation should be replaced by a new arithmetic.
> For example:
> We can add an array to record whether the page is filtered or not.
> And only the unfiltered page will take the buffer.
> 
> But I'm not sure if it is worth.
> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
> 
> --
> Thanks
> Zhou
> 
> 
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
> 

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-04  8:56               ` Chao Fan
@ 2015-12-07  1:09                 ` "Zhou, Wenjian/周文剑"
  0 siblings, 0 replies; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-07  1:09 UTC (permalink / raw)
  To: Chao Fan; +Cc: Atsushi Kumagai, kexec

On 12/04/2015 04:56 PM, Chao Fan wrote:
> Hi Zhou Wenjian and Kumagai,
>
> I have follow Zhou Wenjian's words to do some tests, in the condition of
> "-c", makdumpfile 1.5.9 does perform better than "-l".
>
> I have done more tests in a machine with 128G memory, in the condition
> of "-d 0" and "-d 3", the makedumpfile 1.5.9 performs well. But if with
> "--num-threads 1", it does need more time than without "--num-threads".
>
> Here is my results(makedumpfile -c):
>
> "-d 0" (the size of vmcore is 2.6G):
> --num-threads        time(seconds)
>      0                 556
>      1                1186
>      4                 307
>      8                 186
>     12                 131
>     16                 123
>
>
> "-d 3" (the size of vmcore is 1.3G):
> --num-threads        time(seconds)
>      0                 141
>      1                 262
>      2                 137
>      4                  91
>      8                 121
>     16                 137
>

Hello Chao,

This result also seems not so good.
We had test it, and you can refer to:
http://lists.infradead.org/pipermail/kexec/2015-October/014576.html

Could you collect the information by *perf stat -e page-faults* on both
--num-threads 0 and --num-threads 1 ?
Your result looks like the performance without the patch which dividing
compress2().

-- 
Thanks
Zhou
> So, I think makedumpfile 1.5.9 can save time in the condition of "-c"
> and not "-d 31" and not "--num-threads 1".
>
> ----- Original Message -----
>> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> Cc: kexec@lists.infradead.org
>> Sent: Friday, December 4, 2015 11:33:36 AM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> Hello Kumagai,
>>
>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>>> Hello, Zhou
>>>
>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>>>> Hi,
>>>>>
>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>>>> I think there is no problem if other test results are as expected.
>>>>>>
>>>>>> --num-threads mainly reduces the time of compressing.
>>>>>> So for lzo, it can't do much help at most of time.
>>>>>
>>>>> Seems the help of --num-threads does not say it exactly:
>>>>>
>>>>>      [--num-threads THREADNUM]:
>>>>>          Using multiple threads to read and compress data of each page in
>>>>>          parallel.
>>>>>          And it will reduces time for saving DUMPFILE.
>>>>>          This feature only supports creating DUMPFILE in kdump-comressed
>>>>>          format from
>>>>>          VMCORE in kdump-compressed format or elf format.
>>>>>
>>>>> Lzo is also a compress method, it should be mentioned that --num-threads
>>>>> only
>>>>> supports zlib compressed vmcore.
>>>>>
>>>>
>>>> Sorry, it seems that something I said is not so clear.
>>>> lzo is also supported. Since lzo compresses data at a high speed, the
>>>> improving of the performance is not so obvious at most of time.
>>>>
>>>>> Also worth to mention about the recommended -d value for this feature.
>>>>>
>>>>
>>>> Yes, I think it's worth. I forgot it.
>>>
>>> I saw your patch, but I think I should confirm what is the problem first.
>>>
>>>> However, when "-d 31" is specified, it will be worse.
>>>> Less than 50 buffers are used to cache the compressed page.
>>>> And even the page has been filtered, it will also take a buffer.
>>>> So if "-d 31" is specified, the filtered page will use a lot
>>>> of buffers. Then the page which needs to be compressed can't
>>>> be compressed parallel.
>>>
>>> Could you explain why compression will not be parallel in more detail ?
>>> Actually the buffers are used also for filtered pages, it sounds
>>> inefficient.
>>> However, I don't understand why it prevents parallel compression.
>>>
>>
>> Think about this, in a huge memory, most of the page will be filtered, and
>> we have 5 buffers.
>>
>> page1       page2      page3     page4     page5      page6       page7 .....
>> [buffer1]   [2]        [3]       [4]       [5]
>> unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered
>>
>> Since filtered page will take a buffer, when compressing page1,
>> page6 can't be compressed at the same time.
>> That why it will prevent parallel compression.
>>
>>> Further, according to Chao's benchmark, there is a big performance
>>> degradation even if the number of thread is 1. (58s vs 240s)
>>> The current implementation seems to have some problems, we should
>>> solve them.
>>>
>>
>> If "-d 31" is specified, on the one hand we can't save time by compressing
>> parallel, on the other hand we will introduce some extra work by adding
>> "--num-threads". So it is obvious that it will have a performance
>> degradation.
>>
>> I'm not so sure if it is a problem that the performance degradation is so
>> big.
>> But I think if in other cases, it works as expected, this won't be a problem(
>> or a problem needs to be fixed), for the performance degradation existing
>> in theory.
>>
>> Or the current implementation should be replaced by a new arithmetic.
>> For example:
>> We can add an array to record whether the page is filtered or not.
>> And only the unfiltered page will take the buffer.
>>
>> But I'm not sure if it is worth.
>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>>
>> --
>> Thanks
>> Zhou
>>
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>




_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-04  3:33             ` "Zhou, Wenjian/周文剑"
  2015-12-04  8:56               ` Chao Fan
@ 2015-12-10  8:14               ` Atsushi Kumagai
  2015-12-10  9:36                 ` "Zhou, Wenjian/周文剑"
  1 sibling, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-12-10  8:14 UTC (permalink / raw)
  To: "Zhou, Wenjian/周文剑"
  Cc: kexec@lists.infradead.org

>Hello Kumagai,
>
>On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> Hello, Zhou
>>
>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>>> Hi,
>>>>
>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>>> I think there is no problem if other test results are as expected.
>>>>>
>>>>> --num-threads mainly reduces the time of compressing.
>>>>> So for lzo, it can't do much help at most of time.
>>>>
>>>> Seems the help of --num-threads does not say it exactly:
>>>>
>>>>     [--num-threads THREADNUM]:
>>>>         Using multiple threads to read and compress data of each page in parallel.
>>>>         And it will reduces time for saving DUMPFILE.
>>>>         This feature only supports creating DUMPFILE in kdump-comressed format from
>>>>         VMCORE in kdump-compressed format or elf format.
>>>>
>>>> Lzo is also a compress method, it should be mentioned that --num-threads only
>>>> supports zlib compressed vmcore.
>>>>
>>>
>>> Sorry, it seems that something I said is not so clear.
>>> lzo is also supported. Since lzo compresses data at a high speed, the
>>> improving of the performance is not so obvious at most of time.
>>>
>>>> Also worth to mention about the recommended -d value for this feature.
>>>>
>>>
>>> Yes, I think it's worth. I forgot it.
>>
>> I saw your patch, but I think I should confirm what is the problem first.
>>
>>> However, when "-d 31" is specified, it will be worse.
>>> Less than 50 buffers are used to cache the compressed page.
>>> And even the page has been filtered, it will also take a buffer.
>>> So if "-d 31" is specified, the filtered page will use a lot
>>> of buffers. Then the page which needs to be compressed can't
>>> be compressed parallel.
>>
>> Could you explain why compression will not be parallel in more detail ?
>> Actually the buffers are used also for filtered pages, it sounds inefficient.
>> However, I don't understand why it prevents parallel compression.
>>
>
>Think about this, in a huge memory, most of the page will be filtered, and
>we have 5 buffers.
>
>page1       page2      page3     page4     page5      page6       page7 .....
>[buffer1]   [2]        [3]       [4]       [5]
>unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered
>
>Since filtered page will take a buffer, when compressing page1,
>page6 can't be compressed at the same time.
>That why it will prevent parallel compression.

Thanks for your explanation, I understand.
This is just an issue of the current implementation, there is no
reason to stand this restriction.

>> Further, according to Chao's benchmark, there is a big performance
>> degradation even if the number of thread is 1. (58s vs 240s)
>> The current implementation seems to have some problems, we should
>> solve them.
>>
>
>If "-d 31" is specified, on the one hand we can't save time by compressing
>parallel, on the other hand we will introduce some extra work by adding
>"--num-threads". So it is obvious that it will have a performance degradation.

Sure, there must be some overhead due to "some extra work"(e.g. exclusive lock), 
but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
too slow, the degradation is too big to be called "some extra work".

Both --num-threads=0 and --num-threads=1 are serial processing, 
the above "buffer fairness issue" will not be related to this degradation.
What do you think what make this degradation ?

>I'm not so sure if it is a problem that the performance degradation is so big.
>But I think if in other cases, it works as expected, this won't be a problem(
>or a problem needs to be fixed), for the performance degradation existing
>in theory.	
>
>Or the current implementation should be replaced by a new arithmetic.
>For example:
>We can add an array to record whether the page is filtered or not.
>And only the unfiltered page will take the buffer.

We should discuss how to implement new mechanism, I'll mention this later.

>But I'm not sure if it is worth.
>For "-l -d 31" is fast enough, the new arithmetic also can't do much help.

Basically the faster, the better. There is no obvious target time.
If there is room for improvement, we should do it.


Thanks,
Atsushi Kumagai
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-10  8:14               ` Atsushi Kumagai
@ 2015-12-10  9:36                 ` "Zhou, Wenjian/周文剑"
  2015-12-10  9:58                   ` Chao Fan
  2015-12-14  8:26                   ` Atsushi Kumagai
  0 siblings, 2 replies; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-10  9:36 UTC (permalink / raw)
  To: Atsushi Kumagai; +Cc: kexec@lists.infradead.org

On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> Hello Kumagai,
>>
>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>>> Hello, Zhou
>>>
>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>>>> Hi,
>>>>>
>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>>>> I think there is no problem if other test results are as expected.
>>>>>>
>>>>>> --num-threads mainly reduces the time of compressing.
>>>>>> So for lzo, it can't do much help at most of time.
>>>>>
>>>>> Seems the help of --num-threads does not say it exactly:
>>>>>
>>>>>      [--num-threads THREADNUM]:
>>>>>          Using multiple threads to read and compress data of each page in parallel.
>>>>>          And it will reduces time for saving DUMPFILE.
>>>>>          This feature only supports creating DUMPFILE in kdump-comressed format from
>>>>>          VMCORE in kdump-compressed format or elf format.
>>>>>
>>>>> Lzo is also a compress method, it should be mentioned that --num-threads only
>>>>> supports zlib compressed vmcore.
>>>>>
>>>>
>>>> Sorry, it seems that something I said is not so clear.
>>>> lzo is also supported. Since lzo compresses data at a high speed, the
>>>> improving of the performance is not so obvious at most of time.
>>>>
>>>>> Also worth to mention about the recommended -d value for this feature.
>>>>>
>>>>
>>>> Yes, I think it's worth. I forgot it.
>>>
>>> I saw your patch, but I think I should confirm what is the problem first.
>>>
>>>> However, when "-d 31" is specified, it will be worse.
>>>> Less than 50 buffers are used to cache the compressed page.
>>>> And even the page has been filtered, it will also take a buffer.
>>>> So if "-d 31" is specified, the filtered page will use a lot
>>>> of buffers. Then the page which needs to be compressed can't
>>>> be compressed parallel.
>>>
>>> Could you explain why compression will not be parallel in more detail ?
>>> Actually the buffers are used also for filtered pages, it sounds inefficient.
>>> However, I don't understand why it prevents parallel compression.
>>>
>>
>> Think about this, in a huge memory, most of the page will be filtered, and
>> we have 5 buffers.
>>
>> page1       page2      page3     page4     page5      page6       page7 .....
>> [buffer1]   [2]        [3]       [4]       [5]
>> unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered
>>
>> Since filtered page will take a buffer, when compressing page1,
>> page6 can't be compressed at the same time.
>> That why it will prevent parallel compression.
>
> Thanks for your explanation, I understand.
> This is just an issue of the current implementation, there is no
> reason to stand this restriction.
>
>>> Further, according to Chao's benchmark, there is a big performance
>>> degradation even if the number of thread is 1. (58s vs 240s)
>>> The current implementation seems to have some problems, we should
>>> solve them.
>>>
>>
>> If "-d 31" is specified, on the one hand we can't save time by compressing
>> parallel, on the other hand we will introduce some extra work by adding
>> "--num-threads". So it is obvious that it will have a performance degradation.
>
> Sure, there must be some overhead due to "some extra work"(e.g. exclusive lock),
> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
> too slow, the degradation is too big to be called "some extra work".
>
> Both --num-threads=0 and --num-threads=1 are serial processing,
> the above "buffer fairness issue" will not be related to this degradation.
> What do you think what make this degradation ?
>

I can't get such result at this moment, so I can't do some further investigation
right now. I guess it may be caused by the underlying implementation of pthread.
I reviewed the test result of the patch v2 and found in different machines,
the results are quite different.

It seems that I can get almost the same result of Chao from "PRIMEQUEST 1800E".

###################################
- System: PRIMERGY RX300 S6
- CPU: Intel(R) Xeon(R) CPU x5660
- memory: 16GB
###################################
************ makedumpfile -d 7 ******************
                 core-data       0       256
         threads-num
-l
         0                       10      144
         4                       5       110
         8                       5       111
         12                      6       111

************ makedumpfile -d 31 ******************
                 core-data       0       256
         threads-num
-l
         0                       0       0
         4                       2       2
         8                       2       3
         12                      2       3

###################################
- System: PRIMEQUEST 1800E
- CPU: Intel(R) Xeon(R) CPU E7540
- memory: 32GB
###################################
************ makedumpfile -d 7 ******************
                 core-data        0       256
         threads-num
-l
         0                        34      270
         4                        63      154
         8                        64      131
         12                       65      159

************ makedumpfile -d 31 ******************
                 core-data        0       256
         threads-num
-l
         0                        2       1
         4                        48      48
         8                        48      49
         12                       49      50

>> I'm not so sure if it is a problem that the performance degradation is so big.
>> But I think if in other cases, it works as expected, this won't be a problem(
>> or a problem needs to be fixed), for the performance degradation existing
>> in theory.	
>>
>> Or the current implementation should be replaced by a new arithmetic.
>> For example:
>> We can add an array to record whether the page is filtered or not.
>> And only the unfiltered page will take the buffer.
>
> We should discuss how to implement new mechanism, I'll mention this later.
>
>> But I'm not sure if it is worth.
>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>
> Basically the faster, the better. There is no obvious target time.
> If there is room for improvement, we should do it.
>

Maybe we can improve the performance of "-c -d 31" in some case.

BTW, we can easily get the theoretical performance by using the "--split".

-- 
Thanks
Zhou



_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-10  9:36                 ` "Zhou, Wenjian/周文剑"
@ 2015-12-10  9:58                   ` Chao Fan
  2015-12-10 10:32                     ` "Zhou, Wenjian/周文剑"
  2015-12-14  8:26                   ` Atsushi Kumagai
  1 sibling, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-10  9:58 UTC (permalink / raw)
  To: Wenjian Zhou/周文剑; +Cc: Atsushi Kumagai, kexec



----- Original Message -----
> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> Cc: kexec@lists.infradead.org
> Sent: Thursday, December 10, 2015 5:36:47 PM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> 
> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >> Hello Kumagai,
> >>
> >> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >>> Hello, Zhou
> >>>
> >>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >>>>> Hi,
> >>>>>
> >>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >>>>>> I think there is no problem if other test results are as expected.
> >>>>>>
> >>>>>> --num-threads mainly reduces the time of compressing.
> >>>>>> So for lzo, it can't do much help at most of time.
> >>>>>
> >>>>> Seems the help of --num-threads does not say it exactly:
> >>>>>
> >>>>>      [--num-threads THREADNUM]:
> >>>>>          Using multiple threads to read and compress data of each page
> >>>>>          in parallel.
> >>>>>          And it will reduces time for saving DUMPFILE.
> >>>>>          This feature only supports creating DUMPFILE in
> >>>>>          kdump-comressed format from
> >>>>>          VMCORE in kdump-compressed format or elf format.
> >>>>>
> >>>>> Lzo is also a compress method, it should be mentioned that
> >>>>> --num-threads only
> >>>>> supports zlib compressed vmcore.
> >>>>>
> >>>>
> >>>> Sorry, it seems that something I said is not so clear.
> >>>> lzo is also supported. Since lzo compresses data at a high speed, the
> >>>> improving of the performance is not so obvious at most of time.
> >>>>
> >>>>> Also worth to mention about the recommended -d value for this feature.
> >>>>>
> >>>>
> >>>> Yes, I think it's worth. I forgot it.
> >>>
> >>> I saw your patch, but I think I should confirm what is the problem first.
> >>>
> >>>> However, when "-d 31" is specified, it will be worse.
> >>>> Less than 50 buffers are used to cache the compressed page.
> >>>> And even the page has been filtered, it will also take a buffer.
> >>>> So if "-d 31" is specified, the filtered page will use a lot
> >>>> of buffers. Then the page which needs to be compressed can't
> >>>> be compressed parallel.
> >>>
> >>> Could you explain why compression will not be parallel in more detail ?
> >>> Actually the buffers are used also for filtered pages, it sounds
> >>> inefficient.
> >>> However, I don't understand why it prevents parallel compression.
> >>>
> >>
> >> Think about this, in a huge memory, most of the page will be filtered, and
> >> we have 5 buffers.
> >>
> >> page1       page2      page3     page4     page5      page6       page7
> >> .....
> >> [buffer1]   [2]        [3]       [4]       [5]
> >> unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered
> >>
> >> Since filtered page will take a buffer, when compressing page1,
> >> page6 can't be compressed at the same time.
> >> That why it will prevent parallel compression.
> >
> > Thanks for your explanation, I understand.
> > This is just an issue of the current implementation, there is no
> > reason to stand this restriction.
> >
> >>> Further, according to Chao's benchmark, there is a big performance
> >>> degradation even if the number of thread is 1. (58s vs 240s)
> >>> The current implementation seems to have some problems, we should
> >>> solve them.
> >>>
> >>
> >> If "-d 31" is specified, on the one hand we can't save time by compressing
> >> parallel, on the other hand we will introduce some extra work by adding
> >> "--num-threads". So it is obvious that it will have a performance
> >> degradation.
> >
> > Sure, there must be some overhead due to "some extra work"(e.g. exclusive
> > lock),
> > but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
> > too slow, the degradation is too big to be called "some extra work".
> >
> > Both --num-threads=0 and --num-threads=1 are serial processing,
> > the above "buffer fairness issue" will not be related to this degradation.
> > What do you think what make this degradation ?
> >
> 
> I can't get such result at this moment, so I can't do some further
> investigation
> right now. I guess it may be caused by the underlying implementation of
> pthread.
> I reviewed the test result of the patch v2 and found in different machines,
> the results are quite different.

Hi Zhou Wenjian,

I have done more tests in another machine with 128G memory, and get the result:

the size of vmcore is 300M in "-d 31"
makedumpfile -l --message-level 1 -d 31:
time: 8.6s      page-faults: 2272

makedumpfile -l --num-threads 1 --message-level 1 -d 31:
time: 28.1s     page-faults: 2359


and the size of vmcore is 2.6G in "-d 0".
In this machine, I get the same result as yours:


makedumpfile -c --message-level 1 -d 0:
time: 597s      page-faults: 2287

makedumpfile -c --num-threads 1 --message-level 1 -d 0:
time: 602s      page-faults: 2361

makedumpfile -c --num-threads 2 --message-level 1 -d 0:
time: 337s      page-faults: 2397

makedumpfile -c --num-threads 4 --message-level 1 -d 0:
time: 175s      page-faults: 2461

makedumpfile -c --num-threads 8 --message-level 1 -d 0:
time: 103s      page-faults: 2611


But the machine of my first test is not under my control, should I wait for 
the first machine to do more tests?
If there are still some problems in my tests, please tell me.

Thanks,
Chao Fan


> 
> It seems that I can get almost the same result of Chao from "PRIMEQUEST
> 1800E".
> 
> ###################################
> - System: PRIMERGY RX300 S6
> - CPU: Intel(R) Xeon(R) CPU x5660
> - memory: 16GB
> ###################################
> ************ makedumpfile -d 7 ******************
>                  core-data       0       256
>          threads-num
> -l
>          0                       10      144
>          4                       5       110
>          8                       5       111
>          12                      6       111
> 
> ************ makedumpfile -d 31 ******************
>                  core-data       0       256
>          threads-num
> -l
>          0                       0       0
>          4                       2       2
>          8                       2       3
>          12                      2       3
> 
> ###################################
> - System: PRIMEQUEST 1800E
> - CPU: Intel(R) Xeon(R) CPU E7540
> - memory: 32GB
> ###################################
> ************ makedumpfile -d 7 ******************
>                  core-data        0       256
>          threads-num
> -l
>          0                        34      270
>          4                        63      154
>          8                        64      131
>          12                       65      159
> 
> ************ makedumpfile -d 31 ******************
>                  core-data        0       256
>          threads-num
> -l
>          0                        2       1
>          4                        48      48
>          8                        48      49
>          12                       49      50
> 
> >> I'm not so sure if it is a problem that the performance degradation is so
> >> big.
> >> But I think if in other cases, it works as expected, this won't be a
> >> problem(
> >> or a problem needs to be fixed), for the performance degradation existing
> >> in theory.
> >>
> >> Or the current implementation should be replaced by a new arithmetic.
> >> For example:
> >> We can add an array to record whether the page is filtered or not.
> >> And only the unfiltered page will take the buffer.
> >
> > We should discuss how to implement new mechanism, I'll mention this later.
> >
> >> But I'm not sure if it is worth.
> >> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
> >
> > Basically the faster, the better. There is no obvious target time.
> > If there is room for improvement, we should do it.
> >
> 
> Maybe we can improve the performance of "-c -d 31" in some case.
> 
> BTW, we can easily get the theoretical performance by using the "--split".
> 
> --
> Thanks
> Zhou
> 
> 
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-10  9:58                   ` Chao Fan
@ 2015-12-10 10:32                     ` "Zhou, Wenjian/周文剑"
  2015-12-10 10:54                       ` Chao Fan
  0 siblings, 1 reply; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-10 10:32 UTC (permalink / raw)
  To: Chao Fan; +Cc: Atsushi Kumagai, kexec

On 12/10/2015 05:58 PM, Chao Fan wrote:
>
>
> ----- Original Message -----
>> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> Cc: kexec@lists.infradead.org
>> Sent: Thursday, December 10, 2015 5:36:47 PM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>>
>> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>>>> Hello Kumagai,
>>>>
>>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>>>>> Hello, Zhou
>>>>>
>>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>>>>>>>> I think there is no problem if other test results are as expected.
>>>>>>>>
>>>>>>>> --num-threads mainly reduces the time of compressing.
>>>>>>>> So for lzo, it can't do much help at most of time.
>>>>>>>
>>>>>>> Seems the help of --num-threads does not say it exactly:
>>>>>>>
>>>>>>>       [--num-threads THREADNUM]:
>>>>>>>           Using multiple threads to read and compress data of each page
>>>>>>>           in parallel.
>>>>>>>           And it will reduces time for saving DUMPFILE.
>>>>>>>           This feature only supports creating DUMPFILE in
>>>>>>>           kdump-comressed format from
>>>>>>>           VMCORE in kdump-compressed format or elf format.
>>>>>>>
>>>>>>> Lzo is also a compress method, it should be mentioned that
>>>>>>> --num-threads only
>>>>>>> supports zlib compressed vmcore.
>>>>>>>
>>>>>>
>>>>>> Sorry, it seems that something I said is not so clear.
>>>>>> lzo is also supported. Since lzo compresses data at a high speed, the
>>>>>> improving of the performance is not so obvious at most of time.
>>>>>>
>>>>>>> Also worth to mention about the recommended -d value for this feature.
>>>>>>>
>>>>>>
>>>>>> Yes, I think it's worth. I forgot it.
>>>>>
>>>>> I saw your patch, but I think I should confirm what is the problem first.
>>>>>
>>>>>> However, when "-d 31" is specified, it will be worse.
>>>>>> Less than 50 buffers are used to cache the compressed page.
>>>>>> And even the page has been filtered, it will also take a buffer.
>>>>>> So if "-d 31" is specified, the filtered page will use a lot
>>>>>> of buffers. Then the page which needs to be compressed can't
>>>>>> be compressed parallel.
>>>>>
>>>>> Could you explain why compression will not be parallel in more detail ?
>>>>> Actually the buffers are used also for filtered pages, it sounds
>>>>> inefficient.
>>>>> However, I don't understand why it prevents parallel compression.
>>>>>
>>>>
>>>> Think about this, in a huge memory, most of the page will be filtered, and
>>>> we have 5 buffers.
>>>>
>>>> page1       page2      page3     page4     page5      page6       page7
>>>> .....
>>>> [buffer1]   [2]        [3]       [4]       [5]
>>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered
>>>>
>>>> Since filtered page will take a buffer, when compressing page1,
>>>> page6 can't be compressed at the same time.
>>>> That why it will prevent parallel compression.
>>>
>>> Thanks for your explanation, I understand.
>>> This is just an issue of the current implementation, there is no
>>> reason to stand this restriction.
>>>
>>>>> Further, according to Chao's benchmark, there is a big performance
>>>>> degradation even if the number of thread is 1. (58s vs 240s)
>>>>> The current implementation seems to have some problems, we should
>>>>> solve them.
>>>>>
>>>>
>>>> If "-d 31" is specified, on the one hand we can't save time by compressing
>>>> parallel, on the other hand we will introduce some extra work by adding
>>>> "--num-threads". So it is obvious that it will have a performance
>>>> degradation.
>>>
>>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive
>>> lock),
>>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
>>> too slow, the degradation is too big to be called "some extra work".
>>>
>>> Both --num-threads=0 and --num-threads=1 are serial processing,
>>> the above "buffer fairness issue" will not be related to this degradation.
>>> What do you think what make this degradation ?
>>>
>>
>> I can't get such result at this moment, so I can't do some further
>> investigation
>> right now. I guess it may be caused by the underlying implementation of
>> pthread.
>> I reviewed the test result of the patch v2 and found in different machines,
>> the results are quite different.
>
> Hi Zhou Wenjian,
>
> I have done more tests in another machine with 128G memory, and get the result:
>
> the size of vmcore is 300M in "-d 31"
> makedumpfile -l --message-level 1 -d 31:
> time: 8.6s      page-faults: 2272
>
> makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> time: 28.1s     page-faults: 2359
>
>
> and the size of vmcore is 2.6G in "-d 0".
> In this machine, I get the same result as yours:
>
>
> makedumpfile -c --message-level 1 -d 0:
> time: 597s      page-faults: 2287
>
> makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> time: 602s      page-faults: 2361
>
> makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> time: 337s      page-faults: 2397
>
> makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> time: 175s      page-faults: 2461
>
> makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> time: 103s      page-faults: 2611
>
>
> But the machine of my first test is not under my control, should I wait for
> the first machine to do more tests?
> If there are still some problems in my tests, please tell me.
>

Thanks a lot for your test, it seems that there is nothing wrong.
And I haven't got any idea about more tests...

Could you provide the information of your cpu ?
I will do some further investigation later.

But I still believe it's better not to use "-l -d 31" and "--num-threads"
at the same time, though it's very strange that the performance
degradation is so big.

-- 
Thanks
Zhou

> Thanks,
> Chao Fan
>
>
>>
>> It seems that I can get almost the same result of Chao from "PRIMEQUEST
>> 1800E".
>>
>> ###################################
>> - System: PRIMERGY RX300 S6
>> - CPU: Intel(R) Xeon(R) CPU x5660
>> - memory: 16GB
>> ###################################
>> ************ makedumpfile -d 7 ******************
>>                   core-data       0       256
>>           threads-num
>> -l
>>           0                       10      144
>>           4                       5       110
>>           8                       5       111
>>           12                      6       111
>>
>> ************ makedumpfile -d 31 ******************
>>                   core-data       0       256
>>           threads-num
>> -l
>>           0                       0       0
>>           4                       2       2
>>           8                       2       3
>>           12                      2       3
>>
>> ###################################
>> - System: PRIMEQUEST 1800E
>> - CPU: Intel(R) Xeon(R) CPU E7540
>> - memory: 32GB
>> ###################################
>> ************ makedumpfile -d 7 ******************
>>                   core-data        0       256
>>           threads-num
>> -l
>>           0                        34      270
>>           4                        63      154
>>           8                        64      131
>>           12                       65      159
>>
>> ************ makedumpfile -d 31 ******************
>>                   core-data        0       256
>>           threads-num
>> -l
>>           0                        2       1
>>           4                        48      48
>>           8                        48      49
>>           12                       49      50
>>
>>>> I'm not so sure if it is a problem that the performance degradation is so
>>>> big.
>>>> But I think if in other cases, it works as expected, this won't be a
>>>> problem(
>>>> or a problem needs to be fixed), for the performance degradation existing
>>>> in theory.
>>>>
>>>> Or the current implementation should be replaced by a new arithmetic.
>>>> For example:
>>>> We can add an array to record whether the page is filtered or not.
>>>> And only the unfiltered page will take the buffer.
>>>
>>> We should discuss how to implement new mechanism, I'll mention this later.
>>>
>>>> But I'm not sure if it is worth.
>>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>>>
>>> Basically the faster, the better. There is no obvious target time.
>>> If there is room for improvement, we should do it.
>>>
>>
>> Maybe we can improve the performance of "-c -d 31" in some case.
>>
>> BTW, we can easily get the theoretical performance by using the "--split".
>>
>> --
>> Thanks
>> Zhou
>>
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>




_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-10 10:32                     ` "Zhou, Wenjian/周文剑"
@ 2015-12-10 10:54                       ` Chao Fan
  2015-12-22  8:32                         ` HATAYAMA Daisuke
  0 siblings, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-10 10:54 UTC (permalink / raw)
  To: Wenjian Zhou/周文剑; +Cc: Atsushi Kumagai, kexec



----- Original Message -----
> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> To: "Chao Fan" <cfan@redhat.com>
> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>, kexec@lists.infradead.org
> Sent: Thursday, December 10, 2015 6:32:32 PM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> 
> On 12/10/2015 05:58 PM, Chao Fan wrote:
> >
> >
> > ----- Original Message -----
> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> >> Cc: kexec@lists.infradead.org
> >> Sent: Thursday, December 10, 2015 5:36:47 PM
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >>
> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >>>> Hello Kumagai,
> >>>>
> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >>>>> Hello, Zhou
> >>>>>
> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >>>>>>> Hi,
> >>>>>>>
> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >>>>>>>> I think there is no problem if other test results are as expected.
> >>>>>>>>
> >>>>>>>> --num-threads mainly reduces the time of compressing.
> >>>>>>>> So for lzo, it can't do much help at most of time.
> >>>>>>>
> >>>>>>> Seems the help of --num-threads does not say it exactly:
> >>>>>>>
> >>>>>>>       [--num-threads THREADNUM]:
> >>>>>>>           Using multiple threads to read and compress data of each
> >>>>>>>           page
> >>>>>>>           in parallel.
> >>>>>>>           And it will reduces time for saving DUMPFILE.
> >>>>>>>           This feature only supports creating DUMPFILE in
> >>>>>>>           kdump-comressed format from
> >>>>>>>           VMCORE in kdump-compressed format or elf format.
> >>>>>>>
> >>>>>>> Lzo is also a compress method, it should be mentioned that
> >>>>>>> --num-threads only
> >>>>>>> supports zlib compressed vmcore.
> >>>>>>>
> >>>>>>
> >>>>>> Sorry, it seems that something I said is not so clear.
> >>>>>> lzo is also supported. Since lzo compresses data at a high speed, the
> >>>>>> improving of the performance is not so obvious at most of time.
> >>>>>>
> >>>>>>> Also worth to mention about the recommended -d value for this
> >>>>>>> feature.
> >>>>>>>
> >>>>>>
> >>>>>> Yes, I think it's worth. I forgot it.
> >>>>>
> >>>>> I saw your patch, but I think I should confirm what is the problem
> >>>>> first.
> >>>>>
> >>>>>> However, when "-d 31" is specified, it will be worse.
> >>>>>> Less than 50 buffers are used to cache the compressed page.
> >>>>>> And even the page has been filtered, it will also take a buffer.
> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
> >>>>>> of buffers. Then the page which needs to be compressed can't
> >>>>>> be compressed parallel.
> >>>>>
> >>>>> Could you explain why compression will not be parallel in more detail ?
> >>>>> Actually the buffers are used also for filtered pages, it sounds
> >>>>> inefficient.
> >>>>> However, I don't understand why it prevents parallel compression.
> >>>>>
> >>>>
> >>>> Think about this, in a huge memory, most of the page will be filtered,
> >>>> and
> >>>> we have 5 buffers.
> >>>>
> >>>> page1       page2      page3     page4     page5      page6       page7
> >>>> .....
> >>>> [buffer1]   [2]        [3]       [4]       [5]
> >>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered
> >>>> filtered
> >>>>
> >>>> Since filtered page will take a buffer, when compressing page1,
> >>>> page6 can't be compressed at the same time.
> >>>> That why it will prevent parallel compression.
> >>>
> >>> Thanks for your explanation, I understand.
> >>> This is just an issue of the current implementation, there is no
> >>> reason to stand this restriction.
> >>>
> >>>>> Further, according to Chao's benchmark, there is a big performance
> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
> >>>>> The current implementation seems to have some problems, we should
> >>>>> solve them.
> >>>>>
> >>>>
> >>>> If "-d 31" is specified, on the one hand we can't save time by
> >>>> compressing
> >>>> parallel, on the other hand we will introduce some extra work by adding
> >>>> "--num-threads". So it is obvious that it will have a performance
> >>>> degradation.
> >>>
> >>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive
> >>> lock),
> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
> >>> too slow, the degradation is too big to be called "some extra work".
> >>>
> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
> >>> the above "buffer fairness issue" will not be related to this
> >>> degradation.
> >>> What do you think what make this degradation ?
> >>>
> >>
> >> I can't get such result at this moment, so I can't do some further
> >> investigation
> >> right now. I guess it may be caused by the underlying implementation of
> >> pthread.
> >> I reviewed the test result of the patch v2 and found in different
> >> machines,
> >> the results are quite different.
> >
> > Hi Zhou Wenjian,
> >
> > I have done more tests in another machine with 128G memory, and get the
> > result:
> >
> > the size of vmcore is 300M in "-d 31"
> > makedumpfile -l --message-level 1 -d 31:
> > time: 8.6s      page-faults: 2272
> >
> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> > time: 28.1s     page-faults: 2359
> >
> >
> > and the size of vmcore is 2.6G in "-d 0".
> > In this machine, I get the same result as yours:
> >
> >
> > makedumpfile -c --message-level 1 -d 0:
> > time: 597s      page-faults: 2287
> >
> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> > time: 602s      page-faults: 2361
> >
> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> > time: 337s      page-faults: 2397
> >
> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> > time: 175s      page-faults: 2461
> >
> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> > time: 103s      page-faults: 2611
> >
> >
> > But the machine of my first test is not under my control, should I wait for
> > the first machine to do more tests?
> > If there are still some problems in my tests, please tell me.
> >
> 
> Thanks a lot for your test, it seems that there is nothing wrong.
> And I haven't got any idea about more tests...
> 
> Could you provide the information of your cpu ?
> I will do some further investigation later.
> 

OK, of course, here is the information of cpu:

# lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                48
On-line CPU(s) list:   0-47
Thread(s) per core:    1
Core(s) per socket:    6
Socket(s):             8
NUMA node(s):          8
Vendor ID:             AuthenticAMD
CPU family:            16
Model:                 8
Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
Stepping:              0
CPU MHz:               2793.040
BogoMIPS:              5586.22
Virtualization:        AMD-V
L1d cache:             64K
L1i cache:             64K
L2 cache:              512K
L3 cache:              5118K
NUMA node0 CPU(s):     0,8,16,24,32,40
NUMA node1 CPU(s):     1,9,17,25,33,41
NUMA node2 CPU(s):     2,10,18,26,34,42
NUMA node3 CPU(s):     3,11,19,27,35,43
NUMA node4 CPU(s):     4,12,20,28,36,44
NUMA node5 CPU(s):     5,13,21,29,37,45
NUMA node6 CPU(s):     6,14,22,30,38,46
NUMA node7 CPU(s):     7,15,23,31,39,47
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt hw_pstate npt lbrv svm_lock nrip_save pausefilter vmmcall

> But I still believe it's better not to use "-l -d 31" and "--num-threads"
> at the same time, though it's very strange that the performance
> degradation is so big.
> 
> --
> Thanks
> Zhou
> 
> > Thanks,
> > Chao Fan
> >
> >
> >>
> >> It seems that I can get almost the same result of Chao from "PRIMEQUEST
> >> 1800E".
> >>
> >> ###################################
> >> - System: PRIMERGY RX300 S6
> >> - CPU: Intel(R) Xeon(R) CPU x5660
> >> - memory: 16GB
> >> ###################################
> >> ************ makedumpfile -d 7 ******************
> >>                   core-data       0       256
> >>           threads-num
> >> -l
> >>           0                       10      144
> >>           4                       5       110
> >>           8                       5       111
> >>           12                      6       111
> >>
> >> ************ makedumpfile -d 31 ******************
> >>                   core-data       0       256
> >>           threads-num
> >> -l
> >>           0                       0       0
> >>           4                       2       2
> >>           8                       2       3
> >>           12                      2       3
> >>
> >> ###################################
> >> - System: PRIMEQUEST 1800E
> >> - CPU: Intel(R) Xeon(R) CPU E7540
> >> - memory: 32GB
> >> ###################################
> >> ************ makedumpfile -d 7 ******************
> >>                   core-data        0       256
> >>           threads-num
> >> -l
> >>           0                        34      270
> >>           4                        63      154
> >>           8                        64      131
> >>           12                       65      159
> >>
> >> ************ makedumpfile -d 31 ******************
> >>                   core-data        0       256
> >>           threads-num
> >> -l
> >>           0                        2       1
> >>           4                        48      48
> >>           8                        48      49
> >>           12                       49      50
> >>
> >>>> I'm not so sure if it is a problem that the performance degradation is
> >>>> so
> >>>> big.
> >>>> But I think if in other cases, it works as expected, this won't be a
> >>>> problem(
> >>>> or a problem needs to be fixed), for the performance degradation
> >>>> existing
> >>>> in theory.
> >>>>
> >>>> Or the current implementation should be replaced by a new arithmetic.
> >>>> For example:
> >>>> We can add an array to record whether the page is filtered or not.
> >>>> And only the unfiltered page will take the buffer.
> >>>
> >>> We should discuss how to implement new mechanism, I'll mention this
> >>> later.
> >>>
> >>>> But I'm not sure if it is worth.
> >>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much
> >>>> help.
> >>>
> >>> Basically the faster, the better. There is no obvious target time.
> >>> If there is room for improvement, we should do it.
> >>>
> >>
> >> Maybe we can improve the performance of "-c -d 31" in some case.
> >>
> >> BTW, we can easily get the theoretical performance by using the "--split".
> >>
> >> --
> >> Thanks
> >> Zhou
> >>
> >>
> >>
> >> _______________________________________________
> >> kexec mailing list
> >> kexec@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/kexec
> >>
> 
> 
> 
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
> 

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-10  9:36                 ` "Zhou, Wenjian/周文剑"
  2015-12-10  9:58                   ` Chao Fan
@ 2015-12-14  8:26                   ` Atsushi Kumagai
  2015-12-14  8:59                     ` "Zhou, Wenjian/周文剑"
  1 sibling, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-12-14  8:26 UTC (permalink / raw)
  To: "Zhou, Wenjian/周文剑"
  Cc: kexec@lists.infradead.org

>>> Think about this, in a huge memory, most of the page will be filtered, and
>>> we have 5 buffers.
>>>
>>> page1       page2      page3     page4     page5      page6       page7 .....
>>> [buffer1]   [2]        [3]       [4]       [5]
>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered
>>>
>>> Since filtered page will take a buffer, when compressing page1,
>>> page6 can't be compressed at the same time.
>>> That why it will prevent parallel compression.
>>
>> Thanks for your explanation, I understand.
>> This is just an issue of the current implementation, there is no
>> reason to stand this restriction.
>>
>>>> Further, according to Chao's benchmark, there is a big performance
>>>> degradation even if the number of thread is 1. (58s vs 240s)
>>>> The current implementation seems to have some problems, we should
>>>> solve them.
>>>>
>>>
>>> If "-d 31" is specified, on the one hand we can't save time by compressing
>>> parallel, on the other hand we will introduce some extra work by adding
>>> "--num-threads". So it is obvious that it will have a performance degradation.
>>
>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive lock),
>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
>> too slow, the degradation is too big to be called "some extra work".
>>
>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> the above "buffer fairness issue" will not be related to this degradation.
>> What do you think what make this degradation ?
>>
>
>I can't get such result at this moment, so I can't do some further investigation
>right now. I guess it may be caused by the underlying implementation of pthread.
>I reviewed the test result of the patch v2 and found in different machines,
>the results are quite different.

Unluckily, I also can't reproduce such big degradation.
According to the Chao's verification, this issue seems different form
the "too many page fault issue" that we solved.
I have no ideas, but at least I want to confirm whether this issue
is avoidable or not.

>It seems that I can get almost the same result of Chao from "PRIMEQUEST 1800E".
>
>###################################
>- System: PRIMERGY RX300 S6
>- CPU: Intel(R) Xeon(R) CPU x5660
>- memory: 16GB
>###################################
>************ makedumpfile -d 7 ******************
>                 core-data       0       256
>         threads-num
>-l
>         0                       10      144
>         4                       5       110
>         8                       5       111
>         12                      6       111
>
>************ makedumpfile -d 31 ******************
>                 core-data       0       256
>         threads-num
>-l
>         0                       0       0
>         4                       2       2
>         8                       2       3
>         12                      2       3
>
>###################################
>- System: PRIMEQUEST 1800E
>- CPU: Intel(R) Xeon(R) CPU E7540
>- memory: 32GB
>###################################
>************ makedumpfile -d 7 ******************
>                 core-data        0       256
>         threads-num
>-l
>         0                        34      270
>         4                        63      154
>         8                        64      131
>         12                       65      159
>
>************ makedumpfile -d 31 ******************
>                 core-data        0       256
>         threads-num
>-l
>         0                        2       1
>         4                        48      48
>         8                        48      49
>         12                       49      50
>
>>> I'm not so sure if it is a problem that the performance degradation is so big.
>>> But I think if in other cases, it works as expected, this won't be a problem(
>>> or a problem needs to be fixed), for the performance degradation existing
>>> in theory.
>>>
>>> Or the current implementation should be replaced by a new arithmetic.
>>> For example:
>>> We can add an array to record whether the page is filtered or not.
>>> And only the unfiltered page will take the buffer.
>>
>> We should discuss how to implement new mechanism, I'll mention this later.
>>
>>> But I'm not sure if it is worth.
>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>>
>> Basically the faster, the better. There is no obvious target time.
>> If there is room for improvement, we should do it.
>>
>
>Maybe we can improve the performance of "-c -d 31" in some case.

Yes, the buffer is used for -c, -l and -p, not only for -l.
It would be useful to improve that.

>BTW, we can easily get the theoretical performance by using the "--split".

Are you sure ? You persuaded me in the thread below:

  http://lists.infradead.org/pipermail/kexec/2015-June/013881.html

--num-threads is orthogonal to --split, it's better to use the both
option since they try to solve different bottlenecks.
That's why I decided to merge your multi thread feature.

However, what you said sounds --split is a superset of --num-threads.
You don't need the multi thread feature ?


Thanks,
Atsushi Kumagai
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-14  8:26                   ` Atsushi Kumagai
@ 2015-12-14  8:59                     ` "Zhou, Wenjian/周文剑"
  0 siblings, 0 replies; 43+ messages in thread
From: "Zhou, Wenjian/周文剑" @ 2015-12-14  8:59 UTC (permalink / raw)
  To: Atsushi Kumagai; +Cc: kexec@lists.infradead.org

On 12/14/2015 04:26 PM, Atsushi Kumagai wrote:
>>>> Think about this, in a huge memory, most of the page will be filtered, and
>>>> we have 5 buffers.
>>>>
>>>> page1       page2      page3     page4     page5      page6       page7 .....
>>>> [buffer1]   [2]        [3]       [4]       [5]
>>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered  filtered
>>>>
>>>> Since filtered page will take a buffer, when compressing page1,
>>>> page6 can't be compressed at the same time.
>>>> That why it will prevent parallel compression.
>>>
>>> Thanks for your explanation, I understand.
>>> This is just an issue of the current implementation, there is no
>>> reason to stand this restriction.
>>>
>>>>> Further, according to Chao's benchmark, there is a big performance
>>>>> degradation even if the number of thread is 1. (58s vs 240s)
>>>>> The current implementation seems to have some problems, we should
>>>>> solve them.
>>>>>
>>>>
>>>> If "-d 31" is specified, on the one hand we can't save time by compressing
>>>> parallel, on the other hand we will introduce some extra work by adding
>>>> "--num-threads". So it is obvious that it will have a performance degradation.
>>>
>>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive lock),
>>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
>>> too slow, the degradation is too big to be called "some extra work".
>>>
>>> Both --num-threads=0 and --num-threads=1 are serial processing,
>>> the above "buffer fairness issue" will not be related to this degradation.
>>> What do you think what make this degradation ?
>>>
>>
>> I can't get such result at this moment, so I can't do some further investigation
>> right now. I guess it may be caused by the underlying implementation of pthread.
>> I reviewed the test result of the patch v2 and found in different machines,
>> the results are quite different.
>
> Unluckily, I also can't reproduce such big degradation.
> According to the Chao's verification, this issue seems different form
> the "too many page fault issue" that we solved.
> I have no ideas, but at least I want to confirm whether this issue
> is avoidable or not.
>
>> It seems that I can get almost the same result of Chao from "PRIMEQUEST 1800E".
>>
>> ###################################
>> - System: PRIMERGY RX300 S6
>> - CPU: Intel(R) Xeon(R) CPU x5660
>> - memory: 16GB
>> ###################################
>> ************ makedumpfile -d 7 ******************
>>                  core-data       0       256
>>          threads-num
>> -l
>>          0                       10      144
>>          4                       5       110
>>          8                       5       111
>>          12                      6       111
>>
>> ************ makedumpfile -d 31 ******************
>>                  core-data       0       256
>>          threads-num
>> -l
>>          0                       0       0
>>          4                       2       2
>>          8                       2       3
>>          12                      2       3
>>
>> ###################################
>> - System: PRIMEQUEST 1800E
>> - CPU: Intel(R) Xeon(R) CPU E7540
>> - memory: 32GB
>> ###################################
>> ************ makedumpfile -d 7 ******************
>>                  core-data        0       256
>>          threads-num
>> -l
>>          0                        34      270
>>          4                        63      154
>>          8                        64      131
>>          12                       65      159
>>
>> ************ makedumpfile -d 31 ******************
>>                  core-data        0       256
>>          threads-num
>> -l
>>          0                        2       1
>>          4                        48      48
>>          8                        48      49
>>          12                       49      50
>>
>>>> I'm not so sure if it is a problem that the performance degradation is so big.
>>>> But I think if in other cases, it works as expected, this won't be a problem(
>>>> or a problem needs to be fixed), for the performance degradation existing
>>>> in theory.
>>>>
>>>> Or the current implementation should be replaced by a new arithmetic.
>>>> For example:
>>>> We can add an array to record whether the page is filtered or not.
>>>> And only the unfiltered page will take the buffer.
>>>
>>> We should discuss how to implement new mechanism, I'll mention this later.
>>>
>>>> But I'm not sure if it is worth.
>>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much help.
>>>
>>> Basically the faster, the better. There is no obvious target time.
>>> If there is room for improvement, we should do it.
>>>
>>
>> Maybe we can improve the performance of "-c -d 31" in some case.
>
> Yes, the buffer is used for -c, -l and -p, not only for -l.
> It would be useful to improve that.
>
>> BTW, we can easily get the theoretical performance by using the "--split".
>
> Are you sure ? You persuaded me in the thread below:
>
>    http://lists.infradead.org/pipermail/kexec/2015-June/013881.html
>
> --num-threads is orthogonal to --split, it's better to use the both
> option since they try to solve different bottlenecks.
> That's why I decided to merge your multi thread feature.
>
> However, what you said sounds --split is a superset of --num-threads.
> You don't need the multi thread feature ?
>

I just mean the performance.
There is no doubt that we will use multi-threads in --split in the future.

But as we all known, threads and processes have some common characters.
And in makedumpfile, if we use "--split core1 core2 core3 core4" and
"--num-threads 4" separately, the spent time should not be quite different.

Since the logic of "--split" is more simple, if we can't improve the performance
of "-l -d 31" by "--split", we also don't have much chance to do it by "--num-threads".

I just mean that.
It is of course that --split is not a super set of --num-threads.

-- 
Thanks
Zhou



_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-10 10:54                       ` Chao Fan
@ 2015-12-22  8:32                         ` HATAYAMA Daisuke
  2015-12-24  2:20                           ` Chao Fan
  0 siblings, 1 reply; 43+ messages in thread
From: HATAYAMA Daisuke @ 2015-12-22  8:32 UTC (permalink / raw)
  To: cfan; +Cc: ats-kumagai, zhouwj-fnst, kexec

Chao,

From: Chao Fan <cfan@redhat.com>
Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
Date: Thu, 10 Dec 2015 05:54:28 -0500

> 
> 
> ----- Original Message -----
>> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> To: "Chao Fan" <cfan@redhat.com>
>> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>, kexec@lists.infradead.org
>> Sent: Thursday, December 10, 2015 6:32:32 PM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> 
>> On 12/10/2015 05:58 PM, Chao Fan wrote:
>> >
>> >
>> > ----- Original Message -----
>> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> >> Cc: kexec@lists.infradead.org
>> >> Sent: Thursday, December 10, 2015 5:36:47 PM
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >>
>> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> >>>> Hello Kumagai,
>> >>>>
>> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> >>>>> Hello, Zhou
>> >>>>>
>> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>> >>>>>>> Hi,
>> >>>>>>>
>> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> >>>>>>>> I think there is no problem if other test results are as expected.
>> >>>>>>>>
>> >>>>>>>> --num-threads mainly reduces the time of compressing.
>> >>>>>>>> So for lzo, it can't do much help at most of time.
>> >>>>>>>
>> >>>>>>> Seems the help of --num-threads does not say it exactly:
>> >>>>>>>
>> >>>>>>>       [--num-threads THREADNUM]:
>> >>>>>>>           Using multiple threads to read and compress data of each
>> >>>>>>>           page
>> >>>>>>>           in parallel.
>> >>>>>>>           And it will reduces time for saving DUMPFILE.
>> >>>>>>>           This feature only supports creating DUMPFILE in
>> >>>>>>>           kdump-comressed format from
>> >>>>>>>           VMCORE in kdump-compressed format or elf format.
>> >>>>>>>
>> >>>>>>> Lzo is also a compress method, it should be mentioned that
>> >>>>>>> --num-threads only
>> >>>>>>> supports zlib compressed vmcore.
>> >>>>>>>
>> >>>>>>
>> >>>>>> Sorry, it seems that something I said is not so clear.
>> >>>>>> lzo is also supported. Since lzo compresses data at a high speed, the
>> >>>>>> improving of the performance is not so obvious at most of time.
>> >>>>>>
>> >>>>>>> Also worth to mention about the recommended -d value for this
>> >>>>>>> feature.
>> >>>>>>>
>> >>>>>>
>> >>>>>> Yes, I think it's worth. I forgot it.
>> >>>>>
>> >>>>> I saw your patch, but I think I should confirm what is the problem
>> >>>>> first.
>> >>>>>
>> >>>>>> However, when "-d 31" is specified, it will be worse.
>> >>>>>> Less than 50 buffers are used to cache the compressed page.
>> >>>>>> And even the page has been filtered, it will also take a buffer.
>> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
>> >>>>>> of buffers. Then the page which needs to be compressed can't
>> >>>>>> be compressed parallel.
>> >>>>>
>> >>>>> Could you explain why compression will not be parallel in more detail ?
>> >>>>> Actually the buffers are used also for filtered pages, it sounds
>> >>>>> inefficient.
>> >>>>> However, I don't understand why it prevents parallel compression.
>> >>>>>
>> >>>>
>> >>>> Think about this, in a huge memory, most of the page will be filtered,
>> >>>> and
>> >>>> we have 5 buffers.
>> >>>>
>> >>>> page1       page2      page3     page4     page5      page6       page7
>> >>>> .....
>> >>>> [buffer1]   [2]        [3]       [4]       [5]
>> >>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered
>> >>>> filtered
>> >>>>
>> >>>> Since filtered page will take a buffer, when compressing page1,
>> >>>> page6 can't be compressed at the same time.
>> >>>> That why it will prevent parallel compression.
>> >>>
>> >>> Thanks for your explanation, I understand.
>> >>> This is just an issue of the current implementation, there is no
>> >>> reason to stand this restriction.
>> >>>
>> >>>>> Further, according to Chao's benchmark, there is a big performance
>> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
>> >>>>> The current implementation seems to have some problems, we should
>> >>>>> solve them.
>> >>>>>
>> >>>>
>> >>>> If "-d 31" is specified, on the one hand we can't save time by
>> >>>> compressing
>> >>>> parallel, on the other hand we will introduce some extra work by adding
>> >>>> "--num-threads". So it is obvious that it will have a performance
>> >>>> degradation.
>> >>>
>> >>> Sure, there must be some overhead due to "some extra work"(e.g. exclusive
>> >>> lock),
>> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still sounds
>> >>> too slow, the degradation is too big to be called "some extra work".
>> >>>
>> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> >>> the above "buffer fairness issue" will not be related to this
>> >>> degradation.
>> >>> What do you think what make this degradation ?
>> >>>
>> >>
>> >> I can't get such result at this moment, so I can't do some further
>> >> investigation
>> >> right now. I guess it may be caused by the underlying implementation of
>> >> pthread.
>> >> I reviewed the test result of the patch v2 and found in different
>> >> machines,
>> >> the results are quite different.
>> >
>> > Hi Zhou Wenjian,
>> >
>> > I have done more tests in another machine with 128G memory, and get the
>> > result:
>> >
>> > the size of vmcore is 300M in "-d 31"
>> > makedumpfile -l --message-level 1 -d 31:
>> > time: 8.6s      page-faults: 2272
>> >
>> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
>> > time: 28.1s     page-faults: 2359
>> >
>> >
>> > and the size of vmcore is 2.6G in "-d 0".
>> > In this machine, I get the same result as yours:
>> >
>> >
>> > makedumpfile -c --message-level 1 -d 0:
>> > time: 597s      page-faults: 2287
>> >
>> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
>> > time: 602s      page-faults: 2361
>> >
>> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
>> > time: 337s      page-faults: 2397
>> >
>> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
>> > time: 175s      page-faults: 2461
>> >
>> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
>> > time: 103s      page-faults: 2611
>> >
>> >
>> > But the machine of my first test is not under my control, should I wait for
>> > the first machine to do more tests?
>> > If there are still some problems in my tests, please tell me.
>> >
>> 
>> Thanks a lot for your test, it seems that there is nothing wrong.
>> And I haven't got any idea about more tests...
>> 
>> Could you provide the information of your cpu ?
>> I will do some further investigation later.
>> 
> 
> OK, of course, here is the information of cpu:
> 
> # lscpu
> Architecture:          x86_64
> CPU op-mode(s):        32-bit, 64-bit
> Byte Order:            Little Endian
> CPU(s):                48
> On-line CPU(s) list:   0-47
> Thread(s) per core:    1
> Core(s) per socket:    6
> Socket(s):             8
> NUMA node(s):          8
> Vendor ID:             AuthenticAMD
> CPU family:            16
> Model:                 8
> Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
> Stepping:              0
> CPU MHz:               2793.040
> BogoMIPS:              5586.22
> Virtualization:        AMD-V
> L1d cache:             64K
> L1i cache:             64K
> L2 cache:              512K
> L3 cache:              5118K
> NUMA node0 CPU(s):     0,8,16,24,32,40
> NUMA node1 CPU(s):     1,9,17,25,33,41
> NUMA node2 CPU(s):     2,10,18,26,34,42
> NUMA node3 CPU(s):     3,11,19,27,35,43
> NUMA node4 CPU(s):     4,12,20,28,36,44
> NUMA node5 CPU(s):     5,13,21,29,37,45
> NUMA node6 CPU(s):     6,14,22,30,38,46
> NUMA node7 CPU(s):     7,15,23,31,39,47

This CPU assignment on NUMA nodes looks interesting. Is it possible
that this affects performance of makedumpfile? This is just a guess.

Could you check whether the performance gets imporoved if you run each
thread on the same NUMA node? For example:

  # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0

Also, if this were cause of this performance degradation, we might
need to extend nr_cpus= kernel option to choose NUMA nodes we use;
though, we might already be able to do that in combination with other
kernel features.

> Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt hw_pstate npt lbrv svm_lock nrip_save pausefilter vmmcall
> 
>> But I still believe it's better not to use "-l -d 31" and "--num-threads"
>> at the same time, though it's very strange that the performance
>> degradation is so big.
>> 
>> --
>> Thanks
>> Zhou
>> 
>> > Thanks,
>> > Chao Fan
>> >
>> >
>> >>
>> >> It seems that I can get almost the same result of Chao from "PRIMEQUEST
>> >> 1800E".
>> >>
>> >> ###################################
>> >> - System: PRIMERGY RX300 S6
>> >> - CPU: Intel(R) Xeon(R) CPU x5660
>> >> - memory: 16GB
>> >> ###################################
>> >> ************ makedumpfile -d 7 ******************
>> >>                   core-data       0       256
>> >>           threads-num
>> >> -l
>> >>           0                       10      144
>> >>           4                       5       110
>> >>           8                       5       111
>> >>           12                      6       111
>> >>
>> >> ************ makedumpfile -d 31 ******************
>> >>                   core-data       0       256
>> >>           threads-num
>> >> -l
>> >>           0                       0       0
>> >>           4                       2       2
>> >>           8                       2       3
>> >>           12                      2       3
>> >>
>> >> ###################################
>> >> - System: PRIMEQUEST 1800E
>> >> - CPU: Intel(R) Xeon(R) CPU E7540
>> >> - memory: 32GB
>> >> ###################################
>> >> ************ makedumpfile -d 7 ******************
>> >>                   core-data        0       256
>> >>           threads-num
>> >> -l
>> >>           0                        34      270
>> >>           4                        63      154
>> >>           8                        64      131
>> >>           12                       65      159
>> >>
>> >> ************ makedumpfile -d 31 ******************
>> >>                   core-data        0       256
>> >>           threads-num
>> >> -l
>> >>           0                        2       1
>> >>           4                        48      48
>> >>           8                        48      49
>> >>           12                       49      50
>> >>
>> >>>> I'm not so sure if it is a problem that the performance degradation is
>> >>>> so
>> >>>> big.
>> >>>> But I think if in other cases, it works as expected, this won't be a
>> >>>> problem(
>> >>>> or a problem needs to be fixed), for the performance degradation
>> >>>> existing
>> >>>> in theory.
>> >>>>
>> >>>> Or the current implementation should be replaced by a new arithmetic.
>> >>>> For example:
>> >>>> We can add an array to record whether the page is filtered or not.
>> >>>> And only the unfiltered page will take the buffer.
>> >>>
>> >>> We should discuss how to implement new mechanism, I'll mention this
>> >>> later.
>> >>>
>> >>>> But I'm not sure if it is worth.
>> >>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much
>> >>>> help.
>> >>>
>> >>> Basically the faster, the better. There is no obvious target time.
>> >>> If there is room for improvement, we should do it.
>> >>>
>> >>
>> >> Maybe we can improve the performance of "-c -d 31" in some case.
>> >>
>> >> BTW, we can easily get the theoretical performance by using the "--split".
>> >>
>> >> --
>> >> Thanks
>> >> Zhou
>> >>
>> >>
>> >>
>> >> _______________________________________________
>> >> kexec mailing list
>> >> kexec@lists.infradead.org
>> >> http://lists.infradead.org/mailman/listinfo/kexec
>> >>
>> 
>> 
>> 
>> 
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>> 
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
--
Thanks.
HATAYAMA, Daisuke
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-22  8:32                         ` HATAYAMA Daisuke
@ 2015-12-24  2:20                           ` Chao Fan
  2015-12-24  3:22                             ` HATAYAMA Daisuke
  0 siblings, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-24  2:20 UTC (permalink / raw)
  To: HATAYAMA Daisuke; +Cc: ats-kumagai, zhouwj-fnst, kexec



----- Original Message -----
> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> To: cfan@redhat.com
> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
> Sent: Tuesday, December 22, 2015 4:32:25 PM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> 
> Chao,
> 
> From: Chao Fan <cfan@redhat.com>
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> Date: Thu, 10 Dec 2015 05:54:28 -0500
> 
> > 
> > 
> > ----- Original Message -----
> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> To: "Chao Fan" <cfan@redhat.com>
> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
> >> kexec@lists.infradead.org
> >> Sent: Thursday, December 10, 2015 6:32:32 PM
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> 
> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
> >> >
> >> >
> >> > ----- Original Message -----
> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> >> >> Cc: kexec@lists.infradead.org
> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >>
> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >> >>>> Hello Kumagai,
> >> >>>>
> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >> >>>>> Hello, Zhou
> >> >>>>>
> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >> >>>>>>> Hi,
> >> >>>>>>>
> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >> >>>>>>>> I think there is no problem if other test results are as
> >> >>>>>>>> expected.
> >> >>>>>>>>
> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
> >> >>>>>>>> So for lzo, it can't do much help at most of time.
> >> >>>>>>>
> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
> >> >>>>>>>
> >> >>>>>>>       [--num-threads THREADNUM]:
> >> >>>>>>>           Using multiple threads to read and compress data of each
> >> >>>>>>>           page
> >> >>>>>>>           in parallel.
> >> >>>>>>>           And it will reduces time for saving DUMPFILE.
> >> >>>>>>>           This feature only supports creating DUMPFILE in
> >> >>>>>>>           kdump-comressed format from
> >> >>>>>>>           VMCORE in kdump-compressed format or elf format.
> >> >>>>>>>
> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
> >> >>>>>>> --num-threads only
> >> >>>>>>> supports zlib compressed vmcore.
> >> >>>>>>>
> >> >>>>>>
> >> >>>>>> Sorry, it seems that something I said is not so clear.
> >> >>>>>> lzo is also supported. Since lzo compresses data at a high speed,
> >> >>>>>> the
> >> >>>>>> improving of the performance is not so obvious at most of time.
> >> >>>>>>
> >> >>>>>>> Also worth to mention about the recommended -d value for this
> >> >>>>>>> feature.
> >> >>>>>>>
> >> >>>>>>
> >> >>>>>> Yes, I think it's worth. I forgot it.
> >> >>>>>
> >> >>>>> I saw your patch, but I think I should confirm what is the problem
> >> >>>>> first.
> >> >>>>>
> >> >>>>>> However, when "-d 31" is specified, it will be worse.
> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
> >> >>>>>> And even the page has been filtered, it will also take a buffer.
> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
> >> >>>>>> of buffers. Then the page which needs to be compressed can't
> >> >>>>>> be compressed parallel.
> >> >>>>>
> >> >>>>> Could you explain why compression will not be parallel in more
> >> >>>>> detail ?
> >> >>>>> Actually the buffers are used also for filtered pages, it sounds
> >> >>>>> inefficient.
> >> >>>>> However, I don't understand why it prevents parallel compression.
> >> >>>>>
> >> >>>>
> >> >>>> Think about this, in a huge memory, most of the page will be
> >> >>>> filtered,
> >> >>>> and
> >> >>>> we have 5 buffers.
> >> >>>>
> >> >>>> page1       page2      page3     page4     page5      page6
> >> >>>> page7
> >> >>>> .....
> >> >>>> [buffer1]   [2]        [3]       [4]       [5]
> >> >>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered
> >> >>>> filtered
> >> >>>>
> >> >>>> Since filtered page will take a buffer, when compressing page1,
> >> >>>> page6 can't be compressed at the same time.
> >> >>>> That why it will prevent parallel compression.
> >> >>>
> >> >>> Thanks for your explanation, I understand.
> >> >>> This is just an issue of the current implementation, there is no
> >> >>> reason to stand this restriction.
> >> >>>
> >> >>>>> Further, according to Chao's benchmark, there is a big performance
> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
> >> >>>>> The current implementation seems to have some problems, we should
> >> >>>>> solve them.
> >> >>>>>
> >> >>>>
> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
> >> >>>> compressing
> >> >>>> parallel, on the other hand we will introduce some extra work by
> >> >>>> adding
> >> >>>> "--num-threads". So it is obvious that it will have a performance
> >> >>>> degradation.
> >> >>>
> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
> >> >>> exclusive
> >> >>> lock),
> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still
> >> >>> sounds
> >> >>> too slow, the degradation is too big to be called "some extra work".
> >> >>>
> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
> >> >>> the above "buffer fairness issue" will not be related to this
> >> >>> degradation.
> >> >>> What do you think what make this degradation ?
> >> >>>
> >> >>
> >> >> I can't get such result at this moment, so I can't do some further
> >> >> investigation
> >> >> right now. I guess it may be caused by the underlying implementation of
> >> >> pthread.
> >> >> I reviewed the test result of the patch v2 and found in different
> >> >> machines,
> >> >> the results are quite different.
> >> >
> >> > Hi Zhou Wenjian,
> >> >
> >> > I have done more tests in another machine with 128G memory, and get the
> >> > result:
> >> >
> >> > the size of vmcore is 300M in "-d 31"
> >> > makedumpfile -l --message-level 1 -d 31:
> >> > time: 8.6s      page-faults: 2272
> >> >
> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> >> > time: 28.1s     page-faults: 2359
> >> >
> >> >
> >> > and the size of vmcore is 2.6G in "-d 0".
> >> > In this machine, I get the same result as yours:
> >> >
> >> >
> >> > makedumpfile -c --message-level 1 -d 0:
> >> > time: 597s      page-faults: 2287
> >> >
> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> >> > time: 602s      page-faults: 2361
> >> >
> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> >> > time: 337s      page-faults: 2397
> >> >
> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> >> > time: 175s      page-faults: 2461
> >> >
> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> >> > time: 103s      page-faults: 2611
> >> >
> >> >
> >> > But the machine of my first test is not under my control, should I wait
> >> > for
> >> > the first machine to do more tests?
> >> > If there are still some problems in my tests, please tell me.
> >> >
> >> 
> >> Thanks a lot for your test, it seems that there is nothing wrong.
> >> And I haven't got any idea about more tests...
> >> 
> >> Could you provide the information of your cpu ?
> >> I will do some further investigation later.
> >> 
> > 
> > OK, of course, here is the information of cpu:
> > 
> > # lscpu
> > Architecture:          x86_64
> > CPU op-mode(s):        32-bit, 64-bit
> > Byte Order:            Little Endian
> > CPU(s):                48
> > On-line CPU(s) list:   0-47
> > Thread(s) per core:    1
> > Core(s) per socket:    6
> > Socket(s):             8
> > NUMA node(s):          8
> > Vendor ID:             AuthenticAMD
> > CPU family:            16
> > Model:                 8
> > Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
> > Stepping:              0
> > CPU MHz:               2793.040
> > BogoMIPS:              5586.22
> > Virtualization:        AMD-V
> > L1d cache:             64K
> > L1i cache:             64K
> > L2 cache:              512K
> > L3 cache:              5118K
> > NUMA node0 CPU(s):     0,8,16,24,32,40
> > NUMA node1 CPU(s):     1,9,17,25,33,41
> > NUMA node2 CPU(s):     2,10,18,26,34,42
> > NUMA node3 CPU(s):     3,11,19,27,35,43
> > NUMA node4 CPU(s):     4,12,20,28,36,44
> > NUMA node5 CPU(s):     5,13,21,29,37,45
> > NUMA node6 CPU(s):     6,14,22,30,38,46
> > NUMA node7 CPU(s):     7,15,23,31,39,47
> 
> This CPU assignment on NUMA nodes looks interesting. Is it possible
> that this affects performance of makedumpfile? This is just a guess.
> 
> Could you check whether the performance gets imporoved if you run each
> thread on the same NUMA node? For example:
> 
>   # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>   vmcore-cd0
> 
Hi HATAYAMA,

I think your guess is right, but maybe your command has a little problem.

From my test, the NUMA did affect the performance, but not too much.
The average time of cpus in the same NUMA node: 
# taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
is 314s
The average time of cpus in different NUMA node:
# taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
is 354s

But I think if you want to use "--num-threads 4", the --cpu-list numbers
following "taskset -c" should be 5 cpus at least, otherwise the time will be too
long.

Thanks,
Chao Fan

> Also, if this were cause of this performance degradation, we might
> need to extend nr_cpus= kernel option to choose NUMA nodes we use;
> though, we might already be able to do that in combination with other
> kernel features.
> 
> > Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
> > mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall mmxext fxsr_opt
> > pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc
> > extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic
> > cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt
> > hw_pstate npt lbrv svm_lock nrip_save pausefilter vmmcall
> > 
> >> But I still believe it's better not to use "-l -d 31" and "--num-threads"
> >> at the same time, though it's very strange that the performance
> >> degradation is so big.
> >> 
> >> --
> >> Thanks
> >> Zhou
> >> 
> >> > Thanks,
> >> > Chao Fan
> >> >
> >> >
> >> >>
> >> >> It seems that I can get almost the same result of Chao from "PRIMEQUEST
> >> >> 1800E".
> >> >>
> >> >> ###################################
> >> >> - System: PRIMERGY RX300 S6
> >> >> - CPU: Intel(R) Xeon(R) CPU x5660
> >> >> - memory: 16GB
> >> >> ###################################
> >> >> ************ makedumpfile -d 7 ******************
> >> >>                   core-data       0       256
> >> >>           threads-num
> >> >> -l
> >> >>           0                       10      144
> >> >>           4                       5       110
> >> >>           8                       5       111
> >> >>           12                      6       111
> >> >>
> >> >> ************ makedumpfile -d 31 ******************
> >> >>                   core-data       0       256
> >> >>           threads-num
> >> >> -l
> >> >>           0                       0       0
> >> >>           4                       2       2
> >> >>           8                       2       3
> >> >>           12                      2       3
> >> >>
> >> >> ###################################
> >> >> - System: PRIMEQUEST 1800E
> >> >> - CPU: Intel(R) Xeon(R) CPU E7540
> >> >> - memory: 32GB
> >> >> ###################################
> >> >> ************ makedumpfile -d 7 ******************
> >> >>                   core-data        0       256
> >> >>           threads-num
> >> >> -l
> >> >>           0                        34      270
> >> >>           4                        63      154
> >> >>           8                        64      131
> >> >>           12                       65      159
> >> >>
> >> >> ************ makedumpfile -d 31 ******************
> >> >>                   core-data        0       256
> >> >>           threads-num
> >> >> -l
> >> >>           0                        2       1
> >> >>           4                        48      48
> >> >>           8                        48      49
> >> >>           12                       49      50
> >> >>
> >> >>>> I'm not so sure if it is a problem that the performance degradation
> >> >>>> is
> >> >>>> so
> >> >>>> big.
> >> >>>> But I think if in other cases, it works as expected, this won't be a
> >> >>>> problem(
> >> >>>> or a problem needs to be fixed), for the performance degradation
> >> >>>> existing
> >> >>>> in theory.
> >> >>>>
> >> >>>> Or the current implementation should be replaced by a new arithmetic.
> >> >>>> For example:
> >> >>>> We can add an array to record whether the page is filtered or not.
> >> >>>> And only the unfiltered page will take the buffer.
> >> >>>
> >> >>> We should discuss how to implement new mechanism, I'll mention this
> >> >>> later.
> >> >>>
> >> >>>> But I'm not sure if it is worth.
> >> >>>> For "-l -d 31" is fast enough, the new arithmetic also can't do much
> >> >>>> help.
> >> >>>
> >> >>> Basically the faster, the better. There is no obvious target time.
> >> >>> If there is room for improvement, we should do it.
> >> >>>
> >> >>
> >> >> Maybe we can improve the performance of "-c -d 31" in some case.
> >> >>
> >> >> BTW, we can easily get the theoretical performance by using the
> >> >> "--split".
> >> >>
> >> >> --
> >> >> Thanks
> >> >> Zhou
> >> >>
> >> >>
> >> >>
> >> >> _______________________________________________
> >> >> kexec mailing list
> >> >> kexec@lists.infradead.org
> >> >> http://lists.infradead.org/mailman/listinfo/kexec
> >> >>
> >> 
> >> 
> >> 
> >> 
> >> _______________________________________________
> >> kexec mailing list
> >> kexec@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/kexec
> >> 
> > 
> > _______________________________________________
> > kexec mailing list
> > kexec@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec
> --
> Thanks.
> HATAYAMA, Daisuke
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-24  2:20                           ` Chao Fan
@ 2015-12-24  3:22                             ` HATAYAMA Daisuke
  2015-12-24  3:31                               ` Chao Fan
  0 siblings, 1 reply; 43+ messages in thread
From: HATAYAMA Daisuke @ 2015-12-24  3:22 UTC (permalink / raw)
  To: cfan; +Cc: ats-kumagai, zhouwj-fnst, kexec

From: Chao Fan <cfan@redhat.com>
Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
Date: Wed, 23 Dec 2015 21:20:48 -0500

> 
> 
> ----- Original Message -----
>> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> To: cfan@redhat.com
>> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
>> Sent: Tuesday, December 22, 2015 4:32:25 PM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> 
>> Chao,
>> 
>> From: Chao Fan <cfan@redhat.com>
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> Date: Thu, 10 Dec 2015 05:54:28 -0500
>> 
>> > 
>> > 
>> > ----- Original Message -----
>> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> To: "Chao Fan" <cfan@redhat.com>
>> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
>> >> kexec@lists.infradead.org
>> >> Sent: Thursday, December 10, 2015 6:32:32 PM
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> 
>> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
>> >> >
>> >> >
>> >> > ----- Original Message -----
>> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> >> >> Cc: kexec@lists.infradead.org
>> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
>> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >>
>> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> >> >>>> Hello Kumagai,
>> >> >>>>
>> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> >> >>>>> Hello, Zhou
>> >> >>>>>
>> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>> >> >>>>>>> Hi,
>> >> >>>>>>>
>> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> >> >>>>>>>> I think there is no problem if other test results are as
>> >> >>>>>>>> expected.
>> >> >>>>>>>>
>> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
>> >> >>>>>>>> So for lzo, it can't do much help at most of time.
>> >> >>>>>>>
>> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
>> >> >>>>>>>
>> >> >>>>>>>       [--num-threads THREADNUM]:
>> >> >>>>>>>           Using multiple threads to read and compress data of each
>> >> >>>>>>>           page
>> >> >>>>>>>           in parallel.
>> >> >>>>>>>           And it will reduces time for saving DUMPFILE.
>> >> >>>>>>>           This feature only supports creating DUMPFILE in
>> >> >>>>>>>           kdump-comressed format from
>> >> >>>>>>>           VMCORE in kdump-compressed format or elf format.
>> >> >>>>>>>
>> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
>> >> >>>>>>> --num-threads only
>> >> >>>>>>> supports zlib compressed vmcore.
>> >> >>>>>>>
>> >> >>>>>>
>> >> >>>>>> Sorry, it seems that something I said is not so clear.
>> >> >>>>>> lzo is also supported. Since lzo compresses data at a high speed,
>> >> >>>>>> the
>> >> >>>>>> improving of the performance is not so obvious at most of time.
>> >> >>>>>>
>> >> >>>>>>> Also worth to mention about the recommended -d value for this
>> >> >>>>>>> feature.
>> >> >>>>>>>
>> >> >>>>>>
>> >> >>>>>> Yes, I think it's worth. I forgot it.
>> >> >>>>>
>> >> >>>>> I saw your patch, but I think I should confirm what is the problem
>> >> >>>>> first.
>> >> >>>>>
>> >> >>>>>> However, when "-d 31" is specified, it will be worse.
>> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
>> >> >>>>>> And even the page has been filtered, it will also take a buffer.
>> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
>> >> >>>>>> of buffers. Then the page which needs to be compressed can't
>> >> >>>>>> be compressed parallel.
>> >> >>>>>
>> >> >>>>> Could you explain why compression will not be parallel in more
>> >> >>>>> detail ?
>> >> >>>>> Actually the buffers are used also for filtered pages, it sounds
>> >> >>>>> inefficient.
>> >> >>>>> However, I don't understand why it prevents parallel compression.
>> >> >>>>>
>> >> >>>>
>> >> >>>> Think about this, in a huge memory, most of the page will be
>> >> >>>> filtered,
>> >> >>>> and
>> >> >>>> we have 5 buffers.
>> >> >>>>
>> >> >>>> page1       page2      page3     page4     page5      page6
>> >> >>>> page7
>> >> >>>> .....
>> >> >>>> [buffer1]   [2]        [3]       [4]       [5]
>> >> >>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered
>> >> >>>> filtered
>> >> >>>>
>> >> >>>> Since filtered page will take a buffer, when compressing page1,
>> >> >>>> page6 can't be compressed at the same time.
>> >> >>>> That why it will prevent parallel compression.
>> >> >>>
>> >> >>> Thanks for your explanation, I understand.
>> >> >>> This is just an issue of the current implementation, there is no
>> >> >>> reason to stand this restriction.
>> >> >>>
>> >> >>>>> Further, according to Chao's benchmark, there is a big performance
>> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
>> >> >>>>> The current implementation seems to have some problems, we should
>> >> >>>>> solve them.
>> >> >>>>>
>> >> >>>>
>> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
>> >> >>>> compressing
>> >> >>>> parallel, on the other hand we will introduce some extra work by
>> >> >>>> adding
>> >> >>>> "--num-threads". So it is obvious that it will have a performance
>> >> >>>> degradation.
>> >> >>>
>> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
>> >> >>> exclusive
>> >> >>> lock),
>> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still
>> >> >>> sounds
>> >> >>> too slow, the degradation is too big to be called "some extra work".
>> >> >>>
>> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> >> >>> the above "buffer fairness issue" will not be related to this
>> >> >>> degradation.
>> >> >>> What do you think what make this degradation ?
>> >> >>>
>> >> >>
>> >> >> I can't get such result at this moment, so I can't do some further
>> >> >> investigation
>> >> >> right now. I guess it may be caused by the underlying implementation of
>> >> >> pthread.
>> >> >> I reviewed the test result of the patch v2 and found in different
>> >> >> machines,
>> >> >> the results are quite different.
>> >> >
>> >> > Hi Zhou Wenjian,
>> >> >
>> >> > I have done more tests in another machine with 128G memory, and get the
>> >> > result:
>> >> >
>> >> > the size of vmcore is 300M in "-d 31"
>> >> > makedumpfile -l --message-level 1 -d 31:
>> >> > time: 8.6s      page-faults: 2272
>> >> >
>> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
>> >> > time: 28.1s     page-faults: 2359
>> >> >
>> >> >
>> >> > and the size of vmcore is 2.6G in "-d 0".
>> >> > In this machine, I get the same result as yours:
>> >> >
>> >> >
>> >> > makedumpfile -c --message-level 1 -d 0:
>> >> > time: 597s      page-faults: 2287
>> >> >
>> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
>> >> > time: 602s      page-faults: 2361
>> >> >
>> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
>> >> > time: 337s      page-faults: 2397
>> >> >
>> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
>> >> > time: 175s      page-faults: 2461
>> >> >
>> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
>> >> > time: 103s      page-faults: 2611
>> >> >
>> >> >
>> >> > But the machine of my first test is not under my control, should I wait
>> >> > for
>> >> > the first machine to do more tests?
>> >> > If there are still some problems in my tests, please tell me.
>> >> >
>> >> 
>> >> Thanks a lot for your test, it seems that there is nothing wrong.
>> >> And I haven't got any idea about more tests...
>> >> 
>> >> Could you provide the information of your cpu ?
>> >> I will do some further investigation later.
>> >> 
>> > 
>> > OK, of course, here is the information of cpu:
>> > 
>> > # lscpu
>> > Architecture:          x86_64
>> > CPU op-mode(s):        32-bit, 64-bit
>> > Byte Order:            Little Endian
>> > CPU(s):                48
>> > On-line CPU(s) list:   0-47
>> > Thread(s) per core:    1
>> > Core(s) per socket:    6
>> > Socket(s):             8
>> > NUMA node(s):          8
>> > Vendor ID:             AuthenticAMD
>> > CPU family:            16
>> > Model:                 8
>> > Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
>> > Stepping:              0
>> > CPU MHz:               2793.040
>> > BogoMIPS:              5586.22
>> > Virtualization:        AMD-V
>> > L1d cache:             64K
>> > L1i cache:             64K
>> > L2 cache:              512K
>> > L3 cache:              5118K
>> > NUMA node0 CPU(s):     0,8,16,24,32,40
>> > NUMA node1 CPU(s):     1,9,17,25,33,41
>> > NUMA node2 CPU(s):     2,10,18,26,34,42
>> > NUMA node3 CPU(s):     3,11,19,27,35,43
>> > NUMA node4 CPU(s):     4,12,20,28,36,44
>> > NUMA node5 CPU(s):     5,13,21,29,37,45
>> > NUMA node6 CPU(s):     6,14,22,30,38,46
>> > NUMA node7 CPU(s):     7,15,23,31,39,47
>> 
>> This CPU assignment on NUMA nodes looks interesting. Is it possible
>> that this affects performance of makedumpfile? This is just a guess.
>> 
>> Could you check whether the performance gets imporoved if you run each
>> thread on the same NUMA node? For example:
>> 
>>   # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>>   vmcore-cd0
>> 
> Hi HATAYAMA,
> 
> I think your guess is right, but maybe your command has a little problem.
> 
> From my test, the NUMA did affect the performance, but not too much.
> The average time of cpus in the same NUMA node: 
> # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
> is 314s
> The average time of cpus in different NUMA node:
> # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore vmcore-cd0
> is 354s
>

Hmm, according to some previous discussion, what we should see here is
whether it affects performance of makedumpfile with --num-threads 1
and -d 31. So you should need to compare:

    # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31

with:

    # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31

Also, I'm assuming that you've done these benchmark on kdump 1st
kernel, not kdump 2nd kernel. Is this correct?

> But I think if you want to use "--num-threads 4", the --cpu-list numbers
> following "taskset -c" should be 5 cpus at least, otherwise the time will be too
> long.
> 

I see.

--
Thanks.
HATAYAMA, Daisuke
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-24  3:22                             ` HATAYAMA Daisuke
@ 2015-12-24  3:31                               ` Chao Fan
  2015-12-24  3:50                                 ` HATAYAMA Daisuke
  0 siblings, 1 reply; 43+ messages in thread
From: Chao Fan @ 2015-12-24  3:31 UTC (permalink / raw)
  To: HATAYAMA Daisuke; +Cc: ats-kumagai, zhouwj-fnst, kexec



----- Original Message -----
> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> To: cfan@redhat.com
> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
> Sent: Thursday, December 24, 2015 11:22:28 AM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> 
> From: Chao Fan <cfan@redhat.com>
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> Date: Wed, 23 Dec 2015 21:20:48 -0500
> 
> > 
> > 
> > ----- Original Message -----
> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> >> To: cfan@redhat.com
> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
> >> kexec@lists.infradead.org
> >> Sent: Tuesday, December 22, 2015 4:32:25 PM
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> 
> >> Chao,
> >> 
> >> From: Chao Fan <cfan@redhat.com>
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> Date: Thu, 10 Dec 2015 05:54:28 -0500
> >> 
> >> > 
> >> > 
> >> > ----- Original Message -----
> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> To: "Chao Fan" <cfan@redhat.com>
> >> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
> >> >> kexec@lists.infradead.org
> >> >> Sent: Thursday, December 10, 2015 6:32:32 PM
> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> 
> >> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
> >> >> >
> >> >> >
> >> >> > ----- Original Message -----
> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> >> >> >> Cc: kexec@lists.infradead.org
> >> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> >>
> >> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >> >> >>>> Hello Kumagai,
> >> >> >>>>
> >> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >> >> >>>>> Hello, Zhou
> >> >> >>>>>
> >> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >> >> >>>>>>> Hi,
> >> >> >>>>>>>
> >> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >> >> >>>>>>>> I think there is no problem if other test results are as
> >> >> >>>>>>>> expected.
> >> >> >>>>>>>>
> >> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
> >> >> >>>>>>>> So for lzo, it can't do much help at most of time.
> >> >> >>>>>>>
> >> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
> >> >> >>>>>>>
> >> >> >>>>>>>       [--num-threads THREADNUM]:
> >> >> >>>>>>>           Using multiple threads to read and compress data of
> >> >> >>>>>>>           each
> >> >> >>>>>>>           page
> >> >> >>>>>>>           in parallel.
> >> >> >>>>>>>           And it will reduces time for saving DUMPFILE.
> >> >> >>>>>>>           This feature only supports creating DUMPFILE in
> >> >> >>>>>>>           kdump-comressed format from
> >> >> >>>>>>>           VMCORE in kdump-compressed format or elf format.
> >> >> >>>>>>>
> >> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
> >> >> >>>>>>> --num-threads only
> >> >> >>>>>>> supports zlib compressed vmcore.
> >> >> >>>>>>>
> >> >> >>>>>>
> >> >> >>>>>> Sorry, it seems that something I said is not so clear.
> >> >> >>>>>> lzo is also supported. Since lzo compresses data at a high
> >> >> >>>>>> speed,
> >> >> >>>>>> the
> >> >> >>>>>> improving of the performance is not so obvious at most of time.
> >> >> >>>>>>
> >> >> >>>>>>> Also worth to mention about the recommended -d value for this
> >> >> >>>>>>> feature.
> >> >> >>>>>>>
> >> >> >>>>>>
> >> >> >>>>>> Yes, I think it's worth. I forgot it.
> >> >> >>>>>
> >> >> >>>>> I saw your patch, but I think I should confirm what is the
> >> >> >>>>> problem
> >> >> >>>>> first.
> >> >> >>>>>
> >> >> >>>>>> However, when "-d 31" is specified, it will be worse.
> >> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
> >> >> >>>>>> And even the page has been filtered, it will also take a buffer.
> >> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
> >> >> >>>>>> of buffers. Then the page which needs to be compressed can't
> >> >> >>>>>> be compressed parallel.
> >> >> >>>>>
> >> >> >>>>> Could you explain why compression will not be parallel in more
> >> >> >>>>> detail ?
> >> >> >>>>> Actually the buffers are used also for filtered pages, it sounds
> >> >> >>>>> inefficient.
> >> >> >>>>> However, I don't understand why it prevents parallel compression.
> >> >> >>>>>
> >> >> >>>>
> >> >> >>>> Think about this, in a huge memory, most of the page will be
> >> >> >>>> filtered,
> >> >> >>>> and
> >> >> >>>> we have 5 buffers.
> >> >> >>>>
> >> >> >>>> page1       page2      page3     page4     page5      page6
> >> >> >>>> page7
> >> >> >>>> .....
> >> >> >>>> [buffer1]   [2]        [3]       [4]       [5]
> >> >> >>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered
> >> >> >>>> filtered
> >> >> >>>>
> >> >> >>>> Since filtered page will take a buffer, when compressing page1,
> >> >> >>>> page6 can't be compressed at the same time.
> >> >> >>>> That why it will prevent parallel compression.
> >> >> >>>
> >> >> >>> Thanks for your explanation, I understand.
> >> >> >>> This is just an issue of the current implementation, there is no
> >> >> >>> reason to stand this restriction.
> >> >> >>>
> >> >> >>>>> Further, according to Chao's benchmark, there is a big
> >> >> >>>>> performance
> >> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
> >> >> >>>>> The current implementation seems to have some problems, we should
> >> >> >>>>> solve them.
> >> >> >>>>>
> >> >> >>>>
> >> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
> >> >> >>>> compressing
> >> >> >>>> parallel, on the other hand we will introduce some extra work by
> >> >> >>>> adding
> >> >> >>>> "--num-threads". So it is obvious that it will have a performance
> >> >> >>>> degradation.
> >> >> >>>
> >> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
> >> >> >>> exclusive
> >> >> >>> lock),
> >> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still
> >> >> >>> sounds
> >> >> >>> too slow, the degradation is too big to be called "some extra
> >> >> >>> work".
> >> >> >>>
> >> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
> >> >> >>> the above "buffer fairness issue" will not be related to this
> >> >> >>> degradation.
> >> >> >>> What do you think what make this degradation ?
> >> >> >>>
> >> >> >>
> >> >> >> I can't get such result at this moment, so I can't do some further
> >> >> >> investigation
> >> >> >> right now. I guess it may be caused by the underlying implementation
> >> >> >> of
> >> >> >> pthread.
> >> >> >> I reviewed the test result of the patch v2 and found in different
> >> >> >> machines,
> >> >> >> the results are quite different.
> >> >> >
> >> >> > Hi Zhou Wenjian,
> >> >> >
> >> >> > I have done more tests in another machine with 128G memory, and get
> >> >> > the
> >> >> > result:
> >> >> >
> >> >> > the size of vmcore is 300M in "-d 31"
> >> >> > makedumpfile -l --message-level 1 -d 31:
> >> >> > time: 8.6s      page-faults: 2272
> >> >> >
> >> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> >> >> > time: 28.1s     page-faults: 2359
> >> >> >
> >> >> >
> >> >> > and the size of vmcore is 2.6G in "-d 0".
> >> >> > In this machine, I get the same result as yours:
> >> >> >
> >> >> >
> >> >> > makedumpfile -c --message-level 1 -d 0:
> >> >> > time: 597s      page-faults: 2287
> >> >> >
> >> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> >> >> > time: 602s      page-faults: 2361
> >> >> >
> >> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> >> >> > time: 337s      page-faults: 2397
> >> >> >
> >> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> >> >> > time: 175s      page-faults: 2461
> >> >> >
> >> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> >> >> > time: 103s      page-faults: 2611
> >> >> >
> >> >> >
> >> >> > But the machine of my first test is not under my control, should I
> >> >> > wait
> >> >> > for
> >> >> > the first machine to do more tests?
> >> >> > If there are still some problems in my tests, please tell me.
> >> >> >
> >> >> 
> >> >> Thanks a lot for your test, it seems that there is nothing wrong.
> >> >> And I haven't got any idea about more tests...
> >> >> 
> >> >> Could you provide the information of your cpu ?
> >> >> I will do some further investigation later.
> >> >> 
> >> > 
> >> > OK, of course, here is the information of cpu:
> >> > 
> >> > # lscpu
> >> > Architecture:          x86_64
> >> > CPU op-mode(s):        32-bit, 64-bit
> >> > Byte Order:            Little Endian
> >> > CPU(s):                48
> >> > On-line CPU(s) list:   0-47
> >> > Thread(s) per core:    1
> >> > Core(s) per socket:    6
> >> > Socket(s):             8
> >> > NUMA node(s):          8
> >> > Vendor ID:             AuthenticAMD
> >> > CPU family:            16
> >> > Model:                 8
> >> > Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
> >> > Stepping:              0
> >> > CPU MHz:               2793.040
> >> > BogoMIPS:              5586.22
> >> > Virtualization:        AMD-V
> >> > L1d cache:             64K
> >> > L1i cache:             64K
> >> > L2 cache:              512K
> >> > L3 cache:              5118K
> >> > NUMA node0 CPU(s):     0,8,16,24,32,40
> >> > NUMA node1 CPU(s):     1,9,17,25,33,41
> >> > NUMA node2 CPU(s):     2,10,18,26,34,42
> >> > NUMA node3 CPU(s):     3,11,19,27,35,43
> >> > NUMA node4 CPU(s):     4,12,20,28,36,44
> >> > NUMA node5 CPU(s):     5,13,21,29,37,45
> >> > NUMA node6 CPU(s):     6,14,22,30,38,46
> >> > NUMA node7 CPU(s):     7,15,23,31,39,47
> >> 
> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
> >> that this affects performance of makedumpfile? This is just a guess.
> >> 
> >> Could you check whether the performance gets imporoved if you run each
> >> thread on the same NUMA node? For example:
> >> 
> >>   # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
> >>   vmcore-cd0
> >> 
> > Hi HATAYAMA,
> > 
> > I think your guess is right, but maybe your command has a little problem.
> > 
> > From my test, the NUMA did affect the performance, but not too much.
> > The average time of cpus in the same NUMA node:
> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
> > vmcore-cd0
> > is 314s
> > The average time of cpus in different NUMA node:
> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
> > vmcore-cd0
> > is 354s
> >
> 
> Hmm, according to some previous discussion, what we should see here is
> whether it affects performance of makedumpfile with --num-threads 1
> and -d 31. So you should need to compare:
> 
>     # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
> 
> with:
> 
>     # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
> 
> Also, I'm assuming that you've done these benchmark on kdump 1st
> kernel, not kdump 2nd kernel. Is this correct?
> 
Hi HATAYAMA,

I test in the first kernel, not in the kdump second kernel.

Thanks,
Chao Fan
> > But I think if you want to use "--num-threads 4", the --cpu-list numbers
> > following "taskset -c" should be 5 cpus at least, otherwise the time will
> > be too
> > long.
> > 
> 
> I see.
> 
> --
> Thanks.
> HATAYAMA, Daisuke
> 

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-24  3:31                               ` Chao Fan
@ 2015-12-24  3:50                                 ` HATAYAMA Daisuke
  2015-12-24  6:02                                   ` Chao Fan
  0 siblings, 1 reply; 43+ messages in thread
From: HATAYAMA Daisuke @ 2015-12-24  3:50 UTC (permalink / raw)
  To: cfan; +Cc: ats-kumagai, zhouwj-fnst, kexec

From: Chao Fan <cfan@redhat.com>
Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
Date: Wed, 23 Dec 2015 22:31:37 -0500

> 
> 
> ----- Original Message -----
>> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> To: cfan@redhat.com
>> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
>> Sent: Thursday, December 24, 2015 11:22:28 AM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> 
>> From: Chao Fan <cfan@redhat.com>
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> Date: Wed, 23 Dec 2015 21:20:48 -0500
>> 
>> > 
>> > 
>> > ----- Original Message -----
>> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> >> To: cfan@redhat.com
>> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
>> >> kexec@lists.infradead.org
>> >> Sent: Tuesday, December 22, 2015 4:32:25 PM
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> 
>> >> Chao,
>> >> 
>> >> From: Chao Fan <cfan@redhat.com>
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> Date: Thu, 10 Dec 2015 05:54:28 -0500
>> >> 
>> >> > 
>> >> > 
>> >> > ----- Original Message -----
>> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> To: "Chao Fan" <cfan@redhat.com>
>> >> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
>> >> >> kexec@lists.infradead.org
>> >> >> Sent: Thursday, December 10, 2015 6:32:32 PM
>> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> 
>> >> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
>> >> >> >
>> >> >> >
>> >> >> > ----- Original Message -----
>> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> >> >> >> Cc: kexec@lists.infradead.org
>> >> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
>> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> >>
>> >> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> >> >> >>>> Hello Kumagai,
>> >> >> >>>>
>> >> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> >> >> >>>>> Hello, Zhou
>> >> >> >>>>>
>> >> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>> >> >> >>>>>>> Hi,
>> >> >> >>>>>>>
>> >> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> >> >> >>>>>>>> I think there is no problem if other test results are as
>> >> >> >>>>>>>> expected.
>> >> >> >>>>>>>>
>> >> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
>> >> >> >>>>>>>> So for lzo, it can't do much help at most of time.
>> >> >> >>>>>>>
>> >> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
>> >> >> >>>>>>>
>> >> >> >>>>>>>       [--num-threads THREADNUM]:
>> >> >> >>>>>>>           Using multiple threads to read and compress data of
>> >> >> >>>>>>>           each
>> >> >> >>>>>>>           page
>> >> >> >>>>>>>           in parallel.
>> >> >> >>>>>>>           And it will reduces time for saving DUMPFILE.
>> >> >> >>>>>>>           This feature only supports creating DUMPFILE in
>> >> >> >>>>>>>           kdump-comressed format from
>> >> >> >>>>>>>           VMCORE in kdump-compressed format or elf format.
>> >> >> >>>>>>>
>> >> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
>> >> >> >>>>>>> --num-threads only
>> >> >> >>>>>>> supports zlib compressed vmcore.
>> >> >> >>>>>>>
>> >> >> >>>>>>
>> >> >> >>>>>> Sorry, it seems that something I said is not so clear.
>> >> >> >>>>>> lzo is also supported. Since lzo compresses data at a high
>> >> >> >>>>>> speed,
>> >> >> >>>>>> the
>> >> >> >>>>>> improving of the performance is not so obvious at most of time.
>> >> >> >>>>>>
>> >> >> >>>>>>> Also worth to mention about the recommended -d value for this
>> >> >> >>>>>>> feature.
>> >> >> >>>>>>>
>> >> >> >>>>>>
>> >> >> >>>>>> Yes, I think it's worth. I forgot it.
>> >> >> >>>>>
>> >> >> >>>>> I saw your patch, but I think I should confirm what is the
>> >> >> >>>>> problem
>> >> >> >>>>> first.
>> >> >> >>>>>
>> >> >> >>>>>> However, when "-d 31" is specified, it will be worse.
>> >> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
>> >> >> >>>>>> And even the page has been filtered, it will also take a buffer.
>> >> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
>> >> >> >>>>>> of buffers. Then the page which needs to be compressed can't
>> >> >> >>>>>> be compressed parallel.
>> >> >> >>>>>
>> >> >> >>>>> Could you explain why compression will not be parallel in more
>> >> >> >>>>> detail ?
>> >> >> >>>>> Actually the buffers are used also for filtered pages, it sounds
>> >> >> >>>>> inefficient.
>> >> >> >>>>> However, I don't understand why it prevents parallel compression.
>> >> >> >>>>>
>> >> >> >>>>
>> >> >> >>>> Think about this, in a huge memory, most of the page will be
>> >> >> >>>> filtered,
>> >> >> >>>> and
>> >> >> >>>> we have 5 buffers.
>> >> >> >>>>
>> >> >> >>>> page1       page2      page3     page4     page5      page6
>> >> >> >>>> page7
>> >> >> >>>> .....
>> >> >> >>>> [buffer1]   [2]        [3]       [4]       [5]
>> >> >> >>>> unfiltered  filtered   filtered  filtered  filtered   unfiltered
>> >> >> >>>> filtered
>> >> >> >>>>
>> >> >> >>>> Since filtered page will take a buffer, when compressing page1,
>> >> >> >>>> page6 can't be compressed at the same time.
>> >> >> >>>> That why it will prevent parallel compression.
>> >> >> >>>
>> >> >> >>> Thanks for your explanation, I understand.
>> >> >> >>> This is just an issue of the current implementation, there is no
>> >> >> >>> reason to stand this restriction.
>> >> >> >>>
>> >> >> >>>>> Further, according to Chao's benchmark, there is a big
>> >> >> >>>>> performance
>> >> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
>> >> >> >>>>> The current implementation seems to have some problems, we should
>> >> >> >>>>> solve them.
>> >> >> >>>>>
>> >> >> >>>>
>> >> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
>> >> >> >>>> compressing
>> >> >> >>>> parallel, on the other hand we will introduce some extra work by
>> >> >> >>>> adding
>> >> >> >>>> "--num-threads". So it is obvious that it will have a performance
>> >> >> >>>> degradation.
>> >> >> >>>
>> >> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
>> >> >> >>> exclusive
>> >> >> >>> lock),
>> >> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0" still
>> >> >> >>> sounds
>> >> >> >>> too slow, the degradation is too big to be called "some extra
>> >> >> >>> work".
>> >> >> >>>
>> >> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> >> >> >>> the above "buffer fairness issue" will not be related to this
>> >> >> >>> degradation.
>> >> >> >>> What do you think what make this degradation ?
>> >> >> >>>
>> >> >> >>
>> >> >> >> I can't get such result at this moment, so I can't do some further
>> >> >> >> investigation
>> >> >> >> right now. I guess it may be caused by the underlying implementation
>> >> >> >> of
>> >> >> >> pthread.
>> >> >> >> I reviewed the test result of the patch v2 and found in different
>> >> >> >> machines,
>> >> >> >> the results are quite different.
>> >> >> >
>> >> >> > Hi Zhou Wenjian,
>> >> >> >
>> >> >> > I have done more tests in another machine with 128G memory, and get
>> >> >> > the
>> >> >> > result:
>> >> >> >
>> >> >> > the size of vmcore is 300M in "-d 31"
>> >> >> > makedumpfile -l --message-level 1 -d 31:
>> >> >> > time: 8.6s      page-faults: 2272
>> >> >> >
>> >> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
>> >> >> > time: 28.1s     page-faults: 2359
>> >> >> >
>> >> >> >
>> >> >> > and the size of vmcore is 2.6G in "-d 0".
>> >> >> > In this machine, I get the same result as yours:
>> >> >> >
>> >> >> >
>> >> >> > makedumpfile -c --message-level 1 -d 0:
>> >> >> > time: 597s      page-faults: 2287
>> >> >> >
>> >> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
>> >> >> > time: 602s      page-faults: 2361
>> >> >> >
>> >> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
>> >> >> > time: 337s      page-faults: 2397
>> >> >> >
>> >> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
>> >> >> > time: 175s      page-faults: 2461
>> >> >> >
>> >> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
>> >> >> > time: 103s      page-faults: 2611
>> >> >> >
>> >> >> >
>> >> >> > But the machine of my first test is not under my control, should I
>> >> >> > wait
>> >> >> > for
>> >> >> > the first machine to do more tests?
>> >> >> > If there are still some problems in my tests, please tell me.
>> >> >> >
>> >> >> 
>> >> >> Thanks a lot for your test, it seems that there is nothing wrong.
>> >> >> And I haven't got any idea about more tests...
>> >> >> 
>> >> >> Could you provide the information of your cpu ?
>> >> >> I will do some further investigation later.
>> >> >> 
>> >> > 
>> >> > OK, of course, here is the information of cpu:
>> >> > 
>> >> > # lscpu
>> >> > Architecture:          x86_64
>> >> > CPU op-mode(s):        32-bit, 64-bit
>> >> > Byte Order:            Little Endian
>> >> > CPU(s):                48
>> >> > On-line CPU(s) list:   0-47
>> >> > Thread(s) per core:    1
>> >> > Core(s) per socket:    6
>> >> > Socket(s):             8
>> >> > NUMA node(s):          8
>> >> > Vendor ID:             AuthenticAMD
>> >> > CPU family:            16
>> >> > Model:                 8
>> >> > Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
>> >> > Stepping:              0
>> >> > CPU MHz:               2793.040
>> >> > BogoMIPS:              5586.22
>> >> > Virtualization:        AMD-V
>> >> > L1d cache:             64K
>> >> > L1i cache:             64K
>> >> > L2 cache:              512K
>> >> > L3 cache:              5118K
>> >> > NUMA node0 CPU(s):     0,8,16,24,32,40
>> >> > NUMA node1 CPU(s):     1,9,17,25,33,41
>> >> > NUMA node2 CPU(s):     2,10,18,26,34,42
>> >> > NUMA node3 CPU(s):     3,11,19,27,35,43
>> >> > NUMA node4 CPU(s):     4,12,20,28,36,44
>> >> > NUMA node5 CPU(s):     5,13,21,29,37,45
>> >> > NUMA node6 CPU(s):     6,14,22,30,38,46
>> >> > NUMA node7 CPU(s):     7,15,23,31,39,47
>> >> 
>> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
>> >> that this affects performance of makedumpfile? This is just a guess.
>> >> 
>> >> Could you check whether the performance gets imporoved if you run each
>> >> thread on the same NUMA node? For example:
>> >> 
>> >>   # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >>   vmcore-cd0
>> >> 
>> > Hi HATAYAMA,
>> > 
>> > I think your guess is right, but maybe your command has a little problem.
>> > 
>> > From my test, the NUMA did affect the performance, but not too much.
>> > The average time of cpus in the same NUMA node:
>> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
>> > vmcore-cd0
>> > is 314s
>> > The average time of cpus in different NUMA node:
>> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
>> > vmcore-cd0
>> > is 354s
>> >
>> 
>> Hmm, according to some previous discussion, what we should see here is
>> whether it affects performance of makedumpfile with --num-threads 1
>> and -d 31. So you should need to compare:
>> 
>>     # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
>> 
>> with:
>> 
>>     # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31

I removed -c option wrongly. What I wanted to write is:

    # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31

and:

    # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31

just in case...

>> 
>> Also, I'm assuming that you've done these benchmark on kdump 1st
>> kernel, not kdump 2nd kernel. Is this correct?
>> 
> Hi HATAYAMA,
> 
> I test in the first kernel, not in the kdump second kernel.
>

I see.

--
Thanks.
HATAYAMA, Daisuke
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-24  3:50                                 ` HATAYAMA Daisuke
@ 2015-12-24  6:02                                   ` Chao Fan
  2015-12-24  7:22                                     ` HATAYAMA Daisuke
  2015-12-24  8:20                                     ` Atsushi Kumagai
  0 siblings, 2 replies; 43+ messages in thread
From: Chao Fan @ 2015-12-24  6:02 UTC (permalink / raw)
  To: HATAYAMA Daisuke; +Cc: ats-kumagai, zhouwj-fnst, kexec



----- Original Message -----
> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> To: cfan@redhat.com
> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
> Sent: Thursday, December 24, 2015 11:50:08 AM
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> 
> From: Chao Fan <cfan@redhat.com>
> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> Date: Wed, 23 Dec 2015 22:31:37 -0500
> 
> > 
> > 
> > ----- Original Message -----
> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> >> To: cfan@redhat.com
> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
> >> kexec@lists.infradead.org
> >> Sent: Thursday, December 24, 2015 11:22:28 AM
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> 
> >> From: Chao Fan <cfan@redhat.com>
> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> Date: Wed, 23 Dec 2015 21:20:48 -0500
> >> 
> >> > 
> >> > 
> >> > ----- Original Message -----
> >> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
> >> >> To: cfan@redhat.com
> >> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
> >> >> kexec@lists.infradead.org
> >> >> Sent: Tuesday, December 22, 2015 4:32:25 PM
> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> 
> >> >> Chao,
> >> >> 
> >> >> From: Chao Fan <cfan@redhat.com>
> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> Date: Thu, 10 Dec 2015 05:54:28 -0500
> >> >> 
> >> >> > 
> >> >> > 
> >> >> > ----- Original Message -----
> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> >> To: "Chao Fan" <cfan@redhat.com>
> >> >> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
> >> >> >> kexec@lists.infradead.org
> >> >> >> Sent: Thursday, December 10, 2015 6:32:32 PM
> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> >> 
> >> >> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
> >> >> >> >
> >> >> >> >
> >> >> >> > ----- Original Message -----
> >> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
> >> >> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> >> >> >> >> Cc: kexec@lists.infradead.org
> >> >> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
> >> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
> >> >> >> >>
> >> >> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
> >> >> >> >>>> Hello Kumagai,
> >> >> >> >>>>
> >> >> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
> >> >> >> >>>>> Hello, Zhou
> >> >> >> >>>>>
> >> >> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
> >> >> >> >>>>>>> Hi,
> >> >> >> >>>>>>>
> >> >> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
> >> >> >> >>>>>>>> I think there is no problem if other test results are as
> >> >> >> >>>>>>>> expected.
> >> >> >> >>>>>>>>
> >> >> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
> >> >> >> >>>>>>>> So for lzo, it can't do much help at most of time.
> >> >> >> >>>>>>>
> >> >> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
> >> >> >> >>>>>>>
> >> >> >> >>>>>>>       [--num-threads THREADNUM]:
> >> >> >> >>>>>>>           Using multiple threads to read and compress data
> >> >> >> >>>>>>>           of
> >> >> >> >>>>>>>           each
> >> >> >> >>>>>>>           page
> >> >> >> >>>>>>>           in parallel.
> >> >> >> >>>>>>>           And it will reduces time for saving DUMPFILE.
> >> >> >> >>>>>>>           This feature only supports creating DUMPFILE in
> >> >> >> >>>>>>>           kdump-comressed format from
> >> >> >> >>>>>>>           VMCORE in kdump-compressed format or elf format.
> >> >> >> >>>>>>>
> >> >> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
> >> >> >> >>>>>>> --num-threads only
> >> >> >> >>>>>>> supports zlib compressed vmcore.
> >> >> >> >>>>>>>
> >> >> >> >>>>>>
> >> >> >> >>>>>> Sorry, it seems that something I said is not so clear.
> >> >> >> >>>>>> lzo is also supported. Since lzo compresses data at a high
> >> >> >> >>>>>> speed,
> >> >> >> >>>>>> the
> >> >> >> >>>>>> improving of the performance is not so obvious at most of
> >> >> >> >>>>>> time.
> >> >> >> >>>>>>
> >> >> >> >>>>>>> Also worth to mention about the recommended -d value for
> >> >> >> >>>>>>> this
> >> >> >> >>>>>>> feature.
> >> >> >> >>>>>>>
> >> >> >> >>>>>>
> >> >> >> >>>>>> Yes, I think it's worth. I forgot it.
> >> >> >> >>>>>
> >> >> >> >>>>> I saw your patch, but I think I should confirm what is the
> >> >> >> >>>>> problem
> >> >> >> >>>>> first.
> >> >> >> >>>>>
> >> >> >> >>>>>> However, when "-d 31" is specified, it will be worse.
> >> >> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
> >> >> >> >>>>>> And even the page has been filtered, it will also take a
> >> >> >> >>>>>> buffer.
> >> >> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
> >> >> >> >>>>>> of buffers. Then the page which needs to be compressed can't
> >> >> >> >>>>>> be compressed parallel.
> >> >> >> >>>>>
> >> >> >> >>>>> Could you explain why compression will not be parallel in more
> >> >> >> >>>>> detail ?
> >> >> >> >>>>> Actually the buffers are used also for filtered pages, it
> >> >> >> >>>>> sounds
> >> >> >> >>>>> inefficient.
> >> >> >> >>>>> However, I don't understand why it prevents parallel
> >> >> >> >>>>> compression.
> >> >> >> >>>>>
> >> >> >> >>>>
> >> >> >> >>>> Think about this, in a huge memory, most of the page will be
> >> >> >> >>>> filtered,
> >> >> >> >>>> and
> >> >> >> >>>> we have 5 buffers.
> >> >> >> >>>>
> >> >> >> >>>> page1       page2      page3     page4     page5      page6
> >> >> >> >>>> page7
> >> >> >> >>>> .....
> >> >> >> >>>> [buffer1]   [2]        [3]       [4]       [5]
> >> >> >> >>>> unfiltered  filtered   filtered  filtered  filtered
> >> >> >> >>>> unfiltered
> >> >> >> >>>> filtered
> >> >> >> >>>>
> >> >> >> >>>> Since filtered page will take a buffer, when compressing page1,
> >> >> >> >>>> page6 can't be compressed at the same time.
> >> >> >> >>>> That why it will prevent parallel compression.
> >> >> >> >>>
> >> >> >> >>> Thanks for your explanation, I understand.
> >> >> >> >>> This is just an issue of the current implementation, there is no
> >> >> >> >>> reason to stand this restriction.
> >> >> >> >>>
> >> >> >> >>>>> Further, according to Chao's benchmark, there is a big
> >> >> >> >>>>> performance
> >> >> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
> >> >> >> >>>>> The current implementation seems to have some problems, we
> >> >> >> >>>>> should
> >> >> >> >>>>> solve them.
> >> >> >> >>>>>
> >> >> >> >>>>
> >> >> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
> >> >> >> >>>> compressing
> >> >> >> >>>> parallel, on the other hand we will introduce some extra work
> >> >> >> >>>> by
> >> >> >> >>>> adding
> >> >> >> >>>> "--num-threads". So it is obvious that it will have a
> >> >> >> >>>> performance
> >> >> >> >>>> degradation.
> >> >> >> >>>
> >> >> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
> >> >> >> >>> exclusive
> >> >> >> >>> lock),
> >> >> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0"
> >> >> >> >>> still
> >> >> >> >>> sounds
> >> >> >> >>> too slow, the degradation is too big to be called "some extra
> >> >> >> >>> work".
> >> >> >> >>>
> >> >> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
> >> >> >> >>> the above "buffer fairness issue" will not be related to this
> >> >> >> >>> degradation.
> >> >> >> >>> What do you think what make this degradation ?
> >> >> >> >>>
> >> >> >> >>
> >> >> >> >> I can't get such result at this moment, so I can't do some
> >> >> >> >> further
> >> >> >> >> investigation
> >> >> >> >> right now. I guess it may be caused by the underlying
> >> >> >> >> implementation
> >> >> >> >> of
> >> >> >> >> pthread.
> >> >> >> >> I reviewed the test result of the patch v2 and found in different
> >> >> >> >> machines,
> >> >> >> >> the results are quite different.
> >> >> >> >
> >> >> >> > Hi Zhou Wenjian,
> >> >> >> >
> >> >> >> > I have done more tests in another machine with 128G memory, and
> >> >> >> > get
> >> >> >> > the
> >> >> >> > result:
> >> >> >> >
> >> >> >> > the size of vmcore is 300M in "-d 31"
> >> >> >> > makedumpfile -l --message-level 1 -d 31:
> >> >> >> > time: 8.6s      page-faults: 2272
> >> >> >> >
> >> >> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
> >> >> >> > time: 28.1s     page-faults: 2359
> >> >> >> >
> >> >> >> >
> >> >> >> > and the size of vmcore is 2.6G in "-d 0".
> >> >> >> > In this machine, I get the same result as yours:
> >> >> >> >
> >> >> >> >
> >> >> >> > makedumpfile -c --message-level 1 -d 0:
> >> >> >> > time: 597s      page-faults: 2287
> >> >> >> >
> >> >> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
> >> >> >> > time: 602s      page-faults: 2361
> >> >> >> >
> >> >> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
> >> >> >> > time: 337s      page-faults: 2397
> >> >> >> >
> >> >> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
> >> >> >> > time: 175s      page-faults: 2461
> >> >> >> >
> >> >> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
> >> >> >> > time: 103s      page-faults: 2611
> >> >> >> >
> >> >> >> >
> >> >> >> > But the machine of my first test is not under my control, should I
> >> >> >> > wait
> >> >> >> > for
> >> >> >> > the first machine to do more tests?
> >> >> >> > If there are still some problems in my tests, please tell me.
> >> >> >> >
> >> >> >> 
> >> >> >> Thanks a lot for your test, it seems that there is nothing wrong.
> >> >> >> And I haven't got any idea about more tests...
> >> >> >> 
> >> >> >> Could you provide the information of your cpu ?
> >> >> >> I will do some further investigation later.
> >> >> >> 
> >> >> > 
> >> >> > OK, of course, here is the information of cpu:
> >> >> > 
> >> >> > # lscpu
> >> >> > Architecture:          x86_64
> >> >> > CPU op-mode(s):        32-bit, 64-bit
> >> >> > Byte Order:            Little Endian
> >> >> > CPU(s):                48
> >> >> > On-line CPU(s) list:   0-47
> >> >> > Thread(s) per core:    1
> >> >> > Core(s) per socket:    6
> >> >> > Socket(s):             8
> >> >> > NUMA node(s):          8
> >> >> > Vendor ID:             AuthenticAMD
> >> >> > CPU family:            16
> >> >> > Model:                 8
> >> >> > Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
> >> >> > Stepping:              0
> >> >> > CPU MHz:               2793.040
> >> >> > BogoMIPS:              5586.22
> >> >> > Virtualization:        AMD-V
> >> >> > L1d cache:             64K
> >> >> > L1i cache:             64K
> >> >> > L2 cache:              512K
> >> >> > L3 cache:              5118K
> >> >> > NUMA node0 CPU(s):     0,8,16,24,32,40
> >> >> > NUMA node1 CPU(s):     1,9,17,25,33,41
> >> >> > NUMA node2 CPU(s):     2,10,18,26,34,42
> >> >> > NUMA node3 CPU(s):     3,11,19,27,35,43
> >> >> > NUMA node4 CPU(s):     4,12,20,28,36,44
> >> >> > NUMA node5 CPU(s):     5,13,21,29,37,45
> >> >> > NUMA node6 CPU(s):     6,14,22,30,38,46
> >> >> > NUMA node7 CPU(s):     7,15,23,31,39,47
> >> >> 
> >> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
> >> >> that this affects performance of makedumpfile? This is just a guess.
> >> >> 
> >> >> Could you check whether the performance gets imporoved if you run each
> >> >> thread on the same NUMA node? For example:
> >> >> 
> >> >>   # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> >>   vmcore-cd0
> >> >> 
> >> > Hi HATAYAMA,
> >> > 
> >> > I think your guess is right, but maybe your command has a little
> >> > problem.
> >> > 
> >> > From my test, the NUMA did affect the performance, but not too much.
> >> > The average time of cpus in the same NUMA node:
> >> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> > vmcore-cd0
> >> > is 314s
> >> > The average time of cpus in different NUMA node:
> >> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> > vmcore-cd0
> >> > is 354s
> >> >
> >> 
> >> Hmm, according to some previous discussion, what we should see here is
> >> whether it affects performance of makedumpfile with --num-threads 1
> >> and -d 31. So you should need to compare:
> >> 
> >>     # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
> >> 
> >> with:
> >> 
> >>     # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
> 
> I removed -c option wrongly. What I wanted to write is:
> 
>     # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> 
> and:
> 
>     # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> 
> just in case...
> 
Hi HATAYAMA,

the average time of
# taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
is 33s.
the average time of
# taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
is 18s.

My test steps:
1. change /etc/kdump/conf with
"core_collector makedumpfile -l --message-level 1 -d 31"
2. make a crash
3. cd into the directory of the vmcore made by kdump
4. in the directory of vmcore do
# taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
or
# taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31

if there are there any problems, please tell me.

Thanks,
Chao Fan

> >> 
> >> Also, I'm assuming that you've done these benchmark on kdump 1st
> >> kernel, not kdump 2nd kernel. Is this correct?
> >> 
> > Hi HATAYAMA,
> > 
> > I test in the first kernel, not in the kdump second kernel.
> >
> 
> I see.
> 
> --
> Thanks.
> HATAYAMA, Daisuke
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-24  6:02                                   ` Chao Fan
@ 2015-12-24  7:22                                     ` HATAYAMA Daisuke
  2015-12-24  8:20                                     ` Atsushi Kumagai
  1 sibling, 0 replies; 43+ messages in thread
From: HATAYAMA Daisuke @ 2015-12-24  7:22 UTC (permalink / raw)
  To: cfan; +Cc: ats-kumagai, zhouwj-fnst, kexec

From: Chao Fan <cfan@redhat.com>
Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
Date: Thu, 24 Dec 2015 01:02:38 -0500

> 
> 
> ----- Original Message -----
>> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> To: cfan@redhat.com
>> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
>> Sent: Thursday, December 24, 2015 11:50:08 AM
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> 
>> From: Chao Fan <cfan@redhat.com>
>> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> Date: Wed, 23 Dec 2015 22:31:37 -0500
>> 
>> > 
>> > 
>> > ----- Original Message -----
>> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> >> To: cfan@redhat.com
>> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
>> >> kexec@lists.infradead.org
>> >> Sent: Thursday, December 24, 2015 11:22:28 AM
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> 
>> >> From: Chao Fan <cfan@redhat.com>
>> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> Date: Wed, 23 Dec 2015 21:20:48 -0500
>> >> 
>> >> > 
>> >> > 
>> >> > ----- Original Message -----
>> >> >> From: "HATAYAMA Daisuke" <d.hatayama@jp.fujitsu.com>
>> >> >> To: cfan@redhat.com
>> >> >> Cc: ats-kumagai@wm.jp.nec.com, zhouwj-fnst@cn.fujitsu.com,
>> >> >> kexec@lists.infradead.org
>> >> >> Sent: Tuesday, December 22, 2015 4:32:25 PM
>> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> 
>> >> >> Chao,
>> >> >> 
>> >> >> From: Chao Fan <cfan@redhat.com>
>> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> Date: Thu, 10 Dec 2015 05:54:28 -0500
>> >> >> 
>> >> >> > 
>> >> >> > 
>> >> >> > ----- Original Message -----
>> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> >> To: "Chao Fan" <cfan@redhat.com>
>> >> >> >> Cc: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>,
>> >> >> >> kexec@lists.infradead.org
>> >> >> >> Sent: Thursday, December 10, 2015 6:32:32 PM
>> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> >> 
>> >> >> >> On 12/10/2015 05:58 PM, Chao Fan wrote:
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > ----- Original Message -----
>> >> >> >> >> From: "Wenjian Zhou/周文剑" <zhouwj-fnst@cn.fujitsu.com>
>> >> >> >> >> To: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
>> >> >> >> >> Cc: kexec@lists.infradead.org
>> >> >> >> >> Sent: Thursday, December 10, 2015 5:36:47 PM
>> >> >> >> >> Subject: Re: [PATCH RFC 00/11] makedumpfile: parallel processing
>> >> >> >> >>
>> >> >> >> >> On 12/10/2015 04:14 PM, Atsushi Kumagai wrote:
>> >> >> >> >>>> Hello Kumagai,
>> >> >> >> >>>>
>> >> >> >> >>>> On 12/04/2015 10:30 AM, Atsushi Kumagai wrote:
>> >> >> >> >>>>> Hello, Zhou
>> >> >> >> >>>>>
>> >> >> >> >>>>>> On 12/02/2015 03:24 PM, Dave Young wrote:
>> >> >> >> >>>>>>> Hi,
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>> On 12/02/15 at 01:29pm, "Zhou, Wenjian/周文剑" wrote:
>> >> >> >> >>>>>>>> I think there is no problem if other test results are as
>> >> >> >> >>>>>>>> expected.
>> >> >> >> >>>>>>>>
>> >> >> >> >>>>>>>> --num-threads mainly reduces the time of compressing.
>> >> >> >> >>>>>>>> So for lzo, it can't do much help at most of time.
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>> Seems the help of --num-threads does not say it exactly:
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>>       [--num-threads THREADNUM]:
>> >> >> >> >>>>>>>           Using multiple threads to read and compress data
>> >> >> >> >>>>>>>           of
>> >> >> >> >>>>>>>           each
>> >> >> >> >>>>>>>           page
>> >> >> >> >>>>>>>           in parallel.
>> >> >> >> >>>>>>>           And it will reduces time for saving DUMPFILE.
>> >> >> >> >>>>>>>           This feature only supports creating DUMPFILE in
>> >> >> >> >>>>>>>           kdump-comressed format from
>> >> >> >> >>>>>>>           VMCORE in kdump-compressed format or elf format.
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>> Lzo is also a compress method, it should be mentioned that
>> >> >> >> >>>>>>> --num-threads only
>> >> >> >> >>>>>>> supports zlib compressed vmcore.
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>
>> >> >> >> >>>>>> Sorry, it seems that something I said is not so clear.
>> >> >> >> >>>>>> lzo is also supported. Since lzo compresses data at a high
>> >> >> >> >>>>>> speed,
>> >> >> >> >>>>>> the
>> >> >> >> >>>>>> improving of the performance is not so obvious at most of
>> >> >> >> >>>>>> time.
>> >> >> >> >>>>>>
>> >> >> >> >>>>>>> Also worth to mention about the recommended -d value for
>> >> >> >> >>>>>>> this
>> >> >> >> >>>>>>> feature.
>> >> >> >> >>>>>>>
>> >> >> >> >>>>>>
>> >> >> >> >>>>>> Yes, I think it's worth. I forgot it.
>> >> >> >> >>>>>
>> >> >> >> >>>>> I saw your patch, but I think I should confirm what is the
>> >> >> >> >>>>> problem
>> >> >> >> >>>>> first.
>> >> >> >> >>>>>
>> >> >> >> >>>>>> However, when "-d 31" is specified, it will be worse.
>> >> >> >> >>>>>> Less than 50 buffers are used to cache the compressed page.
>> >> >> >> >>>>>> And even the page has been filtered, it will also take a
>> >> >> >> >>>>>> buffer.
>> >> >> >> >>>>>> So if "-d 31" is specified, the filtered page will use a lot
>> >> >> >> >>>>>> of buffers. Then the page which needs to be compressed can't
>> >> >> >> >>>>>> be compressed parallel.
>> >> >> >> >>>>>
>> >> >> >> >>>>> Could you explain why compression will not be parallel in more
>> >> >> >> >>>>> detail ?
>> >> >> >> >>>>> Actually the buffers are used also for filtered pages, it
>> >> >> >> >>>>> sounds
>> >> >> >> >>>>> inefficient.
>> >> >> >> >>>>> However, I don't understand why it prevents parallel
>> >> >> >> >>>>> compression.
>> >> >> >> >>>>>
>> >> >> >> >>>>
>> >> >> >> >>>> Think about this, in a huge memory, most of the page will be
>> >> >> >> >>>> filtered,
>> >> >> >> >>>> and
>> >> >> >> >>>> we have 5 buffers.
>> >> >> >> >>>>
>> >> >> >> >>>> page1       page2      page3     page4     page5      page6
>> >> >> >> >>>> page7
>> >> >> >> >>>> .....
>> >> >> >> >>>> [buffer1]   [2]        [3]       [4]       [5]
>> >> >> >> >>>> unfiltered  filtered   filtered  filtered  filtered
>> >> >> >> >>>> unfiltered
>> >> >> >> >>>> filtered
>> >> >> >> >>>>
>> >> >> >> >>>> Since filtered page will take a buffer, when compressing page1,
>> >> >> >> >>>> page6 can't be compressed at the same time.
>> >> >> >> >>>> That why it will prevent parallel compression.
>> >> >> >> >>>
>> >> >> >> >>> Thanks for your explanation, I understand.
>> >> >> >> >>> This is just an issue of the current implementation, there is no
>> >> >> >> >>> reason to stand this restriction.
>> >> >> >> >>>
>> >> >> >> >>>>> Further, according to Chao's benchmark, there is a big
>> >> >> >> >>>>> performance
>> >> >> >> >>>>> degradation even if the number of thread is 1. (58s vs 240s)
>> >> >> >> >>>>> The current implementation seems to have some problems, we
>> >> >> >> >>>>> should
>> >> >> >> >>>>> solve them.
>> >> >> >> >>>>>
>> >> >> >> >>>>
>> >> >> >> >>>> If "-d 31" is specified, on the one hand we can't save time by
>> >> >> >> >>>> compressing
>> >> >> >> >>>> parallel, on the other hand we will introduce some extra work
>> >> >> >> >>>> by
>> >> >> >> >>>> adding
>> >> >> >> >>>> "--num-threads". So it is obvious that it will have a
>> >> >> >> >>>> performance
>> >> >> >> >>>> degradation.
>> >> >> >> >>>
>> >> >> >> >>> Sure, there must be some overhead due to "some extra work"(e.g.
>> >> >> >> >>> exclusive
>> >> >> >> >>> lock),
>> >> >> >> >>> but "--num-threads=1 is 4 times slower than --num-threads=0"
>> >> >> >> >>> still
>> >> >> >> >>> sounds
>> >> >> >> >>> too slow, the degradation is too big to be called "some extra
>> >> >> >> >>> work".
>> >> >> >> >>>
>> >> >> >> >>> Both --num-threads=0 and --num-threads=1 are serial processing,
>> >> >> >> >>> the above "buffer fairness issue" will not be related to this
>> >> >> >> >>> degradation.
>> >> >> >> >>> What do you think what make this degradation ?
>> >> >> >> >>>
>> >> >> >> >>
>> >> >> >> >> I can't get such result at this moment, so I can't do some
>> >> >> >> >> further
>> >> >> >> >> investigation
>> >> >> >> >> right now. I guess it may be caused by the underlying
>> >> >> >> >> implementation
>> >> >> >> >> of
>> >> >> >> >> pthread.
>> >> >> >> >> I reviewed the test result of the patch v2 and found in different
>> >> >> >> >> machines,
>> >> >> >> >> the results are quite different.
>> >> >> >> >
>> >> >> >> > Hi Zhou Wenjian,
>> >> >> >> >
>> >> >> >> > I have done more tests in another machine with 128G memory, and
>> >> >> >> > get
>> >> >> >> > the
>> >> >> >> > result:
>> >> >> >> >
>> >> >> >> > the size of vmcore is 300M in "-d 31"
>> >> >> >> > makedumpfile -l --message-level 1 -d 31:
>> >> >> >> > time: 8.6s      page-faults: 2272
>> >> >> >> >
>> >> >> >> > makedumpfile -l --num-threads 1 --message-level 1 -d 31:
>> >> >> >> > time: 28.1s     page-faults: 2359
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > and the size of vmcore is 2.6G in "-d 0".
>> >> >> >> > In this machine, I get the same result as yours:
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > makedumpfile -c --message-level 1 -d 0:
>> >> >> >> > time: 597s      page-faults: 2287
>> >> >> >> >
>> >> >> >> > makedumpfile -c --num-threads 1 --message-level 1 -d 0:
>> >> >> >> > time: 602s      page-faults: 2361
>> >> >> >> >
>> >> >> >> > makedumpfile -c --num-threads 2 --message-level 1 -d 0:
>> >> >> >> > time: 337s      page-faults: 2397
>> >> >> >> >
>> >> >> >> > makedumpfile -c --num-threads 4 --message-level 1 -d 0:
>> >> >> >> > time: 175s      page-faults: 2461
>> >> >> >> >
>> >> >> >> > makedumpfile -c --num-threads 8 --message-level 1 -d 0:
>> >> >> >> > time: 103s      page-faults: 2611
>> >> >> >> >
>> >> >> >> >
>> >> >> >> > But the machine of my first test is not under my control, should I
>> >> >> >> > wait
>> >> >> >> > for
>> >> >> >> > the first machine to do more tests?
>> >> >> >> > If there are still some problems in my tests, please tell me.
>> >> >> >> >
>> >> >> >> 
>> >> >> >> Thanks a lot for your test, it seems that there is nothing wrong.
>> >> >> >> And I haven't got any idea about more tests...
>> >> >> >> 
>> >> >> >> Could you provide the information of your cpu ?
>> >> >> >> I will do some further investigation later.
>> >> >> >> 
>> >> >> > 
>> >> >> > OK, of course, here is the information of cpu:
>> >> >> > 
>> >> >> > # lscpu
>> >> >> > Architecture:          x86_64
>> >> >> > CPU op-mode(s):        32-bit, 64-bit
>> >> >> > Byte Order:            Little Endian
>> >> >> > CPU(s):                48
>> >> >> > On-line CPU(s) list:   0-47
>> >> >> > Thread(s) per core:    1
>> >> >> > Core(s) per socket:    6
>> >> >> > Socket(s):             8
>> >> >> > NUMA node(s):          8
>> >> >> > Vendor ID:             AuthenticAMD
>> >> >> > CPU family:            16
>> >> >> > Model:                 8
>> >> >> > Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
>> >> >> > Stepping:              0
>> >> >> > CPU MHz:               2793.040
>> >> >> > BogoMIPS:              5586.22
>> >> >> > Virtualization:        AMD-V
>> >> >> > L1d cache:             64K
>> >> >> > L1i cache:             64K
>> >> >> > L2 cache:              512K
>> >> >> > L3 cache:              5118K
>> >> >> > NUMA node0 CPU(s):     0,8,16,24,32,40
>> >> >> > NUMA node1 CPU(s):     1,9,17,25,33,41
>> >> >> > NUMA node2 CPU(s):     2,10,18,26,34,42
>> >> >> > NUMA node3 CPU(s):     3,11,19,27,35,43
>> >> >> > NUMA node4 CPU(s):     4,12,20,28,36,44
>> >> >> > NUMA node5 CPU(s):     5,13,21,29,37,45
>> >> >> > NUMA node6 CPU(s):     6,14,22,30,38,46
>> >> >> > NUMA node7 CPU(s):     7,15,23,31,39,47
>> >> >> 
>> >> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
>> >> >> that this affects performance of makedumpfile? This is just a guess.
>> >> >> 
>> >> >> Could you check whether the performance gets imporoved if you run each
>> >> >> thread on the same NUMA node? For example:
>> >> >> 
>> >> >>   # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> >>   vmcore-cd0
>> >> >> 
>> >> > Hi HATAYAMA,
>> >> > 
>> >> > I think your guess is right, but maybe your command has a little
>> >> > problem.
>> >> > 
>> >> > From my test, the NUMA did affect the performance, but not too much.
>> >> > The average time of cpus in the same NUMA node:
>> >> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> > vmcore-cd0
>> >> > is 314s
>> >> > The average time of cpus in different NUMA node:
>> >> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> > vmcore-cd0
>> >> > is 354s
>> >> >
>> >> 
>> >> Hmm, according to some previous discussion, what we should see here is
>> >> whether it affects performance of makedumpfile with --num-threads 1
>> >> and -d 31. So you should need to compare:
>> >> 
>> >>     # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
>> >> 
>> >> with:
>> >> 
>> >>     # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
>> 
>> I removed -c option wrongly. What I wanted to write is:
>> 
>>     # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>> 
>> and:
>> 
>>     # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>> 
>> just in case...
>> 
> Hi HATAYAMA,
> 
> the average time of
> # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> is 33s.
> the average time of
> # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> is 18s.
> 

Thanks. I found out that NUMA nodes is irrelevant here.

> My test steps:
> 1. change /etc/kdump/conf with
> "core_collector makedumpfile -l --message-level 1 -d 31"
> 2. make a crash
> 3. cd into the directory of the vmcore made by kdump
> 4. in the directory of vmcore do
> # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> or
> # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> 
> if there are there any problems, please tell me.
> 

I'll investigate this with Zhou. Please provide us with information
when neccesary.

--
Thanks.
HATAYAMA, Daisuke
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-24  6:02                                   ` Chao Fan
  2015-12-24  7:22                                     ` HATAYAMA Daisuke
@ 2015-12-24  8:20                                     ` Atsushi Kumagai
  2015-12-24  9:04                                       ` Chao Fan
  1 sibling, 1 reply; 43+ messages in thread
From: Atsushi Kumagai @ 2015-12-24  8:20 UTC (permalink / raw)
  To: HATAYAMA Daisuke (d.hatayama@jp.fujitsu.com), Chao Fan
  Cc: zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org

>> >> >> >> Could you provide the information of your cpu ?
>> >> >> >> I will do some further investigation later.
>> >> >> >>
>> >> >> >
>> >> >> > OK, of course, here is the information of cpu:
>> >> >> >
>> >> >> > # lscpu
>> >> >> > Architecture:          x86_64
>> >> >> > CPU op-mode(s):        32-bit, 64-bit
>> >> >> > Byte Order:            Little Endian
>> >> >> > CPU(s):                48
>> >> >> > On-line CPU(s) list:   0-47
>> >> >> > Thread(s) per core:    1
>> >> >> > Core(s) per socket:    6
>> >> >> > Socket(s):             8
>> >> >> > NUMA node(s):          8
>> >> >> > Vendor ID:             AuthenticAMD
>> >> >> > CPU family:            16
>> >> >> > Model:                 8
>> >> >> > Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
>> >> >> > Stepping:              0
>> >> >> > CPU MHz:               2793.040
>> >> >> > BogoMIPS:              5586.22
>> >> >> > Virtualization:        AMD-V
>> >> >> > L1d cache:             64K
>> >> >> > L1i cache:             64K
>> >> >> > L2 cache:              512K
>> >> >> > L3 cache:              5118K
>> >> >> > NUMA node0 CPU(s):     0,8,16,24,32,40
>> >> >> > NUMA node1 CPU(s):     1,9,17,25,33,41
>> >> >> > NUMA node2 CPU(s):     2,10,18,26,34,42
>> >> >> > NUMA node3 CPU(s):     3,11,19,27,35,43
>> >> >> > NUMA node4 CPU(s):     4,12,20,28,36,44
>> >> >> > NUMA node5 CPU(s):     5,13,21,29,37,45
>> >> >> > NUMA node6 CPU(s):     6,14,22,30,38,46
>> >> >> > NUMA node7 CPU(s):     7,15,23,31,39,47
>> >> >>
>> >> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
>> >> >> that this affects performance of makedumpfile? This is just a guess.
>> >> >>
>> >> >> Could you check whether the performance gets imporoved if you run each
>> >> >> thread on the same NUMA node? For example:
>> >> >>
>> >> >>   # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> >>   vmcore-cd0
>> >> >>
>> >> > Hi HATAYAMA,
>> >> >
>> >> > I think your guess is right, but maybe your command has a little
>> >> > problem.
>> >> >
>> >> > From my test, the NUMA did affect the performance, but not too much.
>> >> > The average time of cpus in the same NUMA node:
>> >> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> > vmcore-cd0
>> >> > is 314s
>> >> > The average time of cpus in different NUMA node:
>> >> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
>> >> > vmcore-cd0
>> >> > is 354s
>> >> >
>> >>
>> >> Hmm, according to some previous discussion, what we should see here is
>> >> whether it affects performance of makedumpfile with --num-threads 1
>> >> and -d 31. So you should need to compare:
>> >>
>> >>     # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore vmcore-d31
>> >>
>> >> with:
>> >>
>> >>     # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
>>
>> I removed -c option wrongly. What I wanted to write is:
>>
>>     # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>>
>> and:
>>
>>     # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>>
>> just in case...

Why did you remove -c option from makedumpfile ?
We are discussing the performance with compression.
I think the below is correct:

       # taskset -c 0,8 makedumpfile --num-threads 1 [-c|-l|-p] -d 31 vmcore vmcore-d31

and:

       # taskset -c 0 makedumpfile [-c|-l|-p] -d 31 vmcore vmcore-d31


Thanks,
Atsushi Kumagai

>Hi HATAYAMA,
>
>the average time of
># taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>is 33s.
>the average time of
># taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>is 18s.
>
>My test steps:
>1. change /etc/kdump/conf with
>"core_collector makedumpfile -l --message-level 1 -d 31"
>2. make a crash
>3. cd into the directory of the vmcore made by kdump
>4. in the directory of vmcore do
># taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
>or
># taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
>
>if there are there any problems, please tell me.
>
>Thanks,
>Chao Fan
>
>> >>
>> >> Also, I'm assuming that you've done these benchmark on kdump 1st
>> >> kernel, not kdump 2nd kernel. Is this correct?
>> >>
>> > Hi HATAYAMA,
>> >
>> > I test in the first kernel, not in the kdump second kernel.
>> >
>>
>> I see.
>>
>> --
>> Thanks.
>> HATAYAMA, Daisuke
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>
>
>_______________________________________________
>kexec mailing list
>kexec@lists.infradead.org
>http://lists.infradead.org/mailman/listinfo/kexec
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH RFC 00/11] makedumpfile: parallel processing
  2015-12-24  8:20                                     ` Atsushi Kumagai
@ 2015-12-24  9:04                                       ` Chao Fan
  0 siblings, 0 replies; 43+ messages in thread
From: Chao Fan @ 2015-12-24  9:04 UTC (permalink / raw)
  To: Atsushi Kumagai
  Cc: HATAYAMA Daisuke (d.hatayama@jp.fujitsu.com), zhouwj-fnst, kexec



----- Original Message -----
> From: "Atsushi Kumagai" <ats-kumagai@wm.jp.nec.com>
> To: "HATAYAMA Daisuke (d.hatayama@jp.fujitsu.com)" <d.hatayama@jp.fujitsu.com>, "Chao Fan" <cfan@redhat.com>
> Cc: zhouwj-fnst@cn.fujitsu.com, kexec@lists.infradead.org
> Sent: Thursday, December 24, 2015 4:20:42 PM
> Subject: RE: [PATCH RFC 00/11] makedumpfile: parallel processing
> 
> >> >> >> >> Could you provide the information of your cpu ?
> >> >> >> >> I will do some further investigation later.
> >> >> >> >>
> >> >> >> >
> >> >> >> > OK, of course, here is the information of cpu:
> >> >> >> >
> >> >> >> > # lscpu
> >> >> >> > Architecture:          x86_64
> >> >> >> > CPU op-mode(s):        32-bit, 64-bit
> >> >> >> > Byte Order:            Little Endian
> >> >> >> > CPU(s):                48
> >> >> >> > On-line CPU(s) list:   0-47
> >> >> >> > Thread(s) per core:    1
> >> >> >> > Core(s) per socket:    6
> >> >> >> > Socket(s):             8
> >> >> >> > NUMA node(s):          8
> >> >> >> > Vendor ID:             AuthenticAMD
> >> >> >> > CPU family:            16
> >> >> >> > Model:                 8
> >> >> >> > Model name:            Six-Core AMD Opteron(tm) Processor 8439 SE
> >> >> >> > Stepping:              0
> >> >> >> > CPU MHz:               2793.040
> >> >> >> > BogoMIPS:              5586.22
> >> >> >> > Virtualization:        AMD-V
> >> >> >> > L1d cache:             64K
> >> >> >> > L1i cache:             64K
> >> >> >> > L2 cache:              512K
> >> >> >> > L3 cache:              5118K
> >> >> >> > NUMA node0 CPU(s):     0,8,16,24,32,40
> >> >> >> > NUMA node1 CPU(s):     1,9,17,25,33,41
> >> >> >> > NUMA node2 CPU(s):     2,10,18,26,34,42
> >> >> >> > NUMA node3 CPU(s):     3,11,19,27,35,43
> >> >> >> > NUMA node4 CPU(s):     4,12,20,28,36,44
> >> >> >> > NUMA node5 CPU(s):     5,13,21,29,37,45
> >> >> >> > NUMA node6 CPU(s):     6,14,22,30,38,46
> >> >> >> > NUMA node7 CPU(s):     7,15,23,31,39,47
> >> >> >>
> >> >> >> This CPU assignment on NUMA nodes looks interesting. Is it possible
> >> >> >> that this affects performance of makedumpfile? This is just a guess.
> >> >> >>
> >> >> >> Could you check whether the performance gets imporoved if you run
> >> >> >> each
> >> >> >> thread on the same NUMA node? For example:
> >> >> >>
> >> >> >>   # taskset -c 0,8,16,24 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> >> >>   vmcore-cd0
> >> >> >>
> >> >> > Hi HATAYAMA,
> >> >> >
> >> >> > I think your guess is right, but maybe your command has a little
> >> >> > problem.
> >> >> >
> >> >> > From my test, the NUMA did affect the performance, but not too much.
> >> >> > The average time of cpus in the same NUMA node:
> >> >> > # taskset -c 0,8,16,24,32 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> >> > vmcore-cd0
> >> >> > is 314s
> >> >> > The average time of cpus in different NUMA node:
> >> >> > # taskset -c 2,3,5,6,7 makedumpfile --num-threads 4 -c -d 0 vmcore
> >> >> > vmcore-cd0
> >> >> > is 354s
> >> >> >
> >> >>
> >> >> Hmm, according to some previous discussion, what we should see here is
> >> >> whether it affects performance of makedumpfile with --num-threads 1
> >> >> and -d 31. So you should need to compare:
> >> >>
> >> >>     # taskset 0,8 makedumpfile --num-threads 1 -c -d 31 vmcore
> >> >>     vmcore-d31
> >> >>
> >> >> with:
> >> >>
> >> >>     # taskset 0 makedumpfile -c -d 0 vmcore vmcore-d31
> >>
> >> I removed -c option wrongly. What I wanted to write is:
> >>
> >>     # taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> >>
> >> and:
> >>
> >>     # taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> >>
> >> just in case...
> 
> Why did you remove -c option from makedumpfile ?
> We are discussing the performance with compression.
> I think the below is correct:
> 
>        # taskset -c 0,8 makedumpfile --num-threads 1 [-c|-l|-p] -d 31 vmcore
>        vmcore-d31
> 
> and:
> 
>        # taskset -c 0 makedumpfile [-c|-l|-p] -d 31 vmcore vmcore-d31
> 

Hi Atsushi Kumagai,

   "taskset -c 0,8 makedumpfile --num-threads 1"   "taskset -c 0 makedumpfile"
-c              52s                                         61s
-l              33s                                         17s
-p              33s                                         18s

Thanks,
Chao Fan

> 
> Thanks,
> Atsushi Kumagai
> 
> >Hi HATAYAMA,
> >
> >the average time of
> ># taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> >is 33s.
> >the average time of
> ># taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> >is 18s.
> >
> >My test steps:
> >1. change /etc/kdump/conf with
> >"core_collector makedumpfile -l --message-level 1 -d 31"
> >2. make a crash
> >3. cd into the directory of the vmcore made by kdump
> >4. in the directory of vmcore do
> ># taskset -c 0,8 makedumpfile --num-threads 1 -d 31 vmcore vmcore-d31
> >or
> ># taskset -c 0 makedumpfile -d 31 vmcore vmcore-d31
> >
> >if there are there any problems, please tell me.
> >
> >Thanks,
> >Chao Fan
> >
> >> >>
> >> >> Also, I'm assuming that you've done these benchmark on kdump 1st
> >> >> kernel, not kdump 2nd kernel. Is this correct?
> >> >>
> >> > Hi HATAYAMA,
> >> >
> >> > I test in the first kernel, not in the kdump second kernel.
> >> >
> >>
> >> I see.
> >>
> >> --
> >> Thanks.
> >> HATAYAMA, Daisuke
> >> _______________________________________________
> >> kexec mailing list
> >> kexec@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/kexec
> >>
> >
> >_______________________________________________
> >kexec mailing list
> >kexec@lists.infradead.org
> >http://lists.infradead.org/mailman/listinfo/kexec
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
> 

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2015-12-24  9:05 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-06-05  7:56 [PATCH RFC 00/11] makedumpfile: parallel processing Zhou Wenjian
2015-06-05  7:56 ` [PATCH RFC 01/11] Add readpage_kdump_compressed_parallel Zhou Wenjian
2015-06-05  7:56 ` [PATCH RFC 02/11] Add mappage_elf_parallel Zhou Wenjian
2015-06-05  7:56 ` [PATCH RFC 03/11] Add readpage_elf_parallel Zhou Wenjian
2015-06-05  7:56 ` [PATCH RFC 04/11] Add read_pfn_parallel Zhou Wenjian
2015-06-05  7:56 ` [PATCH RFC 05/11] Add function to initial bitmap for parallel use Zhou Wenjian
2015-06-05  7:57 ` [PATCH RFC 06/11] Add filter_data_buffer_parallel Zhou Wenjian
2015-06-05  7:57 ` [PATCH RFC 07/11] Add write_kdump_pages_parallel to allow parallel process Zhou Wenjian
2015-06-05  7:57 ` [PATCH RFC 08/11] Add write_kdump_pages_parallel_cyclic to allow parallel process in cyclic_mode Zhou Wenjian
2015-06-05  7:57 ` [PATCH RFC 09/11] Initial and free data used for parallel process Zhou Wenjian
2015-06-05  7:57 ` [PATCH RFC 10/11] Make makedumpfile available to read and compress pages parallelly Zhou Wenjian
2015-06-05  7:57 ` [PATCH RFC 11/11] Add usage and manual about multiple threads process Zhou Wenjian
2015-06-08  3:55 ` [PATCH RFC 00/11] makedumpfile: parallel processing "Zhou, Wenjian/周文剑"
2015-12-01  8:39   ` Chao Fan
2015-12-02  5:29     ` "Zhou, Wenjian/周文剑"
2015-12-02  7:24       ` Dave Young
2015-12-02  7:38         ` "Zhou, Wenjian/周文剑"
2015-12-04  2:30           ` Atsushi Kumagai
2015-12-04  3:33             ` "Zhou, Wenjian/周文剑"
2015-12-04  8:56               ` Chao Fan
2015-12-07  1:09                 ` "Zhou, Wenjian/周文剑"
2015-12-10  8:14               ` Atsushi Kumagai
2015-12-10  9:36                 ` "Zhou, Wenjian/周文剑"
2015-12-10  9:58                   ` Chao Fan
2015-12-10 10:32                     ` "Zhou, Wenjian/周文剑"
2015-12-10 10:54                       ` Chao Fan
2015-12-22  8:32                         ` HATAYAMA Daisuke
2015-12-24  2:20                           ` Chao Fan
2015-12-24  3:22                             ` HATAYAMA Daisuke
2015-12-24  3:31                               ` Chao Fan
2015-12-24  3:50                                 ` HATAYAMA Daisuke
2015-12-24  6:02                                   ` Chao Fan
2015-12-24  7:22                                     ` HATAYAMA Daisuke
2015-12-24  8:20                                     ` Atsushi Kumagai
2015-12-24  9:04                                       ` Chao Fan
2015-12-14  8:26                   ` Atsushi Kumagai
2015-12-14  8:59                     ` "Zhou, Wenjian/周文剑"
2015-06-10  6:06 ` Atsushi Kumagai
2015-06-11  3:47   ` "Zhou, Wenjian/周文剑"
2015-06-15  1:59     ` qiaonuohan
2015-06-15  5:57       ` Atsushi Kumagai
2015-06-15  6:06         ` qiaonuohan
2015-06-15  6:07         ` qiaonuohan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox