All of lore.kernel.org
 help / color / mirror / Atom feed
From: "\"Zhou, Wenjian/周文?\"" <zhouwj-fnst@cn.fujitsu.com>
To: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Cc: "kexec@lists.infradead.org" <kexec@lists.infradead.org>
Subject: Re: [PATCH] makedumpfile: Change num_threads to num_producers
Date: Thu, 4 Aug 2016 09:28:06 +0800	[thread overview]
Message-ID: <57A29A26.5000809@cn.fujitsu.com> (raw)
In-Reply-To: <0910DD04CBD6DE4193FCF86B9C00BE9701E5A696@BPXM01GP.gisp.nec.co.jp>

On 08/04/2016 08:50 AM, Atsushi Kumagai wrote:
> Hello Zhou,
> 
>> Currently, num_threads means the producer number. The main thread
>> is not included. However, num_threads is specified by the option
>> "--num-threads". So this may make user confused why it still has
>> performance degradation when the value by "--num-threads" equals
>> the usable cpu number.
>>
>> I think it will be more nature to make "--num-threads" be producer
>> number + 1. In other words, "--num-threads" should include the
>> main thread.
> 
> The most part of cpu time will be spent for compression while
> the main thread(Consumer) just does I/O work.
> If the usable cpu number is 2 and --num-thread is also 2, your
> patch reserves a whole cpu for the main thread and the compression
> will be done by just one cpu. Is that really the multi thread
> processing which we wanted ?
> 
> If there are 4 usable cpus, I guess 4 producers with the main thread
> is faster than 3 producers with the main thread.
> 
> However, according to my quick test, there were no significant
> differences between the two cases.
> (This was measured on a small VM(5GB) without your patch, the number
> of trials is 3, so just for information.)
> 
> # makedumpfile -c --num-thread [3|4]
> 
>           |               |              time[s]
>      CPUs |  -num-threads |    1st   |    2nd   |    3rd
>    -------+---------------+----------+----------+----------
>       4   |       3       |   29.54      30.19     28.26
>       4   |       4       |   29.80      31.75     28.19
> 
> Then, I noticed that the cpu usage of the main thread is almost 100%
> while it doesn't need a lot of cpu resources in theory. Do you have
> any idea why the main thread require a lot of cpu resources ?

Yes, it shouldn't take so much cpu time.
It really need to be improved.
I'll think about it.

-- 
Thanks
Zhou
> If it's an unintended behavior, I believe the case of --num-thread=4
> can be faster if the cpu usage of the main thread is reduced.
> 
> 
> Thanks,
> Atsushi Kumagai
> 
>> ---
>> makedumpfile.c | 108 ++++++++++++++++++++++++++++-----------------------------
>> makedumpfile.h |   4 +--
>> 2 files changed, 56 insertions(+), 56 deletions(-)
>>
>> diff --git a/makedumpfile.c b/makedumpfile.c
>> index 21784e8..19019a4 100644
>> --- a/makedumpfile.c
>> +++ b/makedumpfile.c
>> @@ -1407,13 +1407,13 @@ open_dump_bitmap(void)
>> 		}
>> 	}
>>
>> -	if (info->num_threads) {
>> +	if (info->num_producers) {
>> 		/*
>> 		 * Reserve file descriptors of bitmap for creating dumpfiles
>> 		 * parallelly, because a bitmap file will be unlinked just after
>> 		 * this and it is not possible to open a bitmap file later.
>> 		 */
>> -		for (i = 0; i < info->num_threads; i++) {
>> +		for (i = 0; i < info->num_producers; i++) {
>> 			if ((fd = open(info->name_bitmap, O_RDONLY)) < 0) {
>> 				ERRMSG("Can't open the bitmap file(%s). %s\n",
>> 				    info->name_bitmap, strerror(errno));
>> @@ -3495,9 +3495,9 @@ initialize_bitmap_memory(void)
>> }
>>
>> void
>> -initialize_bitmap_memory_parallel(struct dump_bitmap *bitmap, int thread_num)
>> +initialize_bitmap_memory_parallel(struct dump_bitmap *bitmap, int producer_num)
>> {
>> -	bitmap->fd = FD_BITMAP_MEMORY_PARALLEL(thread_num);
>> +	bitmap->fd = FD_BITMAP_MEMORY_PARALLEL(producer_num);
>> 	bitmap->file_name = info->name_memory;
>> 	bitmap->no_block = -1;
>> 	memset(bitmap->buf, 0, BUFSIZE_BITMAP);
>> @@ -3529,24 +3529,24 @@ initial_for_parallel()
>> 	/*
>> 	 * allocate memory for threads
>> 	 */
>> -	if ((info->threads = malloc(sizeof(pthread_t *) * info->num_threads))
>> +	if ((info->threads = malloc(sizeof(pthread_t *) * info->num_producers))
>> 	    == NULL) {
>> 		MSG("Can't allocate memory for threads. %s\n",
>> 				strerror(errno));
>> 		return FALSE;
>> 	}
>> -	memset(info->threads, 0, sizeof(pthread_t *) * info->num_threads);
>> +	memset(info->threads, 0, sizeof(pthread_t *) * info->num_producers);
>>
>> 	if ((info->kdump_thread_args =
>> -			malloc(sizeof(struct thread_args) * info->num_threads))
>> +			malloc(sizeof(struct thread_args) * info->num_producers))
>> 	    == NULL) {
>> 		MSG("Can't allocate memory for arguments of threads. %s\n",
>> 				strerror(errno));
>> 		return FALSE;
>> 	}
>> -	memset(info->kdump_thread_args, 0, sizeof(struct thread_args) * info->num_threads);
>> +	memset(info->kdump_thread_args, 0, sizeof(struct thread_args) * info->num_producers);
>>
>> -	for (i = 0; i < info->num_threads; i++) {
>> +	for (i = 0; i < info->num_producers; i++) {
>> 		if ((info->threads[i] = malloc(sizeof(pthread_t))) == NULL) {
>> 			MSG("Can't allocate memory for thread %d. %s",
>> 					i, strerror(errno));
>> @@ -3592,7 +3592,7 @@ initial_for_parallel()
>> #endif
>> 	}
>>
>> -	info->num_buffers = PAGE_DATA_NUM * info->num_threads;
>> +	info->num_buffers = PAGE_DATA_NUM * info->num_producers;
>>
>> 	/*
>> 	 * allocate memory for page_data
>> @@ -3615,17 +3615,17 @@ initial_for_parallel()
>> 	}
>>
>> 	/*
>> -	 * initial page_flag for each thread
>> +	 * initial page_flag for each producer thread
>> 	 */
>> -	if ((info->page_flag_buf = malloc(sizeof(void *) * info->num_threads))
>> +	if ((info->page_flag_buf = malloc(sizeof(void *) * info->num_producers))
>> 	    == NULL) {
>> 		MSG("Can't allocate memory for page_flag_buf. %s\n",
>> 				strerror(errno));
>> 		return FALSE;
>> 	}
>> -	memset(info->page_flag_buf, 0, sizeof(void *) * info->num_threads);
>> +	memset(info->page_flag_buf, 0, sizeof(void *) * info->num_producers);
>>
>> -	for (i = 0; i < info->num_threads; i++) {
>> +	for (i = 0; i < info->num_producers; i++) {
>> 		if ((info->page_flag_buf[i] = calloc(1, sizeof(struct page_flag))) == NULL) {
>> 			MSG("Can't allocate memory for page_flag. %s\n",
>> 				strerror(errno));
>> @@ -3645,9 +3645,9 @@ initial_for_parallel()
>> 	}
>>
>> 	/*
>> -	 * initial fd_memory for threads
>> +	 * initial fd_memory for producer threads
>> 	 */
>> -	for (i = 0; i < info->num_threads; i++) {
>> +	for (i = 0; i < info->num_producers; i++) {
>> 		if ((FD_MEMORY_PARALLEL(i) = open(info->name_memory, O_RDONLY))
>> 									< 0) {
>> 			ERRMSG("Can't open the dump memory(%s). %s\n",
>> @@ -3673,7 +3673,7 @@ free_for_parallel()
>> 	struct page_flag *current;
>>
>> 	if (info->threads != NULL) {
>> -		for (i = 0; i < info->num_threads; i++) {
>> +		for (i = 0; i < info->num_producers; i++) {
>> 			if (info->threads[i] != NULL)
>> 				free(info->threads[i]);
>>
>> @@ -3714,7 +3714,7 @@ free_for_parallel()
>> 	}
>>
>> 	if (info->page_flag_buf != NULL) {
>> -		for (i = 0; i < info->num_threads; i++) {
>> +		for (i = 0; i < info->num_producers; i++) {
>> 			for (j = 0; j < PAGE_FLAG_NUM; j++) {
>> 				if (info->page_flag_buf[i] != NULL) {
>> 					current = info->page_flag_buf[i];
>> @@ -3729,7 +3729,7 @@ free_for_parallel()
>> 	if (info->parallel_info == NULL)
>> 		return;
>>
>> -	for (i = 0; i < info->num_threads; i++) {
>> +	for (i = 0; i < info->num_producers; i++) {
>> 		if (FD_MEMORY_PARALLEL(i) > 0)
>> 			close(FD_MEMORY_PARALLEL(i));
>>
>> @@ -3936,7 +3936,7 @@ out:
>> 		DEBUG_MSG("Buffer size for the cyclic mode: %ld\n", info->bufsize_cyclic);
>> 	}
>>
>> -	if (info->num_threads) {
>> +	if (info->num_producers) {
>> 		if (is_xen_memory()) {
>> 			MSG("'--num-threads' option is disable,\n");
>> 			MSG("because %s is Xen's memory core image.\n",
>> @@ -4066,9 +4066,9 @@ initialize_2nd_bitmap(struct dump_bitmap *bitmap)
>> }
>>
>> void
>> -initialize_2nd_bitmap_parallel(struct dump_bitmap *bitmap, int thread_num)
>> +initialize_2nd_bitmap_parallel(struct dump_bitmap *bitmap, int producer_num)
>> {
>> -	bitmap->fd = FD_BITMAP_PARALLEL(thread_num);
>> +	bitmap->fd = FD_BITMAP_PARALLEL(producer_num);
>> 	bitmap->file_name = info->name_bitmap;
>> 	bitmap->no_block = -1;
>> 	memset(bitmap->buf, 0, BUFSIZE_BITMAP);
>> @@ -7558,7 +7558,7 @@ kdump_thread_function_cyclic(void *arg) {
>> 	volatile struct page_flag *page_flag_buf = kdump_thread_args->page_flag_buf;
>> 	struct cycle *cycle = kdump_thread_args->cycle;
>> 	mdf_pfn_t pfn = cycle->start_pfn;
>> -	int index = kdump_thread_args->thread_num;
>> +	int index = kdump_thread_args->producer_num;
>> 	int buf_ready;
>> 	int dumpable;
>> 	int fd_memory = 0;
>> @@ -7566,21 +7566,21 @@ kdump_thread_function_cyclic(void *arg) {
>> 	struct dump_bitmap bitmap_memory_parallel = {0};
>> 	unsigned char *buf = NULL, *buf_out = NULL;
>> 	struct mmap_cache *mmap_cache =
>> -			MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
>> +			MMAP_CACHE_PARALLEL(kdump_thread_args->producer_num);
>> 	unsigned long size_out;
>> -	z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num);
>> +	z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->producer_num);
>> #ifdef USELZO
>> -	lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
>> +	lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->producer_num);
>> #endif
>> #ifdef USESNAPPY
>> 	unsigned long len_buf_out_snappy =
>> 				snappy_max_compressed_length(info->page_size);
>> #endif
>>
>> -	buf = BUF_PARALLEL(kdump_thread_args->thread_num);
>> -	buf_out = BUF_OUT_PARALLEL(kdump_thread_args->thread_num);
>> +	buf = BUF_PARALLEL(kdump_thread_args->producer_num);
>> +	buf_out = BUF_OUT_PARALLEL(kdump_thread_args->producer_num);
>>
>> -	fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->thread_num);
>> +	fd_memory = FD_MEMORY_PARALLEL(kdump_thread_args->producer_num);
>>
>> 	if (info->fd_bitmap) {
>> 		bitmap_parallel.buf = malloc(BUFSIZE_BITMAP);
>> @@ -7590,7 +7590,7 @@ kdump_thread_function_cyclic(void *arg) {
>> 			goto fail;
>> 		}
>> 		initialize_2nd_bitmap_parallel(&bitmap_parallel,
>> -					kdump_thread_args->thread_num);
>> +					kdump_thread_args->producer_num);
>> 	}
>>
>> 	if (info->flag_refiltering) {
>> @@ -7601,7 +7601,7 @@ kdump_thread_function_cyclic(void *arg) {
>> 			goto fail;
>> 		}
>> 		initialize_bitmap_memory_parallel(&bitmap_memory_parallel,
>> -						kdump_thread_args->thread_num);
>> +						kdump_thread_args->producer_num);
>> 	}
>>
>> 	/*
>> @@ -7802,8 +7802,8 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header,
>> 	for (i = 0; i < page_buf_num; i++)
>> 		page_data_buf[i].used = FALSE;
>>
>> -	for (i = 0; i < info->num_threads; i++) {
>> -		kdump_thread_args[i].thread_num = i;
>> +	for (i = 0; i < info->num_producers; i++) {
>> +		kdump_thread_args[i].producer_num = i;
>> 		kdump_thread_args[i].len_buf_out = len_buf_out;
>> 		kdump_thread_args[i].page_data_buf = page_data_buf;
>> 		kdump_thread_args[i].page_flag_buf = info->page_flag_buf[i];
>> @@ -7843,13 +7843,13 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header,
>> 			 * consuming is used for recording in which thread the pfn is the smallest.
>> 			 * current_pfn is used for recording the value of pfn when checking the pfn.
>> 			 */
>> -			for (i = 0; i < info->num_threads; i++) {
>> +			for (i = 0; i < info->num_producers; i++) {
>> 				if (info->page_flag_buf[i]->ready == FLAG_UNUSED)
>> 					continue;
>> 				temp_pfn = info->page_flag_buf[i]->pfn;
>>
>> 				/*
>> -				 * count how many threads have reached the end.
>> +				 * count how many producer threads have reached the end.
>> 				 */
>> 				if (temp_pfn >= end_pfn) {
>> 					info->page_flag_buf[i]->ready = FLAG_UNUSED;
>> @@ -7865,9 +7865,9 @@ write_kdump_pages_parallel_cyclic(struct cache_data *cd_header,
>> 			}
>>
>> 			/*
>> -			 * If all the threads have reached the end, we will finish writing.
>> +			 * If all producer threads have reached the end, we will finish writing.
>> 			 */
>> -			if (end_count >= info->num_threads)
>> +			if (end_count >= info->num_producers)
>> 				goto finish;
>>
>> 			/*
>> @@ -7930,7 +7930,7 @@ finish:
>>
>> out:
>> 	if (threads != NULL) {
>> -		for (i = 0; i < info->num_threads; i++) {
>> +		for (i = 0; i < info->num_producers; i++) {
>> 			if (threads[i] != NULL) {
>> 				res = pthread_cancel(*threads[i]);
>> 				if (res != 0 && res != ESRCH)
>> @@ -7939,7 +7939,7 @@ out:
>> 			}
>> 		}
>>
>> -		for (i = 0; i < info->num_threads; i++) {
>> +		for (i = 0; i < info->num_producers; i++) {
>> 			if (threads[i] != NULL) {
>> 				res = pthread_join(*threads[i], &thread_result);
>> 				if (res != 0)
>> @@ -8553,7 +8553,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data *cd_header, struct cache_d
>> 		if (!write_kdump_bitmap2(&cycle))
>> 			return FALSE;
>>
>> -		if (info->num_threads) {
>> +		if (info->num_producers) {
>> 			if (!write_kdump_pages_parallel_cyclic(cd_header,
>> 							cd_page, &pd_zero,
>> 							&offset_data, &cycle))
>> @@ -10531,7 +10531,7 @@ check_param_for_creating_dumpfile(int argc, char *argv[])
>> 	if (info->flag_sadump_diskset && !sadump_is_supported_arch())
>> 		return FALSE;
>>
>> -	if (info->num_threads) {
>> +	if (info->num_producers) {
>> 		if (info->flag_split) {
>> 			MSG("--num-threads cannot used with --split.\n");
>> 			return FALSE;
>> @@ -10607,16 +10607,16 @@ check_param_for_creating_dumpfile(int argc, char *argv[])
>> 	} else
>> 		return FALSE;
>>
>> -	if (info->num_threads) {
>> +	if (info->num_producers) {
>> 		if ((info->parallel_info =
>> -		     malloc(sizeof(parallel_info_t) * info->num_threads))
>> +		     malloc(sizeof(parallel_info_t) * info->num_producers))
>> 		    == NULL) {
>> 			MSG("Can't allocate memory for parallel_info.\n");
>> 			return FALSE;
>> 		}
>>
>> 		memset(info->parallel_info, 0, sizeof(parallel_info_t)
>> -							* info->num_threads);
>> +							* info->num_producers);
>> 	}
>>
>> 	return TRUE;
>> @@ -10721,20 +10721,20 @@ calculate_cyclic_buffer_size(void) {
>> 	limit_size = get_free_memory_size() * 0.6;
>>
>> 	/*
>> -	 * Recalculate the limit_size according to num_threads.
>> -	 * And reset num_threads if there is not enough memory.
>> +	 * Recalculate the limit_size according to num_producers.
>> +	 * And reset --num-threads if there is not enough memory.
>> 	 */
>> -	if (info->num_threads > 0) {
>> +	if (info->num_producers > 0) {
>> 		if (limit_size <= maximum_size) {
>> 			MSG("There isn't enough memory for multi-threads.\n");
>> -			info->num_threads = 0;
>> +			info->num_producers = 0;
>> 		}
>> -		else if ((limit_size - maximum_size) / info->num_threads < THREAD_REGION) {
>> -			MSG("There isn't enough memory for %d threads.\n", info->num_threads);
>> -			info->num_threads = (limit_size - maximum_size) / THREAD_REGION;
>> -			MSG("--num_threads is set to %d.\n", info->num_threads);
>> +		else if ((limit_size - maximum_size) / info->num_producers < THREAD_REGION) {
>> +			MSG("There isn't enough memory for %d threads.\n", info->num_producers + 1);
>> +			info->num_producers = (limit_size - maximum_size) / THREAD_REGION;
>> +			MSG("--num-threads is set to %d.\n", info->num_producers + 1);
>>
>> -			limit_size = limit_size - THREAD_REGION * info->num_threads;
>> +			limit_size = limit_size - THREAD_REGION * info->num_producers;
>> 		}
>> 	}
>>
>> @@ -11103,7 +11103,7 @@ main(int argc, char *argv[])
>> 			info->working_dir = optarg;
>> 			break;
>> 		case OPT_NUM_THREADS:
>> -			info->num_threads = MAX(atoi(optarg), 0);
>> +			info->num_producers = MAX(atoi(optarg), 1) - 1;
>> 			break;
>> 		case '?':
>> 			MSG("Commandline parameter is invalid.\n");
>> diff --git a/makedumpfile.h b/makedumpfile.h
>> index 533e5b8..6a0f51e 100644
>> --- a/makedumpfile.h
>> +++ b/makedumpfile.h
>> @@ -1004,7 +1004,7 @@ struct page_data
>> };
>>
>> struct thread_args {
>> -	int thread_num;
>> +	int producer_num;
>> 	unsigned long len_buf_out;
>> 	struct cycle *cycle;
>> 	struct page_data *page_data_buf;
>> @@ -1294,7 +1294,7 @@ struct DumpInfo {
>> 	/*
>> 	 * for parallel process
>> 	 */
>> -	int num_threads;
>> +	int num_producers;
>> 	int num_buffers;
>> 	pthread_t **threads;
>> 	struct thread_args *kdump_thread_args;
>> --
>> 1.8.3.1
>>
>>




_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

      reply	other threads:[~2016-08-04  1:29 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-08-01  3:26 [PATCH] makedumpfile: Change num_threads to num_producers Zhou Wenjian
2016-08-04  0:50 ` Atsushi Kumagai
2016-08-04  1:28   ` "Zhou, Wenjian/周文?" [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=57A29A26.5000809@cn.fujitsu.com \
    --to=zhouwj-fnst@cn.fujitsu.com \
    --cc=ats-kumagai@wm.jp.nec.com \
    --cc=kexec@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.