Re: [RFC PATCH 4/4] x86/Hyper-V: Add memory hot remove function

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Vitaly Kuznetsov <vkuznets@redhat.com>
To: lantianyu1986@gmail.com, kys@microsoft.com,
	haiyangz@microsoft.com, sthemmin@microsoft.com,
	sashal@kernel.org, michael.h.kelley@microsoft.com
Cc: Tianyu Lan <Tianyu.Lan@microsoft.com>,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	eric.devolder@oracle.com
Subject: Re: [RFC PATCH 4/4] x86/Hyper-V: Add memory hot remove function
Date: Wed, 11 Dec 2019 16:06:24 +0100	[thread overview]
Message-ID: <87mubyc367.fsf@vitty.brq.redhat.com> (raw)
In-Reply-To: <20191210154611.10958-5-Tianyu.Lan@microsoft.com>

lantianyu1986@gmail.com writes:

> From: Tianyu Lan <Tianyu.Lan@microsoft.com>
>
> Hyper-V provides dynamic memory hot add/remove function.
> Memory hot-add has already enabled in Hyper-V balloon driver.
> Now add memory hot-remove function.
>
> When driver receives hot-remove msg, it first checks whether
> request remove page number is aligned with hot plug unit(128MB).
> If there are remainder pages(pages%128MB), handle remainder pages
> via balloon way(allocate pages, offline pages and return back to
> Hyper-V).
>
> To remove memory chunks, search memory in the hot add blocks first
> and then other system memory.
>
> Hyper-V has a bug of sending unballoon msg to request memory
> hot-add after doing memory hot-remove. Fix it to handle all
> unballoon msg with memory hot-add operation.
>
> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
> ---
>  drivers/hv/hv_balloon.c | 686 +++++++++++++++++++++++++++++++++++++++++++-----

This patch is too big to review and the logic in it is not trivial at
all. Please try to split this into a series so we can take a look.

>  1 file changed, 616 insertions(+), 70 deletions(-)
>
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 4d1a3b1e2490..015e9e993188 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -19,6 +19,7 @@
>  #include <linux/completion.h>
>  #include <linux/memory_hotplug.h>
>  #include <linux/memory.h>
> +#include <linux/memblock.h>
>  #include <linux/notifier.h>
>  #include <linux/percpu_counter.h>
>  
> @@ -46,12 +47,17 @@
>   * Changes to 0.2 on 2009/05/14
>   * Changes to 0.3 on 2009/12/03
>   * Changed to 1.0 on 2011/04/05
> + * Changed to 2.0 on 2019/12/10
>   */
>  
>  #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor)))
>  #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
>  #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)
>  
> +#define MAX_HOT_REMOVE_ENTRIES						\
> +		((PAGE_SIZE - sizeof(struct dm_hot_remove_response))	\
> +		 / sizeof(union dm_mem_page_range))
> +
>  enum {
>  	DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
>  	DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
> @@ -91,7 +97,13 @@ enum dm_message_type {
>  	 * Version 1.0.
>  	 */
>  	DM_INFO_MESSAGE			= 12,
> -	DM_VERSION_1_MAX		= 12
> +	DM_VERSION_1_MAX		= 12,
> +
> +	/*
> +	 * Version 2.0
> +	 */
> +	DM_MEM_HOT_REMOVE_REQUEST        = 13,
> +	DM_MEM_HOT_REMOVE_RESPONSE       = 14
>  };
>  
>  
> @@ -120,7 +132,8 @@ union dm_caps {
>  		 * represents an alignment of 2^n in mega bytes.
>  		 */
>  		__u64 hot_add_alignment:4;
> -		__u64 reservedz:58;
> +		__u64 hot_remove:1;
> +		__u64 reservedz:57;
>  	} cap_bits;
>  	__u64 caps;
>  } __packed;
> @@ -231,7 +244,9 @@ struct dm_capabilities {
>  struct dm_capabilities_resp_msg {
>  	struct dm_header hdr;
>  	__u64 is_accepted:1;
> -	__u64 reservedz:63;
> +	__u64 hot_remove:1;
> +	__u64 suppress_pressure_reports:1;
> +	__u64 reservedz:61;
>  } __packed;
>  
>  /*
> @@ -376,6 +391,27 @@ struct dm_hot_add_response {
>  	__u32 result;
>  } __packed;
>  
> +struct dm_hot_remove {
> +	struct dm_header hdr;
> +	__u32 virtual_node;
> +	__u32 page_count;
> +	__u32 qos_flags;
> +	__u32 reservedZ;
> +} __packed;
> +
> +struct dm_hot_remove_response {
> +	struct dm_header hdr;
> +	__u32 result;
> +	__u32 range_count;
> +	__u64 more_pages:1;
> +	__u64 reservedz:63;
> +	union dm_mem_page_range range_array[];
> +} __packed;
> +
> +#define DM_REMOVE_QOS_LARGE	 (1 << 0)
> +#define DM_REMOVE_QOS_LOCAL	 (1 << 1)
> +#define DM_REMOVE_QoS_MASK       (0x3)

Capitalize 'QoS' to make it match previous two lines please.

> +
>  /*
>   * Types of information sent from host to the guest.
>   */
> @@ -457,6 +493,13 @@ struct hot_add_wrk {
>  	struct work_struct wrk;
>  };
>  
> +struct hot_remove_wrk {
> +	__u32 virtual_node;
> +	__u32 page_count;
> +	__u32 qos_flags;
> +	struct work_struct wrk;
> +};
> +
>  static bool hot_add = true;
>  static bool do_hot_add;
>  /*
> @@ -489,6 +532,7 @@ enum hv_dm_state {
>  	DM_BALLOON_UP,
>  	DM_BALLOON_DOWN,
>  	DM_HOT_ADD,
> +	DM_HOT_REMOVE,
>  	DM_INIT_ERROR
>  };
>  
> @@ -515,11 +559,13 @@ struct hv_dynmem_device {
>  	 * State to manage the ballooning (up) operation.
>  	 */
>  	struct balloon_state balloon_wrk;
> +	struct balloon_state unballoon_wrk;
>  
>  	/*
>  	 * State to execute the "hot-add" operation.

This comment is stale now.

>  	 */
>  	struct hot_add_wrk ha_wrk;
> +	struct hot_remove_wrk hr_wrk;

Do we actually want to work struct and all the problems with their
serialization? Can we get away with one?

>  
>  	/*
>  	 * This state tracks if the host has specified a hot-add
> @@ -569,6 +615,42 @@ static struct hv_dynmem_device dm_device;
>  
>  static void post_status(struct hv_dynmem_device *dm);
>  
> +static int hv_send_hot_remove_response(
> +	       struct dm_hot_remove_response *resp,
> +	       long array_index, bool more_pages)
> +{
> +	struct hv_dynmem_device *dm = &dm_device;
> +	int ret;
> +
> +	resp->hdr.type = DM_MEM_HOT_REMOVE_RESPONSE;
> +	resp->range_count = array_index;
> +	resp->more_pages = more_pages;
> +	resp->hdr.size = sizeof(struct dm_hot_remove_response)
> +			+ sizeof(union dm_mem_page_range) * array_index;
> +
> +	if (array_index)
> +		resp->result = 0;
> +	else
> +		resp->result = 1;
> +
> +	do {
> +		resp->hdr.trans_id = atomic_inc_return(&trans_id);
> +		ret = vmbus_sendpacket(dm->dev->channel, resp,
> +				       resp->hdr.size,
> +				       (unsigned long)NULL,
> +				       VM_PKT_DATA_INBAND, 0);
> +
> +		if (ret == -EAGAIN)
> +			msleep(20);
> +		post_status(&dm_device);
> +	} while (ret == -EAGAIN);
> +
> +	if (ret)
> +		pr_err("Fail to send hot-remove response msg.\n");
> +
> +	return ret;
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
>  				     unsigned long pfn)
> @@ -628,7 +710,9 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
>  			      void *v)
>  {
>  	struct memory_notify *mem = (struct memory_notify *)v;
> -	unsigned long flags, pfn_count;
> +	unsigned long pfn_count;
> +	unsigned long flags = 0;
> +	int unlocked;
>  
>  	switch (val) {
>  	case MEM_ONLINE:
> @@ -640,7 +724,11 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
>  		break;
>  
>  	case MEM_OFFLINE:
> -		spin_lock_irqsave(&dm_device.ha_lock, flags);
> +		if (dm_device.lock_thread != current) {
> +			spin_lock_irqsave(&dm_device.ha_lock, flags);
> +			unlocked = 1;
> +		}
> +
>  		pfn_count = hv_page_offline_check(mem->start_pfn,
>  						  mem->nr_pages);
>  		if (pfn_count <= dm_device.num_pages_onlined) {
> @@ -654,7 +742,10 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
>  			WARN_ON_ONCE(1);
>  			dm_device.num_pages_onlined = 0;
>  		}
> -		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +
> +		if (unlocked)
> +			spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +
>  		break;
>  	case MEM_GOING_ONLINE:
>  	case MEM_GOING_OFFLINE:
> @@ -727,9 +818,17 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
>  		init_completion(&dm_device.ol_waitevent);
>  		dm_device.ha_waiting = !memhp_auto_online;
>  
> -		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
> -		ret = add_memory(nid, PFN_PHYS((start_pfn)),
> -				(HA_CHUNK << PAGE_SHIFT));
> +		/*
> +		 * If memory section of hot add region is online,
> +		 * just bring pages online in the region.
> +		 */
> +		if (online_section_nr(pfn_to_section_nr(start_pfn))) {
> +			hv_bring_pgs_online(has, start_pfn, processed_pfn);
> +		} else {
> +			nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
> +			ret = add_memory(nid, PFN_PHYS((start_pfn)),
> +					(HA_CHUNK << PAGE_SHIFT));
> +		}
>  
>  		if (ret) {
>  			pr_err("hot_add memory failed error is %d\n", ret);
> @@ -765,8 +864,8 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
>  static void hv_online_page(struct page *pg, unsigned int order)
>  {
>  	struct hv_hotadd_state *has;
> -	unsigned long flags;
>  	unsigned long pfn = page_to_pfn(pg);
> +	unsigned long flags = 0;

Why is this change needed?

>  	int unlocked;
>  
>  	if (dm_device.lock_thread != current) {
> @@ -806,10 +905,12 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
>  			continue;
>  
>  		/*
> -		 * If the current start pfn is not where the covered_end
> -		 * is, create a gap and update covered_end_pfn.
> +		 * If the current start pfn is great than covered_end_pfn,
> +		 * create a gap and update covered_end_pfn. Start pfn may
> +		 * locate at gap which is created during hot remove. The
> +		 * gap range is less than covered_end_pfn.
>  		 */
> -		if (has->covered_end_pfn != start_pfn) {
> +		if (has->covered_end_pfn < start_pfn) {
>  			gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
>  			if (!gap) {
>  				ret = -ENOMEM;
> @@ -848,6 +949,91 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
>  	return ret;
>  }
>  
> +static int handle_hot_add_in_gap(unsigned long start, unsigned long pg_cnt,
> +			  struct hv_hotadd_state *has)
> +{
> +	struct hv_hotadd_gap *gap, *new_gap, *tmp_gap;
> +	unsigned long pfn_cnt = pg_cnt;
> +	unsigned long start_pfn = start;
> +	unsigned long end_pfn;
> +	unsigned long pages;
> +	unsigned long pgs_ol;
> +	unsigned long block_pages = HA_CHUNK;
> +	unsigned long pfn;
> +	int nid;
> +	int ret;
> +
> +	list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
> +
> +		if ((start_pfn < gap->start_pfn)
> +		    || (start_pfn >= gap->end_pfn))
> +			continue;
> +
> +		end_pfn = min(gap->end_pfn, start_pfn + pfn_cnt);
> +		pgs_ol = end_pfn - start_pfn;
> +
> +		/*
> +		 * hv_bring_pgs_online() identifies whether pfn
> +		 * should be online or not via checking pfn is in
> +		 * hot add covered range or gap range(Detail see
> +		 * has_pfn_is_backed()). So adjust gap before bringing
> +		 * online or add memory.
> +		 */
> +		if (gap->end_pfn - gap->start_pfn == pgs_ol) {
> +			list_del(&gap->list);
> +			kfree(gap);
> +		} else if (gap->start_pfn < start && gap->end_pfn == end_pfn) {
> +			gap->end_pfn = start_pfn;
> +		} else if (gap->end_pfn > end_pfn
> +		   && gap->start_pfn == start_pfn) {
> +			gap->start_pfn = end_pfn;
> +		} else {
> +			gap->end_pfn = start_pfn;
> +
> +			new_gap = kzalloc(sizeof(struct hv_hotadd_gap),
> +					GFP_ATOMIC);
> +			if (!new_gap) {
> +				do_hot_add = false;
> +				return -ENOMEM;
> +			}
> +
> +			INIT_LIST_HEAD(&new_gap->list);
> +			new_gap->start_pfn = end_pfn;
> +			new_gap->end_pfn = gap->end_pfn;
> +			list_add_tail(&gap->list, &has->gap_list);
> +		}
> +
> +		/* Bring online or add memmory in gaps. */
> +		for (pfn = start_pfn; pfn < end_pfn;
> +		     pfn = round_up(pfn + 1, block_pages)) {
> +			pages = min(round_up(pfn + 1, block_pages),
> +				    end_pfn) - pfn;
> +
> +			if (online_section_nr(pfn_to_section_nr(pfn))) {
> +				hv_bring_pgs_online(has, pfn, pages);
> +			} else {
> +				nid = memory_add_physaddr_to_nid(PFN_PHYS(pfn));
> +				ret = add_memory(nid, PFN_PHYS(pfn),
> +						 round_up(pages, block_pages)
> +						 << PAGE_SHIFT);
> +				if (ret) {
> +					pr_err("Fail to add memory in gaps(error=%d).\n",
> +					       ret);
> +					do_hot_add = false;
> +					return ret;
> +				}
> +			}
> +		}
> +
> +		start_pfn += pgs_ol;
> +		pfn_cnt -= pgs_ol;
> +		if (!pfn_cnt)
> +			break;
> +	}
> +
> +	return pg_cnt - pfn_cnt;
> +}
> +
>  static unsigned long handle_pg_range(unsigned long pg_start,
>  					unsigned long pg_count)
>  {
> @@ -874,6 +1060,22 @@ static unsigned long handle_pg_range(unsigned long pg_start,
>  
>  		old_covered_state = has->covered_end_pfn;
>  
> +		/*
> +		 * If start_pfn is less than cover_end_pfn, the hot-add memory
> +		 * area is in the gap range.
> +		 */
> +		if (start_pfn < has->covered_end_pfn) {
> +			pgs_ol = handle_hot_add_in_gap(start_pfn, pfn_cnt, has);
> +
> +			pfn_cnt -= pgs_ol;
> +			if (!pfn_cnt) {
> +				res = pgs_ol;
> +				break;
> +			}
> +
> +			start_pfn += pgs_ol;
> +		}
> +
>  		if (start_pfn < has->ha_end_pfn) {
>  			/*
>  			 * This is the case where we are backing pages
> @@ -931,6 +1133,23 @@ static unsigned long handle_pg_range(unsigned long pg_start,
>  	return res;
>  }
>  
> +static void free_allocated_pages(__u64 start_frame, int num_pages)
> +{
> +	struct page *pg;
> +	int i;
> +
> +	for (i = 0; i < num_pages; i++) {
> +		pg = pfn_to_page(i + start_frame);
> +
> +		if (page_private(pfn_to_page(i)))
> +			set_page_private(pfn_to_page(i), 0);
> +
> +		__ClearPageOffline(pg);
> +		__free_page(pg);
> +		dm_device.num_pages_ballooned--;
> +	}
> +}
> +
>  static unsigned long process_hot_add(unsigned long pg_start,
>  					unsigned long pfn_cnt,
>  					unsigned long rg_start,
> @@ -940,18 +1159,40 @@ static unsigned long process_hot_add(unsigned long pg_start,
>  	int covered;
>  	unsigned long flags;
>  
> -	if (pfn_cnt == 0)
> -		return 0;
> +	/*
> +	 * Check whether page is allocated by driver via page private
> +	 * data due to remainder pages.
> +	 */
> +	if (present_section_nr(pfn_to_section_nr(pg_start))
> +	    && page_private(pfn_to_page(pg_start))) {
> +		free_allocated_pages(pg_start, pfn_cnt);
> +		return pfn_cnt;
> +	}
>  
> -	if (!dm_device.host_specified_ha_region) {
> -		covered = pfn_covered(pg_start, pfn_cnt);
> -		if (covered < 0)
> -			return 0;
> +	if ((rg_start == 0) && (!dm_device.host_specified_ha_region)) {
> +		/*
> +		 * The host has not specified the hot-add region.
> +		 * Based on the hot-add page range being specified,
> +		 * compute a hot-add region that can cover the pages
> +		 * that need to be hot-added while ensuring the alignment
> +		 * and size requirements of Linux as it relates to hot-add.
> +		 */
> +		rg_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
> +		if (pfn_cnt % HA_CHUNK)
> +			rg_size += HA_CHUNK;
>  
> -		if (covered)
> -			goto do_pg_range;
> +		rg_start = (pg_start / HA_CHUNK) * HA_CHUNK;
>  	}
>  
> +	if (pfn_cnt == 0)
> +		return 0;
> +
> +	covered = pfn_covered(pg_start, pfn_cnt);
> +	if (covered < 0)
> +		return 0;
> +	else if (covered)
> +		goto do_pg_range;
> +
>  	/*
>  	 * If the host has specified a hot-add range; deal with it first.
>  	 */
> @@ -983,8 +1224,321 @@ static unsigned long process_hot_add(unsigned long pg_start,
>  	return handle_pg_range(pg_start, pfn_cnt);
>  }
>  
> +static int check_memblock_online(struct memory_block *mem, void *arg)
> +{
> +	if (mem->state != MEM_ONLINE)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int change_memblock_state(struct memory_block *mem, void *arg)
> +{
> +	unsigned long state = (unsigned long)arg;
> +
> +	mem->state = state;
> +
> +	return 0;
> +}
> +
> +static bool hv_offline_pages(unsigned long start_pfn, unsigned long nr_pages)
> +{
> +	const unsigned long start = PFN_PHYS(start_pfn);
> +	const unsigned long size = PFN_PHYS(nr_pages);
> +
> +	lock_device_hotplug();
> +
> +	if (walk_memory_blocks(start, size, NULL, check_memblock_online)) {
> +		unlock_device_hotplug();
> +		return false;
> +	}
> +
> +	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
> +			  change_memblock_state);
> +
> +	if (offline_pages(start_pfn, nr_pages)) {
> +		walk_memory_blocks(start_pfn, nr_pages, (void *)MEM_ONLINE,
> +				  change_memblock_state);
> +		unlock_device_hotplug();
> +		return false;
> +	}
> +
> +	walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
> +			  change_memblock_state);
> +
> +	unlock_device_hotplug();
> +	return true;
> +}
> +
> +static int hv_hot_remove_range(unsigned int nid, unsigned long start_pfn,
> +			       unsigned long end_pfn, unsigned long nr_pages,
> +			       unsigned long *array_index,
> +			       union dm_mem_page_range *range_array,
> +			       struct hv_hotadd_state *has)
> +{
> +	unsigned long block_pages = HA_CHUNK;
> +	unsigned long rm_pages = nr_pages;
> +	unsigned long pfn;
> +
> +	for (pfn = start_pfn; pfn < end_pfn; pfn += block_pages) {
> +		struct hv_hotadd_gap *gap;
> +		int in_gaps = 0;
> +
> +		if (*array_index >= MAX_HOT_REMOVE_ENTRIES) {
> +			struct dm_hot_remove_response *resp =
> +				(struct dm_hot_remove_response *)
> +					balloon_up_send_buffer;
> +			int ret;
> +
> +			/* Flush out all remove response entries. */
> +			ret = hv_send_hot_remove_response(resp, *array_index,
> +							  true);
> +			if (ret)
> +				return ret;
> +
> +			memset(resp, 0x00, PAGE_SIZE);
> +			*array_index = 0;
> +		}
> +
> +		if (has) {
> +			/*
> +			 * Memory in gaps has been offlined or removed and
> +			 * so skip it if remove range overlap with gap.
> +			 */
> +			list_for_each_entry(gap, &has->gap_list, list)
> +				if (!(pfn >= gap->end_pfn ||
> +				      pfn + block_pages < gap->start_pfn)) {
> +					in_gaps = 1;
> +					break;
> +				}
> +
> +			if (in_gaps)
> +				continue;
> +		}
> +
> +		if (online_section_nr(pfn_to_section_nr(pfn))
> +		    && is_mem_section_removable(pfn, block_pages)
> +		    && hv_offline_pages(pfn, block_pages)) {
> +			remove_memory(nid, pfn << PAGE_SHIFT,
> +				      block_pages << PAGE_SHIFT);
> +
> +			range_array[*array_index].finfo.start_page = pfn;
> +			range_array[*array_index].finfo.page_cnt = block_pages;
> +
> +			(*array_index)++;
> +			nr_pages -= block_pages;
> +
> +			if (!nr_pages)
> +				break;
> +		}
> +	}
> +
> +	return rm_pages - nr_pages;
> +}
> +
> +static int hv_hot_remove_from_ha_list(unsigned int nid, unsigned long nr_pages,
> +				      unsigned long *array_index,
> +				      union dm_mem_page_range *range_array)
> +{
> +	struct hv_hotadd_state *has;
> +	unsigned long start_pfn, end_pfn;
> +	unsigned long flags, rm_pages;
> +	int old_index;
> +	int ret, i;
> +
> +	spin_lock_irqsave(&dm_device.ha_lock, flags);
> +	dm_device.lock_thread = current;
> +	list_for_each_entry(has, &dm_device.ha_region_list, list) {
> +		start_pfn = has->start_pfn;
> +		end_pfn = has->covered_end_pfn;
> +		rm_pages = min(nr_pages, has->covered_end_pfn - has->start_pfn);
> +		old_index = *array_index;
> +
> +		if (!rm_pages || pfn_to_nid(start_pfn) != nid)
> +			continue;
> +
> +		rm_pages = hv_hot_remove_range(nid, start_pfn, end_pfn,
> +				rm_pages, array_index, range_array, has);
> +
> +		if (rm_pages < 0)
> +			return rm_pages;
> +		else if (!rm_pages)
> +			continue;
> +
> +		nr_pages -= rm_pages;
> +		dm_device.num_pages_added -= rm_pages;
> +
> +		/* Create gaps for hot remove regions. */
> +		for (i = old_index; i < *array_index; i++) {
> +			struct hv_hotadd_gap *gap;
> +
> +			gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
> +			if (!gap) {
> +				ret = -ENOMEM;
> +				do_hot_add = false;
> +				return ret;
> +			}
> +
> +			INIT_LIST_HEAD(&gap->list);
> +			gap->start_pfn = range_array[i].finfo.start_page;
> +			gap->end_pfn =
> +				gap->start_pfn + range_array[i].finfo.page_cnt;
> +			list_add_tail(&gap->list, &has->gap_list);
> +		}
> +
> +		if (!nr_pages)
> +			break;
> +	}
> +	dm_device.lock_thread = NULL;
> +	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> +
> +	return nr_pages;
> +}
> +
> +static void free_balloon_pages(struct hv_dynmem_device *dm,
> +			 union dm_mem_page_range *range_array)
> +{
> +	int num_pages = range_array->finfo.page_cnt;
> +	__u64 start_frame = range_array->finfo.start_page;
> +
> +	free_allocated_pages(start_frame, num_pages);
> +}
> +
> +static int hv_hot_remove_pages(struct dm_hot_remove_response *resp,
> +			       u64 nr_pages, unsigned long *array_index,
> +			       bool more_pages)
> +{
> +	int i, j, alloc_unit = PAGES_IN_2M;
> +	struct page *pg;
> +	int ret;
> +
> +	for (i = 0; i < nr_pages; i += alloc_unit) {
> +		if (*array_index >= MAX_HOT_REMOVE_ENTRIES) {
> +			/* Flush out all remove response entries. */
> +			ret = hv_send_hot_remove_response(resp,
> +					*array_index, true);
> +			if (ret)
> +				goto free_pages;
> +
> +			/*
> +			 * Continue to allocate memory for hot remove
> +			 * after resetting send buffer and array index.
> +			 */
> +			memset(resp, 0x00, PAGE_SIZE);
> +			*array_index = 0;
> +		}
> +retry:
> +		pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY |
> +			__GFP_NOMEMALLOC | __GFP_NOWARN,
> +			get_order(alloc_unit << PAGE_SHIFT));
> +		if (!pg) {
> +			if (alloc_unit == 1) {
> +				ret = -ENOMEM;
> +				goto free_pages;
> +			}
> +
> +			alloc_unit = 1;
> +			goto retry;
> +		}
> +
> +		if (alloc_unit != 1)
> +			split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
> +
> +		for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT));
> +		    j++) {
> +			__SetPageOffline(pg + j);
> +
> +			/*
> +			 * Set page's private data to non-zero and use it
> +			 * to identify whehter the page is allocated by driver
> +			 * or new hot-add memory in process_hot_add().
> +			 */
> +			set_page_private(pg + j, 1);
> +		}
> +
> +		resp->range_array[*array_index].finfo.start_page
> +				= page_to_pfn(pg);
> +		resp->range_array[*array_index].finfo.page_cnt
> +				= alloc_unit;
> +		(*array_index)++;
> +
> +		dm_device.num_pages_ballooned += alloc_unit;
> +	}
> +
> +	ret = hv_send_hot_remove_response(resp, *array_index, more_pages);
> +	if (ret)
> +		goto free_pages;
> +
> +	return 0;
> +
> +free_pages:
> +	for (i = 0; i < *array_index; i++)
> +		free_balloon_pages(&dm_device, &resp->range_array[i]);
> +
> +	/* Response hot remove failure. */
> +	hv_send_hot_remove_response(resp, 0, false);
> +	return ret;
> +}
> +
> +static void hv_hot_remove_mem_from_node(unsigned int nid, u64 nr_pages)
> +{
> +	struct dm_hot_remove_response *resp
> +		= (struct dm_hot_remove_response *)balloon_up_send_buffer;
> +	unsigned long remainder = nr_pages % HA_CHUNK;
> +	unsigned long start_pfn = node_start_pfn(nid);
> +	unsigned long end_pfn = node_end_pfn(nid);
> +	unsigned long array_index = 0;
> +	int ret;
> +
> +	/*
> +	 * If page number isn't aligned with memory hot plug unit,
> +	 * handle remainder pages via balloon way.
> +	 */
> +	if (remainder) {
> +		memset(resp, 0x00, PAGE_SIZE);
> +		ret = hv_hot_remove_pages(resp, remainder, &array_index,
> +				!!(nr_pages - remainder));
> +		if (ret)
> +			return;
> +
> +		nr_pages -= remainder;
> +		if (!nr_pages)
> +			return;
> +	}
> +
> +	memset(resp, 0x00, PAGE_SIZE);
> +	array_index = 0;
> +	nr_pages = hv_hot_remove_from_ha_list(nid, nr_pages, &array_index,
> +				resp->range_array);
> +	if (nr_pages < 0) {
> +		/* Set array_index to 0 and response failure in resposne msg. */
> +		array_index = 0;
> +	} else if (nr_pages) {
> +		start_pfn = ALIGN(start_pfn, HA_CHUNK);
> +		hv_hot_remove_range(nid, start_pfn, end_pfn, nr_pages,
> +				    &array_index, resp->range_array, NULL);
> +	}
> +
> +	hv_send_hot_remove_response(resp, array_index, false);
> +}
> +
>  #endif
>  
> +static void hot_remove_req(struct work_struct *dummy)
> +{
> +	struct hv_dynmem_device *dm = &dm_device;
> +	unsigned int numa_node = dm->hr_wrk.virtual_node;
> +	unsigned int page_count = dm->hr_wrk.page_count;
> +
> +	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) || do_hot_add)
> +		hv_hot_remove_mem_from_node(numa_node, page_count);
> +	else
> +		hv_send_hot_remove_response((struct dm_hot_remove_response *)
> +				balloon_up_send_buffer, 0, false);
> +
> +	dm->state = DM_INITIALIZED;
> +}
> +
>  static void hot_add_req(struct work_struct *dummy)
>  {
>  	struct dm_hot_add_response resp;
> @@ -1005,28 +1559,6 @@ static void hot_add_req(struct work_struct *dummy)
>  	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
>  	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
>  
> -	if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
> -		unsigned long region_size;
> -		unsigned long region_start;
> -
> -		/*
> -		 * The host has not specified the hot-add region.
> -		 * Based on the hot-add page range being specified,
> -		 * compute a hot-add region that can cover the pages
> -		 * that need to be hot-added while ensuring the alignment
> -		 * and size requirements of Linux as it relates to hot-add.
> -		 */
> -		region_start = pg_start;
> -		region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
> -		if (pfn_cnt % HA_CHUNK)
> -			region_size += HA_CHUNK;
> -
> -		region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
> -
> -		rg_start = region_start;
> -		rg_sz = region_size;
> -	}
> -
>  	if (do_hot_add)
>  		resp.page_count = process_hot_add(pg_start, pfn_cnt,
>  						rg_start, rg_sz);
> @@ -1190,24 +1722,6 @@ static void post_status(struct hv_dynmem_device *dm)
>  
>  }
>  
> -static void free_balloon_pages(struct hv_dynmem_device *dm,
> -			 union dm_mem_page_range *range_array)
> -{
> -	int num_pages = range_array->finfo.page_cnt;
> -	__u64 start_frame = range_array->finfo.start_page;
> -	struct page *pg;
> -	int i;
> -
> -	for (i = 0; i < num_pages; i++) {
> -		pg = pfn_to_page(i + start_frame);
> -		__ClearPageOffline(pg);
> -		__free_page(pg);
> -		dm->num_pages_ballooned--;
> -	}
> -}
> -
> -
> -
>  static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
>  					unsigned int num_pages,
>  					struct dm_balloon_response *bl_resp,
> @@ -1354,22 +1868,38 @@ static void balloon_up(struct work_struct *dummy)
>  
>  }
>  
> -static void balloon_down(struct hv_dynmem_device *dm,
> -			struct dm_unballoon_request *req)
> +static void balloon_down(struct work_struct *dummy)
>  {
> +	struct dm_unballoon_request *req =
> +		(struct dm_unballoon_request *)recv_buffer;
>  	union dm_mem_page_range *range_array = req->range_array;
>  	int range_count = req->range_count;
>  	struct dm_unballoon_response resp;
> -	int i;
> +	struct hv_dynmem_device *dm = &dm_device;
>  	unsigned int prev_pages_ballooned = dm->num_pages_ballooned;
> +	int i;
>  
>  	for (i = 0; i < range_count; i++) {
> -		free_balloon_pages(dm, &range_array[i]);
> -		complete(&dm_device.config_event);
> +		/*
> +		 * Hyper-V has a bug of sending unballoon msg instead
> +		 * of hot add msg when there is no balloon msg sent before
> +		 * Do hot add operation for all unballoon msg If hot add
> +		 * capability is enabled,
> +		 */
> +		if (do_hot_add) {
> +			dm->host_specified_ha_region = false;
> +			dm->num_pages_added +=
> +				process_hot_add(range_array[i].finfo.start_page,
> +				range_array[i].finfo.page_cnt, 0, 0);
> +		} else {
> +			free_balloon_pages(dm, &range_array[i]);
> +		}
>  	}
> +	complete(&dm_device.config_event);
>  
> -	pr_debug("Freed %u ballooned pages.\n",
> -		prev_pages_ballooned - dm->num_pages_ballooned);
> +	if (!do_hot_add)
> +		pr_debug("Freed %u ballooned pages.\n",
> +			prev_pages_ballooned - dm->num_pages_ballooned);
>  
>  	if (req->more_pages == 1)
>  		return;
> @@ -1489,6 +2019,7 @@ static void balloon_onchannelcallback(void *context)
>  	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
>  	struct dm_balloon *bal_msg;
>  	struct dm_hot_add *ha_msg;
> +	struct dm_hot_remove *hr_msg;
>  	union dm_mem_page_range *ha_pg_range;
>  	union dm_mem_page_range *ha_region;
>  
> @@ -1522,8 +2053,7 @@ static void balloon_onchannelcallback(void *context)
>  
>  		case DM_UNBALLOON_REQUEST:
>  			dm->state = DM_BALLOON_DOWN;
> -			balloon_down(dm,
> -				 (struct dm_unballoon_request *)recv_buffer);
> +			schedule_work(&dm_device.unballoon_wrk.wrk);
>  			break;
>  
>  		case DM_MEM_HOT_ADD_REQUEST:
> @@ -1554,6 +2084,19 @@ static void balloon_onchannelcallback(void *context)
>  			}
>  			schedule_work(&dm_device.ha_wrk.wrk);
>  			break;
> +		case DM_MEM_HOT_REMOVE_REQUEST:
> +			if (dm->state == DM_HOT_REMOVE)
> +				pr_warn("Currently hot-removing.\n");
> +
> +			dm->state = DM_HOT_REMOVE;
> +			hr_msg = (struct dm_hot_remove *)recv_buffer;
> +
> +			dm->hr_wrk.virtual_node = hr_msg->virtual_node;
> +			dm->hr_wrk.page_count = hr_msg->page_count;
> +			dm->hr_wrk.qos_flags = hr_msg->qos_flags;
> +
> +			schedule_work(&dm_device.hr_wrk.wrk);
> +			break;
>  
>  		case DM_INFO_MESSAGE:
>  			process_info(dm, (struct dm_info_msg *)dm_msg);
> @@ -1628,6 +2171,7 @@ static int balloon_connect_vsp(struct hv_device *dev)
>  
>  	cap_msg.caps.cap_bits.balloon = 1;
>  	cap_msg.caps.cap_bits.hot_add = 1;
> +	cap_msg.caps.cap_bits.hot_remove = 1;
>  
>  	/*
>  	 * Specify our alignment requirements as it relates
> @@ -1688,7 +2232,9 @@ static int balloon_probe(struct hv_device *dev,
>  	INIT_LIST_HEAD(&dm_device.ha_region_list);
>  	spin_lock_init(&dm_device.ha_lock);
>  	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
> +	INIT_WORK(&dm_device.unballoon_wrk.wrk, balloon_down);
>  	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
> +	INIT_WORK(&dm_device.hr_wrk.wrk, hot_remove_req);
>  	dm_device.host_specified_ha_region = false;
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG

-- 
Vitaly

next prev parent reply	other threads:[~2019-12-11 15:06 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-12-10 15:46 [RFC PATCH 0/4] x86/Hyper-V: Add Dynamic memory hot-remove function lantianyu1986
2019-12-10 15:46 ` [RFC PATCH 1/4] mm/resource: Move child to new resource when release mem region lantianyu1986
2019-12-10 15:46 ` [RFC PATCH 2/4] mm/hotplug: Expose is_mem_section_removable() and offline_pages() lantianyu1986
2019-12-11 12:07   ` David Hildenbrand
2019-12-10 15:46 ` [RFC PATCH 3/4] Hyper-V/Balloon: Call add_memory() with dm_device.ha_lock lantianyu1986
2019-12-11 14:57   ` Vitaly Kuznetsov
2019-12-12  8:24     ` [EXTERNAL] " Tianyu Lan
2019-12-10 15:46 ` [RFC PATCH 4/4] x86/Hyper-V: Add memory hot remove function lantianyu1986
2019-12-11 15:06   ` Vitaly Kuznetsov [this message]
2019-12-12 13:37     ` [EXTERNAL] " Tianyu Lan
2019-12-11 19:52   ` kbuild test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87mubyc367.fsf@vitty.brq.redhat.com \
    --to=vkuznets@redhat.com \
    --cc=Tianyu.Lan@microsoft.com \
    --cc=eric.devolder@oracle.com \
    --cc=haiyangz@microsoft.com \
    --cc=kys@microsoft.com \
    --cc=lantianyu1986@gmail.com \
    --cc=linux-hyperv@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=michael.h.kelley@microsoft.com \
    --cc=sashal@kernel.org \
    --cc=sthemmin@microsoft.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.