Re: [RFC PATCH] swiotlb: Add Child IO TLB mem support

From: Tianyu Lan <ltykernel@gmail.com>
To: hch@infradead.org, robin.murphy@arm.com
Cc: parri.andrea@gmail.com, thomas.lendacky@amd.com,
	wei.liu@kernel.org, Tianyu Lan <Tianyu.Lan@microsoft.com>,
	linux-hyperv@vger.kernel.org, konrad.wilk@oracle.com,
	linux-kernel@vger.kernel.org, kirill.shutemov@intel.com,
	iommu@lists.linux-foundation.org, michael.h.kelley@microsoft.com,
	andi.kleen@intel.com, brijesh.singh@amd.com, vkuznets@redhat.com,
	kys@microsoft.com, hch@lst.de
Subject: Re: [RFC PATCH] swiotlb: Add Child IO TLB mem support
Date: Fri, 29 Apr 2022 22:25:53 +0800	[thread overview]
Message-ID: <c0e70b17-cdf1-4fd8-f807-e4b9ccad44fd@gmail.com> (raw)
In-Reply-To: <20220429142147.1725184-1-ltykernel@gmail.com>

On 4/29/2022 10:21 PM, Tianyu Lan wrote:
> From: Tianyu Lan <Tianyu.Lan@microsoft.com>
> 
> Traditionally swiotlb was not performance critical because it was only
> used for slow devices. But in some setups, like TDX/SEV confidential
> guests, all IO has to go through swiotlb. Currently swiotlb only has a
> single lock. Under high IO load with multiple CPUs this can lead to
> significant lock contention on the swiotlb lock.
> 
> This patch adds child IO TLB mem support to resolve spinlock overhead
> among device's queues. Each device may allocate IO tlb mem and setup
> child IO TLB mem according to queue number. Swiotlb code allocates
> bounce buffer among child IO tlb mem iterately.
> 

Hi Robin and Christoph:
       According to Robin idea. I draft this patch. Please have a look 
and check whether it's right diection.

Thanks.

> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
> ---
>   include/linux/swiotlb.h |  7 +++
>   kernel/dma/swiotlb.c    | 96 ++++++++++++++++++++++++++++++++++++-----
>   2 files changed, 93 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 7ed35dd3de6e..4a3f6a7b4b7e 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -89,6 +89,9 @@ extern enum swiotlb_force swiotlb_force;
>    * @late_alloc:	%true if allocated using the page allocator
>    * @force_bounce: %true if swiotlb bouncing is forced
>    * @for_alloc:  %true if the pool is used for memory allocation
> + * @child_nslot:The number of IO TLB slot in the child IO TLB mem.
> + * @num_child:  The child io tlb mem number in the pool.
> + * @child_start:The child index to start searching in the next round.
>    */
>   struct io_tlb_mem {
>   	phys_addr_t start;
> @@ -102,6 +105,10 @@ struct io_tlb_mem {
>   	bool late_alloc;
>   	bool force_bounce;
>   	bool for_alloc;
> +	unsigned int num_child;
> +	unsigned int child_nslot;
> +	unsigned int child_start;
> +	struct io_tlb_mem *child;
>   	struct io_tlb_slot {
>   		phys_addr_t orig_addr;
>   		size_t alloc_size;
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index e2ef0864eb1e..382fa2288645 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -207,6 +207,25 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
>   		mem->force_bounce = true;
>   
>   	spin_lock_init(&mem->lock);
> +
> +	if (mem->num_child) {
> +		mem->child_nslot = nslabs / mem->num_child;
> +		mem->child_start = 0;
> +
> +		/*
> +		 * Initialize child IO TLB mem, divide IO TLB pool
> +		 * into child number. Reuse parent mem->slot in the
> +		 * child mem->slot.
> +		 */
> +		for (i = 0; i < mem->num_child; i++) {
> +			mem->num_child = 0;
> +			mem->child[i].slots = mem->slots + i * mem->child_nslot;
> +			swiotlb_init_io_tlb_mem(&mem->child[i],
> +				start + ((i * mem->child_nslot) << IO_TLB_SHIFT),
> +				mem->child_nslot, late_alloc);
> +		}
> +	}
> +
>   	for (i = 0; i < mem->nslabs; i++) {
>   		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
>   		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
> @@ -336,16 +355,18 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
>   
>   	mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
>   		get_order(array_size(sizeof(*mem->slots), nslabs)));
> -	if (!mem->slots) {
> -		free_pages((unsigned long)vstart, order);
> -		return -ENOMEM;
> -	}
> +	if (!mem->slots)
> +		goto error_slots;
>   
>   	set_memory_decrypted((unsigned long)vstart, bytes >> PAGE_SHIFT);
>   	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true);
>   
>   	swiotlb_print_info();
>   	return 0;
> +
> +error_slots:
> +	free_pages((unsigned long)vstart, order);
> +	return -ENOMEM;
>   }
>   
>   void __init swiotlb_exit(void)
> @@ -483,10 +504,11 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
>    * Find a suitable number of IO TLB entries size that will fit this request and
>    * allocate a buffer from that IO TLB pool.
>    */
> -static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
> -			      size_t alloc_size, unsigned int alloc_align_mask)
> +static int swiotlb_do_find_slots(struct io_tlb_mem *mem,
> +				 struct device *dev, phys_addr_t orig_addr,
> +				 size_t alloc_size,
> +				 unsigned int alloc_align_mask)
>   {
> -	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
>   	unsigned long boundary_mask = dma_get_seg_boundary(dev);
>   	dma_addr_t tbl_dma_addr =
>   		phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
> @@ -565,6 +587,46 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
>   	return index;
>   }
>   
> +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
> +			      size_t alloc_size, unsigned int alloc_align_mask)
> +{
> +	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> +	struct io_tlb_mem *child_mem = mem;
> +	int start = 0, i = 0, index;
> +
> +	if (mem->num_child) {
> +		i = start = mem->child_start;
> +		mem->child_start = (mem->child_start + 1) % mem->num_child;
> +		child_mem = mem->child;
> +	}
> +
> +	do {
> +		index = swiotlb_do_find_slots(child_mem + i, dev, orig_addr,
> +					      alloc_size, alloc_align_mask);
> +		if (index >= 0)
> +			return i * mem->child_nslot + index;
> +		if (++i >= mem->num_child)
> +			i = 0;
> +	} while (i != start);
> +
> +	return -1;
> +}
> +
> +static unsigned long mem_used(struct io_tlb_mem *mem)
> +{
> +	int i;
> +	unsigned long used = 0;
> +
> +	if (mem->num_child) {
> +		for (i = 0; i < mem->num_child; i++)
> +			used += mem->child[i].used;
> +	} else {
> +		used = mem->used;
> +	}
> +
> +	return used;
> +}
> +
>   phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>   		size_t mapping_size, size_t alloc_size,
>   		unsigned int alloc_align_mask, enum dma_data_direction dir,
> @@ -594,7 +656,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>   		if (!(attrs & DMA_ATTR_NO_WARN))
>   			dev_warn_ratelimited(dev,
>   	"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
> -				 alloc_size, mem->nslabs, mem->used);
> +				     alloc_size, mem->nslabs, mem_used(mem));
>   		return (phys_addr_t)DMA_MAPPING_ERROR;
>   	}
>   
> @@ -617,9 +679,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>   	return tlb_addr;
>   }
>   
> -static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
> +static void swiotlb_do_release_slots(struct io_tlb_mem *mem,
> +				     struct device *dev, phys_addr_t tlb_addr)
>   {
> -	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
>   	unsigned long flags;
>   	unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
>   	int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
> @@ -660,6 +722,20 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
>   	spin_unlock_irqrestore(&mem->lock, flags);
>   }
>   
> +static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
> +{
> +	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> +	int index, offset;
> +
> +	if (mem->num_child) {
> +		offset = swiotlb_align_offset(dev, tlb_addr);	
> +		index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
> +		mem = &mem->child[index / mem->child_nslot];
> +	}
> +
> +	swiotlb_do_release_slots(mem, dev, tlb_addr);
> +}
> +
>   /*
>    * tlb_addr is the physical address of the bounce buffer to unmap.
>    */
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu