All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ethan Zhao <haifeng.zhao@linux.intel.com>
To: Eric Dumazet <edumazet@google.com>,
	Joerg Roedel <joro@8bytes.org>,
	Robin Murphy <robin.murphy@arm.com>,
	Will Deacon <will@kernel.org>
Cc: linux-kernel <linux-kernel@vger.kernel.org>,
	netdev@vger.kernel.org, Eric Dumazet <eric.dumazet@gmail.com>,
	iommu@lists.linux.dev
Subject: Re: [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations
Date: Thu, 24 Nov 2022 12:23:28 +0800	[thread overview]
Message-ID: <73b5ba7e-a4e2-ee95-d16b-942343a04d03@linux.intel.com> (raw)
In-Reply-To: <20221112040452.644234-1-edumazet@google.com>

Hi,

On 2022/11/12 12:04, Eric Dumazet wrote:
> Quite often, NIC devices do not need dma_sync operations
> on x86_64 at least.
>
> Indeed, when dev_is_dma_coherent(dev) is true and
> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> and friends do nothing.
>
> However, indirectly calling them when CONFIG_RETPOLINE=y
> consumes about 10% of cycles on a cpu receiving packets
> from softirq at ~100Gbit rate, as shown in [1]
>
> Even if/when CONFIG_RETPOLINE is not set, there
> is a cost of about 3%.
>
> This patch adds a copy of iommu_dma_ops structure,
> where sync_single_for_cpu, sync_single_for_device,
> sync_sg_for_cpu and sync_sg_for_device are unset.
>
> perf profile before the patch:
>
>      18.53%  [kernel]       [k] gq_rx_skb
>      14.77%  [kernel]       [k] napi_reuse_skb
>       8.95%  [kernel]       [k] skb_release_data
>       5.42%  [kernel]       [k] dev_gro_receive
>       5.37%  [kernel]       [k] memcpy
> <*>  5.26%  [kernel]       [k] iommu_dma_sync_sg_for_cpu
>       4.78%  [kernel]       [k] tcp_gro_receive
> <*>  4.42%  [kernel]       [k] iommu_dma_sync_sg_for_device
>       4.12%  [kernel]       [k] ipv6_gro_receive
>       3.65%  [kernel]       [k] gq_pool_get
>       3.25%  [kernel]       [k] skb_gro_receive
>       2.07%  [kernel]       [k] napi_gro_frags
>       1.98%  [kernel]       [k] tcp6_gro_receive
>       1.27%  [kernel]       [k] gq_rx_prep_buffers
>       1.18%  [kernel]       [k] gq_rx_napi_handler
>       0.99%  [kernel]       [k] csum_partial
>       0.74%  [kernel]       [k] csum_ipv6_magic
>       0.72%  [kernel]       [k] free_pcp_prepare
>       0.60%  [kernel]       [k] __napi_poll
>       0.58%  [kernel]       [k] net_rx_action
>       0.56%  [kernel]       [k] read_tsc
> <*>  0.50%  [kernel]       [k] __x86_indirect_thunk_r11
>       0.45%  [kernel]       [k] memset
>
> After patch, lines with <*> no longer show up, and overall
> cpu usage looks much better (~60% instead of ~72%)
>
>      25.56%  [kernel]       [k] gq_rx_skb
>       9.90%  [kernel]       [k] napi_reuse_skb
>       7.39%  [kernel]       [k] dev_gro_receive
>       6.78%  [kernel]       [k] memcpy
>       6.53%  [kernel]       [k] skb_release_data
>       6.39%  [kernel]       [k] tcp_gro_receive
>       5.71%  [kernel]       [k] ipv6_gro_receive
>       4.35%  [kernel]       [k] napi_gro_frags
>       4.34%  [kernel]       [k] skb_gro_receive
>       3.50%  [kernel]       [k] gq_pool_get
>       3.08%  [kernel]       [k] gq_rx_napi_handler
>       2.35%  [kernel]       [k] tcp6_gro_receive
>       2.06%  [kernel]       [k] gq_rx_prep_buffers
>       1.32%  [kernel]       [k] csum_partial
>       0.93%  [kernel]       [k] csum_ipv6_magic
>       0.65%  [kernel]       [k] net_rx_action
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Will Deacon <will@kernel.org>
> Cc: iommu@lists.linux.dev
> ---
>   drivers/iommu/dma-iommu.c | 67 +++++++++++++++++++++++++++------------
>   1 file changed, 47 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 9297b741f5e80e2408e864fc3f779410d6b04d49..976ba20a55eab5fd94e9bec2d38a2a60e0690444 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -522,6 +522,11 @@ static bool dev_use_swiotlb(struct device *dev)
>   	return IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev);
>   }
>   
> +static bool dev_is_dma_sync_needed(struct device *dev)
> +{
> +	return !dev_is_dma_coherent(dev) || dev_use_swiotlb(dev);
> +}
> +
>   /**
>    * iommu_dma_init_domain - Initialise a DMA mapping domain
>    * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
> @@ -914,7 +919,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
>   {
>   	phys_addr_t phys;
>   
> -	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
> +	if (!dev_is_dma_sync_needed(dev))

Seems this function is also called by iommu_dma_map_page()  pair and it 
already checked

if the device is coherent,  so do we need this duplicate 
dev_is_dma_sync_needed(dev) ?

How about we move this checking to iommu_dma_map_page() 
/iommu_dma_unmap_page()

then no need checking here anymore ?


Thanks,

Ethan


>   		return;
>   
>   	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
> @@ -930,7 +935,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
>   {
>   	phys_addr_t phys;
>   
> -	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
> +	if (!dev_is_dma_sync_needed(dev))
>   		return;
>   
>   	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
> @@ -1544,30 +1549,51 @@ static size_t iommu_dma_opt_mapping_size(void)
>   	return iova_rcache_range();
>   }
>   
> +#define iommu_dma_ops_common_fields \
> +	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,		\
> +	.alloc			= iommu_dma_alloc,			\
> +	.free			= iommu_dma_free,			\
> +	.alloc_pages		= dma_common_alloc_pages,		\
> +	.free_pages		= dma_common_free_pages,		\
> +	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,	\
> +	.free_noncontiguous	= iommu_dma_free_noncontiguous,		\
> +	.mmap			= iommu_dma_mmap,			\
> +	.get_sgtable		= iommu_dma_get_sgtable,		\
> +	.map_page		= iommu_dma_map_page,			\
> +	.unmap_page		= iommu_dma_unmap_page,			\
> +	.map_sg			= iommu_dma_map_sg,			\
> +	.unmap_sg		= iommu_dma_unmap_sg,			\
> +	.map_resource		= iommu_dma_map_resource,		\
> +	.unmap_resource		= iommu_dma_unmap_resource,		\
> +	.get_merge_boundary	= iommu_dma_get_merge_boundary,		\
> +	.opt_mapping_size	= iommu_dma_opt_mapping_size,
> +
>   static const struct dma_map_ops iommu_dma_ops = {
> -	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
> -	.alloc			= iommu_dma_alloc,
> -	.free			= iommu_dma_free,
> -	.alloc_pages		= dma_common_alloc_pages,
> -	.free_pages		= dma_common_free_pages,
> -	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
> -	.free_noncontiguous	= iommu_dma_free_noncontiguous,
> -	.mmap			= iommu_dma_mmap,
> -	.get_sgtable		= iommu_dma_get_sgtable,
> -	.map_page		= iommu_dma_map_page,
> -	.unmap_page		= iommu_dma_unmap_page,
> -	.map_sg			= iommu_dma_map_sg,
> -	.unmap_sg		= iommu_dma_unmap_sg,
> +	iommu_dma_ops_common_fields
> +
>   	.sync_single_for_cpu	= iommu_dma_sync_single_for_cpu,
>   	.sync_single_for_device	= iommu_dma_sync_single_for_device,
>   	.sync_sg_for_cpu	= iommu_dma_sync_sg_for_cpu,
>   	.sync_sg_for_device	= iommu_dma_sync_sg_for_device,
> -	.map_resource		= iommu_dma_map_resource,
> -	.unmap_resource		= iommu_dma_unmap_resource,
> -	.get_merge_boundary	= iommu_dma_get_merge_boundary,
> -	.opt_mapping_size	= iommu_dma_opt_mapping_size,
>   };
>   
> +/* Special instance of iommu_dma_ops for devices satisfying this condition:
> + *   !dev_is_dma_sync_needed(dev)
> + *
> + * iommu_dma_sync_single_for_cpu(), iommu_dma_sync_single_for_device(),
> + * iommu_dma_sync_sg_for_cpu(), iommu_dma_sync_sg_for_device()
> + * do nothing special and can be avoided, saving indirect calls.
> + */
> +static const struct dma_map_ops iommu_nosync_dma_ops = {
> +	iommu_dma_ops_common_fields
> +
> +	.sync_single_for_cpu	= NULL,
> +	.sync_single_for_device	= NULL,
> +	.sync_sg_for_cpu	= NULL,
> +	.sync_sg_for_device	= NULL,
> +};
> +#undef iommu_dma_ops_common_fields
> +
>   /*
>    * The IOMMU core code allocates the default DMA domain, which the underlying
>    * IOMMU driver needs to support via the dma-iommu layer.
> @@ -1586,7 +1612,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
>   	if (iommu_is_dma_domain(domain)) {
>   		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
>   			goto out_err;
> -		dev->dma_ops = &iommu_dma_ops;
> +		dev->dma_ops = dev_is_dma_sync_needed(dev) ?
> +				&iommu_dma_ops : &iommu_nosync_dma_ops;
>   	}
>   
>   	return;

      parent reply	other threads:[~2022-11-24  4:23 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-12  4:04 [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations Eric Dumazet
2022-11-14 13:30 ` Robin Murphy
2022-11-14 13:52   ` Robin Murphy
2022-11-22 19:17 ` Michal Kubiak
2022-11-22 22:54   ` Eric Dumazet
2022-11-23 10:15   ` Michal Kubiak
2022-11-24  4:23 ` Ethan Zhao [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=73b5ba7e-a4e2-ee95-d16b-942343a04d03@linux.intel.com \
    --to=haifeng.zhao@linux.intel.com \
    --cc=edumazet@google.com \
    --cc=eric.dumazet@gmail.com \
    --cc=iommu@lists.linux.dev \
    --cc=joro@8bytes.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=robin.murphy@arm.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.