Re: [PATCH v2 3/3] scsi: target: Move LUN stats to per CPU

Linux SCSI subsystem development
 help / color / mirror / Atom feed

From: "Maurizio Lombardi" <mlombard@bsdbackstore.eu>
To: "Mike Christie" <michael.christie@oracle.com>,
	<bvanassche@acm.org>, <martin.petersen@oracle.com>,
	<linux-scsi@vger.kernel.org>, <target-devel@vger.kernel.org>
Subject: Re: [PATCH v2 3/3] scsi: target: Move LUN stats to per CPU
Date: Tue, 09 Sep 2025 17:36:24 +0200	[thread overview]
Message-ID: <DCODPRWQNASG.2WN8KKSW05R13@bsdbackstore.eu> (raw)
In-Reply-To: <20250908231000.10777-4-michael.christie@oracle.com>

On Tue Sep 9, 2025 at 1:05 AM CEST, Mike Christie wrote:
> The atomic use in the main I/O path is causing perf issues when using
> higher performance backend devices and multiple queues (more than
> 10 when using vhost-scsi) like with this fio workload:
>
> [global]
> bs=4K
> iodepth=128
> direct=1
> ioengine=libaio
> group_reporting
> time_based
> runtime=120
> name=standard-iops
> rw=randread
> numjobs=16
> cpus_allowed=0-15
>
> To fix this issue, this moves the LUN stats to per CPU.
>
> Note: I forgot to include this patch with the delayed/ordered per CPU
> tracking and per device/device entry per CPU stats. With this patch you
> get the full 33% improvements when using fast backends, multiple queues
> and multiple IO submiters.
>
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/target/target_core_device.c          |  1 +
>  drivers/target/target_core_fabric_configfs.c |  2 +-
>  drivers/target/target_core_internal.h        |  1 +
>  drivers/target/target_core_stat.c            | 67 +++++++-------------
>  drivers/target/target_core_tpg.c             | 25 +++++++-
>  drivers/target/target_core_transport.c       | 22 +++++--
>  include/target/target_core_base.h            |  8 +--
>  7 files changed, 67 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
> index 7bb711b24c0d..2d4a7c0c69ce 100644
> --- a/drivers/target/target_core_device.c
> +++ b/drivers/target/target_core_device.c
> @@ -814,6 +814,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
>  	dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN;
>  	dev->dev_attrib.submit_type = TARGET_FABRIC_DEFAULT_SUBMIT;
>  
> +	/* Skip allocating lun_stats since we can't export them. */
>  	xcopy_lun = &dev->xcopy_lun;
>  	rcu_assign_pointer(xcopy_lun->lun_se_dev, dev);
>  	init_completion(&xcopy_lun->lun_shutdown_comp);
> diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
> index 7156a4dc1ca7..13159928e365 100644
> --- a/drivers/target/target_core_fabric_configfs.c
> +++ b/drivers/target/target_core_fabric_configfs.c
> @@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
>  	struct se_lun *lun = container_of(to_config_group(item),
>  					  struct se_lun, lun_group);
>  
> -	kfree_rcu(lun, rcu_head);
> +	call_rcu(&lun->rcu_head, target_tpg_free_lun);
>  }
>  
>  static struct configfs_item_operations target_fabric_port_item_ops = {
> diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
> index 20aab1f50565..763e6d26e187 100644
> --- a/drivers/target/target_core_internal.h
> +++ b/drivers/target/target_core_internal.h
> @@ -125,6 +125,7 @@ void	core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
>  				  struct se_lun *);
>  void	core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
>  struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
> +void	target_tpg_free_lun(struct rcu_head *head);
>  int	core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
>  		bool, struct se_device *);
>  void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
> diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
> index e29d43dacaf7..083205052be2 100644
> --- a/drivers/target/target_core_stat.c
> +++ b/drivers/target/target_core_stat.c
> @@ -606,53 +606,30 @@ static ssize_t target_stat_tgt_port_port_index_show(struct config_item *item,
>  	return ret;
>  }
>  
> -static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
> -		char *page)
> -{
> -	struct se_lun *lun = to_stat_tgt_port(item);
> -	struct se_device *dev;
> -	ssize_t ret = -ENODEV;
> -
> -	rcu_read_lock();
> -	dev = rcu_dereference(lun->lun_se_dev);
> -	if (dev)
> -		ret = snprintf(page, PAGE_SIZE, "%lu\n",
> -			       atomic_long_read(&lun->lun_stats.cmd_pdus));
> -	rcu_read_unlock();
> -	return ret;
> -}
> -
> -static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
> -		char *page)
> -{
> -	struct se_lun *lun = to_stat_tgt_port(item);
> -	struct se_device *dev;
> -	ssize_t ret = -ENODEV;
> -
> -	rcu_read_lock();
> -	dev = rcu_dereference(lun->lun_se_dev);
> -	if (dev)
> -		ret = snprintf(page, PAGE_SIZE, "%u\n",
> -			(u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
> -	rcu_read_unlock();
> -	return ret;
> +#define tgt_port_show_per_cpu_stat(prefix, field, shift)		\
> +per_cpu_stat_snprintf(scsi_port_stats, prefix, field, shift);		\
> +static ssize_t								\
> +target_stat_##prefix##_show(struct config_item *item, char *page)	\
> +{									\
> +	struct se_lun *lun = to_stat_tgt_port(item);			\
> +	struct se_device *dev;						\
> +	int ret;							\
> +									\
> +	rcu_read_lock();						\
> +	dev = rcu_dereference(lun->lun_se_dev);				\
> +	if (!dev) {							\
> +		rcu_read_unlock();					\
> +		return -ENODEV;						\
> +	}								\
> +									\
> +	ret = per_cpu_stat_##prefix##_snprintf(lun->lun_stats, page);	\
> +	rcu_read_unlock();						\
> +	return ret;							\
>  }
>  
> -static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
> -		char *page)
> -{
> -	struct se_lun *lun = to_stat_tgt_port(item);
> -	struct se_device *dev;
> -	ssize_t ret = -ENODEV;
> -
> -	rcu_read_lock();
> -	dev = rcu_dereference(lun->lun_se_dev);
> -	if (dev)
> -		ret = snprintf(page, PAGE_SIZE, "%u\n",
> -				(u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
> -	rcu_read_unlock();
> -	return ret;
> -}
> +tgt_port_show_per_cpu_stat(tgt_port_in_cmds, cmd_pdus, 0);
> +tgt_port_show_per_cpu_stat(tgt_port_write_mbytes, rx_data_octets, 20);
> +tgt_port_show_per_cpu_stat(tgt_port_read_mbytes, tx_data_octets, 20);
>  
>  static ssize_t target_stat_tgt_port_hs_in_cmds_show(struct config_item *item,
>  		char *page)
> diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
> index c0e429e5ef31..540ff15882c9 100644
> --- a/drivers/target/target_core_tpg.c
> +++ b/drivers/target/target_core_tpg.c
> @@ -548,7 +548,7 @@ int core_tpg_register(
>  		ret = core_tpg_add_lun(se_tpg, se_tpg->tpg_virt_lun0,
>  				true, g_lun0_dev);
>  		if (ret < 0) {
> -			kfree(se_tpg->tpg_virt_lun0);
> +			target_tpg_free_lun(&se_tpg->tpg_virt_lun0->rcu_head);
>  			return ret;
>  		}
>  	}
> @@ -595,7 +595,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg)
>  
>  	if (se_tpg->proto_id >= 0) {
>  		core_tpg_remove_lun(se_tpg, se_tpg->tpg_virt_lun0);
> -		kfree_rcu(se_tpg->tpg_virt_lun0, rcu_head);
> +		call_rcu(&se_tpg->tpg_virt_lun0->rcu_head, target_tpg_free_lun);
>  	}
>  
>  	target_tpg_deregister_rtpi(se_tpg);
> @@ -609,12 +609,21 @@ struct se_lun *core_tpg_alloc_lun(
>  	u64 unpacked_lun)
>  {
>  	struct se_lun *lun;
> +	int ret;
>  
>  	lun = kzalloc(sizeof(*lun), GFP_KERNEL);
>  	if (!lun) {
>  		pr_err("Unable to allocate se_lun memory\n");
>  		return ERR_PTR(-ENOMEM);
>  	}
> +
> +	lun->lun_stats = alloc_percpu(struct scsi_port_stats);
> +	if (!lun->lun_stats) {
> +		pr_err("Unable to allocate se_lun stats memory\n");
> +		ret = -ENOMEM;
> +		goto free_lun;
> +	}
> +
>  	lun->unpacked_lun = unpacked_lun;
>  	atomic_set(&lun->lun_acl_count, 0);
>  	init_completion(&lun->lun_shutdown_comp);
> @@ -628,6 +637,18 @@ struct se_lun *core_tpg_alloc_lun(
>  	lun->lun_tpg = tpg;
>  
>  	return lun;
> +
> +free_lun:
> +	kfree(lun);
> +	return ERR_PTR(-ENOMEM);
> +}

nit: "ret" is set to -ENOMEM but never used, it can be removed.

Maurizio

next prev parent reply	other threads:[~2025-09-09 15:36 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-08 23:05 [PATCH v2 0/3] target: RW/num_cmds stats improvements Mike Christie
2025-09-08 23:05 ` [PATCH v2 1/3] scsi: target: Fix lun/device R/W and total command stats Mike Christie
2025-09-17 13:47   ` Dmitry Bogdanov
2025-09-08 23:05 ` [PATCH v2 2/3] scsi: target: Create and use macro helpers for per CPU stats Mike Christie
2025-09-17 13:33   ` Dmitry Bogdanov
2025-09-08 23:05 ` [PATCH v2 3/3] scsi: target: Move LUN stats to per CPU Mike Christie
2025-09-09 15:36   ` Maurizio Lombardi [this message]
2025-09-17 13:38   ` Dmitry Bogdanov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=DCODPRWQNASG.2WN8KKSW05R13@bsdbackstore.eu \
    --to=mlombard@bsdbackstore.eu \
    --cc=bvanassche@acm.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=michael.christie@oracle.com \
    --cc=target-devel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox