[PATCH 1/1] scsi: target: Move LUN stats to per CPU

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/1] scsi: target: Move LUN stats to per CPU
@ 2025-07-24  0:45 Mike Christie
  2025-07-24 14:26 ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Mike Christie @ 2025-07-24  0:45 UTC (permalink / raw)
  To: martin.petersen, linux-scsi, target-devel; +Cc: Mike Christie

The atomic use in the main I/O path is causing perf issues when using
higher performance backend devices and multiple queues. This moves the
LUN stats to per CPU.

I forgot to include this patch with the delayed/ordered per CPU
tracking and per device/device entry per CPU stats. With this patch
you get the full 33% improvements when using fast backends, multiple
queues and multiple IO submiters.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/target_core_fabric_configfs.c |  2 +-
 drivers/target/target_core_internal.h        |  1 +
 drivers/target/target_core_stat.c            | 48 ++++++++++++++++----
 drivers/target/target_core_tpg.c             | 21 +++++++++
 drivers/target/target_core_transport.c       | 14 +++---
 include/target/target_core_base.h            |  8 ++--
 6 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 7156a4dc1ca7..13159928e365 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
 	struct se_lun *lun = container_of(to_config_group(item),
 					  struct se_lun, lun_group);
 
-	kfree_rcu(lun, rcu_head);
+	call_rcu(&lun->rcu_head, target_tpg_free_lun);
 }
 
 static struct configfs_item_operations target_fabric_port_item_ops = {
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 408be26d2e9b..dfe529e59a29 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -125,6 +125,7 @@ void	core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
 				  struct se_lun *);
 void	core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
 struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
+void	target_tpg_free_lun(struct rcu_head *head);
 int	core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
 		bool, struct se_device *);
 void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index 6bdf2d8bd694..88f8be197a68 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -627,14 +627,24 @@ static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
 		char *page)
 {
 	struct se_lun *lun = to_stat_tgt_port(item);
+	struct scsi_port_stats *stats;
 	struct se_device *dev;
 	ssize_t ret = -ENODEV;
+	unsigned int cpu;
+	u32 pdus = 0;
 
 	rcu_read_lock();
 	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%lu\n",
-			       atomic_long_read(&lun->lun_stats.cmd_pdus));
+	if (!dev)
+		goto unlock;
+
+	for_each_possible_cpu(cpu) {
+		stats = per_cpu_ptr(lun->lun_stats, cpu);
+		pdus += stats->cmd_pdus;
+	}
+
+	ret = snprintf(page, PAGE_SIZE, "%u\n", pdus);
+unlock:
 	rcu_read_unlock();
 	return ret;
 }
@@ -643,14 +653,24 @@ static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
 		char *page)
 {
 	struct se_lun *lun = to_stat_tgt_port(item);
+	struct scsi_port_stats *stats;
 	struct se_device *dev;
 	ssize_t ret = -ENODEV;
+	unsigned int cpu;
+	u32 octets = 0;
 
 	rcu_read_lock();
 	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%u\n",
-			(u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
+	if (!dev)
+		goto unlock;
+
+	for_each_possible_cpu(cpu) {
+		stats = per_cpu_ptr(lun->lun_stats, cpu);
+		octets += stats->rx_data_octets;
+	}
+
+	ret = snprintf(page, PAGE_SIZE, "%u\n", octets);
+unlock:
 	rcu_read_unlock();
 	return ret;
 }
@@ -659,14 +679,24 @@ static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
 		char *page)
 {
 	struct se_lun *lun = to_stat_tgt_port(item);
+	struct scsi_port_stats *stats;
 	struct se_device *dev;
 	ssize_t ret = -ENODEV;
+	unsigned int cpu;
+	u32 octets = 0;
 
 	rcu_read_lock();
 	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%u\n",
-				(u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
+	if (!dev)
+		goto unlock;
+
+	for_each_possible_cpu(cpu) {
+		stats = per_cpu_ptr(lun->lun_stats, cpu);
+		octets += stats->tx_data_octets;
+	}
+
+	ret = snprintf(page, PAGE_SIZE, "%u\n", octets);
+unlock:
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index c0e429e5ef31..caa95aa6f502 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -609,12 +609,21 @@ struct se_lun *core_tpg_alloc_lun(
 	u64 unpacked_lun)
 {
 	struct se_lun *lun;
+	int ret;
 
 	lun = kzalloc(sizeof(*lun), GFP_KERNEL);
 	if (!lun) {
 		pr_err("Unable to allocate se_lun memory\n");
 		return ERR_PTR(-ENOMEM);
 	}
+
+	lun->lun_stats = alloc_percpu(struct scsi_port_stats);
+	if (!lun->lun_stats) {
+		pr_err("Unable to allocate se_lun stats memory\n");
+		ret = -ENOMEM;
+		goto free_lun;
+	}
+
 	lun->unpacked_lun = unpacked_lun;
 	atomic_set(&lun->lun_acl_count, 0);
 	init_completion(&lun->lun_shutdown_comp);
@@ -628,6 +637,18 @@ struct se_lun *core_tpg_alloc_lun(
 	lun->lun_tpg = tpg;
 
 	return lun;
+
+free_lun:
+	kfree(lun);
+	return ERR_PTR(-ENOMEM);
+}
+
+void target_tpg_free_lun(struct rcu_head *head)
+{
+	struct se_lun *lun = container_of(head, struct se_lun, rcu_head);
+
+	free_percpu(lun->lun_stats);
+	kfree(lun);
 }
 
 int core_tpg_add_lun(
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 0a76bdfe5528..4ec66ca6c0ca 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1571,7 +1571,7 @@ target_cmd_parse_cdb(struct se_cmd *cmd)
 		return ret;
 
 	cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
-	atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
+	this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus);
 	return 0;
 }
 EXPORT_SYMBOL(target_cmd_parse_cdb);
@@ -2597,8 +2597,8 @@ static void target_complete_ok_work(struct work_struct *work)
 		    !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
 			goto queue_status;
 
-		atomic_long_add(cmd->data_length,
-				&cmd->se_lun->lun_stats.tx_data_octets);
+		this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+			     cmd->data_length);
 		/*
 		 * Perform READ_STRIP of PI using software emulation when
 		 * backend had PI enabled, if the transport will not be
@@ -2621,14 +2621,14 @@ static void target_complete_ok_work(struct work_struct *work)
 			goto queue_full;
 		break;
 	case DMA_TO_DEVICE:
-		atomic_long_add(cmd->data_length,
-				&cmd->se_lun->lun_stats.rx_data_octets);
+		this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets,
+			     cmd->data_length);
 		/*
 		 * Check if we need to send READ payload for BIDI-COMMAND
 		 */
 		if (cmd->se_cmd_flags & SCF_BIDI) {
-			atomic_long_add(cmd->data_length,
-					&cmd->se_lun->lun_stats.tx_data_octets);
+			this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+				     cmd->data_length);
 			ret = cmd->se_tfo->queue_data_in(cmd);
 			if (ret)
 				goto queue_full;
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index c4d9116904aa..e73fb224625d 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -744,9 +744,9 @@ struct se_port_stat_grps {
 };
 
 struct scsi_port_stats {
-	atomic_long_t	cmd_pdus;
-	atomic_long_t	tx_data_octets;
-	atomic_long_t	rx_data_octets;
+	u32			cmd_pdus;
+	u32			tx_data_octets;
+	u32			rx_data_octets;
 };
 
 struct se_lun {
@@ -773,7 +773,7 @@ struct se_lun {
 	spinlock_t		lun_tg_pt_gp_lock;
 
 	struct se_portal_group	*lun_tpg;
-	struct scsi_port_stats	lun_stats;
+	struct scsi_port_stats	__percpu *lun_stats;
 	struct config_group	lun_group;
 	struct se_port_stat_grps port_stat_grps;
 	struct completion	lun_shutdown_comp;
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/1] scsi: target: Move LUN stats to per CPU
  2025-07-24  0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie
@ 2025-07-24 14:26 ` kernel test robot
  2025-07-24 15:06 ` Bart Van Assche
  2025-07-28 15:08 ` Dmitry Bogdanov
  2 siblings, 0 replies; 4+ messages in thread
From: kernel test robot @ 2025-07-24 14:26 UTC (permalink / raw)
  To: Mike Christie, martin.petersen, linux-scsi, target-devel
  Cc: oe-kbuild-all, Mike Christie

Hi Mike,

kernel test robot noticed the following build warnings:

[auto build test WARNING on mkp-scsi/for-next]
[also build test WARNING on jejb-scsi/for-next linus/master v6.16-rc7 next-20250724]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Christie/scsi-target-Move-LUN-stats-to-per-CPU/20250724-084852
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git for-next
patch link:    https://lore.kernel.org/r/20250724004558.40993-1-michael.christie%40oracle.com
patch subject: [PATCH 1/1] scsi: target: Move LUN stats to per CPU
config: sparc64-randconfig-001-20250724 (https://download.01.org/0day-ci/archive/20250724/202507242247.rt3StU9U-lkp@intel.com/config)
compiler: sparc64-linux-gcc (GCC) 8.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250724/202507242247.rt3StU9U-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507242247.rt3StU9U-lkp@intel.com/

All warnings (new ones prefixed by >>):

   drivers/target/target_core_tpg.c: In function 'core_tpg_alloc_lun':
>> drivers/target/target_core_tpg.c:612:6: warning: variable 'ret' set but not used [-Wunused-but-set-variable]
     int ret;
         ^~~


vim +/ret +612 drivers/target/target_core_tpg.c

   606	
   607	struct se_lun *core_tpg_alloc_lun(
   608		struct se_portal_group *tpg,
   609		u64 unpacked_lun)
   610	{
   611		struct se_lun *lun;
 > 612		int ret;
   613	
   614		lun = kzalloc(sizeof(*lun), GFP_KERNEL);
   615		if (!lun) {
   616			pr_err("Unable to allocate se_lun memory\n");
   617			return ERR_PTR(-ENOMEM);
   618		}
   619	
   620		lun->lun_stats = alloc_percpu(struct scsi_port_stats);
   621		if (!lun->lun_stats) {
   622			pr_err("Unable to allocate se_lun stats memory\n");
   623			ret = -ENOMEM;
   624			goto free_lun;
   625		}
   626	
   627		lun->unpacked_lun = unpacked_lun;
   628		atomic_set(&lun->lun_acl_count, 0);
   629		init_completion(&lun->lun_shutdown_comp);
   630		INIT_LIST_HEAD(&lun->lun_deve_list);
   631		INIT_LIST_HEAD(&lun->lun_dev_link);
   632		atomic_set(&lun->lun_tg_pt_secondary_offline, 0);
   633		spin_lock_init(&lun->lun_deve_lock);
   634		mutex_init(&lun->lun_tg_pt_md_mutex);
   635		INIT_LIST_HEAD(&lun->lun_tg_pt_gp_link);
   636		spin_lock_init(&lun->lun_tg_pt_gp_lock);
   637		lun->lun_tpg = tpg;
   638	
   639		return lun;
   640	
   641	free_lun:
   642		kfree(lun);
   643		return ERR_PTR(-ENOMEM);
   644	}
   645	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/1] scsi: target: Move LUN stats to per CPU
  2025-07-24  0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie
  2025-07-24 14:26 ` kernel test robot
@ 2025-07-24 15:06 ` Bart Van Assche
  2025-07-28 15:08 ` Dmitry Bogdanov
  2 siblings, 0 replies; 4+ messages in thread
From: Bart Van Assche @ 2025-07-24 15:06 UTC (permalink / raw)
  To: Mike Christie, martin.petersen, linux-scsi, target-devel

On 7/23/25 5:45 PM, Mike Christie wrote:
> diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
> index c4d9116904aa..e73fb224625d 100644
> --- a/include/target/target_core_base.h
> +++ b/include/target/target_core_base.h
> @@ -744,9 +744,9 @@ struct se_port_stat_grps {
>   };
>   
>   struct scsi_port_stats {
> -	atomic_long_t	cmd_pdus;
> -	atomic_long_t	tx_data_octets;
> -	atomic_long_t	rx_data_octets;
> +	u32			cmd_pdus;
> +	u32			tx_data_octets;
> +	u32			rx_data_octets;
>   };
>   
>   struct se_lun {
> @@ -773,7 +773,7 @@ struct se_lun {
>   	spinlock_t		lun_tg_pt_gp_lock;
>   
>   	struct se_portal_group	*lun_tpg;
> -	struct scsi_port_stats	lun_stats;
> +	struct scsi_port_stats	__percpu *lun_stats;
>   	struct config_group	lun_group;
>   	struct se_port_stat_grps port_stat_grps;
>   	struct completion	lun_shutdown_comp;

Is this perhaps an open-coded implementation of struct percpu_counter? 
Why hasn't struct percpu_counter been used? I think this should be
explained in the patch description.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/1] scsi: target: Move LUN stats to per CPU
  2025-07-24  0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie
  2025-07-24 14:26 ` kernel test robot
  2025-07-24 15:06 ` Bart Van Assche
@ 2025-07-28 15:08 ` Dmitry Bogdanov
  2 siblings, 0 replies; 4+ messages in thread
From: Dmitry Bogdanov @ 2025-07-28 15:08 UTC (permalink / raw)
  To: Mike Christie; +Cc: martin.petersen, linux-scsi, target-devel

On Wed, Jul 23, 2025 at 07:45:57PM -0500, Mike Christie wrote:
> The atomic use in the main I/O path is causing perf issues when using
> higher performance backend devices and multiple queues. This moves the
> LUN stats to per CPU.
> 
> I forgot to include this patch with the delayed/ordered per CPU
> tracking and per device/device entry per CPU stats. With this patch
> you get the full 33% improvements when using fast backends, multiple
> queues and multiple IO submiters.
> 
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/target/target_core_fabric_configfs.c |  2 +-
>  drivers/target/target_core_internal.h        |  1 +
>  drivers/target/target_core_stat.c            | 48 ++++++++++++++++----
>  drivers/target/target_core_tpg.c             | 21 +++++++++
>  drivers/target/target_core_transport.c       | 14 +++---
>  include/target/target_core_base.h            |  8 ++--
>  6 files changed, 73 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
> index 7156a4dc1ca7..13159928e365 100644
> --- a/drivers/target/target_core_fabric_configfs.c
> +++ b/drivers/target/target_core_fabric_configfs.c
> @@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
>         struct se_lun *lun = container_of(to_config_group(item),
>                                           struct se_lun, lun_group);
> 
> -       kfree_rcu(lun, rcu_head);
> +       call_rcu(&lun->rcu_head, target_tpg_free_lun);

There is se_tpg->tpg_virt_lun0 that is also the lun object that will
have allocated lun_stats, but tpg_virt_lun0 is deallocated in other
places. You have to take care of it to not leak its lun_stats.

>  }
> 
>  static struct configfs_item_operations target_fabric_port_item_ops = {
> diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
> index 408be26d2e9b..dfe529e59a29 100644
> --- a/drivers/target/target_core_internal.h
> +++ b/drivers/target/target_core_internal.h
> @@ -125,6 +125,7 @@ void        core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
>                                   struct se_lun *);
>  void   core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
>  struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
> +void   target_tpg_free_lun(struct rcu_head *head);
>  int    core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
>                 bool, struct se_device *);
>  void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
> diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
> index 6bdf2d8bd694..88f8be197a68 100644
> --- a/drivers/target/target_core_stat.c
> +++ b/drivers/target/target_core_stat.c
> @@ -627,14 +627,24 @@ static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
>                 char *page)
>  {
>         struct se_lun *lun = to_stat_tgt_port(item);
> +       struct scsi_port_stats *stats;
>         struct se_device *dev;
>         ssize_t ret = -ENODEV;
> +       unsigned int cpu;
> +       u32 pdus = 0;
> 
>         rcu_read_lock();
>         dev = rcu_dereference(lun->lun_se_dev);
> -       if (dev)
> -               ret = snprintf(page, PAGE_SIZE, "%lu\n",
> -                              atomic_long_read(&lun->lun_stats.cmd_pdus));
> +       if (!dev)
> +               goto unlock;
> +
> +       for_each_possible_cpu(cpu) {
> +               stats = per_cpu_ptr(lun->lun_stats, cpu);
> +               pdus += stats->cmd_pdus;
> +       }
> +
> +       ret = snprintf(page, PAGE_SIZE, "%u\n", pdus);
> +unlock:
>         rcu_read_unlock();
>         return ret;
>  }
> @@ -643,14 +653,24 @@ static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
>                 char *page)
>  {
>         struct se_lun *lun = to_stat_tgt_port(item);
> +       struct scsi_port_stats *stats;
>         struct se_device *dev;
>         ssize_t ret = -ENODEV;
> +       unsigned int cpu;
> +       u32 octets = 0;
> 
>         rcu_read_lock();
>         dev = rcu_dereference(lun->lun_se_dev);
> -       if (dev)
> -               ret = snprintf(page, PAGE_SIZE, "%u\n",
> -                       (u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
> +       if (!dev)
> +               goto unlock;
> +
> +       for_each_possible_cpu(cpu) {
> +               stats = per_cpu_ptr(lun->lun_stats, cpu);
> +               octets += stats->rx_data_octets;
> +       }
> +
> +       ret = snprintf(page, PAGE_SIZE, "%u\n", octets);
> +unlock:
>         rcu_read_unlock();
>         return ret;
>  }
> @@ -659,14 +679,24 @@ static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
>                 char *page)
>  {
>         struct se_lun *lun = to_stat_tgt_port(item);
> +       struct scsi_port_stats *stats;
>         struct se_device *dev;
>         ssize_t ret = -ENODEV;
> +       unsigned int cpu;
> +       u32 octets = 0;
> 
>         rcu_read_lock();
>         dev = rcu_dereference(lun->lun_se_dev);
> -       if (dev)
> -               ret = snprintf(page, PAGE_SIZE, "%u\n",
> -                               (u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
> +       if (!dev)
> +               goto unlock;
> +
> +       for_each_possible_cpu(cpu) {
> +               stats = per_cpu_ptr(lun->lun_stats, cpu);
> +               octets += stats->tx_data_octets;
> +       }
> +
> +       ret = snprintf(page, PAGE_SIZE, "%u\n", octets);
> +unlock:
>         rcu_read_unlock();
>         return ret;
>  }

May be that is a time to refactor this file using macro magic?
Something like:

static u64 _target_stat_get_u64_luns_stats(struct se_lun *lun, u64 offset)
{
	int cpu;
	u64 res = 0;
	u8 *stats;

	for_each_possible_cpu(cpu) {
		const struct scsi_port_stats *pcpu_stats;

		pcpu_stats = per_cpu_ptr(lun->lun_stats, cpu);
		stats = (u8 *)pcpu_stats;
		res += *(u64 *)(stats + offset);
	}

	return res;
}
#define target_stat_get_u64_luns_stats(LUN, FIELD)			\
	_target_stat_get_u64_luns_stats(LUN,				\
					offsetof(struct scsi_port_stats, FIELD))

#define _SYSFS_TGT_PORT_STATS_U64_SHOW(STAT, VAR, LAMBDA)		\
static ssize_t target_stat_tgt_port_##STAT##_show(			\
					struct config_item *item,	\
					char *page)			\
{									\
	struct se_lun *lun = to_stat_tgt_port(item);			\
	struct se_device *dev;						\
	ssize_t ret = -ENODEV;						\
	u64 VAR;							\
									\
	rcu_read_lock();						\
	dev = rcu_dereference(lun->lun_se_dev);				\
	if (dev) {							\
		VAR = target_stat_get_u64_luns_stats(lun, VAR);		\
		VAR = LAMBDA(VAR);					\
		ret = snprintf(page, PAGE_SIZE, "%llu\n", VAR);		\
	}								\
	rcu_read_unlock();						\
	return ret;							\
}

#define LAMBDA_NOOP(A) (A)
#define LAMBDA_MBYTES(A) (A >> 20)
#define SYSFS_TGT_PORT_STATS_U64_SHOW(STAT, VAR)			\
	_SYSFS_TGT_PORT_STATS_U64_SHOW(STAT, VAR, LAMBDA_NOOP)
#define SYSFS_TGT_PORT_STATS_U64_SHOW_MBYTES(STAT, VAR)			\
	_SYSFS_TGT_PORT_STATS_U64_SHOW(STAT, VAR, LAMBDA_MBYTES)


SYSFS_TGT_PORT_STATS_U64_SHOW(in_cmds, cmd_pdus)
SYSFS_TGT_PORT_STATS_U64_SHOW(in_cmds, hs_cmd_pdus)
SYSFS_TGT_PORT_STATS_U64_SHOW_MBYTES(write_mbytes, rx_data_octets)
SYSFS_TGT_PORT_STATS_U64_SHOW_MBYTES(read_mbytes, tx_data_octets)


> diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
> index c0e429e5ef31..caa95aa6f502 100644
> --- a/drivers/target/target_core_tpg.c
> +++ b/drivers/target/target_core_tpg.c
> @@ -609,12 +609,21 @@ struct se_lun *core_tpg_alloc_lun(
>         u64 unpacked_lun)
>  {
>         struct se_lun *lun;
> +       int ret;
> 
>         lun = kzalloc(sizeof(*lun), GFP_KERNEL);
>         if (!lun) {
>                 pr_err("Unable to allocate se_lun memory\n");
>                 return ERR_PTR(-ENOMEM);
>         }
> +
> +       lun->lun_stats = alloc_percpu(struct scsi_port_stats);
> +       if (!lun->lun_stats) {
> +               pr_err("Unable to allocate se_lun stats memory\n");
> +               ret = -ENOMEM;
> +               goto free_lun;
> +       }
> +

There is dev->xcopy_lun that is used for cmd->se_lun, you have to
allocate and deallocate lun_stats for it to avoid NPE on XCOPY command.

>         lun->unpacked_lun = unpacked_lun;
>         atomic_set(&lun->lun_acl_count, 0);
>         init_completion(&lun->lun_shutdown_comp);
> @@ -628,6 +637,18 @@ struct se_lun *core_tpg_alloc_lun(
>         lun->lun_tpg = tpg;
> 
>         return lun;
> +
> +free_lun:
> +       kfree(lun);
> +       return ERR_PTR(-ENOMEM);
> +}
> +
> +void target_tpg_free_lun(struct rcu_head *head)
> +{
> +       struct se_lun *lun = container_of(head, struct se_lun, rcu_head);
> +
> +       free_percpu(lun->lun_stats);
> +       kfree(lun);
>  }
> 
>  int core_tpg_add_lun(
> diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
> index 0a76bdfe5528..4ec66ca6c0ca 100644
> --- a/drivers/target/target_core_transport.c
> +++ b/drivers/target/target_core_transport.c
> @@ -1571,7 +1571,7 @@ target_cmd_parse_cdb(struct se_cmd *cmd)
>                 return ret;
> 
>         cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
> -       atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
> +       this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus);
>         return 0;
>  }
>  EXPORT_SYMBOL(target_cmd_parse_cdb);
> @@ -2597,8 +2597,8 @@ static void target_complete_ok_work(struct work_struct *work)
>                     !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
>                         goto queue_status;
> 
> -               atomic_long_add(cmd->data_length,
> -                               &cmd->se_lun->lun_stats.tx_data_octets);
> +               this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
> +                            cmd->data_length);
>                 /*
>                  * Perform READ_STRIP of PI using software emulation when
>                  * backend had PI enabled, if the transport will not be
> @@ -2621,14 +2621,14 @@ static void target_complete_ok_work(struct work_struct *work)
>                         goto queue_full;
>                 break;
>         case DMA_TO_DEVICE:
> -               atomic_long_add(cmd->data_length,
> -                               &cmd->se_lun->lun_stats.rx_data_octets);
> +               this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets,
> +                            cmd->data_length);
>                 /*
>                  * Check if we need to send READ payload for BIDI-COMMAND
>                  */
>                 if (cmd->se_cmd_flags & SCF_BIDI) {
> -                       atomic_long_add(cmd->data_length,
> -                                       &cmd->se_lun->lun_stats.tx_data_octets);
> +                       this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
> +                                    cmd->data_length);
>                         ret = cmd->se_tfo->queue_data_in(cmd);
>                         if (ret)
>                                 goto queue_full;
> diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
> index c4d9116904aa..e73fb224625d 100644
> --- a/include/target/target_core_base.h
> +++ b/include/target/target_core_base.h
> @@ -744,9 +744,9 @@ struct se_port_stat_grps {
>  };
> 
>  struct scsi_port_stats {
> -       atomic_long_t   cmd_pdus;
> -       atomic_long_t   tx_data_octets;
> -       atomic_long_t   rx_data_octets;
> +       u32                     cmd_pdus;
> +       u32                     tx_data_octets;
> +       u32                     rx_data_octets;

I belive that there is no reason to have 32-bits counters in our
century. 
[SPC-5] 7.3.9.2 General Access Statistics and Performance log parameter
has 64-bits counters.
RFC 4455 (MIB SCSI) states that 64bit counters are mandatory for systems
with speed >= 4Gbs.

Especially for t(r)x_data_octets that in 32-bit counter presentation has
just 12 meaning bits actually.

>  };
> 
>  struct se_lun {
> @@ -773,7 +773,7 @@ struct se_lun {
>         spinlock_t              lun_tg_pt_gp_lock;
> 
>         struct se_portal_group  *lun_tpg;
> -       struct scsi_port_stats  lun_stats;
> +       struct scsi_port_stats  __percpu *lun_stats;
>         struct config_group     lun_group;
>         struct se_port_stat_grps port_stat_grps;
>         struct completion       lun_shutdown_comp;
> --
> 2.47.1
> 
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-07-28 15:14 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-24  0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie
2025-07-24 14:26 ` kernel test robot
2025-07-24 15:06 ` Bart Van Assche
2025-07-28 15:08 ` Dmitry Bogdanov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox