[PATCH v3 0/3] target: RW/num_cmds stats improvements

target-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3 0/3] target: RW/num_cmds stats improvements
@ 2025-09-17 22:12 Mike Christie
  2025-09-17 22:12 ` [PATCH v3 1/3] scsi: target: Fix lun/device R/W and total command stats Mike Christie
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Mike Christie @ 2025-09-17 22:12 UTC (permalink / raw)
  To: mlombard, martin.petersen, d.bogdanov, bvanassche, linux-scsi,
	target-devel

The following patches were made over Linus tree. They fix/improve the
stats used in the main IO path. The first patch fixes an issue where
I made some stats u32 when they should have stayed u64. The rest of
the patches improve the handling of RW/num_cmds stats to reduce code
duplication and improve performance.

V3:
- Fix ENOMEM/ret use.
V2:
- Research if percpu_counters would work.
- Add patch to fix u32 use.
- Fix several issues in last patch: do unsigned 64 bit counters, fix
remote xcopy handling, fix default lun0 error cleanup path, fix
byte to mbyte conversion.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v3 1/3] scsi: target: Fix lun/device R/W and total command stats
  2025-09-17 22:12 [PATCH v3 0/3] target: RW/num_cmds stats improvements Mike Christie
@ 2025-09-17 22:12 ` Mike Christie
  2025-09-17 22:12 ` [PATCH v3 2/3] scsi: target: Create and use macro helpers for per CPU stats Mike Christie
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Mike Christie @ 2025-09-17 22:12 UTC (permalink / raw)
  To: mlombard, martin.petersen, d.bogdanov, bvanassche, linux-scsi,
	target-devel
  Cc: Mike Christie

In

commit 9cf2317b795d ("scsi: target: Move I/O path stats to per CPU")

I saw we sometimes use %u and also misread the spec. As a result I
thought all the stats were supposed to be 32 bit only. However, for
the majority of cases we support currently, the spec specifies u64 bit
stats. This patch converts the stats changed in the commit above to
u64.

Fixes: 9cf2317b795d ("scsi: target: Move I/O path stats to per CPU")
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
---
 drivers/target/target_core_stat.c | 24 ++++++++++++------------
 include/target/target_core_base.h | 12 ++++++------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index 6bdf2d8bd694..4fdc307ea38b 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -282,7 +282,7 @@ static ssize_t target_stat_lu_num_cmds_show(struct config_item *item,
 	struct se_device *dev = to_stat_lu_dev(item);
 	struct se_dev_io_stats *stats;
 	unsigned int cpu;
-	u32 cmds = 0;
+	u64 cmds = 0;
 
 	for_each_possible_cpu(cpu) {
 		stats = per_cpu_ptr(dev->stats, cpu);
@@ -290,7 +290,7 @@ static ssize_t target_stat_lu_num_cmds_show(struct config_item *item,
 	}
 
 	/* scsiLuNumCommands */
-	return snprintf(page, PAGE_SIZE, "%u\n", cmds);
+	return snprintf(page, PAGE_SIZE, "%llu\n", cmds);
 }
 
 static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
@@ -299,7 +299,7 @@ static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
 	struct se_device *dev = to_stat_lu_dev(item);
 	struct se_dev_io_stats *stats;
 	unsigned int cpu;
-	u32 bytes = 0;
+	u64 bytes = 0;
 
 	for_each_possible_cpu(cpu) {
 		stats = per_cpu_ptr(dev->stats, cpu);
@@ -307,7 +307,7 @@ static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
 	}
 
 	/* scsiLuReadMegaBytes */
-	return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
+	return snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
 }
 
 static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
@@ -316,7 +316,7 @@ static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
 	struct se_device *dev = to_stat_lu_dev(item);
 	struct se_dev_io_stats *stats;
 	unsigned int cpu;
-	u32 bytes = 0;
+	u64 bytes = 0;
 
 	for_each_possible_cpu(cpu) {
 		stats = per_cpu_ptr(dev->stats, cpu);
@@ -324,7 +324,7 @@ static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
 	}
 
 	/* scsiLuWrittenMegaBytes */
-	return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
+	return snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
 }
 
 static ssize_t target_stat_lu_resets_show(struct config_item *item, char *page)
@@ -1044,7 +1044,7 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
 	struct se_dev_entry *deve;
 	unsigned int cpu;
 	ssize_t ret;
-	u32 cmds = 0;
+	u64 cmds = 0;
 
 	rcu_read_lock();
 	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1059,7 +1059,7 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
 	}
 
 	/* scsiAuthIntrOutCommands */
-	ret = snprintf(page, PAGE_SIZE, "%u\n", cmds);
+	ret = snprintf(page, PAGE_SIZE, "%llu\n", cmds);
 	rcu_read_unlock();
 	return ret;
 }
@@ -1073,7 +1073,7 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
 	struct se_dev_entry *deve;
 	unsigned int cpu;
 	ssize_t ret;
-	u32 bytes = 0;
+	u64 bytes = 0;
 
 	rcu_read_lock();
 	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1088,7 +1088,7 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
 	}
 
 	/* scsiAuthIntrReadMegaBytes */
-	ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
+	ret = snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
 	rcu_read_unlock();
 	return ret;
 }
@@ -1102,7 +1102,7 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
 	struct se_dev_entry *deve;
 	unsigned int cpu;
 	ssize_t ret;
-	u32 bytes = 0;
+	u64 bytes = 0;
 
 	rcu_read_lock();
 	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1117,7 +1117,7 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
 	}
 
 	/* scsiAuthIntrWrittenMegaBytes */
-	ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
+	ret = snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index c4d9116904aa..27e1f9d5f0c6 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -671,9 +671,9 @@ struct se_lun_acl {
 };
 
 struct se_dev_entry_io_stats {
-	u32			total_cmds;
-	u32			read_bytes;
-	u32			write_bytes;
+	u64			total_cmds;
+	u64			read_bytes;
+	u64			write_bytes;
 };
 
 struct se_dev_entry {
@@ -806,9 +806,9 @@ struct se_device_queue {
 };
 
 struct se_dev_io_stats {
-	u32			total_cmds;
-	u32			read_bytes;
-	u32			write_bytes;
+	u64			total_cmds;
+	u64			read_bytes;
+	u64			write_bytes;
 };
 
 struct se_device {
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v3 2/3] scsi: target: Create and use macro helpers for per CPU stats
  2025-09-17 22:12 [PATCH v3 0/3] target: RW/num_cmds stats improvements Mike Christie
  2025-09-17 22:12 ` [PATCH v3 1/3] scsi: target: Fix lun/device R/W and total command stats Mike Christie
@ 2025-09-17 22:12 ` Mike Christie
  2025-09-17 22:12 ` [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU Mike Christie
  2025-11-05  4:02 ` [PATCH v3 0/3] target: RW/num_cmds stats improvements Martin K. Petersen
  3 siblings, 0 replies; 9+ messages in thread
From: Mike Christie @ 2025-09-17 22:12 UTC (permalink / raw)
  To: mlombard, martin.petersen, d.bogdanov, bvanassche, linux-scsi,
	target-devel
  Cc: Mike Christie

This creates some macros to reduce code duplication for when we handle
per CPU stats. It them converts the existing LUN and auth cases.

Note: This is similar to percpu_counters but they only support s64 since
they are also used for non-stat counters where you need to handle/prevent
rollover more gracefully. Our use is just for stats where the spec
defines u64 use plus we already have some files exporting u64 values so
it's not possible to directly use percpu_counters.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
---
 drivers/target/target_core_stat.c | 197 +++++++++---------------------
 1 file changed, 61 insertions(+), 136 deletions(-)

diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index 4fdc307ea38b..e29d43dacaf7 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -276,56 +276,39 @@ static ssize_t target_stat_lu_state_bit_show(struct config_item *item,
 	return snprintf(page, PAGE_SIZE, "exposed\n");
 }
 
-static ssize_t target_stat_lu_num_cmds_show(struct config_item *item,
-		char *page)
-{
-	struct se_device *dev = to_stat_lu_dev(item);
-	struct se_dev_io_stats *stats;
-	unsigned int cpu;
-	u64 cmds = 0;
-
-	for_each_possible_cpu(cpu) {
-		stats = per_cpu_ptr(dev->stats, cpu);
-		cmds += stats->total_cmds;
-	}
-
-	/* scsiLuNumCommands */
-	return snprintf(page, PAGE_SIZE, "%llu\n", cmds);
-}
-
-static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
-		char *page)
-{
-	struct se_device *dev = to_stat_lu_dev(item);
-	struct se_dev_io_stats *stats;
-	unsigned int cpu;
-	u64 bytes = 0;
-
-	for_each_possible_cpu(cpu) {
-		stats = per_cpu_ptr(dev->stats, cpu);
-		bytes += stats->read_bytes;
-	}
-
-	/* scsiLuReadMegaBytes */
-	return snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
-}
-
-static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
-		char *page)
-{
-	struct se_device *dev = to_stat_lu_dev(item);
-	struct se_dev_io_stats *stats;
-	unsigned int cpu;
-	u64 bytes = 0;
-
-	for_each_possible_cpu(cpu) {
-		stats = per_cpu_ptr(dev->stats, cpu);
-		bytes += stats->write_bytes;
-	}
-
-	/* scsiLuWrittenMegaBytes */
-	return snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
-}
+#define per_cpu_stat_snprintf(stats_struct, prefix, field, shift)	\
+static ssize_t								\
+per_cpu_stat_##prefix##_snprintf(struct stats_struct __percpu *per_cpu_stats, \
+				 char *page)				\
+{									\
+	struct stats_struct *stats;					\
+	unsigned int cpu;						\
+	u64 sum = 0;							\
+									\
+	for_each_possible_cpu(cpu) {					\
+		stats = per_cpu_ptr(per_cpu_stats, cpu);		\
+		sum += stats->field;					\
+	}								\
+									\
+	return snprintf(page, PAGE_SIZE, "%llu\n", sum >> shift);	\
+}
+
+#define lu_show_per_cpu_stat(prefix, field, shift)			\
+per_cpu_stat_snprintf(se_dev_io_stats, prefix, field, shift);		\
+static ssize_t								\
+target_stat_##prefix##_show(struct config_item *item, char *page)	\
+{									\
+	struct se_device *dev = to_stat_lu_dev(item);			\
+									\
+	return per_cpu_stat_##prefix##_snprintf(dev->stats, page);	\
+}									\
+
+/* scsiLuNumCommands */
+lu_show_per_cpu_stat(lu_num_cmds, total_cmds, 0);
+/* scsiLuReadMegaBytes */
+lu_show_per_cpu_stat(lu_read_mbytes, read_bytes, 20);
+/* scsiLuWrittenMegaBytes */
+lu_show_per_cpu_stat(lu_write_mbytes, write_bytes, 20);
 
 static ssize_t target_stat_lu_resets_show(struct config_item *item, char *page)
 {
@@ -1035,92 +1018,34 @@ static ssize_t target_stat_auth_att_count_show(struct config_item *item,
 	return ret;
 }
 
-static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun_acl *lacl = auth_to_lacl(item);
-	struct se_node_acl *nacl = lacl->se_lun_nacl;
-	struct se_dev_entry_io_stats *stats;
-	struct se_dev_entry *deve;
-	unsigned int cpu;
-	ssize_t ret;
-	u64 cmds = 0;
-
-	rcu_read_lock();
-	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
-	if (!deve) {
-		rcu_read_unlock();
-		return -ENODEV;
-	}
-
-	for_each_possible_cpu(cpu) {
-		stats = per_cpu_ptr(deve->stats, cpu);
-		cmds += stats->total_cmds;
-	}
-
-	/* scsiAuthIntrOutCommands */
-	ret = snprintf(page, PAGE_SIZE, "%llu\n", cmds);
-	rcu_read_unlock();
-	return ret;
-}
-
-static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun_acl *lacl = auth_to_lacl(item);
-	struct se_node_acl *nacl = lacl->se_lun_nacl;
-	struct se_dev_entry_io_stats *stats;
-	struct se_dev_entry *deve;
-	unsigned int cpu;
-	ssize_t ret;
-	u64 bytes = 0;
-
-	rcu_read_lock();
-	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
-	if (!deve) {
-		rcu_read_unlock();
-		return -ENODEV;
-	}
-
-	for_each_possible_cpu(cpu) {
-		stats = per_cpu_ptr(deve->stats, cpu);
-		bytes += stats->read_bytes;
-	}
-
-	/* scsiAuthIntrReadMegaBytes */
-	ret = snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
-	rcu_read_unlock();
-	return ret;
-}
-
-static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun_acl *lacl = auth_to_lacl(item);
-	struct se_node_acl *nacl = lacl->se_lun_nacl;
-	struct se_dev_entry_io_stats *stats;
-	struct se_dev_entry *deve;
-	unsigned int cpu;
-	ssize_t ret;
-	u64 bytes = 0;
-
-	rcu_read_lock();
-	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
-	if (!deve) {
-		rcu_read_unlock();
-		return -ENODEV;
-	}
-
-	for_each_possible_cpu(cpu) {
-		stats = per_cpu_ptr(deve->stats, cpu);
-		bytes += stats->write_bytes;
-	}
-
-	/* scsiAuthIntrWrittenMegaBytes */
-	ret = snprintf(page, PAGE_SIZE, "%llu\n", bytes >> 20);
-	rcu_read_unlock();
-	return ret;
-}
+#define auth_show_per_cpu_stat(prefix, field, shift)			\
+per_cpu_stat_snprintf(se_dev_entry_io_stats, prefix, field, shift);	\
+static ssize_t								\
+target_stat_##prefix##_show(struct config_item *item, char *page)	\
+{									\
+	struct se_lun_acl *lacl = auth_to_lacl(item);			\
+	struct se_node_acl *nacl = lacl->se_lun_nacl;			\
+	struct se_dev_entry *deve;					\
+	int ret;							\
+									\
+	rcu_read_lock();						\
+	deve = target_nacl_find_deve(nacl, lacl->mapped_lun);		\
+	if (!deve) {							\
+		rcu_read_unlock();					\
+		return -ENODEV;						\
+	}								\
+									\
+	ret = per_cpu_stat_##prefix##_snprintf(deve->stats, page);	\
+	rcu_read_unlock();						\
+	return ret;							\
+}
+
+/* scsiAuthIntrOutCommands */
+auth_show_per_cpu_stat(auth_num_cmds, total_cmds, 0);
+/* scsiAuthIntrReadMegaBytes */
+auth_show_per_cpu_stat(auth_read_mbytes, read_bytes, 20);
+/* scsiAuthIntrWrittenMegaBytes */
+auth_show_per_cpu_stat(auth_write_mbytes, write_bytes, 20);
 
 static ssize_t target_stat_auth_hs_num_cmds_show(struct config_item *item,
 		char *page)
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU
  2025-09-17 22:12 [PATCH v3 0/3] target: RW/num_cmds stats improvements Mike Christie
  2025-09-17 22:12 ` [PATCH v3 1/3] scsi: target: Fix lun/device R/W and total command stats Mike Christie
  2025-09-17 22:12 ` [PATCH v3 2/3] scsi: target: Create and use macro helpers for per CPU stats Mike Christie
@ 2025-09-17 22:12 ` Mike Christie
  2025-09-18  6:31   ` Hannes Reinecke
  2025-11-05  4:02 ` [PATCH v3 0/3] target: RW/num_cmds stats improvements Martin K. Petersen
  3 siblings, 1 reply; 9+ messages in thread
From: Mike Christie @ 2025-09-17 22:12 UTC (permalink / raw)
  To: mlombard, martin.petersen, d.bogdanov, bvanassche, linux-scsi,
	target-devel
  Cc: Mike Christie

The atomic use in the main I/O path is causing perf issues when using
higher performance backend devices and multiple queues (more than
10 when using vhost-scsi) like with this fio workload:

[global]
bs=4K
iodepth=128
direct=1
ioengine=libaio
group_reporting
time_based
runtime=120
name=standard-iops
rw=randread
numjobs=16
cpus_allowed=0-15

To fix this issue, this moves the LUN stats to per CPU.

Note: I forgot to include this patch with the delayed/ordered per CPU
tracking and per device/device entry per CPU stats. With this patch you
get the full 33% improvements when using fast backends, multiple queues
and multiple IO submiters.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
---
 drivers/target/target_core_device.c          |  1 +
 drivers/target/target_core_fabric_configfs.c |  2 +-
 drivers/target/target_core_internal.h        |  1 +
 drivers/target/target_core_stat.c            | 67 +++++++-------------
 drivers/target/target_core_tpg.c             | 23 ++++++-
 drivers/target/target_core_transport.c       | 22 +++++--
 include/target/target_core_base.h            |  8 +--
 7 files changed, 65 insertions(+), 59 deletions(-)

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 7bb711b24c0d..2d4a7c0c69ce 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -814,6 +814,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
 	dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN;
 	dev->dev_attrib.submit_type = TARGET_FABRIC_DEFAULT_SUBMIT;
 
+	/* Skip allocating lun_stats since we can't export them. */
 	xcopy_lun = &dev->xcopy_lun;
 	rcu_assign_pointer(xcopy_lun->lun_se_dev, dev);
 	init_completion(&xcopy_lun->lun_shutdown_comp);
diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 7156a4dc1ca7..13159928e365 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
 	struct se_lun *lun = container_of(to_config_group(item),
 					  struct se_lun, lun_group);
 
-	kfree_rcu(lun, rcu_head);
+	call_rcu(&lun->rcu_head, target_tpg_free_lun);
 }
 
 static struct configfs_item_operations target_fabric_port_item_ops = {
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 20aab1f50565..763e6d26e187 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -125,6 +125,7 @@ void	core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
 				  struct se_lun *);
 void	core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
 struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
+void	target_tpg_free_lun(struct rcu_head *head);
 int	core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
 		bool, struct se_device *);
 void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index e29d43dacaf7..083205052be2 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -606,53 +606,30 @@ static ssize_t target_stat_tgt_port_port_index_show(struct config_item *item,
 	return ret;
 }
 
-static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun *lun = to_stat_tgt_port(item);
-	struct se_device *dev;
-	ssize_t ret = -ENODEV;
-
-	rcu_read_lock();
-	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%lu\n",
-			       atomic_long_read(&lun->lun_stats.cmd_pdus));
-	rcu_read_unlock();
-	return ret;
-}
-
-static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun *lun = to_stat_tgt_port(item);
-	struct se_device *dev;
-	ssize_t ret = -ENODEV;
-
-	rcu_read_lock();
-	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%u\n",
-			(u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
-	rcu_read_unlock();
-	return ret;
+#define tgt_port_show_per_cpu_stat(prefix, field, shift)		\
+per_cpu_stat_snprintf(scsi_port_stats, prefix, field, shift);		\
+static ssize_t								\
+target_stat_##prefix##_show(struct config_item *item, char *page)	\
+{									\
+	struct se_lun *lun = to_stat_tgt_port(item);			\
+	struct se_device *dev;						\
+	int ret;							\
+									\
+	rcu_read_lock();						\
+	dev = rcu_dereference(lun->lun_se_dev);				\
+	if (!dev) {							\
+		rcu_read_unlock();					\
+		return -ENODEV;						\
+	}								\
+									\
+	ret = per_cpu_stat_##prefix##_snprintf(lun->lun_stats, page);	\
+	rcu_read_unlock();						\
+	return ret;							\
 }
 
-static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
-		char *page)
-{
-	struct se_lun *lun = to_stat_tgt_port(item);
-	struct se_device *dev;
-	ssize_t ret = -ENODEV;
-
-	rcu_read_lock();
-	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%u\n",
-				(u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
-	rcu_read_unlock();
-	return ret;
-}
+tgt_port_show_per_cpu_stat(tgt_port_in_cmds, cmd_pdus, 0);
+tgt_port_show_per_cpu_stat(tgt_port_write_mbytes, rx_data_octets, 20);
+tgt_port_show_per_cpu_stat(tgt_port_read_mbytes, tx_data_octets, 20);
 
 static ssize_t target_stat_tgt_port_hs_in_cmds_show(struct config_item *item,
 		char *page)
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index c0e429e5ef31..8b5ad50baa43 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -548,7 +548,7 @@ int core_tpg_register(
 		ret = core_tpg_add_lun(se_tpg, se_tpg->tpg_virt_lun0,
 				true, g_lun0_dev);
 		if (ret < 0) {
-			kfree(se_tpg->tpg_virt_lun0);
+			target_tpg_free_lun(&se_tpg->tpg_virt_lun0->rcu_head);
 			return ret;
 		}
 	}
@@ -595,7 +595,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg)
 
 	if (se_tpg->proto_id >= 0) {
 		core_tpg_remove_lun(se_tpg, se_tpg->tpg_virt_lun0);
-		kfree_rcu(se_tpg->tpg_virt_lun0, rcu_head);
+		call_rcu(&se_tpg->tpg_virt_lun0->rcu_head, target_tpg_free_lun);
 	}
 
 	target_tpg_deregister_rtpi(se_tpg);
@@ -615,6 +615,13 @@ struct se_lun *core_tpg_alloc_lun(
 		pr_err("Unable to allocate se_lun memory\n");
 		return ERR_PTR(-ENOMEM);
 	}
+
+	lun->lun_stats = alloc_percpu(struct scsi_port_stats);
+	if (!lun->lun_stats) {
+		pr_err("Unable to allocate se_lun stats memory\n");
+		goto free_lun;
+	}
+
 	lun->unpacked_lun = unpacked_lun;
 	atomic_set(&lun->lun_acl_count, 0);
 	init_completion(&lun->lun_shutdown_comp);
@@ -628,6 +635,18 @@ struct se_lun *core_tpg_alloc_lun(
 	lun->lun_tpg = tpg;
 
 	return lun;
+
+free_lun:
+	kfree(lun);
+	return ERR_PTR(-ENOMEM);
+}
+
+void target_tpg_free_lun(struct rcu_head *head)
+{
+	struct se_lun *lun = container_of(head, struct se_lun, rcu_head);
+
+	free_percpu(lun->lun_stats);
+	kfree(lun);
 }
 
 int core_tpg_add_lun(
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 0a76bdfe5528..fca9b44288bc 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1571,7 +1571,12 @@ target_cmd_parse_cdb(struct se_cmd *cmd)
 		return ret;
 
 	cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
-	atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
+	/*
+	 * If this is the xcopy_lun then we won't have lun_stats since we
+	 * can't export them.
+	 */
+	if (cmd->se_lun->lun_stats)
+		this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus);
 	return 0;
 }
 EXPORT_SYMBOL(target_cmd_parse_cdb);
@@ -2597,8 +2602,9 @@ static void target_complete_ok_work(struct work_struct *work)
 		    !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
 			goto queue_status;
 
-		atomic_long_add(cmd->data_length,
-				&cmd->se_lun->lun_stats.tx_data_octets);
+		if (cmd->se_lun->lun_stats)
+			this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+				     cmd->data_length);
 		/*
 		 * Perform READ_STRIP of PI using software emulation when
 		 * backend had PI enabled, if the transport will not be
@@ -2621,14 +2627,16 @@ static void target_complete_ok_work(struct work_struct *work)
 			goto queue_full;
 		break;
 	case DMA_TO_DEVICE:
-		atomic_long_add(cmd->data_length,
-				&cmd->se_lun->lun_stats.rx_data_octets);
+		if (cmd->se_lun->lun_stats)
+			this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets,
+				     cmd->data_length);
 		/*
 		 * Check if we need to send READ payload for BIDI-COMMAND
 		 */
 		if (cmd->se_cmd_flags & SCF_BIDI) {
-			atomic_long_add(cmd->data_length,
-					&cmd->se_lun->lun_stats.tx_data_octets);
+			if (cmd->se_lun->lun_stats)
+				this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+					     cmd->data_length);
 			ret = cmd->se_tfo->queue_data_in(cmd);
 			if (ret)
 				goto queue_full;
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 27e1f9d5f0c6..372da2eadf54 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -744,9 +744,9 @@ struct se_port_stat_grps {
 };
 
 struct scsi_port_stats {
-	atomic_long_t	cmd_pdus;
-	atomic_long_t	tx_data_octets;
-	atomic_long_t	rx_data_octets;
+	u64			cmd_pdus;
+	u64			tx_data_octets;
+	u64			rx_data_octets;
 };
 
 struct se_lun {
@@ -773,7 +773,7 @@ struct se_lun {
 	spinlock_t		lun_tg_pt_gp_lock;
 
 	struct se_portal_group	*lun_tpg;
-	struct scsi_port_stats	lun_stats;
+	struct scsi_port_stats	__percpu *lun_stats;
 	struct config_group	lun_group;
 	struct se_port_stat_grps port_stat_grps;
 	struct completion	lun_shutdown_comp;
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU
  2025-09-17 22:12 ` [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU Mike Christie
@ 2025-09-18  6:31   ` Hannes Reinecke
  2025-09-18 14:50     ` michael.christie
  0 siblings, 1 reply; 9+ messages in thread
From: Hannes Reinecke @ 2025-09-18  6:31 UTC (permalink / raw)
  To: Mike Christie, mlombard, martin.petersen, d.bogdanov, bvanassche,
	linux-scsi, target-devel

On 9/18/25 00:12, Mike Christie wrote:
> The atomic use in the main I/O path is causing perf issues when using
> higher performance backend devices and multiple queues (more than
> 10 when using vhost-scsi) like with this fio workload:
> 
> [global]
> bs=4K
> iodepth=128
> direct=1
> ioengine=libaio
> group_reporting
> time_based
> runtime=120
> name=standard-iops
> rw=randread
> numjobs=16
> cpus_allowed=0-15
> 
> To fix this issue, this moves the LUN stats to per CPU.
> 
> Note: I forgot to include this patch with the delayed/ordered per CPU
> tracking and per device/device entry per CPU stats. With this patch you
> get the full 33% improvements when using fast backends, multiple queues
> and multiple IO submiters.
> 
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
> ---
>   drivers/target/target_core_device.c          |  1 +
>   drivers/target/target_core_fabric_configfs.c |  2 +-
>   drivers/target/target_core_internal.h        |  1 +
>   drivers/target/target_core_stat.c            | 67 +++++++-------------
>   drivers/target/target_core_tpg.c             | 23 ++++++-
>   drivers/target/target_core_transport.c       | 22 +++++--
>   include/target/target_core_base.h            |  8 +--
>   7 files changed, 65 insertions(+), 59 deletions(-)
> 
Ho-hum.

That only works if both submission and completion paths do run on the
_same_ cpu. Are we sure that they do?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.com                               +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU
  2025-09-18  6:31   ` Hannes Reinecke
@ 2025-09-18 14:50     ` michael.christie
  2025-10-16  6:21       ` Hannes Reinecke
  0 siblings, 1 reply; 9+ messages in thread
From: michael.christie @ 2025-09-18 14:50 UTC (permalink / raw)
  To: Hannes Reinecke, mlombard, martin.petersen, d.bogdanov,
	bvanassche, linux-scsi, target-devel

On 9/18/25 1:31 AM, Hannes Reinecke wrote:
> On 9/18/25 00:12, Mike Christie wrote:
>> The atomic use in the main I/O path is causing perf issues when using
>> higher performance backend devices and multiple queues (more than
>> 10 when using vhost-scsi) like with this fio workload:
>>
>> [global]
>> bs=4K
>> iodepth=128
>> direct=1
>> ioengine=libaio
>> group_reporting
>> time_based
>> runtime=120
>> name=standard-iops
>> rw=randread
>> numjobs=16
>> cpus_allowed=0-15
>>
>> To fix this issue, this moves the LUN stats to per CPU.
>>
>> Note: I forgot to include this patch with the delayed/ordered per CPU
>> tracking and per device/device entry per CPU stats. With this patch you
>> get the full 33% improvements when using fast backends, multiple queues
>> and multiple IO submiters.
>>
>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>> Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
>> ---
>>   drivers/target/target_core_device.c          |  1 +
>>   drivers/target/target_core_fabric_configfs.c |  2 +-
>>   drivers/target/target_core_internal.h        |  1 +
>>   drivers/target/target_core_stat.c            | 67 +++++++-------------
>>   drivers/target/target_core_tpg.c             | 23 ++++++-
>>   drivers/target/target_core_transport.c       | 22 +++++--
>>   include/target/target_core_base.h            |  8 +--
>>   7 files changed, 65 insertions(+), 59 deletions(-)
>>
> Ho-hum.
> 
> That only works if both submission and completion paths do run on the
> _same_ cpu. Are we sure that they do?
> 
What do you mean by it only works if they run on the same CPU? Do you
mean I won't get the perf gains I think I will or is there a crash type
of bug?

The default is for cmds to complete on the submitting CPU. The
user/driver can override it though.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU
  2025-09-18 14:50     ` michael.christie
@ 2025-10-16  6:21       ` Hannes Reinecke
  2025-10-16 15:21         ` michael.christie
  0 siblings, 1 reply; 9+ messages in thread
From: Hannes Reinecke @ 2025-10-16  6:21 UTC (permalink / raw)
  To: michael.christie, mlombard, martin.petersen, d.bogdanov,
	bvanassche, linux-scsi, target-devel

On 9/18/25 16:50, michael.christie@oracle.com wrote:
> On 9/18/25 1:31 AM, Hannes Reinecke wrote:
>> On 9/18/25 00:12, Mike Christie wrote:
>>> The atomic use in the main I/O path is causing perf issues when using
>>> higher performance backend devices and multiple queues (more than
>>> 10 when using vhost-scsi) like with this fio workload:
>>>
>>> [global]
>>> bs=4K
>>> iodepth=128
>>> direct=1
>>> ioengine=libaio
>>> group_reporting
>>> time_based
>>> runtime=120
>>> name=standard-iops
>>> rw=randread
>>> numjobs=16
>>> cpus_allowed=0-15
>>>
>>> To fix this issue, this moves the LUN stats to per CPU.
>>>
>>> Note: I forgot to include this patch with the delayed/ordered per CPU
>>> tracking and per device/device entry per CPU stats. With this patch you
>>> get the full 33% improvements when using fast backends, multiple queues
>>> and multiple IO submiters.
>>>
>>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>>> Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
>>> ---
>>>    drivers/target/target_core_device.c          |  1 +
>>>    drivers/target/target_core_fabric_configfs.c |  2 +-
>>>    drivers/target/target_core_internal.h        |  1 +
>>>    drivers/target/target_core_stat.c            | 67 +++++++-------------
>>>    drivers/target/target_core_tpg.c             | 23 ++++++-
>>>    drivers/target/target_core_transport.c       | 22 +++++--
>>>    include/target/target_core_base.h            |  8 +--
>>>    7 files changed, 65 insertions(+), 59 deletions(-)
>>>
>> Ho-hum.
>>
>> That only works if both submission and completion paths do run on the
>> _same_ cpu. Are we sure that they do?
>>
> What do you mean by it only works if they run on the same CPU? Do you
> mean I won't get the perf gains I think I will or is there a crash type
> of bug?
> 
> The default is for cmds to complete on the submitting CPU. The
> user/driver can override it though.

Exactly. And the transport can interfere.
But if they do the stats become garbled as the completion statistics
are added to the wrong CPU.
Can't you store the submitting CPU in 'struct se_cmd', and then use that
value on the completion path?
Then we can be sure to correctly update statistics in the correct
per-cpu slot.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.com                               +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU
  2025-10-16  6:21       ` Hannes Reinecke
@ 2025-10-16 15:21         ` michael.christie
  0 siblings, 0 replies; 9+ messages in thread
From: michael.christie @ 2025-10-16 15:21 UTC (permalink / raw)
  To: Hannes Reinecke, mlombard, martin.petersen, d.bogdanov,
	bvanassche, linux-scsi, target-devel

On 10/16/25 1:21 AM, Hannes Reinecke wrote:
> On 9/18/25 16:50, michael.christie@oracle.com wrote:
>> On 9/18/25 1:31 AM, Hannes Reinecke wrote:
>>> On 9/18/25 00:12, Mike Christie wrote:
>>>> The atomic use in the main I/O path is causing perf issues when using
>>>> higher performance backend devices and multiple queues (more than
>>>> 10 when using vhost-scsi) like with this fio workload:
>>>>
>>>> [global]
>>>> bs=4K
>>>> iodepth=128
>>>> direct=1
>>>> ioengine=libaio
>>>> group_reporting
>>>> time_based
>>>> runtime=120
>>>> name=standard-iops
>>>> rw=randread
>>>> numjobs=16
>>>> cpus_allowed=0-15
>>>>
>>>> To fix this issue, this moves the LUN stats to per CPU.
>>>>
>>>> Note: I forgot to include this patch with the delayed/ordered per CPU
>>>> tracking and per device/device entry per CPU stats. With this patch you
>>>> get the full 33% improvements when using fast backends, multiple queues
>>>> and multiple IO submiters.
>>>>
>>>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>>>> Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
>>>> ---
>>>>    drivers/target/target_core_device.c          |  1 +
>>>>    drivers/target/target_core_fabric_configfs.c |  2 +-
>>>>    drivers/target/target_core_internal.h        |  1 +
>>>>    drivers/target/target_core_stat.c            | 67 ++++++
>>>> +-------------
>>>>    drivers/target/target_core_tpg.c             | 23 ++++++-
>>>>    drivers/target/target_core_transport.c       | 22 +++++--
>>>>    include/target/target_core_base.h            |  8 +--
>>>>    7 files changed, 65 insertions(+), 59 deletions(-)
>>>>
>>> Ho-hum.
>>>
>>> That only works if both submission and completion paths do run on the
>>> _same_ cpu. Are we sure that they do?
>>>
>> What do you mean by it only works if they run on the same CPU? Do you
>> mean I won't get the perf gains I think I will or is there a crash type
>> of bug?
>>
>> The default is for cmds to complete on the submitting CPU. The
>> user/driver can override it though.
> 
> Exactly. And the transport can interfere.
> But if they do the stats become garbled as the completion statistics
> are added to the wrong CPU.

What is the issue for that though? For this patch and I think every
place we do per cpu stats like in this patch, when we report the stats
we loop over all cpus and just add up the totals for each cpu and report
the total.

It's how percpu_counter counters work as well.


> Can't you store the submitting CPU in 'struct se_cmd', and then use that
> value on the completion path?
> Then we can be sure to correctly update statistics in the correct
> per-cpu slot.

When we loop over each cpu to calculate the stats, then you get the same
end total though, right?


> 
> Cheers,
> 
> Hannes


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 0/3] target: RW/num_cmds stats improvements
  2025-09-17 22:12 [PATCH v3 0/3] target: RW/num_cmds stats improvements Mike Christie
                   ` (2 preceding siblings ...)
  2025-09-17 22:12 ` [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU Mike Christie
@ 2025-11-05  4:02 ` Martin K. Petersen
  3 siblings, 0 replies; 9+ messages in thread
From: Martin K. Petersen @ 2025-11-05  4:02 UTC (permalink / raw)
  To: mlombard, d.bogdanov, bvanassche, linux-scsi, target-devel,
	Mike Christie
  Cc: Martin K . Petersen

On Wed, 17 Sep 2025 17:12:52 -0500, Mike Christie wrote:

> The following patches were made over Linus tree. They fix/improve the
> stats used in the main IO path. The first patch fixes an issue where
> I made some stats u32 when they should have stayed u64. The rest of
> the patches improve the handling of RW/num_cmds stats to reduce code
> duplication and improve performance.
> 
> V3:
> - Fix ENOMEM/ret use.
> V2:
> - Research if percpu_counters would work.
> - Add patch to fix u32 use.
> - Fix several issues in last patch: do unsigned 64 bit counters, fix
> remote xcopy handling, fix default lun0 error cleanup path, fix
> byte to mbyte conversion.
> 
> [...]

Applied to 6.19/scsi-queue, thanks!

[1/3] scsi: target: Fix lun/device R/W and total command stats
      https://git.kernel.org/mkp/scsi/c/95aa2041c654
[2/3] scsi: target: Create and use macro helpers for per CPU stats
      https://git.kernel.org/mkp/scsi/c/ed6b97a79577
[3/3] scsi: target: Move LUN stats to per CPU
      https://git.kernel.org/mkp/scsi/c/bbb490053173

-- 
Martin K. Petersen

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-11-05  4:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-17 22:12 [PATCH v3 0/3] target: RW/num_cmds stats improvements Mike Christie
2025-09-17 22:12 ` [PATCH v3 1/3] scsi: target: Fix lun/device R/W and total command stats Mike Christie
2025-09-17 22:12 ` [PATCH v3 2/3] scsi: target: Create and use macro helpers for per CPU stats Mike Christie
2025-09-17 22:12 ` [PATCH v3 3/3] scsi: target: Move LUN stats to per CPU Mike Christie
2025-09-18  6:31   ` Hannes Reinecke
2025-09-18 14:50     ` michael.christie
2025-10-16  6:21       ` Hannes Reinecke
2025-10-16 15:21         ` michael.christie
2025-11-05  4:02 ` [PATCH v3 0/3] target: RW/num_cmds stats improvements Martin K. Petersen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).