public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] scsi: target: Move LUN stats to per CPU
@ 2025-07-24  0:45 Mike Christie
  2025-07-24 14:26 ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Mike Christie @ 2025-07-24  0:45 UTC (permalink / raw)
  To: martin.petersen, linux-scsi, target-devel; +Cc: Mike Christie

The atomic use in the main I/O path is causing perf issues when using
higher performance backend devices and multiple queues. This moves the
LUN stats to per CPU.

I forgot to include this patch with the delayed/ordered per CPU
tracking and per device/device entry per CPU stats. With this patch
you get the full 33% improvements when using fast backends, multiple
queues and multiple IO submiters.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/target_core_fabric_configfs.c |  2 +-
 drivers/target/target_core_internal.h        |  1 +
 drivers/target/target_core_stat.c            | 48 ++++++++++++++++----
 drivers/target/target_core_tpg.c             | 21 +++++++++
 drivers/target/target_core_transport.c       | 14 +++---
 include/target/target_core_base.h            |  8 ++--
 6 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 7156a4dc1ca7..13159928e365 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
 	struct se_lun *lun = container_of(to_config_group(item),
 					  struct se_lun, lun_group);
 
-	kfree_rcu(lun, rcu_head);
+	call_rcu(&lun->rcu_head, target_tpg_free_lun);
 }
 
 static struct configfs_item_operations target_fabric_port_item_ops = {
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 408be26d2e9b..dfe529e59a29 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -125,6 +125,7 @@ void	core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
 				  struct se_lun *);
 void	core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
 struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
+void	target_tpg_free_lun(struct rcu_head *head);
 int	core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
 		bool, struct se_device *);
 void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index 6bdf2d8bd694..88f8be197a68 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -627,14 +627,24 @@ static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
 		char *page)
 {
 	struct se_lun *lun = to_stat_tgt_port(item);
+	struct scsi_port_stats *stats;
 	struct se_device *dev;
 	ssize_t ret = -ENODEV;
+	unsigned int cpu;
+	u32 pdus = 0;
 
 	rcu_read_lock();
 	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%lu\n",
-			       atomic_long_read(&lun->lun_stats.cmd_pdus));
+	if (!dev)
+		goto unlock;
+
+	for_each_possible_cpu(cpu) {
+		stats = per_cpu_ptr(lun->lun_stats, cpu);
+		pdus += stats->cmd_pdus;
+	}
+
+	ret = snprintf(page, PAGE_SIZE, "%u\n", pdus);
+unlock:
 	rcu_read_unlock();
 	return ret;
 }
@@ -643,14 +653,24 @@ static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
 		char *page)
 {
 	struct se_lun *lun = to_stat_tgt_port(item);
+	struct scsi_port_stats *stats;
 	struct se_device *dev;
 	ssize_t ret = -ENODEV;
+	unsigned int cpu;
+	u32 octets = 0;
 
 	rcu_read_lock();
 	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%u\n",
-			(u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
+	if (!dev)
+		goto unlock;
+
+	for_each_possible_cpu(cpu) {
+		stats = per_cpu_ptr(lun->lun_stats, cpu);
+		octets += stats->rx_data_octets;
+	}
+
+	ret = snprintf(page, PAGE_SIZE, "%u\n", octets);
+unlock:
 	rcu_read_unlock();
 	return ret;
 }
@@ -659,14 +679,24 @@ static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
 		char *page)
 {
 	struct se_lun *lun = to_stat_tgt_port(item);
+	struct scsi_port_stats *stats;
 	struct se_device *dev;
 	ssize_t ret = -ENODEV;
+	unsigned int cpu;
+	u32 octets = 0;
 
 	rcu_read_lock();
 	dev = rcu_dereference(lun->lun_se_dev);
-	if (dev)
-		ret = snprintf(page, PAGE_SIZE, "%u\n",
-				(u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
+	if (!dev)
+		goto unlock;
+
+	for_each_possible_cpu(cpu) {
+		stats = per_cpu_ptr(lun->lun_stats, cpu);
+		octets += stats->tx_data_octets;
+	}
+
+	ret = snprintf(page, PAGE_SIZE, "%u\n", octets);
+unlock:
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index c0e429e5ef31..caa95aa6f502 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -609,12 +609,21 @@ struct se_lun *core_tpg_alloc_lun(
 	u64 unpacked_lun)
 {
 	struct se_lun *lun;
+	int ret;
 
 	lun = kzalloc(sizeof(*lun), GFP_KERNEL);
 	if (!lun) {
 		pr_err("Unable to allocate se_lun memory\n");
 		return ERR_PTR(-ENOMEM);
 	}
+
+	lun->lun_stats = alloc_percpu(struct scsi_port_stats);
+	if (!lun->lun_stats) {
+		pr_err("Unable to allocate se_lun stats memory\n");
+		ret = -ENOMEM;
+		goto free_lun;
+	}
+
 	lun->unpacked_lun = unpacked_lun;
 	atomic_set(&lun->lun_acl_count, 0);
 	init_completion(&lun->lun_shutdown_comp);
@@ -628,6 +637,18 @@ struct se_lun *core_tpg_alloc_lun(
 	lun->lun_tpg = tpg;
 
 	return lun;
+
+free_lun:
+	kfree(lun);
+	return ERR_PTR(-ENOMEM);
+}
+
+void target_tpg_free_lun(struct rcu_head *head)
+{
+	struct se_lun *lun = container_of(head, struct se_lun, rcu_head);
+
+	free_percpu(lun->lun_stats);
+	kfree(lun);
 }
 
 int core_tpg_add_lun(
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 0a76bdfe5528..4ec66ca6c0ca 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1571,7 +1571,7 @@ target_cmd_parse_cdb(struct se_cmd *cmd)
 		return ret;
 
 	cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
-	atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
+	this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus);
 	return 0;
 }
 EXPORT_SYMBOL(target_cmd_parse_cdb);
@@ -2597,8 +2597,8 @@ static void target_complete_ok_work(struct work_struct *work)
 		    !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
 			goto queue_status;
 
-		atomic_long_add(cmd->data_length,
-				&cmd->se_lun->lun_stats.tx_data_octets);
+		this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+			     cmd->data_length);
 		/*
 		 * Perform READ_STRIP of PI using software emulation when
 		 * backend had PI enabled, if the transport will not be
@@ -2621,14 +2621,14 @@ static void target_complete_ok_work(struct work_struct *work)
 			goto queue_full;
 		break;
 	case DMA_TO_DEVICE:
-		atomic_long_add(cmd->data_length,
-				&cmd->se_lun->lun_stats.rx_data_octets);
+		this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets,
+			     cmd->data_length);
 		/*
 		 * Check if we need to send READ payload for BIDI-COMMAND
 		 */
 		if (cmd->se_cmd_flags & SCF_BIDI) {
-			atomic_long_add(cmd->data_length,
-					&cmd->se_lun->lun_stats.tx_data_octets);
+			this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+				     cmd->data_length);
 			ret = cmd->se_tfo->queue_data_in(cmd);
 			if (ret)
 				goto queue_full;
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index c4d9116904aa..e73fb224625d 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -744,9 +744,9 @@ struct se_port_stat_grps {
 };
 
 struct scsi_port_stats {
-	atomic_long_t	cmd_pdus;
-	atomic_long_t	tx_data_octets;
-	atomic_long_t	rx_data_octets;
+	u32			cmd_pdus;
+	u32			tx_data_octets;
+	u32			rx_data_octets;
 };
 
 struct se_lun {
@@ -773,7 +773,7 @@ struct se_lun {
 	spinlock_t		lun_tg_pt_gp_lock;
 
 	struct se_portal_group	*lun_tpg;
-	struct scsi_port_stats	lun_stats;
+	struct scsi_port_stats	__percpu *lun_stats;
 	struct config_group	lun_group;
 	struct se_port_stat_grps port_stat_grps;
 	struct completion	lun_shutdown_comp;
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-07-28 15:14 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-24  0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie
2025-07-24 14:26 ` kernel test robot
2025-07-24 15:06 ` Bart Van Assche
2025-07-28 15:08 ` Dmitry Bogdanov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox