* [PATCH 1/1] scsi: target: Move LUN stats to per CPU
@ 2025-07-24 0:45 Mike Christie
2025-07-24 14:26 ` kernel test robot
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Mike Christie @ 2025-07-24 0:45 UTC (permalink / raw)
To: martin.petersen, linux-scsi, target-devel; +Cc: Mike Christie
The atomic use in the main I/O path is causing perf issues when using
higher performance backend devices and multiple queues. This moves the
LUN stats to per CPU.
I forgot to include this patch with the delayed/ordered per CPU
tracking and per device/device entry per CPU stats. With this patch
you get the full 33% improvements when using fast backends, multiple
queues and multiple IO submiters.
Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
drivers/target/target_core_fabric_configfs.c | 2 +-
drivers/target/target_core_internal.h | 1 +
drivers/target/target_core_stat.c | 48 ++++++++++++++++----
drivers/target/target_core_tpg.c | 21 +++++++++
drivers/target/target_core_transport.c | 14 +++---
include/target/target_core_base.h | 8 ++--
6 files changed, 73 insertions(+), 21 deletions(-)
diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 7156a4dc1ca7..13159928e365 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
struct se_lun *lun = container_of(to_config_group(item),
struct se_lun, lun_group);
- kfree_rcu(lun, rcu_head);
+ call_rcu(&lun->rcu_head, target_tpg_free_lun);
}
static struct configfs_item_operations target_fabric_port_item_ops = {
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 408be26d2e9b..dfe529e59a29 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -125,6 +125,7 @@ void core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
struct se_lun *);
void core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
+void target_tpg_free_lun(struct rcu_head *head);
int core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
bool, struct se_device *);
void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index 6bdf2d8bd694..88f8be197a68 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -627,14 +627,24 @@ static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
char *page)
{
struct se_lun *lun = to_stat_tgt_port(item);
+ struct scsi_port_stats *stats;
struct se_device *dev;
ssize_t ret = -ENODEV;
+ unsigned int cpu;
+ u32 pdus = 0;
rcu_read_lock();
dev = rcu_dereference(lun->lun_se_dev);
- if (dev)
- ret = snprintf(page, PAGE_SIZE, "%lu\n",
- atomic_long_read(&lun->lun_stats.cmd_pdus));
+ if (!dev)
+ goto unlock;
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(lun->lun_stats, cpu);
+ pdus += stats->cmd_pdus;
+ }
+
+ ret = snprintf(page, PAGE_SIZE, "%u\n", pdus);
+unlock:
rcu_read_unlock();
return ret;
}
@@ -643,14 +653,24 @@ static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
char *page)
{
struct se_lun *lun = to_stat_tgt_port(item);
+ struct scsi_port_stats *stats;
struct se_device *dev;
ssize_t ret = -ENODEV;
+ unsigned int cpu;
+ u32 octets = 0;
rcu_read_lock();
dev = rcu_dereference(lun->lun_se_dev);
- if (dev)
- ret = snprintf(page, PAGE_SIZE, "%u\n",
- (u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
+ if (!dev)
+ goto unlock;
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(lun->lun_stats, cpu);
+ octets += stats->rx_data_octets;
+ }
+
+ ret = snprintf(page, PAGE_SIZE, "%u\n", octets);
+unlock:
rcu_read_unlock();
return ret;
}
@@ -659,14 +679,24 @@ static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
char *page)
{
struct se_lun *lun = to_stat_tgt_port(item);
+ struct scsi_port_stats *stats;
struct se_device *dev;
ssize_t ret = -ENODEV;
+ unsigned int cpu;
+ u32 octets = 0;
rcu_read_lock();
dev = rcu_dereference(lun->lun_se_dev);
- if (dev)
- ret = snprintf(page, PAGE_SIZE, "%u\n",
- (u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
+ if (!dev)
+ goto unlock;
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(lun->lun_stats, cpu);
+ octets += stats->tx_data_octets;
+ }
+
+ ret = snprintf(page, PAGE_SIZE, "%u\n", octets);
+unlock:
rcu_read_unlock();
return ret;
}
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index c0e429e5ef31..caa95aa6f502 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -609,12 +609,21 @@ struct se_lun *core_tpg_alloc_lun(
u64 unpacked_lun)
{
struct se_lun *lun;
+ int ret;
lun = kzalloc(sizeof(*lun), GFP_KERNEL);
if (!lun) {
pr_err("Unable to allocate se_lun memory\n");
return ERR_PTR(-ENOMEM);
}
+
+ lun->lun_stats = alloc_percpu(struct scsi_port_stats);
+ if (!lun->lun_stats) {
+ pr_err("Unable to allocate se_lun stats memory\n");
+ ret = -ENOMEM;
+ goto free_lun;
+ }
+
lun->unpacked_lun = unpacked_lun;
atomic_set(&lun->lun_acl_count, 0);
init_completion(&lun->lun_shutdown_comp);
@@ -628,6 +637,18 @@ struct se_lun *core_tpg_alloc_lun(
lun->lun_tpg = tpg;
return lun;
+
+free_lun:
+ kfree(lun);
+ return ERR_PTR(-ENOMEM);
+}
+
+void target_tpg_free_lun(struct rcu_head *head)
+{
+ struct se_lun *lun = container_of(head, struct se_lun, rcu_head);
+
+ free_percpu(lun->lun_stats);
+ kfree(lun);
}
int core_tpg_add_lun(
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 0a76bdfe5528..4ec66ca6c0ca 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1571,7 +1571,7 @@ target_cmd_parse_cdb(struct se_cmd *cmd)
return ret;
cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
- atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
+ this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus);
return 0;
}
EXPORT_SYMBOL(target_cmd_parse_cdb);
@@ -2597,8 +2597,8 @@ static void target_complete_ok_work(struct work_struct *work)
!(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
goto queue_status;
- atomic_long_add(cmd->data_length,
- &cmd->se_lun->lun_stats.tx_data_octets);
+ this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+ cmd->data_length);
/*
* Perform READ_STRIP of PI using software emulation when
* backend had PI enabled, if the transport will not be
@@ -2621,14 +2621,14 @@ static void target_complete_ok_work(struct work_struct *work)
goto queue_full;
break;
case DMA_TO_DEVICE:
- atomic_long_add(cmd->data_length,
- &cmd->se_lun->lun_stats.rx_data_octets);
+ this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets,
+ cmd->data_length);
/*
* Check if we need to send READ payload for BIDI-COMMAND
*/
if (cmd->se_cmd_flags & SCF_BIDI) {
- atomic_long_add(cmd->data_length,
- &cmd->se_lun->lun_stats.tx_data_octets);
+ this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+ cmd->data_length);
ret = cmd->se_tfo->queue_data_in(cmd);
if (ret)
goto queue_full;
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index c4d9116904aa..e73fb224625d 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -744,9 +744,9 @@ struct se_port_stat_grps {
};
struct scsi_port_stats {
- atomic_long_t cmd_pdus;
- atomic_long_t tx_data_octets;
- atomic_long_t rx_data_octets;
+ u32 cmd_pdus;
+ u32 tx_data_octets;
+ u32 rx_data_octets;
};
struct se_lun {
@@ -773,7 +773,7 @@ struct se_lun {
spinlock_t lun_tg_pt_gp_lock;
struct se_portal_group *lun_tpg;
- struct scsi_port_stats lun_stats;
+ struct scsi_port_stats __percpu *lun_stats;
struct config_group lun_group;
struct se_port_stat_grps port_stat_grps;
struct completion lun_shutdown_comp;
--
2.47.1
^ permalink raw reply related [flat|nested] 4+ messages in thread* Re: [PATCH 1/1] scsi: target: Move LUN stats to per CPU 2025-07-24 0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie @ 2025-07-24 14:26 ` kernel test robot 2025-07-24 15:06 ` Bart Van Assche 2025-07-28 15:08 ` Dmitry Bogdanov 2 siblings, 0 replies; 4+ messages in thread From: kernel test robot @ 2025-07-24 14:26 UTC (permalink / raw) To: Mike Christie, martin.petersen, linux-scsi, target-devel Cc: oe-kbuild-all, Mike Christie Hi Mike, kernel test robot noticed the following build warnings: [auto build test WARNING on mkp-scsi/for-next] [also build test WARNING on jejb-scsi/for-next linus/master v6.16-rc7 next-20250724] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Mike-Christie/scsi-target-Move-LUN-stats-to-per-CPU/20250724-084852 base: https://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git for-next patch link: https://lore.kernel.org/r/20250724004558.40993-1-michael.christie%40oracle.com patch subject: [PATCH 1/1] scsi: target: Move LUN stats to per CPU config: sparc64-randconfig-001-20250724 (https://download.01.org/0day-ci/archive/20250724/202507242247.rt3StU9U-lkp@intel.com/config) compiler: sparc64-linux-gcc (GCC) 8.5.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250724/202507242247.rt3StU9U-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202507242247.rt3StU9U-lkp@intel.com/ All warnings (new ones prefixed by >>): drivers/target/target_core_tpg.c: In function 'core_tpg_alloc_lun': >> drivers/target/target_core_tpg.c:612:6: warning: variable 'ret' set but not used [-Wunused-but-set-variable] int ret; ^~~ vim +/ret +612 drivers/target/target_core_tpg.c 606 607 struct se_lun *core_tpg_alloc_lun( 608 struct se_portal_group *tpg, 609 u64 unpacked_lun) 610 { 611 struct se_lun *lun; > 612 int ret; 613 614 lun = kzalloc(sizeof(*lun), GFP_KERNEL); 615 if (!lun) { 616 pr_err("Unable to allocate se_lun memory\n"); 617 return ERR_PTR(-ENOMEM); 618 } 619 620 lun->lun_stats = alloc_percpu(struct scsi_port_stats); 621 if (!lun->lun_stats) { 622 pr_err("Unable to allocate se_lun stats memory\n"); 623 ret = -ENOMEM; 624 goto free_lun; 625 } 626 627 lun->unpacked_lun = unpacked_lun; 628 atomic_set(&lun->lun_acl_count, 0); 629 init_completion(&lun->lun_shutdown_comp); 630 INIT_LIST_HEAD(&lun->lun_deve_list); 631 INIT_LIST_HEAD(&lun->lun_dev_link); 632 atomic_set(&lun->lun_tg_pt_secondary_offline, 0); 633 spin_lock_init(&lun->lun_deve_lock); 634 mutex_init(&lun->lun_tg_pt_md_mutex); 635 INIT_LIST_HEAD(&lun->lun_tg_pt_gp_link); 636 spin_lock_init(&lun->lun_tg_pt_gp_lock); 637 lun->lun_tpg = tpg; 638 639 return lun; 640 641 free_lun: 642 kfree(lun); 643 return ERR_PTR(-ENOMEM); 644 } 645 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 1/1] scsi: target: Move LUN stats to per CPU 2025-07-24 0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie 2025-07-24 14:26 ` kernel test robot @ 2025-07-24 15:06 ` Bart Van Assche 2025-07-28 15:08 ` Dmitry Bogdanov 2 siblings, 0 replies; 4+ messages in thread From: Bart Van Assche @ 2025-07-24 15:06 UTC (permalink / raw) To: Mike Christie, martin.petersen, linux-scsi, target-devel On 7/23/25 5:45 PM, Mike Christie wrote: > diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h > index c4d9116904aa..e73fb224625d 100644 > --- a/include/target/target_core_base.h > +++ b/include/target/target_core_base.h > @@ -744,9 +744,9 @@ struct se_port_stat_grps { > }; > > struct scsi_port_stats { > - atomic_long_t cmd_pdus; > - atomic_long_t tx_data_octets; > - atomic_long_t rx_data_octets; > + u32 cmd_pdus; > + u32 tx_data_octets; > + u32 rx_data_octets; > }; > > struct se_lun { > @@ -773,7 +773,7 @@ struct se_lun { > spinlock_t lun_tg_pt_gp_lock; > > struct se_portal_group *lun_tpg; > - struct scsi_port_stats lun_stats; > + struct scsi_port_stats __percpu *lun_stats; > struct config_group lun_group; > struct se_port_stat_grps port_stat_grps; > struct completion lun_shutdown_comp; Is this perhaps an open-coded implementation of struct percpu_counter? Why hasn't struct percpu_counter been used? I think this should be explained in the patch description. Thanks, Bart. ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 1/1] scsi: target: Move LUN stats to per CPU 2025-07-24 0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie 2025-07-24 14:26 ` kernel test robot 2025-07-24 15:06 ` Bart Van Assche @ 2025-07-28 15:08 ` Dmitry Bogdanov 2 siblings, 0 replies; 4+ messages in thread From: Dmitry Bogdanov @ 2025-07-28 15:08 UTC (permalink / raw) To: Mike Christie; +Cc: martin.petersen, linux-scsi, target-devel On Wed, Jul 23, 2025 at 07:45:57PM -0500, Mike Christie wrote: > The atomic use in the main I/O path is causing perf issues when using > higher performance backend devices and multiple queues. This moves the > LUN stats to per CPU. > > I forgot to include this patch with the delayed/ordered per CPU > tracking and per device/device entry per CPU stats. With this patch > you get the full 33% improvements when using fast backends, multiple > queues and multiple IO submiters. > > Signed-off-by: Mike Christie <michael.christie@oracle.com> > --- > drivers/target/target_core_fabric_configfs.c | 2 +- > drivers/target/target_core_internal.h | 1 + > drivers/target/target_core_stat.c | 48 ++++++++++++++++---- > drivers/target/target_core_tpg.c | 21 +++++++++ > drivers/target/target_core_transport.c | 14 +++--- > include/target/target_core_base.h | 8 ++-- > 6 files changed, 73 insertions(+), 21 deletions(-) > > diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c > index 7156a4dc1ca7..13159928e365 100644 > --- a/drivers/target/target_core_fabric_configfs.c > +++ b/drivers/target/target_core_fabric_configfs.c > @@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item) > struct se_lun *lun = container_of(to_config_group(item), > struct se_lun, lun_group); > > - kfree_rcu(lun, rcu_head); > + call_rcu(&lun->rcu_head, target_tpg_free_lun); There is se_tpg->tpg_virt_lun0 that is also the lun object that will have allocated lun_stats, but tpg_virt_lun0 is deallocated in other places. You have to take care of it to not leak its lun_stats. > } > > static struct configfs_item_operations target_fabric_port_item_ops = { > diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h > index 408be26d2e9b..dfe529e59a29 100644 > --- a/drivers/target/target_core_internal.h > +++ b/drivers/target/target_core_internal.h > @@ -125,6 +125,7 @@ void core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *, > struct se_lun *); > void core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *); > struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64); > +void target_tpg_free_lun(struct rcu_head *head); > int core_tpg_add_lun(struct se_portal_group *, struct se_lun *, > bool, struct se_device *); > void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *); > diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c > index 6bdf2d8bd694..88f8be197a68 100644 > --- a/drivers/target/target_core_stat.c > +++ b/drivers/target/target_core_stat.c > @@ -627,14 +627,24 @@ static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item, > char *page) > { > struct se_lun *lun = to_stat_tgt_port(item); > + struct scsi_port_stats *stats; > struct se_device *dev; > ssize_t ret = -ENODEV; > + unsigned int cpu; > + u32 pdus = 0; > > rcu_read_lock(); > dev = rcu_dereference(lun->lun_se_dev); > - if (dev) > - ret = snprintf(page, PAGE_SIZE, "%lu\n", > - atomic_long_read(&lun->lun_stats.cmd_pdus)); > + if (!dev) > + goto unlock; > + > + for_each_possible_cpu(cpu) { > + stats = per_cpu_ptr(lun->lun_stats, cpu); > + pdus += stats->cmd_pdus; > + } > + > + ret = snprintf(page, PAGE_SIZE, "%u\n", pdus); > +unlock: > rcu_read_unlock(); > return ret; > } > @@ -643,14 +653,24 @@ static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item, > char *page) > { > struct se_lun *lun = to_stat_tgt_port(item); > + struct scsi_port_stats *stats; > struct se_device *dev; > ssize_t ret = -ENODEV; > + unsigned int cpu; > + u32 octets = 0; > > rcu_read_lock(); > dev = rcu_dereference(lun->lun_se_dev); > - if (dev) > - ret = snprintf(page, PAGE_SIZE, "%u\n", > - (u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20)); > + if (!dev) > + goto unlock; > + > + for_each_possible_cpu(cpu) { > + stats = per_cpu_ptr(lun->lun_stats, cpu); > + octets += stats->rx_data_octets; > + } > + > + ret = snprintf(page, PAGE_SIZE, "%u\n", octets); > +unlock: > rcu_read_unlock(); > return ret; > } > @@ -659,14 +679,24 @@ static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item, > char *page) > { > struct se_lun *lun = to_stat_tgt_port(item); > + struct scsi_port_stats *stats; > struct se_device *dev; > ssize_t ret = -ENODEV; > + unsigned int cpu; > + u32 octets = 0; > > rcu_read_lock(); > dev = rcu_dereference(lun->lun_se_dev); > - if (dev) > - ret = snprintf(page, PAGE_SIZE, "%u\n", > - (u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20)); > + if (!dev) > + goto unlock; > + > + for_each_possible_cpu(cpu) { > + stats = per_cpu_ptr(lun->lun_stats, cpu); > + octets += stats->tx_data_octets; > + } > + > + ret = snprintf(page, PAGE_SIZE, "%u\n", octets); > +unlock: > rcu_read_unlock(); > return ret; > } May be that is a time to refactor this file using macro magic? Something like: static u64 _target_stat_get_u64_luns_stats(struct se_lun *lun, u64 offset) { int cpu; u64 res = 0; u8 *stats; for_each_possible_cpu(cpu) { const struct scsi_port_stats *pcpu_stats; pcpu_stats = per_cpu_ptr(lun->lun_stats, cpu); stats = (u8 *)pcpu_stats; res += *(u64 *)(stats + offset); } return res; } #define target_stat_get_u64_luns_stats(LUN, FIELD) \ _target_stat_get_u64_luns_stats(LUN, \ offsetof(struct scsi_port_stats, FIELD)) #define _SYSFS_TGT_PORT_STATS_U64_SHOW(STAT, VAR, LAMBDA) \ static ssize_t target_stat_tgt_port_##STAT##_show( \ struct config_item *item, \ char *page) \ { \ struct se_lun *lun = to_stat_tgt_port(item); \ struct se_device *dev; \ ssize_t ret = -ENODEV; \ u64 VAR; \ \ rcu_read_lock(); \ dev = rcu_dereference(lun->lun_se_dev); \ if (dev) { \ VAR = target_stat_get_u64_luns_stats(lun, VAR); \ VAR = LAMBDA(VAR); \ ret = snprintf(page, PAGE_SIZE, "%llu\n", VAR); \ } \ rcu_read_unlock(); \ return ret; \ } #define LAMBDA_NOOP(A) (A) #define LAMBDA_MBYTES(A) (A >> 20) #define SYSFS_TGT_PORT_STATS_U64_SHOW(STAT, VAR) \ _SYSFS_TGT_PORT_STATS_U64_SHOW(STAT, VAR, LAMBDA_NOOP) #define SYSFS_TGT_PORT_STATS_U64_SHOW_MBYTES(STAT, VAR) \ _SYSFS_TGT_PORT_STATS_U64_SHOW(STAT, VAR, LAMBDA_MBYTES) SYSFS_TGT_PORT_STATS_U64_SHOW(in_cmds, cmd_pdus) SYSFS_TGT_PORT_STATS_U64_SHOW(in_cmds, hs_cmd_pdus) SYSFS_TGT_PORT_STATS_U64_SHOW_MBYTES(write_mbytes, rx_data_octets) SYSFS_TGT_PORT_STATS_U64_SHOW_MBYTES(read_mbytes, tx_data_octets) > diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c > index c0e429e5ef31..caa95aa6f502 100644 > --- a/drivers/target/target_core_tpg.c > +++ b/drivers/target/target_core_tpg.c > @@ -609,12 +609,21 @@ struct se_lun *core_tpg_alloc_lun( > u64 unpacked_lun) > { > struct se_lun *lun; > + int ret; > > lun = kzalloc(sizeof(*lun), GFP_KERNEL); > if (!lun) { > pr_err("Unable to allocate se_lun memory\n"); > return ERR_PTR(-ENOMEM); > } > + > + lun->lun_stats = alloc_percpu(struct scsi_port_stats); > + if (!lun->lun_stats) { > + pr_err("Unable to allocate se_lun stats memory\n"); > + ret = -ENOMEM; > + goto free_lun; > + } > + There is dev->xcopy_lun that is used for cmd->se_lun, you have to allocate and deallocate lun_stats for it to avoid NPE on XCOPY command. > lun->unpacked_lun = unpacked_lun; > atomic_set(&lun->lun_acl_count, 0); > init_completion(&lun->lun_shutdown_comp); > @@ -628,6 +637,18 @@ struct se_lun *core_tpg_alloc_lun( > lun->lun_tpg = tpg; > > return lun; > + > +free_lun: > + kfree(lun); > + return ERR_PTR(-ENOMEM); > +} > + > +void target_tpg_free_lun(struct rcu_head *head) > +{ > + struct se_lun *lun = container_of(head, struct se_lun, rcu_head); > + > + free_percpu(lun->lun_stats); > + kfree(lun); > } > > int core_tpg_add_lun( > diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c > index 0a76bdfe5528..4ec66ca6c0ca 100644 > --- a/drivers/target/target_core_transport.c > +++ b/drivers/target/target_core_transport.c > @@ -1571,7 +1571,7 @@ target_cmd_parse_cdb(struct se_cmd *cmd) > return ret; > > cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE; > - atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus); > + this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus); > return 0; > } > EXPORT_SYMBOL(target_cmd_parse_cdb); > @@ -2597,8 +2597,8 @@ static void target_complete_ok_work(struct work_struct *work) > !(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL)) > goto queue_status; > > - atomic_long_add(cmd->data_length, > - &cmd->se_lun->lun_stats.tx_data_octets); > + this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets, > + cmd->data_length); > /* > * Perform READ_STRIP of PI using software emulation when > * backend had PI enabled, if the transport will not be > @@ -2621,14 +2621,14 @@ static void target_complete_ok_work(struct work_struct *work) > goto queue_full; > break; > case DMA_TO_DEVICE: > - atomic_long_add(cmd->data_length, > - &cmd->se_lun->lun_stats.rx_data_octets); > + this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets, > + cmd->data_length); > /* > * Check if we need to send READ payload for BIDI-COMMAND > */ > if (cmd->se_cmd_flags & SCF_BIDI) { > - atomic_long_add(cmd->data_length, > - &cmd->se_lun->lun_stats.tx_data_octets); > + this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets, > + cmd->data_length); > ret = cmd->se_tfo->queue_data_in(cmd); > if (ret) > goto queue_full; > diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h > index c4d9116904aa..e73fb224625d 100644 > --- a/include/target/target_core_base.h > +++ b/include/target/target_core_base.h > @@ -744,9 +744,9 @@ struct se_port_stat_grps { > }; > > struct scsi_port_stats { > - atomic_long_t cmd_pdus; > - atomic_long_t tx_data_octets; > - atomic_long_t rx_data_octets; > + u32 cmd_pdus; > + u32 tx_data_octets; > + u32 rx_data_octets; I belive that there is no reason to have 32-bits counters in our century. [SPC-5] 7.3.9.2 General Access Statistics and Performance log parameter has 64-bits counters. RFC 4455 (MIB SCSI) states that 64bit counters are mandatory for systems with speed >= 4Gbs. Especially for t(r)x_data_octets that in 32-bit counter presentation has just 12 meaning bits actually. > }; > > struct se_lun { > @@ -773,7 +773,7 @@ struct se_lun { > spinlock_t lun_tg_pt_gp_lock; > > struct se_portal_group *lun_tpg; > - struct scsi_port_stats lun_stats; > + struct scsi_port_stats __percpu *lun_stats; > struct config_group lun_group; > struct se_port_stat_grps port_stat_grps; > struct completion lun_shutdown_comp; > -- > 2.47.1 > > ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-07-28 15:14 UTC | newest] Thread overview: 4+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2025-07-24 0:45 [PATCH 1/1] scsi: target: Move LUN stats to per CPU Mike Christie 2025-07-24 14:26 ` kernel test robot 2025-07-24 15:06 ` Bart Van Assche 2025-07-28 15:08 ` Dmitry Bogdanov
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox