Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net 02/11] net/mlx5: Initialize flow steering during driver probe
From: Saeed Mahameed @ 2022-05-18  6:34 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Shay Drory, Mark Bloch, Saeed Mahameed
In-Reply-To: <20220518063427.123758-1-saeed@kernel.org>

From: Shay Drory <shayd@nvidia.com>

Currently, software objects of flow steering are created and destroyed
during reload flow. In case a device is unloaded, the following error
is printed during grace period:

 mlx5_core 0000:00:0b.0: mlx5_fw_fatal_reporter_err_work:690:(pid 95):
    Driver is in error state. Unloading

As a solution to fix use-after-free bugs, where we try to access
these objects, when reading the value of flow_steering_mode devlink
param[1], let's split flow steering creation and destruction into two
routines:
    * init and cleanup: memory, cache, and pools allocation/free.
    * create and destroy: namespaces initialization and cleanup.

While at it, re-order the cleanup function to mirror the init function.

[1]
Kasan trace:

[  385.119849 ] BUG: KASAN: use-after-free in mlx5_devlink_fs_mode_get+0x3b/0xa0
[  385.119849 ] Read of size 4 at addr ffff888104b79308 by task bash/291
[  385.119849 ]
[  385.119849 ] CPU: 1 PID: 291 Comm: bash Not tainted 5.17.0-rc1+ #2
[  385.119849 ] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014
[  385.119849 ] Call Trace:
[  385.119849 ]  <TASK>
[  385.119849 ]  dump_stack_lvl+0x6e/0x91
[  385.119849 ]  print_address_description.constprop.0+0x1f/0x160
[  385.119849 ]  ? mlx5_devlink_fs_mode_get+0x3b/0xa0
[  385.119849 ]  ? mlx5_devlink_fs_mode_get+0x3b/0xa0
[  385.119849 ]  kasan_report.cold+0x83/0xdf
[  385.119849 ]  ? devlink_param_notify+0x20/0x190
[  385.119849 ]  ? mlx5_devlink_fs_mode_get+0x3b/0xa0
[  385.119849 ]  mlx5_devlink_fs_mode_get+0x3b/0xa0
[  385.119849 ]  devlink_nl_param_fill+0x18a/0xa50
[  385.119849 ]  ? _raw_spin_lock_irqsave+0x8d/0xe0
[  385.119849 ]  ? devlink_flash_update_timeout_notify+0xf0/0xf0
[  385.119849 ]  ? __wake_up_common+0x4b/0x1e0
[  385.119849 ]  ? preempt_count_sub+0x14/0xc0
[  385.119849 ]  ? _raw_spin_unlock_irqrestore+0x28/0x40
[  385.119849 ]  ? __wake_up_common_lock+0xe3/0x140
[  385.119849 ]  ? __wake_up_common+0x1e0/0x1e0
[  385.119849 ]  ? __sanitizer_cov_trace_const_cmp8+0x27/0x80
[  385.119849 ]  ? __rcu_read_unlock+0x48/0x70
[  385.119849 ]  ? kasan_unpoison+0x23/0x50
[  385.119849 ]  ? __kasan_slab_alloc+0x2c/0x80
[  385.119849 ]  ? memset+0x20/0x40
[  385.119849 ]  ? __sanitizer_cov_trace_const_cmp4+0x25/0x80
[  385.119849 ]  devlink_param_notify+0xce/0x190
[  385.119849 ]  devlink_unregister+0x92/0x2b0
[  385.119849 ]  remove_one+0x41/0x140
[  385.119849 ]  pci_device_remove+0x68/0x140
[  385.119849 ]  ? pcibios_free_irq+0x10/0x10
[  385.119849 ]  __device_release_driver+0x294/0x3f0
[  385.119849 ]  device_driver_detach+0x82/0x130
[  385.119849 ]  unbind_store+0x193/0x1b0
[  385.119849 ]  ? subsys_interface_unregister+0x270/0x270
[  385.119849 ]  drv_attr_store+0x4e/0x70
[  385.119849 ]  ? drv_attr_show+0x60/0x60
[  385.119849 ]  sysfs_kf_write+0xa7/0xc0
[  385.119849 ]  kernfs_fop_write_iter+0x23a/0x2f0
[  385.119849 ]  ? sysfs_kf_bin_read+0x160/0x160
[  385.119849 ]  new_sync_write+0x311/0x430
[  385.119849 ]  ? new_sync_read+0x480/0x480
[  385.119849 ]  ? _raw_spin_lock+0x87/0xe0
[  385.119849 ]  ? __sanitizer_cov_trace_cmp4+0x25/0x80
[  385.119849 ]  ? security_file_permission+0x94/0xa0
[  385.119849 ]  vfs_write+0x4c7/0x590
[  385.119849 ]  ksys_write+0xf6/0x1e0
[  385.119849 ]  ? __x64_sys_read+0x50/0x50
[  385.119849 ]  ? fpregs_assert_state_consistent+0x99/0xa0
[  385.119849 ]  do_syscall_64+0x3d/0x90
[  385.119849 ]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  385.119849 ] RIP: 0033:0x7fc36ef38504
[  385.119849 ] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f
80 00 00 00 00 48 8d 05 f9 61 0d 00 8b 00 85 c0 75 13 b8 01 00 00 00 0f
05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 41 54 49 89 d4 55 48 89 f5 53
[  385.119849 ] RSP: 002b:00007ffde0ff3d08 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  385.119849 ] RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007fc36ef38504
[  385.119849 ] RDX: 000000000000000c RSI: 00007fc370521040 RDI: 0000000000000001
[  385.119849 ] RBP: 00007fc370521040 R08: 00007fc36f00b8c0 R09: 00007fc36ee4b740
[  385.119849 ] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fc36f00a760
[  385.119849 ] R13: 000000000000000c R14: 00007fc36f005760 R15: 000000000000000c
[  385.119849 ]  </TASK>
[  385.119849 ]
[  385.119849 ] Allocated by task 65:
[  385.119849 ]  kasan_save_stack+0x1e/0x40
[  385.119849 ]  __kasan_kmalloc+0x81/0xa0
[  385.119849 ]  mlx5_init_fs+0x11b/0x1160
[  385.119849 ]  mlx5_load+0x13c/0x220
[  385.119849 ]  mlx5_load_one+0xda/0x160
[  385.119849 ]  mlx5_recover_device+0xb8/0x100
[  385.119849 ]  mlx5_health_try_recover+0x2f9/0x3a1
[  385.119849 ]  devlink_health_reporter_recover+0x75/0x100
[  385.119849 ]  devlink_health_report+0x26c/0x4b0
[  385.275909 ]  mlx5_fw_fatal_reporter_err_work+0x11e/0x1b0
[  385.275909 ]  process_one_work+0x520/0x970
[  385.275909 ]  worker_thread+0x378/0x950
[  385.275909 ]  kthread+0x1bb/0x200
[  385.275909 ]  ret_from_fork+0x1f/0x30
[  385.275909 ]
[  385.275909 ] Freed by task 65:
[  385.275909 ]  kasan_save_stack+0x1e/0x40
[  385.275909 ]  kasan_set_track+0x21/0x30
[  385.275909 ]  kasan_set_free_info+0x20/0x30
[  385.275909 ]  __kasan_slab_free+0xfc/0x140
[  385.275909 ]  kfree+0xa5/0x3b0
[  385.275909 ]  mlx5_unload+0x2e/0xb0
[  385.275909 ]  mlx5_unload_one+0x86/0xb0
[  385.275909 ]  mlx5_fw_fatal_reporter_err_work.cold+0xca/0xcf
[  385.275909 ]  process_one_work+0x520/0x970
[  385.275909 ]  worker_thread+0x378/0x950
[  385.275909 ]  kthread+0x1bb/0x200
[  385.275909 ]  ret_from_fork+0x1f/0x30
[  385.275909 ]
[  385.275909 ] The buggy address belongs to the object at ffff888104b79300
[  385.275909 ]  which belongs to the cache kmalloc-128 of size 128
[  385.275909 ] The buggy address is located 8 bytes inside of
[  385.275909 ]  128-byte region [ffff888104b79300, ffff888104b79380)
[  385.275909 ] The buggy address belongs to the page:
[  385.275909 ] page:00000000de44dd39 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x104b78
[  385.275909 ] head:00000000de44dd39 order:1 compound_mapcount:0
[  385.275909 ] flags: 0x8000000000010200(slab|head|zone=2)
[  385.275909 ] raw: 8000000000010200 0000000000000000 dead000000000122 ffff8881000428c0
[  385.275909 ] raw: 0000000000000000 0000000080200020 00000001ffffffff 0000000000000000
[  385.275909 ] page dumped because: kasan: bad access detected
[  385.275909 ]
[  385.275909 ] Memory state around the buggy address:
[  385.275909 ]  ffff888104b79200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc
[  385.275909 ]  ffff888104b79280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  385.275909 ] >ffff888104b79300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  385.275909 ]                       ^
[  385.275909 ]  ffff888104b79380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  385.275909 ]  ffff888104b79400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  385.275909 ]]

Fixes: e890acd5ff18 ("net/mlx5: Add devlink flow_steering_mode parameter")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/fs_core.c | 131 ++++++++++--------
 .../net/ethernet/mellanox/mlx5/core/fs_core.h |   6 +-
 .../net/ethernet/mellanox/mlx5/core/main.c    |  15 +-
 3 files changed, 91 insertions(+), 61 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 816d991f7621..3ad67e6b5586 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2663,28 +2663,6 @@ static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns)
 	clean_tree(&root_ns->ns.node);
 }
 
-void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
-{
-	struct mlx5_flow_steering *steering = dev->priv.steering;
-
-	cleanup_root_ns(steering->root_ns);
-	cleanup_root_ns(steering->fdb_root_ns);
-	steering->fdb_root_ns = NULL;
-	kfree(steering->fdb_sub_ns);
-	steering->fdb_sub_ns = NULL;
-	cleanup_root_ns(steering->port_sel_root_ns);
-	cleanup_root_ns(steering->sniffer_rx_root_ns);
-	cleanup_root_ns(steering->sniffer_tx_root_ns);
-	cleanup_root_ns(steering->rdma_rx_root_ns);
-	cleanup_root_ns(steering->rdma_tx_root_ns);
-	cleanup_root_ns(steering->egress_root_ns);
-	mlx5_cleanup_fc_stats(dev);
-	kmem_cache_destroy(steering->ftes_cache);
-	kmem_cache_destroy(steering->fgs_cache);
-	mlx5_ft_pool_destroy(dev);
-	kfree(steering);
-}
-
 static int init_sniffer_tx_root_ns(struct mlx5_flow_steering *steering)
 {
 	struct fs_prio *prio;
@@ -3086,42 +3064,27 @@ static int init_egress_root_ns(struct mlx5_flow_steering *steering)
 	return err;
 }
 
-int mlx5_init_fs(struct mlx5_core_dev *dev)
+void mlx5_fs_core_cleanup(struct mlx5_core_dev *dev)
 {
-	struct mlx5_flow_steering *steering;
-	int err = 0;
-
-	err = mlx5_init_fc_stats(dev);
-	if (err)
-		return err;
-
-	err = mlx5_ft_pool_init(dev);
-	if (err)
-		return err;
-
-	steering = kzalloc(sizeof(*steering), GFP_KERNEL);
-	if (!steering) {
-		err = -ENOMEM;
-		goto err;
-	}
-
-	steering->dev = dev;
-	dev->priv.steering = steering;
+	struct mlx5_flow_steering *steering = dev->priv.steering;
 
-	if (mlx5_fs_dr_is_supported(dev))
-		steering->mode = MLX5_FLOW_STEERING_MODE_SMFS;
-	else
-		steering->mode = MLX5_FLOW_STEERING_MODE_DMFS;
+	cleanup_root_ns(steering->root_ns);
+	cleanup_root_ns(steering->fdb_root_ns);
+	steering->fdb_root_ns = NULL;
+	kfree(steering->fdb_sub_ns);
+	steering->fdb_sub_ns = NULL;
+	cleanup_root_ns(steering->port_sel_root_ns);
+	cleanup_root_ns(steering->sniffer_rx_root_ns);
+	cleanup_root_ns(steering->sniffer_tx_root_ns);
+	cleanup_root_ns(steering->rdma_rx_root_ns);
+	cleanup_root_ns(steering->rdma_tx_root_ns);
+	cleanup_root_ns(steering->egress_root_ns);
+}
 
-	steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs",
-						sizeof(struct mlx5_flow_group), 0,
-						0, NULL);
-	steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0,
-						 0, NULL);
-	if (!steering->ftes_cache || !steering->fgs_cache) {
-		err = -ENOMEM;
-		goto err;
-	}
+int mlx5_fs_core_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int err = 0;
 
 	if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
 	      (MLX5_CAP_GEN(dev, nic_flow_table))) ||
@@ -3180,8 +3143,64 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 	}
 
 	return 0;
+
+err:
+	mlx5_fs_core_cleanup(dev);
+	return err;
+}
+
+void mlx5_fs_core_free(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+
+	kmem_cache_destroy(steering->ftes_cache);
+	kmem_cache_destroy(steering->fgs_cache);
+	kfree(steering);
+	mlx5_ft_pool_destroy(dev);
+	mlx5_cleanup_fc_stats(dev);
+}
+
+int mlx5_fs_core_alloc(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering;
+	int err = 0;
+
+	err = mlx5_init_fc_stats(dev);
+	if (err)
+		return err;
+
+	err = mlx5_ft_pool_init(dev);
+	if (err)
+		goto err;
+
+	steering = kzalloc(sizeof(*steering), GFP_KERNEL);
+	if (!steering) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	steering->dev = dev;
+	dev->priv.steering = steering;
+
+	if (mlx5_fs_dr_is_supported(dev))
+		steering->mode = MLX5_FLOW_STEERING_MODE_SMFS;
+	else
+		steering->mode = MLX5_FLOW_STEERING_MODE_DMFS;
+
+	steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs",
+						sizeof(struct mlx5_flow_group), 0,
+						0, NULL);
+	steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0,
+						 0, NULL);
+	if (!steering->ftes_cache || !steering->fgs_cache) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	return 0;
+
 err:
-	mlx5_cleanup_fs(dev);
+	mlx5_fs_core_free(dev);
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index c488a7c5b07e..3f20523e514f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -298,8 +298,10 @@ int mlx5_flow_namespace_set_peer(struct mlx5_flow_root_namespace *ns,
 int mlx5_flow_namespace_set_mode(struct mlx5_flow_namespace *ns,
 				 enum mlx5_flow_steering_mode mode);
 
-int mlx5_init_fs(struct mlx5_core_dev *dev);
-void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
+int mlx5_fs_core_alloc(struct mlx5_core_dev *dev);
+void mlx5_fs_core_free(struct mlx5_core_dev *dev);
+int mlx5_fs_core_init(struct mlx5_core_dev *dev);
+void mlx5_fs_core_cleanup(struct mlx5_core_dev *dev);
 
 int mlx5_fs_egress_acls_init(struct mlx5_core_dev *dev, int total_vports);
 void mlx5_fs_egress_acls_cleanup(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 2589e39eb9c7..c6737720bf3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -938,6 +938,12 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 		goto err_sf_table_cleanup;
 	}
 
+	err = mlx5_fs_core_alloc(dev);
+	if (err) {
+		mlx5_core_err(dev, "Failed to alloc flow steering\n");
+		goto err_fs;
+	}
+
 	dev->dm = mlx5_dm_create(dev);
 	if (IS_ERR(dev->dm))
 		mlx5_core_warn(dev, "Failed to init device memory%d\n", err);
@@ -948,6 +954,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 
 	return 0;
 
+err_fs:
+	mlx5_sf_table_cleanup(dev);
 err_sf_table_cleanup:
 	mlx5_sf_hw_table_cleanup(dev);
 err_sf_hw_table_cleanup:
@@ -985,6 +993,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_hv_vhca_destroy(dev->hv_vhca);
 	mlx5_fw_tracer_destroy(dev->tracer);
 	mlx5_dm_cleanup(dev);
+	mlx5_fs_core_free(dev);
 	mlx5_sf_table_cleanup(dev);
 	mlx5_sf_hw_table_cleanup(dev);
 	mlx5_vhca_event_cleanup(dev);
@@ -1191,7 +1200,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 		goto err_tls_start;
 	}
 
-	err = mlx5_init_fs(dev);
+	err = mlx5_fs_core_init(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to init flow steering\n");
 		goto err_fs;
@@ -1236,7 +1245,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 err_vhca:
 	mlx5_vhca_event_stop(dev);
 err_set_hca:
-	mlx5_cleanup_fs(dev);
+	mlx5_fs_core_cleanup(dev);
 err_fs:
 	mlx5_accel_tls_cleanup(dev);
 err_tls_start:
@@ -1265,7 +1274,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 	mlx5_ec_cleanup(dev);
 	mlx5_sf_hw_table_destroy(dev);
 	mlx5_vhca_event_stop(dev);
-	mlx5_cleanup_fs(dev);
+	mlx5_fs_core_cleanup(dev);
 	mlx5_accel_ipsec_cleanup(dev);
 	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
-- 
2.36.1


^ permalink raw reply related

* [net 03/11] net/mlx5: DR, Ignore modify TTL on RX if device doesn't support it
From: Saeed Mahameed @ 2022-05-18  6:34 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Yevgeny Kliteynik, Alex Vesker, Saeed Mahameed
In-Reply-To: <20220518063427.123758-1-saeed@kernel.org>

From: Yevgeny Kliteynik <kliteyn@nvidia.com>

When modifying TTL, packet's csum has to be recalculated.
Due to HW issue in ConnectX-5, csum recalculation for modify
TTL on RX is supported through a work-around that is specifically
enabled by configuration.
If the work-around isn't enabled, rather than adding an unsupported
action the modify TTL action on RX should be ignored.
Ignoring modify TTL action might result in zero actions, so in such
cases we will not convert the match STE to modify STE, as it is done
by FW in DMFS.

This patch fixes an issue where modify TTL action was ignored both
on RX and TX instead of only on RX.

Fixes: 4ff725e1d4ad ("net/mlx5: DR, Ignore modify TTL if device doesn't support it")
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../mellanox/mlx5/core/steering/dr_action.c   | 65 +++++++++++++------
 .../mellanox/mlx5/core/steering/dr_ste_v0.c   |  4 +-
 2 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
index b52b539c8d2c..1383550f44c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
@@ -530,6 +530,37 @@ static int dr_action_handle_cs_recalc(struct mlx5dr_domain *dmn,
 	return 0;
 }
 
+static void dr_action_modify_ttl_adjust(struct mlx5dr_domain *dmn,
+					struct mlx5dr_ste_actions_attr *attr,
+					bool rx_rule,
+					bool *recalc_cs_required)
+{
+	*recalc_cs_required = false;
+
+	/* if device supports csum recalculation - no adjustment needed */
+	if (mlx5dr_ste_supp_ttl_cs_recalc(&dmn->info.caps))
+		return;
+
+	/* no adjustment needed on TX rules */
+	if (!rx_rule)
+		return;
+
+	if (!MLX5_CAP_ESW_FLOWTABLE(dmn->mdev, fdb_ipv4_ttl_modify)) {
+		/* Ignore the modify TTL action.
+		 * It is always kept as last HW action.
+		 */
+		attr->modify_actions--;
+		return;
+	}
+
+	if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB)
+		/* Due to a HW bug on some devices, modifying TTL on RX flows
+		 * will cause an incorrect checksum calculation. In such cases
+		 * we will use a FW table to recalculate the checksum.
+		 */
+		*recalc_cs_required = true;
+}
+
 static void dr_action_print_sequence(struct mlx5dr_domain *dmn,
 				     struct mlx5dr_action *actions[],
 				     int last_idx)
@@ -650,8 +681,9 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
 		case DR_ACTION_TYP_MODIFY_HDR:
 			attr.modify_index = action->rewrite->index;
 			attr.modify_actions = action->rewrite->num_of_actions;
-			recalc_cs_required = action->rewrite->modify_ttl &&
-					     !mlx5dr_ste_supp_ttl_cs_recalc(&dmn->info.caps);
+			if (action->rewrite->modify_ttl)
+				dr_action_modify_ttl_adjust(dmn, &attr, rx_rule,
+							    &recalc_cs_required);
 			break;
 		case DR_ACTION_TYP_L2_TO_TNL_L2:
 		case DR_ACTION_TYP_L2_TO_TNL_L3:
@@ -732,12 +764,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
 	*new_hw_ste_arr_sz = nic_matcher->num_of_builders;
 	last_ste = ste_arr + DR_STE_SIZE * (nic_matcher->num_of_builders - 1);
 
-	/* Due to a HW bug in some devices, modifying TTL on RX flows will
-	 * cause an incorrect checksum calculation. In this case we will
-	 * use a FW table to recalculate.
-	 */
-	if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB &&
-	    rx_rule && recalc_cs_required && dest_action) {
+	if (recalc_cs_required && dest_action) {
 		ret = dr_action_handle_cs_recalc(dmn, dest_action, &attr.final_icm_addr);
 		if (ret) {
 			mlx5dr_err(dmn,
@@ -1558,12 +1585,6 @@ dr_action_modify_check_is_ttl_modify(const void *sw_action)
 	return sw_field == MLX5_ACTION_IN_FIELD_OUT_IP_TTL;
 }
 
-static bool dr_action_modify_ttl_ignore(struct mlx5dr_domain *dmn)
-{
-	return !mlx5dr_ste_supp_ttl_cs_recalc(&dmn->info.caps) &&
-	       !MLX5_CAP_ESW_FLOWTABLE(dmn->mdev, fdb_ipv4_ttl_modify);
-}
-
 static int dr_actions_convert_modify_header(struct mlx5dr_action *action,
 					    u32 max_hw_actions,
 					    u32 num_sw_actions,
@@ -1575,6 +1596,7 @@ static int dr_actions_convert_modify_header(struct mlx5dr_action *action,
 	const struct mlx5dr_ste_action_modify_field *hw_dst_action_info;
 	const struct mlx5dr_ste_action_modify_field *hw_src_action_info;
 	struct mlx5dr_domain *dmn = action->rewrite->dmn;
+	__be64 *modify_ttl_sw_action = NULL;
 	int ret, i, hw_idx = 0;
 	__be64 *sw_action;
 	__be64 hw_action;
@@ -1587,8 +1609,14 @@ static int dr_actions_convert_modify_header(struct mlx5dr_action *action,
 	action->rewrite->allow_rx = 1;
 	action->rewrite->allow_tx = 1;
 
-	for (i = 0; i < num_sw_actions; i++) {
-		sw_action = &sw_actions[i];
+	for (i = 0; i < num_sw_actions || modify_ttl_sw_action; i++) {
+		/* modify TTL is handled separately, as a last action */
+		if (i == num_sw_actions) {
+			sw_action = modify_ttl_sw_action;
+			modify_ttl_sw_action = NULL;
+		} else {
+			sw_action = &sw_actions[i];
+		}
 
 		ret = dr_action_modify_check_field_limitation(action,
 							      sw_action);
@@ -1597,10 +1625,9 @@ static int dr_actions_convert_modify_header(struct mlx5dr_action *action,
 
 		if (!(*modify_ttl) &&
 		    dr_action_modify_check_is_ttl_modify(sw_action)) {
-			if (dr_action_modify_ttl_ignore(dmn))
-				continue;
-
+			modify_ttl_sw_action = sw_action;
 			*modify_ttl = true;
+			continue;
 		}
 
 		/* Convert SW action to HW action */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c
index 5a322335f204..2010d4ac6519 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c
@@ -420,7 +420,7 @@ dr_ste_v0_set_actions_tx(struct mlx5dr_domain *dmn,
 	 * encapsulation. The reason for that is that we support
 	 * modify headers for outer headers only
 	 */
-	if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) {
+	if (action_type_set[DR_ACTION_TYP_MODIFY_HDR] && attr->modify_actions) {
 		dr_ste_v0_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT);
 		dr_ste_v0_set_rewrite_actions(last_ste,
 					      attr->modify_actions,
@@ -513,7 +513,7 @@ dr_ste_v0_set_actions_rx(struct mlx5dr_domain *dmn,
 		}
 	}
 
-	if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) {
+	if (action_type_set[DR_ACTION_TYP_MODIFY_HDR] && attr->modify_actions) {
 		if (dr_ste_v0_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT)
 			dr_ste_v0_arr_init_next(&last_ste,
 						added_stes,
-- 
2.36.1


^ permalink raw reply related

* [net 01/11] net/mlx5: DR, Fix missing flow_source when creating multi-destination FW table
From: Saeed Mahameed @ 2022-05-18  6:34 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Maor Dickman, Yevgeny Kliteynik, Saeed Mahameed
In-Reply-To: <20220518063427.123758-1-saeed@kernel.org>

From: Maor Dickman <maord@nvidia.com>

In order to support multiple destination FTEs with SW steering
FW table is created with single FTE with multiple actions and
SW steering rule forward to it. When creating this table, flow
source isn't set according to the original FTE.

Fix this by passing the original FTE flow source to the created
FW table.

Fixes: 34583beea4b7 ("net/mlx5: DR, Create multi-destination table for SW-steering use")
Signed-off-by: Maor Dickman <maord@nvidia.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/steering/dr_action.c    | 6 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c    | 4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c    | 4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h   | 3 ++-
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
index 850937cd8bf9..b52b539c8d2c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
@@ -842,7 +842,8 @@ struct mlx5dr_action *
 mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn,
 				   struct mlx5dr_action_dest *dests,
 				   u32 num_of_dests,
-				   bool ignore_flow_level)
+				   bool ignore_flow_level,
+				   u32 flow_source)
 {
 	struct mlx5dr_cmd_flow_destination_hw_info *hw_dests;
 	struct mlx5dr_action **ref_actions;
@@ -914,7 +915,8 @@ mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn,
 				      reformat_req,
 				      &action->dest_tbl->fw_tbl.id,
 				      &action->dest_tbl->fw_tbl.group_id,
-				      ignore_flow_level);
+				      ignore_flow_level,
+				      flow_source);
 	if (ret)
 		goto free_action;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c
index 68a4c32d5f34..f05ef0cd54ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c
@@ -104,7 +104,8 @@ int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn,
 			    bool reformat_req,
 			    u32 *tbl_id,
 			    u32 *group_id,
-			    bool ignore_flow_level)
+			    bool ignore_flow_level,
+			    u32 flow_source)
 {
 	struct mlx5dr_cmd_create_flow_table_attr ft_attr = {};
 	struct mlx5dr_cmd_fte_info fte_info = {};
@@ -139,6 +140,7 @@ int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn,
 	fte_info.val = val;
 	fte_info.dest_arr = dest;
 	fte_info.ignore_flow_level = ignore_flow_level;
+	fte_info.flow_context.flow_source = flow_source;
 
 	ret = mlx5dr_cmd_set_fte(dmn->mdev, 0, 0, &ft_info, *group_id, &fte_info);
 	if (ret) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 46866a5fc5ca..98320e3945ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -1461,7 +1461,8 @@ int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn,
 			    bool reformat_req,
 			    u32 *tbl_id,
 			    u32 *group_id,
-			    bool ignore_flow_level);
+			    bool ignore_flow_level,
+			    u32 flow_source);
 void mlx5dr_fw_destroy_md_tbl(struct mlx5dr_domain *dmn, u32 tbl_id,
 			      u32 group_id);
 #endif  /* _DR_TYPES_H_ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
index 045b0cf90063..728f81882589 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
@@ -520,6 +520,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns,
 	} else if (num_term_actions > 1) {
 		bool ignore_flow_level =
 			!!(fte->action.flags & FLOW_ACT_IGNORE_FLOW_LEVEL);
+		u32 flow_source = fte->flow_context.flow_source;
 
 		if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX ||
 		    fs_dr_num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) {
@@ -529,7 +530,8 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns,
 		tmp_action = mlx5dr_action_create_mult_dest_tbl(domain,
 								term_actions,
 								num_term_actions,
-								ignore_flow_level);
+								ignore_flow_level,
+								flow_source);
 		if (!tmp_action) {
 			err = -EOPNOTSUPP;
 			goto free_actions;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
index ec5cbec0d455..7626c85643b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
@@ -99,7 +99,8 @@ struct mlx5dr_action *
 mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn,
 				   struct mlx5dr_action_dest *dests,
 				   u32 num_of_dests,
-				   bool ignore_flow_level);
+				   bool ignore_flow_level,
+				   u32 flow_source);
 
 struct mlx5dr_action *mlx5dr_action_create_drop(void);
 
-- 
2.36.1


^ permalink raw reply related

* [pull request][net 00/11] mlx5 fixes 2022-05-17
From: Saeed Mahameed @ 2022-05-18  6:34 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski, Paolo Abeni; +Cc: netdev, Saeed Mahameed

From: Saeed Mahameed <saeedm@nvidia.com>

This series provides bug fixes to mlx5 driver.
Please pull and let me know if there is any problem.

Thanks,
Saeed.


The following changes since commit 23dd4581350d4ffa23d58976ec46408f8f4c1e16:

  NFC: nci: fix sleep in atomic context bugs caused by nci_skb_alloc (2022-05-17 17:55:53 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5-fixes-2022-05-17

for you to fetch changes up to 16d42d313350946f4b9a8b74a13c99f0461a6572:

  net/mlx5: Drain fw_reset when removing device (2022-05-17 23:03:57 -0700)

----------------------------------------------------------------
mlx5-fixes-2022-05-17

----------------------------------------------------------------
Aya Levin (1):
      net/mlx5e: Block rx-gro-hw feature in switchdev mode

Gal Pressman (1):
      net/mlx5e: Remove HW-GRO from reported features

Maor Dickman (1):
      net/mlx5: DR, Fix missing flow_source when creating multi-destination FW table

Maxim Mikityanskiy (3):
      net/mlx5e: Wrap mlx5e_trap_napi_poll into rcu_read_lock
      net/mlx5e: Properly block LRO when XDP is enabled
      net/mlx5e: Properly block HW GRO when XDP is enabled

Paul Blakey (2):
      net/mlx5e: CT: Fix support for GRE tuples
      net/mlx5e: CT: Fix setting flow_source for smfs ct tuples

Shay Drory (2):
      net/mlx5: Initialize flow steering during driver probe
      net/mlx5: Drain fw_reset when removing device

Yevgeny Kliteynik (1):
      net/mlx5: DR, Ignore modify TTL on RX if device doesn't support it

 .../ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c |  58 +++++----
 drivers/net/ethernet/mellanox/mlx5/core/en/trap.c  |  13 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  27 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 131 ++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c |  25 +++-
 drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  19 ++-
 .../mellanox/mlx5/core/steering/dr_action.c        |  71 +++++++----
 .../ethernet/mellanox/mlx5/core/steering/dr_fw.c   |   4 +-
 .../mellanox/mlx5/core/steering/dr_ste_v0.c        |   4 +-
 .../mellanox/mlx5/core/steering/dr_types.h         |   3 +-
 .../ethernet/mellanox/mlx5/core/steering/fs_dr.c   |   4 +-
 .../ethernet/mellanox/mlx5/core/steering/mlx5dr.h  |   3 +-
 14 files changed, 246 insertions(+), 123 deletions(-)

^ permalink raw reply

* Re: [PATCH] bpf: avoid grabbing spin_locks of all cpus when no free elems
From: Alexei Starovoitov @ 2022-05-18  6:32 UTC (permalink / raw)
  To: Feng zhou
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Network Development, bpf, LKML, Xiongchun Duan,
	Muchun Song, Dongdong Wang, Cong Wang, Chengming Zhou
In-Reply-To: <20220518062715.27809-1-zhoufeng.zf@bytedance.com>

On Tue, May 17, 2022 at 11:27 PM Feng zhou <zhoufeng.zf@bytedance.com> wrote:
>
> From: Feng Zhou <zhoufeng.zf@bytedance.com>
>
> We encountered bad case on big system with 96 CPUs that
> alloc_htab_elem() would last for 1ms. The reason is that after the
> prealloc hashtab has no free elems, when trying to update, it will still
> grab spin_locks of all cpus. If there are multiple update users, the
> competition is very serious.
>
> So this patch add is_empty in pcpu_freelist_head to check freelist
> having free or not. If having, grab spin_lock, or check next cpu's
> freelist.
>
> Before patch: hash_map performance
> ./map_perf_test 1
> 0:hash_map_perf pre-alloc 975345 events per sec
> 4:hash_map_perf pre-alloc 855367 events per sec
> 12:hash_map_perf pre-alloc 860862 events per sec
> 8:hash_map_perf pre-alloc 849561 events per sec
> 3:hash_map_perf pre-alloc 849074 events per sec
> 6:hash_map_perf pre-alloc 847120 events per sec
> 10:hash_map_perf pre-alloc 845047 events per sec
> 5:hash_map_perf pre-alloc 841266 events per sec
> 14:hash_map_perf pre-alloc 849740 events per sec
> 2:hash_map_perf pre-alloc 839598 events per sec
> 9:hash_map_perf pre-alloc 838695 events per sec
> 11:hash_map_perf pre-alloc 845390 events per sec
> 7:hash_map_perf pre-alloc 834865 events per sec
> 13:hash_map_perf pre-alloc 842619 events per sec
> 1:hash_map_perf pre-alloc 804231 events per sec
> 15:hash_map_perf pre-alloc 795314 events per sec
>
> hash_map the worst: no free
> ./map_perf_test 2048
> 6:worse hash_map_perf pre-alloc 28628 events per sec
> 5:worse hash_map_perf pre-alloc 28553 events per sec
> 11:worse hash_map_perf pre-alloc 28543 events per sec
> 3:worse hash_map_perf pre-alloc 28444 events per sec
> 1:worse hash_map_perf pre-alloc 28418 events per sec
> 7:worse hash_map_perf pre-alloc 28427 events per sec
> 13:worse hash_map_perf pre-alloc 28330 events per sec
> 14:worse hash_map_perf pre-alloc 28263 events per sec
> 9:worse hash_map_perf pre-alloc 28211 events per sec
> 15:worse hash_map_perf pre-alloc 28193 events per sec
> 12:worse hash_map_perf pre-alloc 28190 events per sec
> 10:worse hash_map_perf pre-alloc 28129 events per sec
> 8:worse hash_map_perf pre-alloc 28116 events per sec
> 4:worse hash_map_perf pre-alloc 27906 events per sec
> 2:worse hash_map_perf pre-alloc 27801 events per sec
> 0:worse hash_map_perf pre-alloc 27416 events per sec
> 3:worse hash_map_perf pre-alloc 28188 events per sec
>
> ftrace trace
>
> 0)               |  htab_map_update_elem() {
> 0)   0.198 us    |    migrate_disable();
> 0)               |    _raw_spin_lock_irqsave() {
> 0)   0.157 us    |      preempt_count_add();
> 0)   0.538 us    |    }
> 0)   0.260 us    |    lookup_elem_raw();
> 0)               |    alloc_htab_elem() {
> 0)               |      __pcpu_freelist_pop() {
> 0)               |        _raw_spin_lock() {
> 0)   0.152 us    |          preempt_count_add();
> 0)   0.352 us    |          native_queued_spin_lock_slowpath();
> 0)   1.065 us    |        }
>                  |        ...
> 0)               |        _raw_spin_unlock() {
> 0)   0.254 us    |          preempt_count_sub();
> 0)   0.555 us    |        }
> 0) + 25.188 us   |      }
> 0) + 25.486 us   |    }
> 0)               |    _raw_spin_unlock_irqrestore() {
> 0)   0.155 us    |      preempt_count_sub();
> 0)   0.454 us    |    }
> 0)   0.148 us    |    migrate_enable();
> 0) + 28.439 us   |  }
>
> The test machine is 16C, trying to get spin_lock 17 times, in addition
> to 16c, there is an extralist.

Is this with small max_entries and a large number of cpus?

If so, probably better to fix would be to artificially
bump max_entries to be 4x of num_cpus.
Racy is_empty check still wastes the loop.

^ permalink raw reply

* [Patch] net: af_key: check encryption module availability consistency
From: Thomas Bartschies @ 2022-05-18  6:32 UTC (permalink / raw)
  To: davem; +Cc: steffen.klassert, herbert, kuba, pabeni, netdev, linux-kernel

Since the recent introduction supporting the SM3 and SM4 hash algos for IPsec, the kernel 
produces invalid pfkey acquire messages, when these encryption modules are disabled. This 
happens because the availability of the algos wasn't checked in all necessary functions. 
This patch adds these checks.

Signed-off-by: Thomas Bartschies <thomas.bartschies@cvk.de>

diff -uprN a/net/key/af_key.c b/net/key/af_key.c
--- a/net/key/af_key.c	2022-05-09 09:16:33.000000000 +0200
+++ b/net/key/af_key.c	2022-05-13 13:51:58.286250337 +0200
@@ -2898,7 +2898,7 @@ static int count_ah_combs(const struct x
 			break;
 		if (!aalg->pfkey_supported)
 			continue;
-		if (aalg_tmpl_set(t, aalg))
+		if (aalg_tmpl_set(t, aalg) && aalg->available)
 			sz += sizeof(struct sadb_comb);
 	}
 	return sz + sizeof(struct sadb_prop);
@@ -2916,7 +2916,7 @@ static int count_esp_combs(const struct
 		if (!ealg->pfkey_supported)
 			continue;
 
-		if (!(ealg_tmpl_set(t, ealg)))
+		if (!(ealg_tmpl_set(t, ealg) && ealg->available))
 			continue;
 
 		for (k = 1; ; k++) {
@@ -2927,7 +2927,7 @@ static int count_esp_combs(const struct
 			if (!aalg->pfkey_supported)
 				continue;
 
-			if (aalg_tmpl_set(t, aalg))
+			if (aalg_tmpl_set(t, aalg) && aalg->available)
 				sz += sizeof(struct sadb_comb);
 		}
 	}

^ permalink raw reply

* [PATCH] bpf: avoid grabbing spin_locks of all cpus when no free elems
From: Feng zhou @ 2022-05-18  6:27 UTC (permalink / raw)
  To: ast, daniel, andrii, kafai, songliubraving, yhs, john.fastabend,
	kpsingh
  Cc: netdev, bpf, linux-kernel, duanxiongchun, songmuchun,
	wangdongdong.6, cong.wang, zhouchengming, zhoufeng.zf

From: Feng Zhou <zhoufeng.zf@bytedance.com>

We encountered bad case on big system with 96 CPUs that
alloc_htab_elem() would last for 1ms. The reason is that after the
prealloc hashtab has no free elems, when trying to update, it will still
grab spin_locks of all cpus. If there are multiple update users, the
competition is very serious.

So this patch add is_empty in pcpu_freelist_head to check freelist
having free or not. If having, grab spin_lock, or check next cpu's
freelist.

Before patch: hash_map performance
./map_perf_test 1
0:hash_map_perf pre-alloc 975345 events per sec
4:hash_map_perf pre-alloc 855367 events per sec
12:hash_map_perf pre-alloc 860862 events per sec
8:hash_map_perf pre-alloc 849561 events per sec
3:hash_map_perf pre-alloc 849074 events per sec
6:hash_map_perf pre-alloc 847120 events per sec
10:hash_map_perf pre-alloc 845047 events per sec
5:hash_map_perf pre-alloc 841266 events per sec
14:hash_map_perf pre-alloc 849740 events per sec
2:hash_map_perf pre-alloc 839598 events per sec
9:hash_map_perf pre-alloc 838695 events per sec
11:hash_map_perf pre-alloc 845390 events per sec
7:hash_map_perf pre-alloc 834865 events per sec
13:hash_map_perf pre-alloc 842619 events per sec
1:hash_map_perf pre-alloc 804231 events per sec
15:hash_map_perf pre-alloc 795314 events per sec

hash_map the worst: no free
./map_perf_test 2048
6:worse hash_map_perf pre-alloc 28628 events per sec
5:worse hash_map_perf pre-alloc 28553 events per sec
11:worse hash_map_perf pre-alloc 28543 events per sec
3:worse hash_map_perf pre-alloc 28444 events per sec
1:worse hash_map_perf pre-alloc 28418 events per sec
7:worse hash_map_perf pre-alloc 28427 events per sec
13:worse hash_map_perf pre-alloc 28330 events per sec
14:worse hash_map_perf pre-alloc 28263 events per sec
9:worse hash_map_perf pre-alloc 28211 events per sec
15:worse hash_map_perf pre-alloc 28193 events per sec
12:worse hash_map_perf pre-alloc 28190 events per sec
10:worse hash_map_perf pre-alloc 28129 events per sec
8:worse hash_map_perf pre-alloc 28116 events per sec
4:worse hash_map_perf pre-alloc 27906 events per sec
2:worse hash_map_perf pre-alloc 27801 events per sec
0:worse hash_map_perf pre-alloc 27416 events per sec
3:worse hash_map_perf pre-alloc 28188 events per sec

ftrace trace

0)               |  htab_map_update_elem() {
0)   0.198 us    |    migrate_disable();
0)               |    _raw_spin_lock_irqsave() {
0)   0.157 us    |      preempt_count_add();
0)   0.538 us    |    }
0)   0.260 us    |    lookup_elem_raw();
0)               |    alloc_htab_elem() {
0)               |      __pcpu_freelist_pop() {
0)               |        _raw_spin_lock() {
0)   0.152 us    |          preempt_count_add();
0)   0.352 us    |          native_queued_spin_lock_slowpath();
0)   1.065 us    |        }
		 |	  ...
0)               |        _raw_spin_unlock() {
0)   0.254 us    |          preempt_count_sub();
0)   0.555 us    |        }
0) + 25.188 us   |      }
0) + 25.486 us   |    }
0)               |    _raw_spin_unlock_irqrestore() {
0)   0.155 us    |      preempt_count_sub();
0)   0.454 us    |    }
0)   0.148 us    |    migrate_enable();
0) + 28.439 us   |  }

The test machine is 16C, trying to get spin_lock 17 times, in addition
to 16c, there is an extralist.

after patch: hash_map performance
./map_perf_test 1
0:hash_map_perf pre-alloc 969348 events per sec
10:hash_map_perf pre-alloc 906526 events per sec
11:hash_map_perf pre-alloc 904557 events per sec
9:hash_map_perf pre-alloc 902384 events per sec
15:hash_map_perf pre-alloc 912287 events per sec
14:hash_map_perf pre-alloc 905689 events per sec
12:hash_map_perf pre-alloc 903680 events per sec
13:hash_map_perf pre-alloc 902631 events per sec
8:hash_map_perf pre-alloc 875369 events per sec
4:hash_map_perf pre-alloc 862808 events per sec
1:hash_map_perf pre-alloc 857218 events per sec
2:hash_map_perf pre-alloc 852875 events per sec
5:hash_map_perf pre-alloc 846497 events per sec
6:hash_map_perf pre-alloc 828467 events per sec
3:hash_map_perf pre-alloc 812542 events per sec
7:hash_map_perf pre-alloc 805336 events per sec

hash_map worst: no free
./map_perf_test 2048
7:worse hash_map_perf pre-alloc 391104 events per sec
4:worse hash_map_perf pre-alloc 388073 events per sec
5:worse hash_map_perf pre-alloc 387038 events per sec
1:worse hash_map_perf pre-alloc 386546 events per sec
0:worse hash_map_perf pre-alloc 384590 events per sec
11:worse hash_map_perf pre-alloc 379378 events per sec
10:worse hash_map_perf pre-alloc 375480 events per sec
12:worse hash_map_perf pre-alloc 372394 events per sec
6:worse hash_map_perf pre-alloc 367692 events per sec
3:worse hash_map_perf pre-alloc 363970 events per sec
9:worse hash_map_perf pre-alloc 364008 events per sec
8:worse hash_map_perf pre-alloc 363759 events per sec
2:worse hash_map_perf pre-alloc 360743 events per sec
14:worse hash_map_perf pre-alloc 361195 events per sec
13:worse hash_map_perf pre-alloc 360276 events per sec
15:worse hash_map_perf pre-alloc 360057 events per sec
0:worse hash_map_perf pre-alloc 378177 events per sec

ftrace trace
0)               |  htab_map_update_elem() {
0)   0.317 us    |    migrate_disable();
0)               |    _raw_spin_lock_irqsave() {
0)   0.260 us    |      preempt_count_add();
0)   1.803 us    |    }
0)   0.276 us    |    lookup_elem_raw();
0)               |    alloc_htab_elem() {
0)   0.586 us    |      __pcpu_freelist_pop();
0)   0.945 us    |    }
0)               |    _raw_spin_unlock_irqrestore() {
0)   0.160 us    |      preempt_count_sub();
0)   0.972 us    |    }
0)   0.657 us    |    migrate_enable();
0)   8.669 us    |  }

It can be seen that after adding this patch, the map performance is
almost not degraded, and when free=0, first check is_empty instead of
directly acquiring spin_lock.

As for why to add is_empty instead of directly judging head->first, my
understanding is this, head->first is frequently modified during updating
map, which will lead to invalid other cpus's cache, and is_empty is after
freelist having no free elems will be changed, the performance will be
better.

Co-developed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Feng Zhou <zhoufeng.zf@bytedance.com>
---
 kernel/bpf/percpu_freelist.c | 68 ++++++++++++++++++++++++------------
 kernel/bpf/percpu_freelist.h |  1 +
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 3d897de89061..abc148c8cae1 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -16,9 +16,11 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
 
 		raw_spin_lock_init(&head->lock);
 		head->first = NULL;
+		head->is_empty = true;
 	}
 	raw_spin_lock_init(&s->extralist.lock);
 	s->extralist.first = NULL;
+	s->extralist.is_empty = true;
 	return 0;
 }
 
@@ -32,6 +34,8 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
 {
 	node->next = head->first;
 	head->first = node;
+	if (head->is_empty)
+		head->is_empty = false;
 }
 
 static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -130,14 +134,18 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
-		raw_spin_lock(&head->lock);
-		node = head->first;
-		if (node) {
-			head->first = node->next;
+		if (!head->is_empty) {
+			raw_spin_lock(&head->lock);
+			node = head->first;
+			if (node) {
+				head->first = node->next;
+				if (!head->first)
+					head->is_empty = true;
+				raw_spin_unlock(&head->lock);
+				return node;
+			}
 			raw_spin_unlock(&head->lock);
-			return node;
 		}
-		raw_spin_unlock(&head->lock);
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
@@ -146,11 +154,16 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 	}
 
 	/* per cpu lists are all empty, try extralist */
-	raw_spin_lock(&s->extralist.lock);
-	node = s->extralist.first;
-	if (node)
-		s->extralist.first = node->next;
-	raw_spin_unlock(&s->extralist.lock);
+	if (!s->extralist.is_empty) {
+		raw_spin_lock(&s->extralist.lock);
+		node = s->extralist.first;
+		if (node) {
+			s->extralist.first = node->next;
+			if (!s->extralist.first)
+				s->extralist.is_empty = true;
+		}
+		raw_spin_unlock(&s->extralist.lock);
+	}
 	return node;
 }
 
@@ -164,14 +177,18 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
-		if (raw_spin_trylock(&head->lock)) {
-			node = head->first;
-			if (node) {
-				head->first = node->next;
+		if (!head->is_empty) {
+			if (raw_spin_trylock(&head->lock)) {
+				node = head->first;
+				if (node) {
+					head->first = node->next;
+					if (!head->first)
+						head->is_empty = true;
+					raw_spin_unlock(&head->lock);
+					return node;
+				}
 				raw_spin_unlock(&head->lock);
-				return node;
 			}
-			raw_spin_unlock(&head->lock);
 		}
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
@@ -181,12 +198,17 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 	}
 
 	/* cannot pop from per cpu lists, try extralist */
-	if (!raw_spin_trylock(&s->extralist.lock))
-		return NULL;
-	node = s->extralist.first;
-	if (node)
-		s->extralist.first = node->next;
-	raw_spin_unlock(&s->extralist.lock);
+	if (!s->extralist.is_empty) {
+		if (!raw_spin_trylock(&s->extralist.lock))
+			return NULL;
+		node = s->extralist.first;
+		if (node) {
+			s->extralist.first = node->next;
+			if (!s->extralist.first)
+				s->extralist.is_empty = true;
+		}
+		raw_spin_unlock(&s->extralist.lock);
+	}
 	return node;
 }
 
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3c76553cfe57..9e4545631ed5 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -9,6 +9,7 @@
 struct pcpu_freelist_head {
 	struct pcpu_freelist_node *first;
 	raw_spinlock_t lock;
+	bool is_empty;
 };
 
 struct pcpu_freelist {
-- 
2.20.1


^ permalink raw reply related

* [PATCH] net: fec: Avoid allocating rx buffer using ATOMIC in ndo_open
From: Michael Trimarchi @ 2022-05-18  6:20 UTC (permalink / raw)
  To: Joakim Zhang, Liam Girdwood, Jakub Kicinski
  Cc: Eric Dumazet, Paolo Abeni, netdev, linux-kernel

Make ndo_open less sensitive to memory pressure.

Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
---
Change:
	- Adjust the commit message addressing the comments in RFC version
---
 drivers/net/ethernet/freescale/fec_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 9f33ec838b52..09eb6ea9a584 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3076,7 +3076,7 @@ fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
 	rxq = fep->rx_queue[queue];
 	bdp = rxq->bd.base;
 	for (i = 0; i < rxq->bd.ring_size; i++) {
-		skb = netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE);
+		skb = __netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE, GFP_KERNEL);
 		if (!skb)
 			goto err_alloc;
 
-- 
2.25.1


^ permalink raw reply related

* RE: [EXTERNAL] Re: [PATCH 05/12] net: mana: Set the DMA device max page size
From: Ajay Sharma @ 2022-05-18  5:59 UTC (permalink / raw)
  To: Jason Gunthorpe, Long Li
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Wei Liu,
	Dexuan Cui, David S. Miller, Jakub Kicinski, Paolo Abeni,
	Leon Romanovsky, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-rdma@vger.kernel.org, Ajay Sharma
In-Reply-To: <20220518000356.GO63055@ziepe.ca>

Thanks Long. 
Hello Jason,
I am the author of the patch.
To your comment below : 
" As I've already said, you are supposed to set the value that limits to ib_sge and *NOT* the value that is related to ib_umem_find_best_pgsz. It is usually 2G because the ib_sge's typically work on a 32 bit length."

The ib_sge is limited by the __sg_alloc_table_from_pages() which uses ib_dma_max_seg_size() which is what is set by the eth driver using dma_set_max_seg_size() . Currently our hw does not support PTEs larger than 2M. 

So ib_umem_find_best_pgsz()  takes as an input PG_SZ_BITMAP .  The bitmap has all the bits set for the page sizes supported by the HW.

#define PAGE_SZ_BM (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K \
		    | SZ_256K | SZ_512K | SZ_1M | SZ_2M)

 Are you suggesting we are too restrictive in the bitmap  we are passing ? or that we should not set this bitmap let the function choose default ?

Regards,
Ajay

-----Original Message-----
From: Jason Gunthorpe <jgg@ziepe.ca> 
Sent: Tuesday, May 17, 2022 5:04 PM
To: Long Li <longli@microsoft.com>
Cc: Ajay Sharma <sharmaajay@microsoft.com>; KY Srinivasan <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Stephen Hemminger <sthemmin@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui <decui@microsoft.com>; David S. Miller <davem@davemloft.net>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Leon Romanovsky <leon@kernel.org>; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org
Subject: [EXTERNAL] Re: [PATCH 05/12] net: mana: Set the DMA device max page size

[You don't often get email from jgg@ziepe.ca. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification.]

On Tue, May 17, 2022 at 08:04:58PM +0000, Long Li wrote:
> > Subject: Re: [PATCH 05/12] net: mana: Set the DMA device max page 
> > size
> >
> > On Tue, May 17, 2022 at 07:32:51PM +0000, Long Li wrote:
> > > > Subject: Re: [PATCH 05/12] net: mana: Set the DMA device max 
> > > > page size
> > > >
> > > > On Tue, May 17, 2022 at 02:04:29AM -0700, 
> > > > longli@linuxonhyperv.com
> > wrote:
> > > > > From: Long Li <longli@microsoft.com>
> > > > >
> > > > > The system chooses default 64K page size if the device does 
> > > > > not specify the max page size the device can handle for DMA. 
> > > > > This do not work well when device is registering large chunk 
> > > > > of memory in that a large page size is more efficient.
> > > > >
> > > > > Set it to the maximum hardware supported page size.
> > > >
> > > > For RDMA devices this should be set to the largest segment size 
> > > > an ib_sge can take in when posting work. It should not be the 
> > > > page size of MR. 2M is a weird number for that, are you sure it is right?
> > >
> > > Yes, this is the maximum page size used in hardware page tables.
> >
> > As I said, it should be the size of the sge in the WQE, not the 
> > "hardware page tables"
>
> This driver uses the following code to figure out the largest page 
> size for memory registration with hardware:
>
> page_sz = ib_umem_find_best_pgsz(mr->umem, PAGE_SZ_BM, iova);
>
> In this function, mr->umem is created with ib_dma_max_seg_size() as 
> its max segment size when creating its sgtable.
>
> The purpose of setting DMA page size to 2M is to make sure this 
> function returns the largest possible MR size that the hardware can 
> take. Otherwise, this function will return 64k: the default DMA size.

As I've already said, you are supposed to set the value that limits to ib_sge and *NOT* the value that is related to ib_umem_find_best_pgsz. It is usually 2G because the ib_sge's typically work on a 32 bit length.

Jason

^ permalink raw reply

* Re: [PATCH net 0/2] selftests: net: add missing tests to Makefile
From: Hangbin Liu @ 2022-05-18  6:01 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, David S. Miller, Paolo Abeni, Shuah Khan, linux-kselftest
In-Reply-To: <20220517124517.363445f4@kernel.org>

On Tue, May 17, 2022 at 12:45:17PM -0700, Jakub Kicinski wrote:
> Yes, we don't have the auto-reply. There's too much noise in some of
> the tests, but mostly it's because we don't want to encourage people
> posting patches just to build them. If it's a machine replying rather
> than a human some may think that it's okay. We already have
> jaw-droppingly expensive VM instance to keep up with the build volume.
> And the list is very busy. So we can't afford "post to run the CI"
> development model.

OK, I just afraid the developer doesn't check patchwork status.

> > +files=$(git show --name-status --oneline | grep -P '^A\ttools/testing/selftests/net/' | grep '\.sh$' | sed 's@A\ttools/testing/selftests/net/@@')
> > +for file in $files; do
> > +	if echo $file | grep forwarding; then
> > +		file=$(echo $file | sed 's/forwarding\///')
> > +		if ! grep -P "[\t| ]$file" tools/testing/selftests/net/forwarding/Makefile;then
> > +			echo "new test $file not in selftests/net/forwarding/Makefile" >&$DESC_FD
> > +			rc=1
> > +		fi
> > +	else
> > +		if ! grep -P "[\t| ]$file" tools/testing/selftests/net/Makefile;then
> > +			echo "new test $file not in selftests/net/Makefile" >&$DESC_FD
> > +			rc=1
> > +		fi
> 
> Does it matter which exact selftest makefile the changes are?

I only checked the tools/testing/selftests/net/Makefile and
tools/testing/selftests/net/forwarding/Makefile at present.
Maybe mptcp should also added?

> Maybe as a first stab we should just check if there are changes 
> to anything in tools/testing/selftests/.*/Makefile?

In my checking only shell scripts are checked, as most net net/forwarding tests
using shell script for testing. But other sub-component may use c binary or
python for testing. So I think there is no need to check all
tools/testing/selftests/.*/Makefile. WDYT?

Thanks
Hangbin

^ permalink raw reply

* Re: [PATCH v2 0/2] octeon_ep: Fix the error handling path of octep_request_irqs()
From: Dan Carpenter @ 2022-05-18  5:33 UTC (permalink / raw)
  To: Christophe JAILLET
  Cc: vburru, aayarekar, davem, edumazet, kuba, pabeni, sburla, netdev,
	linux-kernel, kernel-janitors
In-Reply-To: <cover.1652819974.git.christophe.jaillet@wanadoo.fr>

Thanks!

Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>

regards,
dan carpenter


^ permalink raw reply

* [PATCH ipsec-next v1 6/6] xfrm: enforce separation between priorities of HW/SW policies
From: Leon Romanovsky @ 2022-05-18  5:30 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Leon Romanovsky, David S . Miller, Herbert Xu, netdev, Raed Salem,
	ipsec-devel
In-Reply-To: <cover.1652851393.git.leonro@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Devices that implement IPsec full offload mode offload policies too.
In RX path, it causes to the situation that HW can't effectively handle
mixed SW and HW priorities unless users make sure that HW offloaded
policies have higher priorities.

In order to make sure that users have coherent picture, let's require to
make sure that HW offloaded policies have always (both RX and TX) higher
priorities than SW ones.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/net/netns/xfrm.h |   8 ++-
 net/xfrm/xfrm_policy.c   | 113 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index bd7c3be4af5d..f0cfa0faf611 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -29,6 +29,11 @@ struct xfrm_policy_hthresh {
 	u8			rbits6;
 };
 
+struct xfrm_policy_prio {
+	u32 max_sw_prio;
+	u32 min_hw_prio;
+};
+
 struct netns_xfrm {
 	struct list_head	state_all;
 	/*
@@ -52,7 +57,7 @@ struct netns_xfrm {
 	unsigned int		policy_idx_hmask;
 	struct hlist_head	policy_inexact[XFRM_POLICY_MAX];
 	struct xfrm_policy_hash	policy_bydst[XFRM_POLICY_MAX];
-	unsigned int		policy_count[XFRM_POLICY_MAX * 2];
+	unsigned int		policy_count[XFRM_POLICY_MAX * 3];
 	struct work_struct	policy_hash_work;
 	struct xfrm_policy_hthresh policy_hthresh;
 	struct list_head	inexact_bins;
@@ -67,6 +72,7 @@ struct netns_xfrm {
 	u32			sysctl_acq_expires;
 
 	u8			policy_default[XFRM_POLICY_MAX];
+	struct xfrm_policy_prio	policy_prio[XFRM_POLICY_MAX];
 
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*sysctl_hdr;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 681670f7d50f..9b6f89618ab4 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1570,13 +1570,70 @@ static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
 	return delpol;
 }
 
+static int __xfrm_policy_check_hw_priority(struct net *net,
+					   struct xfrm_policy *policy, int dir)
+{
+	int left, right;
+
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	if (!net->xfrm.policy_count[dir])
+		/* Adding first policy */
+		return 0;
+
+	if (policy->xdo.type != XFRM_DEV_OFFLOAD_FULL) {
+		/* SW priority */
+		if (!net->xfrm.policy_count[2 * XFRM_POLICY_MAX + dir])
+			/* Special case to allow reuse maximum priority
+			 * (U32_MAX) for SW policies, when no HW policy exist.
+			 */
+			return 0;
+
+		left = policy->priority;
+		right = net->xfrm.policy_prio[dir].min_hw_prio;
+	} else {
+		/* HW priority */
+		left = net->xfrm.policy_prio[dir].max_sw_prio;
+		right = policy->priority;
+	}
+	if (left >= right)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void __xfrm_policy_update_hw_priority(struct net *net,
+					     struct xfrm_policy *policy,
+					     int dir)
+{
+	u32 *hw_prio, *sw_prio;
+
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	if (policy->xdo.type != XFRM_DEV_OFFLOAD_FULL) {
+		sw_prio = &net->xfrm.policy_prio[dir].max_sw_prio;
+		*sw_prio = max(*sw_prio, policy->priority);
+		return;
+	}
+
+	hw_prio = &net->xfrm.policy_prio[dir].min_hw_prio;
+	*hw_prio = min(*hw_prio, policy->priority);
+}
+
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 {
 	struct net *net = xp_net(policy);
 	struct xfrm_policy *delpol;
 	struct hlist_head *chain;
+	int ret;
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+	ret = __xfrm_policy_check_hw_priority(net, policy, dir);
+	if (ret) {
+		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+		return ret;
+	}
+
 	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
 	if (chain)
 		delpol = xfrm_policy_insert_list(chain, policy, excl);
@@ -1606,6 +1663,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	policy->curlft.use_time = 0;
 	if (!mod_timer(&policy->timer, jiffies + HZ))
 		xfrm_pol_hold(policy);
+	__xfrm_policy_update_hw_priority(net, policy, dir);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	if (delpol)
@@ -2205,6 +2263,8 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
 
 	list_add(&pol->walk.all, &net->xfrm.policy_all);
 	net->xfrm.policy_count[dir]++;
+	if (pol->xdo.type == XFRM_DEV_OFFLOAD_FULL)
+		net->xfrm.policy_count[2 * XFRM_POLICY_MAX + dir]++;
 	xfrm_pol_hold(pol);
 }
 
@@ -2224,6 +2284,8 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 	}
 
 	list_del_init(&pol->walk.all);
+	if (pol->xdo.type == XFRM_DEV_OFFLOAD_FULL)
+		net->xfrm.policy_count[2 * XFRM_POLICY_MAX + dir]--;
 	net->xfrm.policy_count[dir]--;
 
 	return pol;
@@ -2239,12 +2301,58 @@ static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
 	__xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
 }
 
+static void __xfrm_policy_delete_prio(struct net *net,
+				      struct xfrm_policy *policy, int dir)
+{
+	struct xfrm_policy *pol;
+	u32 sw_prio = 0;
+
+	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+	if (!net->xfrm.policy_count[dir]) {
+		net->xfrm.policy_prio[dir].max_sw_prio = sw_prio;
+		net->xfrm.policy_prio[dir].min_hw_prio = U32_MAX;
+		return;
+	}
+
+	if (policy->xdo.type == XFRM_DEV_OFFLOAD_FULL &&
+	    !net->xfrm.policy_count[2 * XFRM_POLICY_MAX + dir]) {
+		net->xfrm.policy_prio[dir].min_hw_prio = U32_MAX;
+		return;
+	}
+
+	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+		if (pol->walk.dead)
+			continue;
+
+		if (policy->xdo.type != XFRM_DEV_OFFLOAD_FULL) {
+			/* SW priority */
+			if (pol->xdo.type == XFRM_DEV_OFFLOAD_FULL) {
+				net->xfrm.policy_prio[dir].max_sw_prio = sw_prio;
+				return;
+			}
+			sw_prio = pol->priority;
+			continue;
+		}
+		/* HW priority */
+		if (pol->xdo.type != XFRM_DEV_OFFLOAD_FULL)
+			continue;
+
+		net->xfrm.policy_prio[dir].min_hw_prio = pol->priority;
+		return;
+	}
+
+	net->xfrm.policy_prio[dir].max_sw_prio = sw_prio;
+}
+
 int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 {
 	struct net *net = xp_net(pol);
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 	pol = __xfrm_policy_unlink(pol, dir);
+	if (pol)
+		__xfrm_policy_delete_prio(net, pol, dir);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	if (pol) {
 		xfrm_dev_policy_delete(pol);
@@ -4042,6 +4150,7 @@ static int __net_init xfrm_policy_init(struct net *net)
 
 		net->xfrm.policy_count[dir] = 0;
 		net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;
+		net->xfrm.policy_count[2 * XFRM_POLICY_MAX + dir] = 0;
 		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
 
 		htab = &net->xfrm.policy_bydst[dir];
@@ -4127,6 +4236,10 @@ static int __net_init xfrm_net_init(struct net *net)
 	net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT;
 	net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT;
 
+	net->xfrm.policy_prio[XFRM_POLICY_IN].min_hw_prio = U32_MAX;
+	net->xfrm.policy_prio[XFRM_POLICY_FWD].min_hw_prio = U32_MAX;
+	net->xfrm.policy_prio[XFRM_POLICY_OUT].min_hw_prio = U32_MAX;
+
 	rv = xfrm_statistics_init(net);
 	if (rv < 0)
 		goto out_statistics;
-- 
2.36.1


^ permalink raw reply related

* [PATCH ipsec-next v1 5/6] xfrm: add RX datapath protection for IPsec full offload mode
From: Leon Romanovsky @ 2022-05-18  5:30 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Leon Romanovsky, David S . Miller, Herbert Xu, netdev, Raed Salem,
	ipsec-devel
In-Reply-To: <cover.1652851393.git.leonro@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Traffic received by device with enabled IPsec full offload should be
forwarded to the stack only after decryption, packet headers and
trailers removed.

Such packets are expected to be seen as normal (non-XFRM) ones, while
not-supported packets should be dropped by the HW.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/net/xfrm.h | 55 +++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 21be19ece4f7..9f9250fe1c4d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1094,6 +1094,29 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un
 	return !0;
 }
 
+#ifdef CONFIG_XFRM
+static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
+{
+	struct sec_path *sp = skb_sec_path(skb);
+
+	return sp->xvec[sp->len - 1];
+}
+#endif
+
+static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
+{
+#ifdef CONFIG_XFRM
+	struct sec_path *sp = skb_sec_path(skb);
+
+	if (!sp || !sp->olen || sp->len != sp->olen)
+		return NULL;
+
+	return &sp->ovec[sp->olen - 1];
+#else
+	return NULL;
+#endif
+}
+
 #ifdef CONFIG_XFRM
 int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
 			unsigned short family);
@@ -1113,6 +1136,15 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 {
 	struct net *net = dev_net(skb->dev);
 	int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct xfrm_state *x;
+
+	if (xo) {
+		x = xfrm_input_state(skb);
+		if (x->xso.type == XFRM_DEV_OFFLOAD_FULL)
+			return (xo->flags & CRYPTO_DONE) &&
+			       (xo->status & CRYPTO_SUCCESS);
+	}
 
 	if (sk && sk->sk_policy[XFRM_POLICY_IN])
 		return __xfrm_policy_check(sk, ndir, skb, family);
@@ -1845,29 +1877,6 @@ static inline void xfrm_states_delete(struct xfrm_state **states, int n)
 }
 #endif
 
-#ifdef CONFIG_XFRM
-static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
-{
-	struct sec_path *sp = skb_sec_path(skb);
-
-	return sp->xvec[sp->len - 1];
-}
-#endif
-
-static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
-{
-#ifdef CONFIG_XFRM
-	struct sec_path *sp = skb_sec_path(skb);
-
-	if (!sp || !sp->olen || sp->len != sp->olen)
-		return NULL;
-
-	return &sp->ovec[sp->olen - 1];
-#else
-	return NULL;
-#endif
-}
-
 void __init xfrm_dev_init(void);
 
 #ifdef CONFIG_XFRM_OFFLOAD
-- 
2.36.1


^ permalink raw reply related

* [PATCH ipsec-next v1 4/6] xfrm: add TX datapath support for IPsec full offload mode
From: Leon Romanovsky @ 2022-05-18  5:30 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Leon Romanovsky, David S . Miller, Herbert Xu, netdev, Raed Salem,
	ipsec-devel
In-Reply-To: <cover.1652851393.git.leonro@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

In IPsec full mode, the device is going to encrypt and encapsulate
packets that are associated with offloaded policy. After successful
policy lookup to indicate if packets should be offloaded or not,
the stack forwards packets to the device to do the magic.

Signed-off-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Huy Nguyen <huyn@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 net/xfrm/xfrm_output.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index d4935b3b9983..2599f3dbac08 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -718,6 +718,25 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
 		break;
 	}
 
+	if (x->xso.type == XFRM_DEV_OFFLOAD_FULL) {
+		struct dst_entry *dst = skb_dst_pop(skb);
+
+		if (!dst) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+			return -EHOSTUNREACH;
+		}
+
+		skb_dst_set(skb, dst);
+		err = skb_dst(skb)->ops->local_out(net, skb->sk, skb);
+		if (unlikely(err != 1))
+			return err;
+
+		if (!skb_dst(skb)->xfrm)
+			return dst_output(net, skb->sk, skb);
+
+		return 0;
+	}
+
 	secpath_reset(skb);
 
 	if (xfrm_dev_offload_ok(skb, x)) {
-- 
2.36.1


^ permalink raw reply related

* [PATCH ipsec-next v1 3/6] xfrm: add an interface to offload policy
From: Leon Romanovsky @ 2022-05-18  5:30 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Leon Romanovsky, David S . Miller, Herbert Xu, netdev, Raed Salem,
	ipsec-devel
In-Reply-To: <cover.1652851393.git.leonro@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Extend netlink interface to add and delete XFRM policy from the device.
This functionality is a first step to implement full IPsec offload solution.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/netdevice.h |  3 +++
 include/net/xfrm.h        | 40 +++++++++++++++++++++++++++
 net/xfrm/xfrm_device.c    | 57 +++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_policy.c    |  2 ++
 net/xfrm/xfrm_user.c      |  9 +++++++
 5 files changed, 111 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cbaf312e365b..e6fefcdb4dcc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1012,6 +1012,9 @@ struct xfrmdev_ops {
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
 	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
+	int	(*xdo_dev_policy_add) (struct xfrm_policy *x);
+	void	(*xdo_dev_policy_delete) (struct xfrm_policy *x);
+	void	(*xdo_dev_policy_free) (struct xfrm_policy *x);
 };
 #endif
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 77e06e8208d8..21be19ece4f7 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -129,6 +129,7 @@ struct xfrm_state_walk {
 enum {
 	XFRM_DEV_OFFLOAD_IN = 1,
 	XFRM_DEV_OFFLOAD_OUT,
+	XFRM_DEV_OFFLOAD_FWD,
 };
 
 enum {
@@ -534,6 +535,8 @@ struct xfrm_policy {
 	struct xfrm_tmpl       	xfrm_vec[XFRM_MAX_DEPTH];
 	struct hlist_node	bydst_inexact_list;
 	struct rcu_head		rcu;
+
+	struct xfrm_dev_offload xdo;
 };
 
 static inline struct net *xp_net(const struct xfrm_policy *xp)
@@ -1873,6 +1876,8 @@ void xfrm_dev_backlog(struct softnet_data *sd);
 struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again);
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		       struct xfrm_user_offload *xuo);
+int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+		       struct xfrm_user_offload *xuo, u8 dir);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
 static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
@@ -1921,6 +1926,27 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 		dev_put_track(dev, &xso->dev_tracker);
 	}
 }
+
+static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
+{
+	struct xfrm_dev_offload *xdo = &x->xdo;
+
+	if (xdo->dev)
+		xdo->dev->xfrmdev_ops->xdo_dev_policy_delete(x);
+}
+
+static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
+{
+	struct xfrm_dev_offload *xdo = &x->xdo;
+	struct net_device *dev = xdo->dev;
+
+	if (dev && dev->xfrmdev_ops) {
+		if (dev->xfrmdev_ops->xdo_dev_policy_free)
+			dev->xfrmdev_ops->xdo_dev_policy_free(x);
+		xdo->dev = NULL;
+		dev_put_track(dev, &xdo->dev_tracker);
+	}
+}
 #else
 static inline void xfrm_dev_resume(struct sk_buff *skb)
 {
@@ -1948,6 +1974,20 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 {
 }
 
+static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+				      struct xfrm_user_offload *xuo, u8 dir)
+{
+	return 0;
+}
+
+static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
+{
+}
+
+static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
+{
+}
+
 static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
 	return false;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index c4c469a85663..d7792a195c4e 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -302,6 +302,63 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 }
 EXPORT_SYMBOL_GPL(xfrm_dev_state_add);
 
+int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+			struct xfrm_user_offload *xuo, u8 dir)
+{
+	struct xfrm_dev_offload *xdo = &xp->xdo;
+	struct net_device *dev;
+	int err;
+
+	/* We support only Full offload mode and it means
+	 * that user must set XFRM_OFFLOAD_FULL bit.
+	 */
+	if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_FULL)
+		return -EINVAL;
+
+	dev = dev_get_by_index(net, xuo->ifindex);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) {
+		xdo->dev = NULL;
+		dev_put(dev);
+		return -EINVAL;
+	}
+
+	xdo->dev = dev;
+	netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC);
+	xdo->real_dev = dev;
+	xdo->type = XFRM_DEV_OFFLOAD_FULL;
+	switch (dir) {
+	case XFRM_POLICY_IN:
+		xdo->dir = XFRM_DEV_OFFLOAD_IN;
+		break;
+	case XFRM_POLICY_OUT:
+		xdo->dir = XFRM_DEV_OFFLOAD_OUT;
+		break;
+	case XFRM_POLICY_FWD:
+		xdo->dir = XFRM_DEV_OFFLOAD_FWD;
+		break;
+	default:
+		xdo->dev = NULL;
+		dev_put(dev);
+		return -EINVAL;
+	}
+
+	err = dev->xfrmdev_ops->xdo_dev_policy_add(xp);
+	if (err) {
+		xdo->dev = NULL;
+		xdo->real_dev = NULL;
+		xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+		xdo->dir = 0;
+		dev_put_track(dev, &xdo->dev_tracker);
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xfrm_dev_policy_add);
+
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
 	int mtu;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 00bd0ecff5a1..681670f7d50f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -425,6 +425,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
 		BUG();
 
+	xfrm_dev_policy_free(policy);
 	call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
@@ -2246,6 +2247,7 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	if (pol) {
+		xfrm_dev_policy_delete(pol);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0df71449abe9..00474559d079 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1747,6 +1747,14 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us
 	if (attrs[XFRMA_IF_ID])
 		xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
 
+	/* configure the hardware if offload is requested */
+	if (attrs[XFRMA_OFFLOAD_DEV]) {
+		err = xfrm_dev_policy_add(net, xp,
+				nla_data(attrs[XFRMA_OFFLOAD_DEV]), p->dir);
+		if (err)
+			goto error;
+	}
+
 	return xp;
  error:
 	*errp = err;
@@ -1785,6 +1793,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	xfrm_audit_policy_add(xp, err ? 0 : 1, true);
 
 	if (err) {
+		xfrm_dev_policy_delete(xp);
 		security_xfrm_policy_free(xp->security);
 		kfree(xp);
 		return err;
-- 
2.36.1


^ permalink raw reply related

* [PATCH ipsec-next v1 2/6] xfrm: allow state full offload mode
From: Leon Romanovsky @ 2022-05-18  5:30 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Leon Romanovsky, David S . Miller, Herbert Xu, netdev, Raed Salem,
	ipsec-devel
In-Reply-To: <cover.1652851393.git.leonro@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Allow users to configure xfrm states with full offload mode.
The full mode must be requested both for policy and state, and
such requires us to do not implement fallback.

We explicitly return an error if requested full mode can't
be configured.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 .../inline_crypto/ch_ipsec/chcr_ipsec.c       |  4 ++++
 .../net/ethernet/intel/ixgbe/ixgbe_ipsec.c    |  5 ++++
 drivers/net/ethernet/intel/ixgbevf/ipsec.c    |  5 ++++
 .../mellanox/mlx5/core/en_accel/ipsec.c       |  4 ++++
 drivers/net/netdevsim/ipsec.c                 |  5 ++++
 net/xfrm/xfrm_device.c                        | 24 +++++++++++++++----
 6 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c
index 585590520076..ca21794281d6 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c
@@ -283,6 +283,10 @@ static int ch_ipsec_xfrm_add_state(struct xfrm_state *x)
 		pr_debug("Cannot offload xfrm states with geniv other than seqiv\n");
 		return -EINVAL;
 	}
+	if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+		pr_debug("Unsupported xfrm offload\n");
+		return -EINVAL;
+	}
 
 	sa_entry = kzalloc(sizeof(*sa_entry), GFP_KERNEL);
 	if (!sa_entry) {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 774de63dd93a..53a969e34883 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -585,6 +585,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
 		return -EINVAL;
 	}
 
+	if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+		netdev_err(dev, "Unsupported ipsec offload type\n");
+		return -EINVAL;
+	}
+
 	if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) {
 		struct rx_sa rsa;
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
index 9984ebc62d78..c1cf540d162a 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
@@ -280,6 +280,11 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs)
 		return -EINVAL;
 	}
 
+	if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+		netdev_err(dev, "Unsupported ipsec offload type\n");
+		return -EINVAL;
+	}
+
 	if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) {
 		struct rx_sa rsa;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
index 2a8fd7020622..c182b640b80d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
@@ -256,6 +256,10 @@ static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x)
 		netdev_info(netdev, "Cannot offload xfrm states with geniv other than seqiv\n");
 		return -EINVAL;
 	}
+	if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+		netdev_info(netdev, "Unsupported xfrm offload type\n");
+		return -EINVAL;
+	}
 	return 0;
 }
 
diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
index 386336a38f34..b93baf5c8bee 100644
--- a/drivers/net/netdevsim/ipsec.c
+++ b/drivers/net/netdevsim/ipsec.c
@@ -149,6 +149,11 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs)
 		return -EINVAL;
 	}
 
+	if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+		netdev_err(dev, "Unsupported ipsec offload type\n");
+		return -EINVAL;
+	}
+
 	/* find the first unused index */
 	ret = nsim_ipsec_find_empty_idx(ipsec);
 	if (ret < 0) {
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 8eb100162863..c4c469a85663 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -215,6 +215,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	struct xfrm_dev_offload *xso = &x->xso;
 	xfrm_address_t *saddr;
 	xfrm_address_t *daddr;
+	bool is_full_offload;
 
 	if (!x->type_offload)
 		return -EINVAL;
@@ -223,9 +224,11 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	if (x->encap || x->tfcpad)
 		return -EINVAL;
 
-	if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND))
+	if (xuo->flags &
+	    ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_FULL))
 		return -EINVAL;
 
+	is_full_offload = xuo->flags & XFRM_OFFLOAD_FULL;
 	dev = dev_get_by_index(net, xuo->ifindex);
 	if (!dev) {
 		if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) {
@@ -240,7 +243,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 					x->props.family,
 					xfrm_smark_get(0, x));
 		if (IS_ERR(dst))
-			return 0;
+			return (is_full_offload) ? -EINVAL : 0;
 
 		dev = dst->dev;
 
@@ -251,7 +254,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
 		xso->dev = NULL;
 		dev_put(dev);
-		return 0;
+		return (is_full_offload) ? -EINVAL : 0;
 	}
 
 	if (x->props.flags & XFRM_STATE_ESN &&
@@ -270,7 +273,10 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	else
 		xso->dir = XFRM_DEV_OFFLOAD_OUT;
 
-	xso->type = XFRM_DEV_OFFLOAD_CRYPTO;
+	if (is_full_offload)
+		xso->type = XFRM_DEV_OFFLOAD_FULL;
+	else
+		xso->type = XFRM_DEV_OFFLOAD_CRYPTO;
 
 	err = dev->xfrmdev_ops->xdo_dev_state_add(x);
 	if (err) {
@@ -280,7 +286,15 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		dev_put_track(dev, &xso->dev_tracker);
 		xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
 
-		if (err != -EOPNOTSUPP)
+		/* User explicitly requested full offload mode and configured
+		 * policy in addition to the XFRM state. So be civil to users,
+		 * and return an error instead of taking fallback path.
+		 *
+		 * This WARN_ON() can be seen as a documentation for driver
+		 * authors to do not return -EOPNOTSUPP in full offload mode.
+		 */
+		WARN_ON(err == -EOPNOTSUPP && is_full_offload);
+		if (err != -EOPNOTSUPP || is_full_offload)
 			return err;
 	}
 
-- 
2.36.1


^ permalink raw reply related

* [PATCH ipsec-next v1 1/6] xfrm: add new full offload flag
From: Leon Romanovsky @ 2022-05-18  5:30 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Leon Romanovsky, David S . Miller, Herbert Xu, netdev, Raed Salem,
	ipsec-devel
In-Reply-To: <cover.1652851393.git.leonro@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

In the next patches, the xfrm core code will be extended to support
new type of offload - full offload. In that mode, both policy and state
should be specially configured in order to perform whole offloaded data
path.

Full offload takes care of encryption, decryption, encapsulation and
other operations with headers.

As this mode is new for XFRM policy flow, we can "start fresh" with flag
bits and release first and second bit for future use.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/net/xfrm.h        | 7 +++++++
 include/uapi/linux/xfrm.h | 6 ++++++
 net/xfrm/xfrm_device.c    | 3 +++
 net/xfrm/xfrm_user.c      | 2 ++
 4 files changed, 18 insertions(+)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 736c349de8bf..77e06e8208d8 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -131,12 +131,19 @@ enum {
 	XFRM_DEV_OFFLOAD_OUT,
 };
 
+enum {
+	XFRM_DEV_OFFLOAD_UNSPECIFIED,
+	XFRM_DEV_OFFLOAD_CRYPTO,
+	XFRM_DEV_OFFLOAD_FULL,
+};
+
 struct xfrm_dev_offload {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
 	struct net_device	*real_dev;
 	unsigned long		offload_handle;
 	u8			dir : 2;
+	u8			type : 2;
 };
 
 struct xfrm_mode {
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 65e13a099b1a..9caec9b562e1 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -519,6 +519,12 @@ struct xfrm_user_offload {
  */
 #define XFRM_OFFLOAD_IPV6	1
 #define XFRM_OFFLOAD_INBOUND	2
+/* Two bits above are relevant for state path only, while
+ * offload is used for both policy and state flows.
+ *
+ * In policy offload mode, they are free and can be safely reused.
+ */
+#define XFRM_OFFLOAD_FULL	4
 
 struct xfrm_userpolicy_default {
 #define XFRM_USERPOLICY_UNSPEC	0
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 35c7e89b2e7d..8eb100162863 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -270,12 +270,15 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	else
 		xso->dir = XFRM_DEV_OFFLOAD_OUT;
 
+	xso->type = XFRM_DEV_OFFLOAD_CRYPTO;
+
 	err = dev->xfrmdev_ops->xdo_dev_state_add(x);
 	if (err) {
 		xso->dev = NULL;
 		xso->dir = 0;
 		xso->real_dev = NULL;
 		dev_put_track(dev, &xso->dev_tracker);
+		xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
 
 		if (err != -EOPNOTSUPP)
 			return err;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 6a58fec6a1fb..0df71449abe9 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -854,6 +854,8 @@ static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb)
 	xuo->ifindex = xso->dev->ifindex;
 	if (xso->dir == XFRM_DEV_OFFLOAD_IN)
 		xuo->flags = XFRM_OFFLOAD_INBOUND;
+	if (xso->type == XFRM_DEV_OFFLOAD_FULL)
+		xuo->flags |= XFRM_OFFLOAD_FULL;
 
 	return 0;
 }
-- 
2.36.1


^ permalink raw reply related

* [PATCH ipsec-next v1 0/6] Extend XFRM core to allow full offload configuration
From: Leon Romanovsky @ 2022-05-18  5:30 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Leon Romanovsky, David S . Miller, Herbert Xu, netdev, Raed Salem,
	ipsec-devel

From: Leon Romanovsky <leonro@nvidia.com>

Changelog:
v1: Moved comment to be before if (...) in third patch.
v0: https://lore.kernel.org/all/cover.1652176932.git.leonro@nvidia.com
-----------------------------------------------------------------------

The following series extends XFRM core code to handle new type of IPsec
offload - full offload.

In this mode, the HW is going to be responsible for whole data path, so
both policy and state should be offloaded.

Thanks

Leon Romanovsky (6):
  xfrm: add new full offload flag
  xfrm: allow state full offload mode
  xfrm: add an interface to offload policy
  xfrm: add TX datapath support for IPsec full offload mode
  xfrm: add RX datapath protection for IPsec full offload mode
  xfrm: enforce separation between priorities of HW/SW policies

 .../inline_crypto/ch_ipsec/chcr_ipsec.c       |   4 +
 .../net/ethernet/intel/ixgbe/ixgbe_ipsec.c    |   5 +
 drivers/net/ethernet/intel/ixgbevf/ipsec.c    |   5 +
 .../mellanox/mlx5/core/en_accel/ipsec.c       |   4 +
 drivers/net/netdevsim/ipsec.c                 |   5 +
 include/linux/netdevice.h                     |   3 +
 include/net/netns/xfrm.h                      |   8 +-
 include/net/xfrm.h                            | 102 ++++++++++++----
 include/uapi/linux/xfrm.h                     |   6 +
 net/xfrm/xfrm_device.c                        |  84 ++++++++++++-
 net/xfrm/xfrm_output.c                        |  19 +++
 net/xfrm/xfrm_policy.c                        | 115 ++++++++++++++++++
 net/xfrm/xfrm_user.c                          |  11 ++
 13 files changed, 342 insertions(+), 29 deletions(-)

-- 
2.36.1


^ permalink raw reply

* Re: [PATCH] vhost_net: fix double fget()
From: Michael S. Tsirkin @ 2022-05-18  5:22 UTC (permalink / raw)
  To: Al Viro
  Cc: Jason Wang, kvm, virtualization, netdev, linux-kernel,
	linux-fsdevel, ebiggers, davem
In-Reply-To: <YoQa4wzy9jSwDY7E@zeniv-ca.linux.org.uk>

On Tue, May 17, 2022 at 10:00:03PM +0000, Al Viro wrote:
> On Mon, May 16, 2022 at 04:44:19AM -0400, Michael S. Tsirkin wrote:
> > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > 
> > Acked-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > and this is stable material I guess.
> 
> It is, except that commit message ought to be cleaned up.  Something
> along the lines of
> 
> ----
> Fix double fget() in vhost_net_set_backend()
> 
> Descriptor table is a shared resource; two fget() on the same descriptor
> may return different struct file references.  get_tap_ptr_ring() is
> called after we'd found (and pinned) the socket we'll be using and it
> tries to find the private tun/tap data structures associated with it.
> Redoing the lookup by the same file descriptor we'd used to get the
> socket is racy - we need to same struct file.
> 
> Thanks to Jason for spotting a braino in the original variant of patch -
> I'd missed the use of fd == -1 for disabling backend, and in that case
> we can end up with sock == NULL and sock != oldsock.
> ----
> 
> Does the above sound sane for commit message?  And which tree would you
> prefer it to go through?  I can take it in vfs.git#fixes, or you could
> take it into your tree...

Acked-by: Michael S. Tsirkin <mst@redhat.com>
for the new message and merging through your tree.

-- 
MST


^ permalink raw reply

* Re: [PATCH] mediatek/mt76: cleanup the code a bit
From: Kalle Valo @ 2022-05-18  5:10 UTC (permalink / raw)
  To: Bernard Zhao
  Cc: Felix Fietkau, Lorenzo Bianconi, Ryder Lee, Shayne Chen,
	Sean Wang, David S. Miller, Jakub Kicinski, Paolo Abeni,
	Matthias Brugger, linux-wireless, netdev, linux-arm-kernel,
	linux-mediatek, linux-kernel, zhaojunkui2008
In-Reply-To: <20220517062913.473920-1-bernard@vivo.com>

Bernard Zhao <bernard@vivo.com> writes:

> Function mt76_register_debugfs just call mt76_register_debugfs_fops
> with NULL op parameter.
> This change is to cleanup the code a bit, elete the meaningless
> mt76_register_debugfs, and all call mt76_register_debugfs_fops.
>
> Signed-off-by: Bernard Zhao <bernard@vivo.com>

Please make the title more informative and don't use mediatek in the
title, for example something like this:

mt76: remove simple mt76_register_debugfs() function

More info:

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches#commit_title_is_wrong

-- 
https://patchwork.kernel.org/project/linux-wireless/list/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH 1/3] net: macb: Fix PTP one step sync support
From: Jakub Kicinski @ 2022-05-18  5:06 UTC (permalink / raw)
  To: Harini Katakam
  Cc: Harini Katakam, Nicolas Ferre, David Miller, Richard Cochran,
	Claudiu Beznea, Paolo Abeni, netdev, Linux Kernel Mailing List,
	Michal Simek, Radhey Shyam Pandey
In-Reply-To: <CAFcVEC+qdouQ+tJdBG_Vv8QsaUX99uFtjKnB5WwQawA1fxmgEQ@mail.gmail.com>

On Wed, 18 May 2022 09:53:29 +0530 Harini Katakam wrote:
> On Wed, May 18, 2022 at 8:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Tue, 17 May 2022 13:02:57 +0530 Harini Katakam wrote:  
> > > PTP one step sync packets cannot have CSUM padding and insertion in
> > > SW since time stamp is inserted on the fly by HW.
> > > In addition, ptp4l version 3.0 and above report an error when skb
> > > timestamps are reported for packets that not processed for TX TS
> > > after transmission.
> > > Add a helper to identify PTP one step sync and fix the above two
> > > errors.
> > > Also reset ptp OSS bit when one step is not selected.
> > >
> > > Fixes: ab91f0a9b5f4 ("net: macb: Add hardware PTP support")
> > > Fixes: 653e92a9175e ("net: macb: add support for padding and fcs computation")  
> >
> > Please make sure to CC authors of the patches under fixes.
> > ./scripts/get_maintainer should point them out.  
> 
> Thanks for the review.
> Rafal Ozieblo <rafalo@cadence.com> is the author of the first Fixes
> patch but that
> address hasn't worked in the last ~4 yrs.
> I have cced Claudiu and everyone else from the maintainers
> (Eric Dumazet <edumazet@google.com> also doesn't work)

I see, thanks, added Rafal's email to the ignore list, 
I'm quite sure Eric's email address works.

> > > @@ -1158,13 +1192,14 @@ static int macb_tx_complete(struct macb_queue *queue, int budget)
> > >
> > >                       /* First, update TX stats if needed */
> > >                       if (skb) {
> > > -                             if (unlikely(skb_shinfo(skb)->tx_flags &
> > > -                                          SKBTX_HW_TSTAMP) &&
> > > -                                 gem_ptp_do_txstamp(queue, skb, desc) == 0) {
> > > -                                     /* skb now belongs to timestamp buffer
> > > -                                      * and will be removed later
> > > -                                      */
> > > -                                     tx_skb->skb = NULL;
> > > +                             if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&  
> >
> > ptp_oss already checks if HW_TSTAMP is set.  
> 
> The check for SKBTX_HW_TSTAMP is required here universally and not
> just inside ptp_oss.
> I will remove the redundant check in ptp_oss instead. Please see the
> reply below.

But then you need to add this check in the padding/fcs call site and
the place where NOCRC is set. If you wrap the check for SKBTX_HW_TSTAMP
in the helper with likely() and remove the inline - will the compiler
not split the function and inline just that check? And leave the rest
as a functionname.part... thing?

^ permalink raw reply

* Re: [PATCH net-next v4 1/2] net: Add a second bind table hashed by port and address
From: Joanne Koong @ 2022-05-18  4:57 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Martin KaFai Lau, Jakub Kicinski, David Miller
In-Reply-To: <CANn89iKryk30MM=XuwDmdZ7T_rDJUe+zZrZMsaPXQG2mghe6tQ@mail.gmail.com>

On Tue, May 17, 2022 at 9:59 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Thu, May 12, 2022 at 1:51 PM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > We currently have one tcp bind table (bhash) which hashes by port
> > number only. In the socket bind path, we check for bind conflicts by
> > traversing the specified port's inet_bind2_bucket while holding the
> > bucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()).
> >
> > In instances where there are tons of sockets hashed to the same port
> > at different addresses, checking for a bind conflict is time-intensive
> > and can cause softirq cpu lockups, as well as stops new tcp connections
> > since __inet_inherit_port() also contests for the spinlock.
> >
> > This patch proposes adding a second bind table, bhash2, that hashes by
> > port and ip address. Searching the bhash2 table leads to significantly
> > faster conflict resolution and less time holding the spinlock.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
>
> Scary patch, but I could not find obvious issues with it.
> Let's give it a try, thanks !
>
> Reviewed-by: Eric Dumazet <edumazet@google.com>
Thanks for reviewing this code, Eric.

I will submit a v5 that tidies up some of the things flagged by
patchworks [1] (removing the inline in the "static inline bool
check_bind2_bucket_match(...)" function in net/ipv4/inet_hashtables.c
and adding line breaks to keep the lengths to 80 columns)

[1] https://patchwork.kernel.org/project/netdevbpf/patch/20220512205041.1208962-2-joannelkoong@gmail.com/

^ permalink raw reply

* Re: [PATCH -next v2] net: ethernet: sunplus: add missing of_node_put() in spl2sw_mdio_init()
From: 呂芳騰 @ 2022-05-18  4:57 UTC (permalink / raw)
  To: Yang Yingliang
  Cc: linux-kernel, netdev, andrew, pabeni, David Miller,
	Jakub Kicinski
In-Reply-To: <20220518020812.2626293-1-yangyingliang@huawei.com>

Hi Yingliang,

Thanks a lot for fixing the bug.

Reviewed-by: Wells Lu <wellslutw@gmail.com>

Best regards,
Wells

Yang Yingliang <yangyingliang@huawei.com> 於 2022年5月18日 週三 上午9:56寫道：
>
> of_get_child_by_name() returns device node pointer with refcount
> incremented. The refcount should be decremented before returning
> from spl2sw_mdio_init().
>
> Fixes: fd3040b9394c ("net: ethernet: Add driver for Sunplus SP7021")
> Reported-by: Hulk Robot <hulkci@huawei.com>
> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
> ---
> v2:
>   add fix tag.
> ---
>  drivers/net/ethernet/sunplus/spl2sw_mdio.c | 11 ++++++++---
>  1 file changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/sunplus/spl2sw_mdio.c b/drivers/net/ethernet/sunplus/spl2sw_mdio.c
> index 139ac8f2685e..733ae1704269 100644
> --- a/drivers/net/ethernet/sunplus/spl2sw_mdio.c
> +++ b/drivers/net/ethernet/sunplus/spl2sw_mdio.c
> @@ -97,8 +97,10 @@ u32 spl2sw_mdio_init(struct spl2sw_common *comm)
>
>         /* Allocate and register mdio bus. */
>         mii_bus = devm_mdiobus_alloc(&comm->pdev->dev);
> -       if (!mii_bus)
> -               return -ENOMEM;
> +       if (!mii_bus) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
>
>         mii_bus->name = "sunplus_mii_bus";
>         mii_bus->parent = &comm->pdev->dev;
> @@ -110,10 +112,13 @@ u32 spl2sw_mdio_init(struct spl2sw_common *comm)
>         ret = of_mdiobus_register(mii_bus, mdio_np);
>         if (ret) {
>                 dev_err(&comm->pdev->dev, "Failed to register mdiobus!\n");
> -               return ret;
> +               goto out;
>         }
>
>         comm->mii_bus = mii_bus;
> +
> +out:
> +       of_node_put(mdio_np);
>         return ret;
>  }
>
> --
> 2.25.1
>

^ permalink raw reply

* Re: [PATCHv2 0/3] perf tools: Fix prologue generation
From: Ian Rogers @ 2022-05-18  4:45 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Jiri Olsa,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	linux-perf-use., Networking, bpf, Ingo Molnar, Namhyung Kim,
	Alexander Shishkin, Peter Zijlstra, Martin KaFai Lau, Song Liu,
	Yonghong Song, John Fastabend
In-Reply-To: <CAEf4BzaQsF31f3WuU32wDCzo6bw7eY8E9zF6Lo218jfw-VQmcA@mail.gmail.com>

On Tue, May 17, 2022 at 3:03 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Wed, May 11, 2022 at 11:22 AM Arnaldo Carvalho de Melo
> <acme@kernel.org> wrote:
> >
> > Em Wed, May 11, 2022 at 09:35:34AM +0200, Jiri Olsa escreveu:
> > > On Tue, May 10, 2022 at 04:48:55PM -0700, Andrii Nakryiko wrote:
> > > > On Tue, May 10, 2022 at 12:47 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > > > >
> > > > > hi,
> > > > > sending change we discussed some time ago [1] to get rid of
> > > > > some deprecated functions we use in perf prologue code.
> > > > >
> > > > > Despite the gloomy discussion I think the final code does
> > > > > not look that bad ;-)
> > > > >
> > > > > This patchset removes following libbpf functions from perf:
> > > > >   bpf_program__set_prep
> > > > >   bpf_program__nth_fd
> > > > >   struct bpf_prog_prep_result
> > > > >
> > > > > v2 changes:
> > > > >   - use fallback section prog handler, so we don't need to
> > > > >     use section prefix [Andrii]
> > > > >   - realloc prog->insns array in bpf_program__set_insns [Andrii]
> > > > >   - squash patch 1 from previous version with
> > > > >     bpf_program__set_insns change [Daniel]
> > > > >   - patch 3 already merged [Arnaldo]
> > > > >   - added more comments
> > > > >
> > > > >   meanwhile.. perf/core and bpf-next diverged, so:
> > > > >     - libbpf bpf_program__set_insns change is based on bpf-next/master
> > > > >     - perf changes do not apply on bpf-next/master so they are based on
> > > > >       perf/core ... however they can be merged only after we release
> > > > >       libbpf 0.8.0 with bpf_program__set_insns change, so we don't break
> > > > >       the dynamic linking
> > > > >       I'm sending perf changes now just for review, I'll resend them
> > > > >       once libbpf 0.8.0 is released
> > > > >
> > > > > thanks,
> > > > > jirka
> > > > >
> > > > >
> > > > > [1] https://lore.kernel.org/bpf/CAEf4BzaiBO3_617kkXZdYJ8hS8YF--ZLgapNbgeeEJ-pY0H88g@mail.gmail.com/
> > > > > ---
> > > > > Jiri Olsa (1):
> > > > >       libbpf: Add bpf_program__set_insns function
> > > > >
> > > >
> > > > The first patch looks good to me. The rest I can't really review and
> > > > test properly, so I'll leave it up to Arnaldo.
> > > >
> > > > Arnaldo, how do we coordinate these patches? Should they go through
> > > > bpf-next (after you Ack them) or you want them in your tree?
> > > >
> > > > I'd like to get the bpf_program__set_insns() patch into bpf-next so
> > > > that I can do libbpf v0.8 release, having it in a separate tree is
> > > > extremely inconvenient. Please let me know how you think we should
> > > > proceed?
> > >
> > > we need to wait with perf changes after the libbpf is merged and
> > > libbpf 0.8.0 is released.. so we don't break dynamic linking for
> > > perf
> > >
> > > at the moment please just take libbpf change and I'll resend the
> > > perf change later if needed
> >
> > Ok.
> >
>
> Jiri, libbpf v0.8 is out, can you please re-send your perf patches?

We still haven't done as Andrii suggested in:
https://lore.kernel.org/lkml/CAEf4BzbbOHQZUAe6iWaehKCPQAf3VC=hq657buqe2_yRKxaK-A@mail.gmail.com/
so for LIBBPF_DYNAMIC I believe we're compiling against the header
files in tools/lib rather than the installed libbpf's header files.
This may have some unexpected consequences.

Thanks,
Ian

>
> > - Arnaldo

^ permalink raw reply

* Re: Re: [PATCH net v2] NFC: hci: fix sleep in atomic context bugs in nfc_hci_hcp_message_tx
From: duoming @ 2022-05-18  4:39 UTC (permalink / raw)
  To: Krzysztof Kozlowski
  Cc: linux-kernel, kuba, davem, edumazet, pabeni, gregkh,
	alexander.deucher, broonie, netdev
In-Reply-To: <68ccef70-ef30-8f53-6ec5-17ce5815089c@linaro.org>

Hello,

On Tue, 17 May 2022 17:28:51 +0200 Krzysztof wrote:

> >>> There are sleep in atomic context bugs when the request to secure
> >>> element of st21nfca is timeout. The root cause is that kzalloc and
> >>> alloc_skb with GFP_KERNEL parameter and mutex_lock are called in
> >>> st21nfca_se_wt_timeout which is a timer handler. The call tree shows
> >>> the execution paths that could lead to bugs:
> >>>
> >>>    (Interrupt context)
> >>> st21nfca_se_wt_timeout
> >>>   nfc_hci_send_event
> >>>     nfc_hci_hcp_message_tx
> >>>       kzalloc(..., GFP_KERNEL) //may sleep
> >>>       alloc_skb(..., GFP_KERNEL) //may sleep
> >>>       mutex_lock() //may sleep
> >>>
> >>> This patch changes allocation mode of kzalloc and alloc_skb from
> >>> GFP_KERNEL to GFP_ATOMIC and changes mutex_lock to spin_lock in
> >>> order to prevent atomic context from sleeping.
> >>>
> >>> Fixes: 2130fb97fecf ("NFC: st21nfca: Adding support for secure element")
> >>> Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>

> > The nfc_hci_hcp_message_tx() is called by both process context(hci_dev_up and so on)
> > and interrupt context(st21nfca_se_wt_timeout()). The process context(hci_dev_up and so on)
> > calls device_lock, but I think calling spin_lock() within device_lock() is ok. There is
> > no device_lock() called within spin_lock(). 
> 
> There is.
> 
> nfc_hci_failure -> spin lock -> nfc_driver_failure -> nfc_targets_found
> -> device_lock
> 
> I found it just by a very quick look, so I suspect there are several
> other places, not really checked.

I agree with you, the spin_lock is not a good solution to this problem. There is another solution:

We could put the nfc_hci_send_event() of st21nfca_se_wt_timeout() in a work item, then, using
schedule_work() in st21nfca_se_wt_timeout() to execute the work item. The schedule_work() will
wake up another kernel thread which is in process context to execute the bottom half of the interrupt, 
so it allows sleep.

The following is the details.

diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c
index c922f10d0d7..1e98467dbf7 100644
--- a/drivers/nfc/st21nfca/se.c
+++ b/drivers/nfc/st21nfca/se.c
@@ -241,7 +241,7 @@ int st21nfca_hci_se_io(struct nfc_hci_dev *hdev, u32 se_idx,
 }
 EXPORT_SYMBOL(st21nfca_hci_se_io);

-static void st21nfca_se_wt_timeout(struct timer_list *t)
+static void st21nfca_se_wt_work(struct work_struct *work)
 {
        /* 
         * No answer from the secure element
@@ -254,8 +254,9 @@ static void st21nfca_se_wt_timeout(struct timer_list *t)
         */
        /* hardware reset managed through VCC_UICC_OUT power supply */
        u8 param = 0x01;
-       struct st21nfca_hci_info *info = from_timer(info, t,
-                                                   se_info.bwi_timer);
+       struct st21nfca_hci_info *info = container_of(work,
+                                               struct st21nfca_hci_info,
+                                               se_info.timeout_work);

        info->se_info.bwi_active = false;
 
@@ -271,6 +272,13 @@ static void st21nfca_se_wt_timeout(struct timer_list *t)
        info->se_info.cb(info->se_info.cb_context, NULL, 0, -ETIME);
 }

+static void st21nfca_se_wt_timeout(struct timer_list *t)
+{
+       struct st21nfca_hci_info *info = from_timer(info, t, se_info.bwi_timer);
+
+       schedule_work(&info->se_info.timeout_work);
+}
+
 static void st21nfca_se_activation_timeout(struct timer_list *t)
 {
        struct st21nfca_hci_info *info = from_timer(info, t,
@@ -389,6 +397,7 @@ void st21nfca_se_init(struct nfc_hci_dev *hdev)
        struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);

        init_completion(&info->se_info.req_completion);
+       INIT_WORK(&info->se_info.timeout_work, st21nfca_se_wt_work);
        /* initialize timers */
        timer_setup(&info->se_info.bwi_timer, st21nfca_se_wt_timeout, 0);
        info->se_info.bwi_active = false;
@@ -416,6 +425,7 @@ void st21nfca_se_deinit(struct nfc_hci_dev *hdev)
        if (info->se_info.se_active)
                del_timer_sync(&info->se_info.se_active_timer);

+       cancel_work_sync(&info->se_info.timeout_work);
        info->se_info.bwi_active = false;
        info->se_info.se_active = false;
 }
diff --git a/drivers/nfc/st21nfca/st21nfca.h b/drivers/nfc/st21nfca/st21nfca.h
index cb6ad916be9..ae6771cc989 100644
--- a/drivers/nfc/st21nfca/st21nfca.h
+++ b/drivers/nfc/st21nfca/st21nfca.h
@@ -141,6 +141,7 @@ struct st21nfca_se_info {

        se_io_cb_t cb;
        void *cb_context;
+       struct work_struct timeout_work;
 };

 struct st21nfca_hci_info {

Do you think this solution is better?

Best regards,
Duoming Zhou

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox