* [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding
@ 2025-09-13 9:06 Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 1/8] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev Junxian Huang
` (7 more replies)
0 siblings, 8 replies; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
This series adds support for RoCE bonding. The bond mode is active
when multiple PF netdevs are enslaved to a bond master while all
following rules are met:
* All the slaves are on the same card, i.e., they share the same
bus number.
* The bond mode are set to mode 1 (active-backup), 2 (XOR) or
4 (802.3ad).
* None of the slaves have generated a VF.
In bond mode, a bond ibdev "hns_bond_*" is registered instead of the
regular PF ibdev "hns_*". For RoCE traffic, HW chooses the same active
port as netdev bonding in mode 1, while in mode 2/4, the port selection
is determined by the hash algorithm.
Junxian Huang (8):
RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev
RDMA/hns: Initialize bonding resources
RDMA/hns: Add bonding event handler
RDMA/hns: Add bonding cmds
RDMA/hns: Implement bonding init/uninit process
RDMA/hns: Add delayed work for bonding
RDMA/hns: Support link state reporting for bond
RDMA/hns: Support reset recovery for bond
drivers/infiniband/hw/hns/Makefile | 4 +-
drivers/infiniband/hw/hns/hns_roce_ah.c | 1 -
drivers/infiniband/hw/hns/hns_roce_bond.c | 996 ++++++++++++++++++++
drivers/infiniband/hw/hns/hns_roce_bond.h | 95 ++
drivers/infiniband/hw/hns/hns_roce_device.h | 16 +-
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 175 +++-
drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 20 +
drivers/infiniband/hw/hns/hns_roce_main.c | 176 +++-
drivers/infiniband/hw/hns/hns_roce_pd.c | 1 -
drivers/infiniband/hw/hns/hns_roce_qp.c | 5 +-
drivers/infiniband/hw/hns/hns_roce_srq.c | 1 -
11 files changed, 1438 insertions(+), 52 deletions(-)
create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.c
create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.h
--
2.33.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH for-next 1/8] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
@ 2025-09-13 9:06 ` Junxian Huang
2025-09-24 14:00 ` Jason Gunthorpe
2025-09-13 9:06 ` [PATCH for-next 2/8] RDMA/hns: Initialize bonding resources Junxian Huang
` (6 subsequent siblings)
7 siblings, 1 reply; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
Add helpers to obtain netdev and bus_num from hr_dev.
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_ah.c | 1 -
drivers/infiniband/hw/hns/hns_roce_device.h | 12 ++++++++++++
drivers/infiniband/hw/hns/hns_roce_main.c | 19 ++++++++++---------
drivers/infiniband/hw/hns/hns_roce_pd.c | 1 -
drivers/infiniband/hw/hns/hns_roce_qp.c | 5 +++--
drivers/infiniband/hw/hns/hns_roce_srq.c | 1 -
6 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index 307c35888b30..0c1c32d23c88 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include "hns_roce_device.h"
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 78ee04a48a74..5ae37832059f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -33,6 +33,7 @@
#ifndef _HNS_ROCE_DEVICE_H
#define _HNS_ROCE_DEVICE_H
+#include <linux/pci.h>
#include <rdma/ib_verbs.h>
#include <rdma/hns-abi.h>
#include "hns_roce_debugfs.h"
@@ -1165,6 +1166,17 @@ static inline u8 get_tclass(const struct ib_global_route *grh)
grh->traffic_class >> DSCP_SHIFT : grh->traffic_class;
}
+static inline struct net_device *get_hr_netdev(struct hns_roce_dev *hr_dev,
+ u8 port)
+{
+ return hr_dev->iboe.netdevs[port];
+}
+
+static inline u8 get_hr_bus_num(struct hns_roce_dev *hr_dev)
+{
+ return hr_dev->pci_dev->bus->number;
+}
+
void hns_roce_init_uar_table(struct hns_roce_dev *dev);
int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar);
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index d50f36f8a110..8bca0b10c69e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -32,7 +32,6 @@
*/
#include <linux/acpi.h>
#include <linux/module.h>
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
@@ -148,12 +147,13 @@ static int hns_roce_netdev_event(struct notifier_block *self,
static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev)
{
+ struct net_device *net_dev;
int ret;
u8 i;
for (i = 0; i < hr_dev->caps.num_ports; i++) {
- ret = hns_roce_set_mac(hr_dev, i,
- hr_dev->iboe.netdevs[i]->dev_addr);
+ net_dev = get_hr_netdev(hr_dev, i);
+ ret = hns_roce_set_mac(hr_dev, i, net_dev->dev_addr);
if (ret)
return ret;
}
@@ -246,7 +246,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
spin_lock_irqsave(&hr_dev->iboe.lock, flags);
- net_dev = hr_dev->iboe.netdevs[port];
+ net_dev = get_hr_netdev(hr_dev, port);
if (!net_dev) {
spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
dev_err(dev, "find netdev %u failed!\n", port);
@@ -704,11 +704,12 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = {
static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
{
- int ret;
struct hns_roce_ib_iboe *iboe = NULL;
- struct ib_device *ib_dev = NULL;
struct device *dev = hr_dev->dev;
+ struct ib_device *ib_dev = NULL;
+ struct net_device *net_dev;
unsigned int i;
+ int ret;
iboe = &hr_dev->iboe;
spin_lock_init(&iboe->lock);
@@ -744,11 +745,11 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops);
for (i = 0; i < hr_dev->caps.num_ports; i++) {
- if (!hr_dev->iboe.netdevs[i])
+ net_dev = get_hr_netdev(hr_dev, i);
+ if (!net_dev)
continue;
- ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i],
- i + 1);
+ ret = ib_device_set_netdev(ib_dev, net_dev, i + 1);
if (ret)
return ret;
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index d35cf59d0f43..225c3e328e0e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include "hns_roce_device.h"
void hns_roce_init_pd_table(struct hns_roce_dev *hr_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 6ff1b8ce580c..e0e28c4ff1ca 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -31,7 +31,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
@@ -1350,11 +1349,13 @@ static int check_mtu_validate(struct hns_roce_dev *hr_dev,
struct hns_roce_qp *hr_qp,
struct ib_qp_attr *attr, int attr_mask)
{
+ struct net_device *net_dev;
enum ib_mtu active_mtu;
int p;
p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
- active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu);
+ net_dev = get_hr_netdev(hr_dev, p);
+ active_mtu = iboe_get_mtu(net_dev->mtu);
if ((hr_dev->caps.max_mtu >= IB_MTU_2048 &&
attr->path_mtu > hr_dev->caps.max_mtu) ||
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 1090051f493b..8a6efb6b9c9e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -3,7 +3,6 @@
* Copyright (c) 2018 Hisilicon Limited.
*/
-#include <linux/pci.h>
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
#include "hns_roce_device.h"
--
2.33.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH for-next 2/8] RDMA/hns: Initialize bonding resources
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 1/8] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev Junxian Huang
@ 2025-09-13 9:06 ` Junxian Huang
2025-09-24 14:04 ` Jason Gunthorpe
2025-09-13 9:06 ` [PATCH for-next 3/8] RDMA/hns: Add bonding event handler Junxian Huang
` (5 subsequent siblings)
7 siblings, 1 reply; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
Allocate bond_grp resources for each card when the first device in
this card is registered. Block the initialization of VF when its PF
is a bonded slave, as VF is not supported in this case due to HW
constraints.
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/Makefile | 4 +-
drivers/infiniband/hw/hns/hns_roce_bond.c | 185 ++++++++++++++++++++
drivers/infiniband/hw/hns/hns_roce_bond.h | 38 ++++
drivers/infiniband/hw/hns/hns_roce_device.h | 1 +
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 38 ++++
drivers/infiniband/hw/hns/hns_roce_main.c | 11 ++
6 files changed, 276 insertions(+), 1 deletion(-)
create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.c
create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.h
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile
index baf592e6f21b..d07ef02c5231 100644
--- a/drivers/infiniband/hw/hns/Makefile
+++ b/drivers/infiniband/hw/hns/Makefile
@@ -4,11 +4,13 @@
#
ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
+ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3pf
+ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3_common
ccflags-y += -I $(src)
hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \
- hns_roce_debugfs.o hns_roce_hw_v2.o
+ hns_roce_debugfs.o hns_roce_hw_v2.o hns_roce_bond.o
obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c
new file mode 100644
index 000000000000..859da5af5e09
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#include "hns_roce_device.h"
+#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
+
+static DEFINE_XARRAY(roce_bond_xa);
+
+static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev)
+{
+ struct net_device *upper_dev;
+
+ rcu_read_lock();
+ upper_dev = netdev_master_upper_dev_get_rcu(net_dev);
+ rcu_read_unlock();
+
+ return upper_dev;
+}
+
+static int get_netdev_bond_slave_id(struct net_device *net_dev,
+ struct hns_roce_bond_group *bond_grp)
+{
+ int i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++)
+ if (net_dev == bond_grp->bond_func_info[i].net_dev)
+ return i;
+
+ return -ENOENT;
+}
+
+struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
+ u8 bus_num)
+{
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+ struct hns_roce_bond_group *bond_grp;
+ int i;
+
+ if (!die_info)
+ return NULL;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ return bond_grp;
+ if (bond_grp->upper_dev &&
+ bond_grp->upper_dev == get_upper_dev_from_ndev(net_dev))
+ return bond_grp;
+ }
+
+ return NULL;
+}
+
+static struct hns_roce_die_info *alloc_die_info(int bus_num)
+{
+ struct hns_roce_die_info *die_info;
+ int ret;
+
+ die_info = kzalloc(sizeof(*die_info), GFP_KERNEL);
+ if (!die_info)
+ return NULL;
+
+ ret = xa_err(xa_store(&roce_bond_xa, bus_num, die_info, GFP_KERNEL));
+ if (ret) {
+ kfree(die_info);
+ return NULL;
+ }
+
+ return die_info;
+}
+
+static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num)
+{
+ xa_erase(&roce_bond_xa, bus_num);
+ kfree(die_info);
+}
+
+static int alloc_bond_id(struct hns_roce_bond_group *bond_grp)
+{
+ u8 bus_num = bond_grp->bus_num;
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+ int i;
+
+ if (!die_info) {
+ die_info = alloc_die_info(bus_num);
+ if (!die_info)
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ if (die_info->bond_id_mask & BOND_ID(i))
+ continue;
+
+ die_info->bond_id_mask |= BOND_ID(i);
+ die_info->bgrps[i] = bond_grp;
+ bond_grp->bond_id = i;
+
+ return 0;
+ }
+
+ return -ENOSPC;
+}
+
+static int remove_bond_id(int bus_num, u8 bond_id)
+{
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+
+ if (bond_id >= ROCE_BOND_NUM_MAX)
+ return -EINVAL;
+
+ if (!die_info)
+ return -ENODEV;
+
+ die_info->bond_id_mask &= ~BOND_ID(bond_id);
+ die_info->bgrps[bond_id] = NULL;
+ if (!die_info->bond_id_mask)
+ dealloc_die_info(die_info, bus_num);
+
+ return 0;
+}
+
+int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ int ret;
+ int i;
+
+ if (xa_load(&roce_bond_xa, bus_num))
+ return 0;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = kvzalloc(sizeof(*bond_grp), GFP_KERNEL);
+ if (!bond_grp) {
+ ret = -ENOMEM;
+ goto mem_err;
+ }
+
+ bond_grp->bus_num = bus_num;
+
+ ret = alloc_bond_id(bond_grp);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "failed to alloc bond ID, ret = %d.\n", ret);
+ goto alloc_id_err;
+ }
+
+ bgrps[i] = bond_grp;
+ }
+
+ return 0;
+
+alloc_id_err:
+ kvfree(bond_grp);
+mem_err:
+ for (i--; i >= 0; i--) {
+ remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id);
+ kvfree(bgrps[i]);
+ }
+ return ret;
+}
+
+void hns_roce_dealloc_bond_grp(void)
+{
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ unsigned long id;
+ int i;
+
+ xa_for_each(&roce_bond_xa, id, die_info) {
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
+ kvfree(bond_grp);
+ }
+ }
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h
new file mode 100644
index 000000000000..61c52135588e
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#ifndef _HNS_ROCE_BOND_H
+#define _HNS_ROCE_BOND_H
+
+#include <linux/netdevice.h>
+#include <net/bonding.h>
+
+#define ROCE_BOND_FUNC_MAX 4
+#define ROCE_BOND_NUM_MAX 2
+
+#define BOND_ID(id) BIT(id)
+
+struct hns_roce_func_info {
+ struct net_device *net_dev;
+};
+
+struct hns_roce_bond_group {
+ struct net_device *upper_dev;
+ u8 bond_id;
+ u8 bus_num;
+ struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX];
+};
+
+struct hns_roce_die_info {
+ u8 bond_id_mask;
+ struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+};
+
+struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
+ u8 bus_num);
+int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev);
+void hns_roce_dealloc_bond_grp(void);
+
+#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 5ae37832059f..cc1402fc8943 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -154,6 +154,7 @@ enum {
HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14),
HNS_ROCE_CAP_FLAG_STASH = BIT(17),
HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19),
+ HNS_ROCE_CAP_FLAG_BOND = BIT(21),
HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22),
};
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 64bca08f3f1a..e918c1c99d17 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -43,11 +43,13 @@
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
+#include "hclge_main.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_cmd.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
#define CREATE_TRACE_POINTS
#include "hns_roce_trace.h"
@@ -2270,6 +2272,9 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) <<
HNS_ROCE_CAP_FLAGS_EX_SHIFT;
+ if (hr_dev->is_vf)
+ caps->flags &= ~HNS_ROCE_CAP_FLAG_BOND;
+
caps->num_cqs = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_CQS);
caps->gid_table_len[0] = hr_reg_read(resp_c, PF_CAPS_C_MAX_GID);
caps->max_cqes = 1 << hr_reg_read(resp_c, PF_CAPS_C_CQ_DEPTH);
@@ -7025,6 +7030,33 @@ static void hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
priv->handle = handle;
}
+static bool check_vf_support(struct pci_dev *vf)
+{
+ struct hns_roce_bond_group *bond_grp;
+ struct pci_dev *pf = pci_physfn(vf);
+ struct hnae3_ae_dev *ae_dev;
+ struct hnae3_handle *handle;
+ struct hns_roce_dev *hr_dev;
+ struct hclge_dev *hdev;
+
+ if (pf == vf)
+ return true;
+
+ ae_dev = pci_get_drvdata(pf);
+ hdev = ae_dev->priv;
+ handle = &hdev->vport[0].roce;
+ hr_dev = handle->priv;
+ if (!hr_dev)
+ return false;
+
+ bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0),
+ pf->bus->number);
+ if (bond_grp)
+ return false;
+
+ return true;
+}
+
static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
{
struct hns_roce_dev *hr_dev;
@@ -7042,6 +7074,11 @@ static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
hns_roce_hw_v2_get_cfg(hr_dev, handle);
+ if (hr_dev->is_vf && !check_vf_support(hr_dev->pci_dev)) {
+ ret = -EOPNOTSUPP;
+ goto error_failed_roce_init;
+ }
+
ret = hns_roce_init(hr_dev);
if (ret) {
dev_err(hr_dev->dev, "RoCE Engine init failed!\n");
@@ -7260,6 +7297,7 @@ static int __init hns_roce_hw_v2_init(void)
static void __exit hns_roce_hw_v2_exit(void)
{
+ hns_roce_dealloc_bond_grp();
hnae3_unregister_client(&hns_roce_hw_v2_client);
hns_roce_cleanup_debugfs();
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 8bca0b10c69e..7fa25586ccd8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -40,6 +40,7 @@
#include "hns_roce_device.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port,
const u8 *addr)
@@ -744,6 +745,16 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops);
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ ret = hns_roce_alloc_bond_grp(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to alloc bond_grp for bus %u, ret = %d\n",
+ get_hr_bus_num(hr_dev), ret);
+ return ret;
+ }
+ }
+
for (i = 0; i < hr_dev->caps.num_ports; i++) {
net_dev = get_hr_netdev(hr_dev, i);
if (!net_dev)
--
2.33.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH for-next 3/8] RDMA/hns: Add bonding event handler
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 1/8] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 2/8] RDMA/hns: Initialize bonding resources Junxian Huang
@ 2025-09-13 9:06 ` Junxian Huang
2025-09-24 14:05 ` Jason Gunthorpe
2025-09-13 9:06 ` [PATCH for-next 4/8] RDMA/hns: Add bonding cmds Junxian Huang
` (4 subsequent siblings)
7 siblings, 1 reply; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
Register netdev notifier for two bonding events NETDEV_CHANGEUPPER
and NETDEV_CHANGELOWERSTATE.
In NETDEV_CHANGEUPPER event handler, check some rules about the HW
constraints when trying to link a new slave to the masteri, and
store some bonding information from the notifier. In unlinking case,
simply check the number of the rest slaves to decide whether the
bond is still supported.
In NETDEV_CHANGELOWERSTATE event handler, not much is done. It
simply sets the bond state when the bond is ready, which will be
used later.
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_bond.c | 306 ++++++++++++++++++++++
drivers/infiniband/hw/hns/hns_roce_bond.h | 26 ++
2 files changed, 332 insertions(+)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c
index 859da5af5e09..5fee44bcf81d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_bond.c
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.c
@@ -9,6 +9,21 @@
static DEFINE_XARRAY(roce_bond_xa);
+static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev)
+{
+ struct ib_device *ibdev =
+ ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS);
+ struct hns_roce_dev *hr_dev;
+
+ if (!ibdev)
+ return NULL;
+
+ hr_dev = container_of(ibdev, struct hns_roce_dev, ib_dev);
+ ib_device_put(ibdev);
+
+ return hr_dev;
+}
+
static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev)
{
struct net_device *upper_dev;
@@ -124,6 +139,279 @@ static int remove_bond_id(int bus_num, u8 bond_id)
return 0;
}
+static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct hns_roce_dev *hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+
+ if (!hr_dev) {
+ if (bond_grp &&
+ get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ return true;
+ else
+ return false;
+ }
+
+ if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND))
+ return false;
+
+ if (hr_dev->is_vf || pci_num_vf(hr_dev->pci_dev) > 0)
+ return false;
+
+ if (bond_grp->bus_num != get_hr_bus_num(hr_dev))
+ return false;
+
+ return true;
+}
+
+static bool check_slave_support(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ struct net_device *net_dev;
+ u8 slave_num = 0;
+
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(upper_dev, net_dev) {
+ if (is_dev_bond_supported(bond_grp, net_dev)) {
+ slave_num++;
+ continue;
+ }
+ rcu_read_unlock();
+ return false;
+ }
+ rcu_read_unlock();
+
+ return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX);
+}
+
+static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev,
+ struct net_device *upper_dev)
+{
+ bond_grp->upper_dev = upper_dev;
+ bond_grp->main_hr_dev = hr_dev;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ bond_grp->bond_ready = false;
+}
+
+static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct hns_roce_bond_group *bond_grp_tmp;
+
+ bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bond_grp->bus_num);
+ return bond_grp_tmp == bond_grp;
+}
+
+static void lowerstate_event_setting(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changelowerstate_info *info)
+{
+ mutex_lock(&bond_grp->bond_mutex);
+
+ if (bond_grp->bond_ready &&
+ bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED)
+ bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGESTATE;
+
+ mutex_unlock(&bond_grp->bond_mutex);
+}
+
+static bool hns_roce_bond_lowerstate_event(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changelowerstate_info *info)
+{
+ struct net_device *net_dev =
+ netdev_notifier_info_to_dev((struct netdev_notifier_info *)info);
+
+ if (!netif_is_lag_port(net_dev))
+ return false;
+
+ if (!lowerstate_event_filter(bond_grp, net_dev))
+ return false;
+
+ lowerstate_event_setting(bond_grp, info);
+
+ return true;
+}
+
+static bool is_bond_setting_supported(struct netdev_lag_upper_info *bond_info)
+{
+ if (!bond_info)
+ return false;
+
+ if (bond_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
+ bond_info->tx_type != NETDEV_LAG_TX_TYPE_HASH)
+ return false;
+
+ if (bond_info->tx_type == NETDEV_LAG_TX_TYPE_HASH &&
+ bond_info->hash_type > NETDEV_LAG_HASH_L23)
+ return false;
+
+ return true;
+}
+
+static void upper_event_setting(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netdev_lag_upper_info *bond_upper_info = NULL;
+ bool slave_inc = info->linking;
+
+ if (slave_inc)
+ bond_upper_info = info->upper_info;
+
+ if (bond_upper_info) {
+ bond_grp->tx_type = bond_upper_info->tx_type;
+ bond_grp->hash_type = bond_upper_info->hash_type;
+ }
+}
+
+static bool check_unlinking_bond_support(struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ u8 slave_num = 0;
+
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(bond_grp->upper_dev, net_dev) {
+ if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ slave_num++;
+ }
+ rcu_read_unlock();
+
+ return (slave_num > 1);
+}
+
+static bool check_linking_bond_support(struct netdev_lag_upper_info *bond_info,
+ struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ if (!is_bond_setting_supported(bond_info))
+ return false;
+
+ return check_slave_support(bond_grp, upper_dev);
+}
+
+static enum bond_support_type
+ check_bond_support(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ bool bond_grp_exist = false;
+ bool support;
+
+ if (upper_dev == bond_grp->upper_dev)
+ bond_grp_exist = true;
+
+ if (!info->linking && !bond_grp_exist)
+ return BOND_NOT_SUPPORT;
+
+ if (info->linking)
+ support = check_linking_bond_support(info->upper_info, bond_grp,
+ upper_dev);
+ else
+ support = check_unlinking_bond_support(bond_grp);
+
+ if (support)
+ return BOND_SUPPORT;
+
+ return bond_grp_exist ? BOND_EXISTING_NOT_SUPPORT : BOND_NOT_SUPPORT;
+}
+
+static bool upper_event_filter(struct netdev_notifier_changeupper_info *info,
+ struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct net_device *upper_dev = info->upper_dev;
+ struct hns_roce_bond_group *bond_grp_tmp;
+ struct hns_roce_dev *hr_dev;
+ u8 bus_num;
+
+ if (!info->linking ||
+ bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED)
+ return bond_grp->upper_dev == upper_dev;
+
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev)
+ return false;
+
+ bus_num = get_hr_bus_num(hr_dev);
+ if (bond_grp->bus_num != bus_num)
+ return false;
+
+ bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp_tmp && bond_grp_tmp != bond_grp)
+ return false;
+
+ return true;
+}
+
+static bool hns_roce_bond_upper_event(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *net_dev =
+ netdev_notifier_info_to_dev((struct netdev_notifier_info *)info);
+ struct net_device *upper_dev = info->upper_dev;
+ enum bond_support_type support = BOND_SUPPORT;
+ struct hns_roce_dev *hr_dev;
+ int slave_id;
+
+ if (!upper_dev || !netif_is_lag_master(upper_dev))
+ return false;
+
+ if (!upper_event_filter(info, bond_grp, net_dev))
+ return false;
+
+ mutex_lock(&bond_grp->bond_mutex);
+ support = check_bond_support(bond_grp, upper_dev, info);
+ if (support == BOND_NOT_SUPPORT) {
+ mutex_unlock(&bond_grp->bond_mutex);
+ return false;
+ }
+
+ if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_ATTACHED) {
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev) {
+ mutex_unlock(&bond_grp->bond_mutex);
+ return false;
+ }
+ hns_roce_attach_bond_grp(bond_grp, hr_dev, upper_dev);
+ }
+
+ /* In the case of netdev being unregistered, the roce
+ * instance shouldn't be inited.
+ */
+ if (net_dev->reg_state >= NETREG_UNREGISTERING) {
+ slave_id = get_netdev_bond_slave_id(net_dev, bond_grp);
+ if (slave_id >= 0) {
+ bond_grp->bond_func_info[slave_id].net_dev = NULL;
+ bond_grp->bond_func_info[slave_id].handle = NULL;
+ }
+ }
+
+ if (support == BOND_SUPPORT) {
+ bond_grp->bond_ready = true;
+ if (bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED)
+ bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGE_NUM;
+ }
+ mutex_unlock(&bond_grp->bond_mutex);
+ if (support == BOND_SUPPORT)
+ upper_event_setting(bond_grp, info);
+
+ return true;
+}
+
+static int hns_roce_bond_event(struct notifier_block *self,
+ unsigned long event, void *ptr)
+{
+ struct hns_roce_bond_group *bond_grp =
+ container_of(self, struct hns_roce_bond_group, bond_nb);
+ bool changed = false;
+
+ if (event == NETDEV_CHANGEUPPER)
+ changed = hns_roce_bond_upper_event(bond_grp, ptr);
+ if (event == NETDEV_CHANGELOWERSTATE)
+ changed = hns_roce_bond_lowerstate_event(bond_grp, ptr);
+
+ return NOTIFY_DONE;
+}
+
int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
{
struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
@@ -142,6 +430,10 @@ int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
goto mem_err;
}
+ mutex_init(&bond_grp->bond_mutex);
+
+ bond_grp->bond_ready = false;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
bond_grp->bus_num = bus_num;
ret = alloc_bond_id(bond_grp);
@@ -151,16 +443,28 @@ int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
goto alloc_id_err;
}
+ bond_grp->bond_nb.notifier_call = hns_roce_bond_event;
+ ret = register_netdevice_notifier(&bond_grp->bond_nb);
+ if (ret) {
+ ibdev_err(&hr_dev->ib_dev,
+ "failed to register bond nb, ret = %d.\n", ret);
+ goto register_nb_err;
+ }
bgrps[i] = bond_grp;
}
return 0;
+register_nb_err:
+ remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
alloc_id_err:
+ mutex_destroy(&bond_grp->bond_mutex);
kvfree(bond_grp);
mem_err:
for (i--; i >= 0; i--) {
+ unregister_netdevice_notifier(&bgrps[i]->bond_nb);
remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id);
+ mutex_destroy(&bgrps[i]->bond_mutex);
kvfree(bgrps[i]);
}
return ret;
@@ -178,7 +482,9 @@ void hns_roce_dealloc_bond_grp(void)
bond_grp = die_info->bgrps[i];
if (!bond_grp)
continue;
+ unregister_netdevice_notifier(&bond_grp->bond_nb);
remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
+ mutex_destroy(&bond_grp->bond_mutex);
kvfree(bond_grp);
}
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h
index 61c52135588e..a11de04c42e9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_bond.h
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.h
@@ -14,15 +14,41 @@
#define BOND_ID(id) BIT(id)
+enum bond_support_type {
+ BOND_NOT_SUPPORT,
+ /*
+ * bond_grp already exists, but in the current
+ * conditions it's no longer supported
+ */
+ BOND_EXISTING_NOT_SUPPORT,
+ BOND_SUPPORT,
+};
+
+enum hns_roce_bond_state {
+ HNS_ROCE_BOND_NOT_ATTACHED,
+ HNS_ROCE_BOND_NOT_BONDED,
+ HNS_ROCE_BOND_IS_BONDED,
+ HNS_ROCE_BOND_SLAVE_CHANGE_NUM,
+ HNS_ROCE_BOND_SLAVE_CHANGESTATE,
+};
+
struct hns_roce_func_info {
struct net_device *net_dev;
+ struct hnae3_handle *handle;
};
struct hns_roce_bond_group {
struct net_device *upper_dev;
+ struct hns_roce_dev *main_hr_dev;
u8 bond_id;
u8 bus_num;
struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX];
+ bool bond_ready;
+ enum hns_roce_bond_state bond_state;
+ enum netdev_lag_tx_type tx_type;
+ enum netdev_lag_hash hash_type;
+ struct mutex bond_mutex;
+ struct notifier_block bond_nb;
};
struct hns_roce_die_info {
--
2.33.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH for-next 4/8] RDMA/hns: Add bonding cmds
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
` (2 preceding siblings ...)
2025-09-13 9:06 ` [PATCH for-next 3/8] RDMA/hns: Add bonding event handler Junxian Huang
@ 2025-09-13 9:06 ` Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 5/8] RDMA/hns: Implement bonding init/uninit process Junxian Huang
` (3 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
Add three bonding cmds to configure bonding settings to HW.
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_bond.h | 20 ++++++
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 73 ++++++++++++++++++++++
drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 15 +++++
3 files changed, 108 insertions(+)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h
index a11de04c42e9..84c94cbc397d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_bond.h
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.h
@@ -14,6 +14,17 @@
#define BOND_ID(id) BIT(id)
+enum {
+ BOND_MODE_1,
+ BOND_MODE_2_4,
+};
+
+enum hns_roce_bond_hashtype {
+ BOND_HASH_L2,
+ BOND_HASH_L34,
+ BOND_HASH_L23,
+};
+
enum bond_support_type {
BOND_NOT_SUPPORT,
/*
@@ -32,6 +43,12 @@ enum hns_roce_bond_state {
HNS_ROCE_BOND_SLAVE_CHANGESTATE,
};
+enum hns_roce_bond_cmd_type {
+ HNS_ROCE_SET_BOND,
+ HNS_ROCE_CHANGE_BOND,
+ HNS_ROCE_CLEAR_BOND,
+};
+
struct hns_roce_func_info {
struct net_device *net_dev;
struct hnae3_handle *handle;
@@ -40,6 +57,9 @@ struct hns_roce_func_info {
struct hns_roce_bond_group {
struct net_device *upper_dev;
struct hns_roce_dev *main_hr_dev;
+ u8 active_slave_num;
+ u32 slave_map;
+ u32 active_slave_map;
u8 bond_id;
u8 bus_num;
struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX];
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index e918c1c99d17..d3c1ad04afd7 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1431,6 +1431,79 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
return ret;
}
+static enum hns_roce_opcode_type
+ get_bond_opcode(enum hns_roce_bond_cmd_type bond_type)
+{
+ switch (bond_type) {
+ case HNS_ROCE_SET_BOND:
+ return HNS_ROCE_OPC_SET_BOND_INFO;
+ case HNS_ROCE_CHANGE_BOND:
+ return HNS_ROCE_OPC_CHANGE_ACTIVE_PORT;
+ case HNS_ROCE_CLEAR_BOND:
+ return HNS_ROCE_OPC_CLEAR_BOND_INFO;
+ default:
+ WARN(true, "Invalid bond type %d!\n", bond_type);
+ return HNS_ROCE_OPC_SET_BOND_INFO;
+ }
+}
+
+static enum hns_roce_bond_hashtype
+ get_bond_hashtype(enum netdev_lag_hash netdev_hashtype)
+{
+ switch (netdev_hashtype) {
+ case NETDEV_LAG_HASH_L2:
+ return BOND_HASH_L2;
+ case NETDEV_LAG_HASH_L34:
+ return BOND_HASH_L34;
+ case NETDEV_LAG_HASH_L23:
+ return BOND_HASH_L23;
+ default:
+ WARN(true, "Invalid hash type %d!\n", netdev_hashtype);
+ return BOND_HASH_L2;
+ }
+}
+
+int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
+ enum hns_roce_bond_cmd_type bond_type)
+{
+ enum hns_roce_opcode_type opcode = get_bond_opcode(bond_type);
+ struct hns_roce_bond_info *slave_info;
+ struct hns_roce_cmq_desc desc = {};
+ int ret;
+
+ slave_info = (struct hns_roce_bond_info *)desc.data;
+ hns_roce_cmq_setup_basic_desc(&desc, opcode, false);
+
+ slave_info->bond_id = cpu_to_le32(bond_grp->bond_id);
+ if (bond_type == HNS_ROCE_CLEAR_BOND)
+ goto out;
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ slave_info->bond_mode = cpu_to_le32(BOND_MODE_1);
+ if (bond_grp->active_slave_num != 1)
+ ibdev_warn(&bond_grp->main_hr_dev->ib_dev,
+ "active slave cnt(%u) in Mode 1 is invalid.\n",
+ bond_grp->active_slave_num);
+ } else {
+ slave_info->bond_mode = cpu_to_le32(BOND_MODE_2_4);
+ slave_info->hash_policy =
+ cpu_to_le32(get_bond_hashtype(bond_grp->hash_type));
+ }
+
+ slave_info->active_slave_cnt = cpu_to_le32(bond_grp->active_slave_num);
+ slave_info->active_slave_mask = cpu_to_le32(bond_grp->active_slave_map);
+ slave_info->slave_mask = cpu_to_le32(bond_grp->slave_map);
+
+out:
+ ret = hns_roce_cmq_send(bond_grp->main_hr_dev, &desc, 1);
+ if (ret)
+ ibdev_err(&bond_grp->main_hr_dev->ib_dev,
+ "cmq bond type(%d) failed, ret = %d.\n",
+ bond_type, ret);
+
+ return ret;
+}
+
static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev,
dma_addr_t base_addr, u8 cmd, unsigned long tag)
{
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index e64a04d6f85b..82cec4b38c92 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -35,6 +35,7 @@
#include <linux/bitops.h>
#include "hnae3.h"
+#include "hns_roce_bond.h"
#define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32
#define HNS_ROCE_V2_MTT_ENTRY_SZ 64
@@ -228,6 +229,9 @@ enum hns_roce_opcode_type {
HNS_ROCE_OPC_CFG_GMV_BT = 0x8510,
HNS_ROCE_QUERY_RAM_ECC = 0x8513,
HNS_SWITCH_PARAMETER_CFG = 0x1033,
+ HNS_ROCE_OPC_SET_BOND_INFO = 0x8601,
+ HNS_ROCE_OPC_CLEAR_BOND_INFO = 0x8602,
+ HNS_ROCE_OPC_CHANGE_ACTIVE_PORT = 0x8603,
};
#define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000
@@ -1465,7 +1469,18 @@ struct hns_roce_sccc_clr_done {
__le32 rsv[5];
};
+struct hns_roce_bond_info {
+ __le32 bond_id;
+ __le32 bond_mode;
+ __le32 active_slave_cnt;
+ __le32 active_slave_mask;
+ __le32 slave_mask;
+ __le32 hash_policy;
+};
+
int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
+ enum hns_roce_bond_cmd_type bond_type);
static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
void __iomem *dest)
--
2.33.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH for-next 5/8] RDMA/hns: Implement bonding init/uninit process
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
` (3 preceding siblings ...)
2025-09-13 9:06 ` [PATCH for-next 4/8] RDMA/hns: Add bonding cmds Junxian Huang
@ 2025-09-13 9:06 ` Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 6/8] RDMA/hns: Add delayed work for bonding Junxian Huang
` (2 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
Implement hns_roce_slave_init() and hns_roce_slave_uninit() for device
init/uninit in bonding cases. The former is used to initialize a slave
ibdev (when the slave is unlinked from a bond) or a bond ibdev, while
the latter does the opposite. Most of the process is the same as
regular device init/uninit, while some bonding‑specific steps below are
also added.
In bond device init flow, choose one slave to re-initialize as the
main_hr_dev of the bond, and it will be the only device presented for
multiple slaves. During registration, set and active netdev to the
ibdev based on the link state of the slaves. When this main_hr_dev
slave is being unlinked while the bond is still valid, choose a new
slave from the rest and initialize it as the new bond device.
In uninit flow, add a bond cleanup process, restore all the other
slaves and clean up bond resource. This is only for the case where
the port of main_hr_dev is directly removed without unlinking it
from bond.
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_bond.c | 178 ++++++++++++++++++++
drivers/infiniband/hw/hns/hns_roce_bond.h | 6 +
drivers/infiniband/hw/hns/hns_roce_device.h | 3 +-
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 41 ++++-
drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 +
drivers/infiniband/hw/hns/hns_roce_main.c | 67 ++++++--
6 files changed, 283 insertions(+), 17 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c
index 5fee44bcf81d..d6fce23501b4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_bond.c
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.c
@@ -3,6 +3,7 @@
* Copyright (c) 2025 Hisilicon Limited.
*/
+#include <net/bonding.h>
#include "hns_roce_device.h"
#include "hns_roce_hw_v2.h"
#include "hns_roce_bond.h"
@@ -71,6 +72,143 @@ struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
return NULL;
}
+static int hns_roce_set_bond_netdev(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev)
+{
+ struct net_device *active_dev;
+ struct net_device *old_dev;
+ int i, ret = 0;
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ rcu_read_lock();
+ active_dev =
+ bond_option_active_slave_get_rcu(netdev_priv(bond_grp->upper_dev));
+ rcu_read_unlock();
+ } else {
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ active_dev = bond_grp->bond_func_info[i].net_dev;
+ if (active_dev &&
+ ib_get_curr_port_state(active_dev) == IB_PORT_ACTIVE)
+ break;
+ }
+ }
+
+ if (!active_dev || i == ROCE_BOND_FUNC_MAX)
+ active_dev = get_hr_netdev(hr_dev, 0);
+
+ old_dev = ib_device_get_netdev(&hr_dev->ib_dev, 1);
+ if (old_dev == active_dev)
+ goto out;
+
+ ret = ib_device_set_netdev(&hr_dev->ib_dev, active_dev, 1);
+ if (ret) {
+ dev_err(hr_dev->dev, "failed to set netdev for bond.\n");
+ goto out;
+ }
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ if (old_dev)
+ roce_del_all_netdev_gids(&hr_dev->ib_dev, 1, old_dev);
+ rdma_roce_rescan_port(&hr_dev->ib_dev, 1);
+ }
+out:
+ dev_put(old_dev);
+ return ret;
+}
+
+bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED &&
+ bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED)
+ return true;
+
+ return false;
+}
+
+static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx)
+{
+ struct hnae3_handle *handle;
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (handle->priv)
+ hns_roce_bond_uninit_client(bond_grp, func_idx);
+}
+
+static struct hns_roce_dev
+ *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx, bool need_switch);
+
+static int switch_main_dev(struct hns_roce_bond_group *bond_grp,
+ u8 main_func_idx)
+{
+ struct hns_roce_dev *hr_dev;
+ struct net_device *net_dev;
+ u8 i;
+
+ bond_grp->main_hr_dev = NULL;
+ hns_roce_bond_uninit_client(bond_grp, main_func_idx);
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if ((bond_grp->slave_map & (1U << i)) && net_dev) {
+ /* In case this slave is still being registered as
+ * a non-bonded PF, uninit it first and then re-init
+ * it as the main device.
+ */
+ hns_roce_slave_uninit(bond_grp, i);
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev) {
+ bond_grp->main_hr_dev = hr_dev;
+ break;
+ }
+ }
+ }
+
+ if (!bond_grp->main_hr_dev)
+ return -ENODEV;
+
+ return 0;
+}
+
+static struct hns_roce_dev
+ *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx, bool need_switch)
+{
+ struct hns_roce_dev *hr_dev = NULL;
+ struct hnae3_handle *handle;
+ u8 main_func_idx;
+ int ret;
+
+ if (need_switch) {
+ main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
+ if (func_idx == main_func_idx) {
+ ret = switch_main_dev(bond_grp, main_func_idx);
+ if (ret == -ENODEV)
+ return NULL;
+ }
+ }
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (handle) {
+ if (handle->priv)
+ return handle->priv;
+ /* Prevent this device from being initialized as a bond device */
+ if (need_switch)
+ bond_grp->bond_func_info[func_idx].net_dev = NULL;
+ hr_dev = hns_roce_bond_init_client(bond_grp, func_idx);
+ if (!hr_dev)
+ BOND_ERR_LOG("failed to init slave %u.\n", func_idx);
+ }
+
+ return hr_dev;
+}
+
static struct hns_roce_die_info *alloc_die_info(int bus_num)
{
struct hns_roce_die_info *die_info;
@@ -194,6 +332,35 @@ static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp,
bond_grp->bond_ready = false;
}
+static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp)
+{
+ mutex_lock(&bond_grp->bond_mutex);
+
+ bond_grp->upper_dev = NULL;
+ bond_grp->main_hr_dev = NULL;
+ bond_grp->bond_ready = false;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
+ bond_grp->slave_map = 0;
+ memset(bond_grp->bond_func_info, 0, sizeof(bond_grp->bond_func_info));
+
+ mutex_unlock(&bond_grp->bond_mutex);
+}
+
+void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+
+ ret = bond_grp->main_hr_dev ?
+ hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO;
+ if (ret)
+ BOND_ERR_LOG("failed to clear RoCE bond, ret = %d.\n", ret);
+ else
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE clear bond finished!\n");
+
+ hns_roce_detach_bond_grp(bond_grp);
+}
+
static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp,
struct net_device *net_dev)
{
@@ -489,3 +656,14 @@ void hns_roce_dealloc_bond_grp(void)
}
}
}
+
+int hns_roce_bond_init(struct hns_roce_dev *hr_dev)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+
+ return hns_roce_set_bond_netdev(bond_grp, hr_dev);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h
index 84c94cbc397d..3ef7d28379cc 100644
--- a/drivers/infiniband/hw/hns/hns_roce_bond.h
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.h
@@ -14,6 +14,9 @@
#define BOND_ID(id) BIT(id)
+#define BOND_ERR_LOG(fmt, ...) \
+ pr_err("HNS RoCE Bonding: " fmt, ##__VA_ARGS__)
+
enum {
BOND_MODE_1,
BOND_MODE_2_4,
@@ -80,5 +83,8 @@ struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
u8 bus_num);
int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev);
void hns_roce_dealloc_bond_grp(void);
+void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp);
+bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev);
+int hns_roce_bond_init(struct hns_roce_dev *hr_dev);
#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index cc1402fc8943..0add49d9664b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -179,6 +179,7 @@ enum hns_roce_instance_state {
HNS_ROCE_STATE_INIT,
HNS_ROCE_STATE_INITED,
HNS_ROCE_STATE_UNINIT,
+ HNS_ROCE_STATE_BOND_UNINIT,
};
enum {
@@ -1304,7 +1305,7 @@ void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn);
void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
int hns_roce_init(struct hns_roce_dev *hr_dev);
-void hns_roce_exit(struct hns_roce_dev *hr_dev);
+void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup);
int hns_roce_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq);
int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq);
int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index d3c1ad04afd7..4c43e930e0d0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -7173,7 +7173,7 @@ static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
}
static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
- bool reset)
+ bool reset, bool bond_cleanup)
{
struct hns_roce_dev *hr_dev = handle->priv;
@@ -7185,7 +7185,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT;
hns_roce_handle_device_err(hr_dev);
- hns_roce_exit(hr_dev);
+ hns_roce_exit(hr_dev, bond_cleanup);
kfree(hr_dev->priv);
ib_dealloc_device(&hr_dev->ib_dev);
}
@@ -7241,7 +7241,40 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
- __hns_roce_hw_v2_uninit_instance(handle, reset);
+ __hns_roce_hw_v2_uninit_instance(handle, reset, true);
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+}
+
+struct hns_roce_dev
+ *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx)
+{
+ struct hnae3_handle *handle;
+ int ret;
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (!handle || !handle->client)
+ return NULL;
+
+ ret = hns_roce_hw_v2_init_instance(handle);
+ if (ret)
+ return NULL;
+
+ return handle->priv;
+}
+
+void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx)
+{
+ struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle;
+
+ if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
+ return;
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT;
+
+ __hns_roce_hw_v2_uninit_instance(handle, false, false);
handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
}
@@ -7310,7 +7343,7 @@ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle)
handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT;
dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n");
msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY);
- __hns_roce_hw_v2_uninit_instance(handle, false);
+ __hns_roce_hw_v2_uninit_instance(handle, false, false);
return 0;
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 82cec4b38c92..285fe0875fac 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -1478,6 +1478,11 @@ struct hns_roce_bond_info {
__le32 hash_policy;
};
+struct hns_roce_dev
+ *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx);
+void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx);
int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
enum hns_roce_bond_cmd_type bond_type);
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 7fa25586ccd8..f7ef563d8239 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -614,9 +614,41 @@ static int hns_roce_get_hw_stats(struct ib_device *device,
return num_counters;
}
-static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
+static void
+ hns_roce_unregister_bond_cleanup(struct hns_roce_dev *hr_dev,
+ struct hns_roce_bond_group *bond_grp)
{
+ struct net_device *net_dev;
+ int i;
+
+ /* To avoid the loss of other slave devices when main_hr_dev
+ * is unregistered, re-initialize the remaining slaves before
+ * the bond resources cleanup.
+ */
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if (net_dev && net_dev != get_hr_netdev(hr_dev, 0))
+ hns_roce_bond_init_client(bond_grp, i);
+ }
+
+ hns_roce_cleanup_bond(bond_grp);
+}
+
+static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev,
+ bool bond_cleanup)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
struct hns_roce_ib_iboe *iboe = &hr_dev->iboe;
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ int i;
+
+ if (bond_cleanup && hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp)
+ hns_roce_unregister_bond_cleanup(hr_dev, bond_grp);
+ }
hr_dev->active = false;
unregister_netdevice_notifier(&iboe->nb);
@@ -746,6 +778,8 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops);
+ dma_set_max_seg_size(dev, SZ_2G);
+
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
ret = hns_roce_alloc_bond_grp(hr_dev);
if (ret) {
@@ -755,17 +789,26 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
}
}
- for (i = 0; i < hr_dev->caps.num_ports; i++) {
- net_dev = get_hr_netdev(hr_dev, i);
- if (!net_dev)
- continue;
-
- ret = ib_device_set_netdev(ib_dev, net_dev, i + 1);
- if (ret)
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND &&
+ hns_roce_bond_is_active(hr_dev)) {
+ ret = hns_roce_bond_init(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to init bond!\n");
return ret;
+ }
+ ret = ib_register_device(ib_dev, "hns_bond_%d", dev);
+ } else {
+ for (i = 0; i < hr_dev->caps.num_ports; i++) {
+ net_dev = get_hr_netdev(hr_dev, i);
+ if (!net_dev)
+ continue;
+
+ ret = ib_device_set_netdev(ib_dev, net_dev, i + 1);
+ if (ret)
+ return ret;
+ }
+ ret = ib_register_device(ib_dev, "hns_%d", dev);
}
- dma_set_max_seg_size(dev, SZ_2G);
- ret = ib_register_device(ib_dev, "hns_%d", dev);
if (ret) {
dev_err(dev, "ib_register_device failed!\n");
return ret;
@@ -1165,10 +1208,10 @@ int hns_roce_init(struct hns_roce_dev *hr_dev)
return ret;
}
-void hns_roce_exit(struct hns_roce_dev *hr_dev)
+void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup)
{
hns_roce_unregister_debugfs(hr_dev);
- hns_roce_unregister_device(hr_dev);
+ hns_roce_unregister_device(hr_dev, bond_cleanup);
if (hr_dev->hw->hw_exit)
hr_dev->hw->hw_exit(hr_dev);
--
2.33.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH for-next 6/8] RDMA/hns: Add delayed work for bonding
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
` (4 preceding siblings ...)
2025-09-13 9:06 ` [PATCH for-next 5/8] RDMA/hns: Implement bonding init/uninit process Junxian Huang
@ 2025-09-13 9:06 ` Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 7/8] RDMA/hns: Support link state reporting for bond Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 8/8] RDMA/hns: Support reset recovery " Junxian Huang
7 siblings, 0 replies; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
When conditions are met, schedule a delayed work in bond event handler
to perform bonding operation according to the bond state. In the case
of changing slave number or link state, re-set the netdev for the bond
ibdev after the modification is complete, since these two operations
may not call hns_roce_set_bond_netdev() in hns_roce_init().
The delayed work will be paused when there is a driver reset or exit
to avoid concurrency.
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_bond.c | 307 +++++++++++++++++++++
drivers/infiniband/hw/hns/hns_roce_bond.h | 5 +
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 13 +-
3 files changed, 324 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c
index d6fce23501b4..dcafb8d9bfff 100644
--- a/drivers/infiniband/hw/hns/hns_roce_bond.c
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.c
@@ -3,6 +3,7 @@
* Copyright (c) 2025 Hisilicon Limited.
*/
+#include <net/lag.h>
#include <net/bonding.h>
#include "hns_roce_device.h"
#include "hns_roce_hw_v2.h"
@@ -130,6 +131,32 @@ bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev)
return false;
}
+static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ u32 active_slave_map = 0;
+ u8 active_slave_num = 0;
+ bool active;
+ u8 i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if (!net_dev || !(bond_grp->slave_map & (1U << i)))
+ continue;
+
+ active = (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ?
+ net_lag_port_dev_txable(net_dev) :
+ (ib_get_curr_port_state(net_dev) == IB_PORT_ACTIVE);
+ if (active) {
+ active_slave_num++;
+ active_slave_map |= (1U << i);
+ }
+ }
+
+ bond_grp->active_slave_num = active_slave_num;
+ bond_grp->active_slave_map = active_slave_map;
+}
+
static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp,
u8 func_idx)
{
@@ -224,11 +251,14 @@ static struct hns_roce_die_info *alloc_die_info(int bus_num)
return NULL;
}
+ mutex_init(&die_info->die_mutex);
+
return die_info;
}
static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num)
{
+ mutex_destroy(&die_info->die_mutex);
xa_erase(&roce_bond_xa, bus_num);
kfree(die_info);
}
@@ -277,6 +307,167 @@ static int remove_bond_id(int bus_num, u8 bond_id)
return 0;
}
+static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp)
+{
+ struct hns_roce_dev *hr_dev;
+ int ret;
+ int i;
+
+ for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) {
+ if (bond_grp->slave_map & (1 << i))
+ hns_roce_slave_uninit(bond_grp, i);
+ }
+
+ mutex_lock(&bond_grp->bond_mutex);
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+ bond_grp->main_hr_dev = NULL;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ if (bond_grp->slave_map & (1 << i)) {
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev) {
+ bond_grp->main_hr_dev = hr_dev;
+ break;
+ }
+ }
+ }
+
+ if (!bond_grp->main_hr_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND);
+
+out:
+ if (ret) {
+ BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret);
+ hns_roce_cleanup_bond(bond_grp);
+ } else {
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE set bond finished!\n");
+ }
+}
+
+static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp)
+{
+ u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
+ struct hns_roce_dev *hr_dev;
+ u8 i;
+
+ if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED)
+ goto out;
+
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ bond_grp->main_hr_dev = NULL;
+
+ hns_roce_slave_uninit(bond_grp, main_func_idx);
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev)
+ bond_grp->main_hr_dev = hr_dev;
+ }
+
+out:
+ hns_roce_cleanup_bond(bond_grp);
+}
+
+static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND);
+
+ mutex_lock(&bond_grp->bond_mutex);
+ if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGESTATE)
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+
+ if (ret)
+ ibdev_err(&bond_grp->main_hr_dev->ib_dev,
+ "failed to change RoCE bond slave state, ret = %d.\n",
+ ret);
+ else
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE slave changestate finished!\n");
+}
+
+static void hns_roce_slave_change_num(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+ u8 i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ if (bond_grp->slave_map & (1U << i)) {
+ if (i == PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn))
+ continue;
+ hns_roce_slave_uninit(bond_grp, i);
+ } else {
+ hns_roce_slave_init(bond_grp, i, true);
+ if (!bond_grp->main_hr_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ bond_grp->bond_func_info[i].net_dev = NULL;
+ bond_grp->bond_func_info[i].handle = NULL;
+ }
+ }
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND);
+
+out:
+ if (ret) {
+ BOND_ERR_LOG("failed to change RoCE bond slave num, ret = %d.\n", ret);
+ hns_roce_cleanup_bond(bond_grp);
+ } else {
+ mutex_lock(&bond_grp->bond_mutex);
+ if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGE_NUM)
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE slave change num finished!\n");
+ }
+}
+
+static void hns_roce_bond_info_update_nolock(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ struct hns_roce_v2_priv *priv;
+ struct hns_roce_dev *hr_dev;
+ struct net_device *net_dev;
+ int func_idx;
+
+ bond_grp->slave_map = 0;
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(upper_dev, net_dev) {
+ func_idx = get_netdev_bond_slave_id(net_dev, bond_grp);
+ if (func_idx < 0) {
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev)
+ continue;
+ func_idx = PCI_FUNC(hr_dev->pci_dev->devfn);
+ if (!bond_grp->bond_func_info[func_idx].net_dev) {
+ priv = hr_dev->priv;
+ bond_grp->bond_func_info[func_idx].net_dev =
+ net_dev;
+ bond_grp->bond_func_info[func_idx].handle =
+ priv->handle;
+ }
+ }
+
+ bond_grp->slave_map |= (1 << func_idx);
+ }
+ rcu_read_unlock();
+}
+
static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp,
struct net_device *net_dev)
{
@@ -322,6 +513,50 @@ static bool check_slave_support(struct hns_roce_bond_group *bond_grp,
return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX);
}
+static void hns_roce_bond_work(struct work_struct *work)
+{
+ struct delayed_work *delayed_work = to_delayed_work(work);
+ struct hns_roce_bond_group *bond_grp =
+ container_of(delayed_work, struct hns_roce_bond_group,
+ bond_work);
+ enum hns_roce_bond_state bond_state;
+ bool bond_ready;
+
+ mutex_lock(&bond_grp->bond_mutex);
+ bond_ready = check_slave_support(bond_grp, bond_grp->upper_dev);
+ hns_roce_bond_info_update_nolock(bond_grp, bond_grp->upper_dev);
+ bond_state = bond_grp->bond_state;
+ bond_grp->bond_ready = bond_ready;
+ mutex_unlock(&bond_grp->bond_mutex);
+
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "bond work: bond_ready - %d, bond_state - %d.\n",
+ bond_ready, bond_state);
+
+ if (!bond_ready) {
+ hns_roce_clear_bond(bond_grp);
+ return;
+ }
+
+ switch (bond_state) {
+ case HNS_ROCE_BOND_NOT_BONDED:
+ hns_roce_set_bond(bond_grp);
+ /* In set_bond flow, we don't need to set bond netdev here as
+ * it has been done when bond_grp->main_hr_dev is registered.
+ */
+ return;
+ case HNS_ROCE_BOND_SLAVE_CHANGESTATE:
+ hns_roce_slave_changestate(bond_grp);
+ break;
+ case HNS_ROCE_BOND_SLAVE_CHANGE_NUM:
+ hns_roce_slave_change_num(bond_grp);
+ break;
+ default:
+ return;
+ }
+ hns_roce_set_bond_netdev(bond_grp, bond_grp->main_hr_dev);
+}
+
static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp,
struct hns_roce_dev *hr_dev,
struct net_device *upper_dev)
@@ -336,6 +571,7 @@ static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp)
{
mutex_lock(&bond_grp->bond_mutex);
+ cancel_delayed_work(&bond_grp->bond_work);
bond_grp->upper_dev = NULL;
bond_grp->main_hr_dev = NULL;
bond_grp->bond_ready = false;
@@ -576,6 +812,9 @@ static int hns_roce_bond_event(struct notifier_block *self,
if (event == NETDEV_CHANGELOWERSTATE)
changed = hns_roce_bond_lowerstate_event(bond_grp, ptr);
+ if (changed)
+ schedule_delayed_work(&bond_grp->bond_work, HZ);
+
return NOTIFY_DONE;
}
@@ -598,6 +837,7 @@ int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
}
mutex_init(&bond_grp->bond_mutex);
+ INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_bond_work);
bond_grp->bond_ready = false;
bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
@@ -630,6 +870,7 @@ int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
mem_err:
for (i--; i >= 0; i--) {
unregister_netdevice_notifier(&bgrps[i]->bond_nb);
+ cancel_delayed_work_sync(&bgrps[i]->bond_work);
remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id);
mutex_destroy(&bgrps[i]->bond_mutex);
kvfree(bgrps[i]);
@@ -650,6 +891,7 @@ void hns_roce_dealloc_bond_grp(void)
if (!bond_grp)
continue;
unregister_netdevice_notifier(&bond_grp->bond_nb);
+ cancel_delayed_work_sync(&bond_grp->bond_work);
remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
mutex_destroy(&bond_grp->bond_mutex);
kvfree(bond_grp);
@@ -667,3 +909,68 @@ int hns_roce_bond_init(struct hns_roce_dev *hr_dev)
return hns_roce_set_bond_netdev(bond_grp, hr_dev);
}
+
+void hns_roce_bond_suspend(struct hnae3_handle *handle)
+{
+ u8 bus_num = handle->pdev->bus->number;
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ int i;
+
+ die_info = xa_load(&roce_bond_xa, bus_num);
+ if (!die_info)
+ return;
+
+ mutex_lock(&die_info->die_mutex);
+
+ /*
+ * Avoid duplicated processing when calling this function
+ * multiple times.
+ */
+ if (die_info->suspend_cnt)
+ goto out;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ unregister_netdevice_notifier(&bond_grp->bond_nb);
+ cancel_delayed_work_sync(&bond_grp->bond_work);
+ }
+
+out:
+ die_info->suspend_cnt++;
+ mutex_unlock(&die_info->die_mutex);
+}
+
+void hns_roce_bond_resume(struct hnae3_handle *handle)
+{
+ u8 bus_num = handle->pdev->bus->number;
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ int i, ret;
+
+ die_info = xa_load(&roce_bond_xa, bus_num);
+ if (!die_info)
+ return;
+
+ mutex_lock(&die_info->die_mutex);
+
+ die_info->suspend_cnt--;
+ if (die_info->suspend_cnt)
+ goto out;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ ret = register_netdevice_notifier(&bond_grp->bond_nb);
+ if (ret)
+ dev_err(&handle->pdev->dev,
+ "failed to resume bond notifier(bus_num = %u, id = %u), ret = %d.\n",
+ bus_num, bond_grp->bond_id, ret);
+ }
+
+out:
+ mutex_unlock(&die_info->die_mutex);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h
index 3ef7d28379cc..98c295d78ca1 100644
--- a/drivers/infiniband/hw/hns/hns_roce_bond.h
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.h
@@ -72,11 +72,14 @@ struct hns_roce_bond_group {
enum netdev_lag_hash hash_type;
struct mutex bond_mutex;
struct notifier_block bond_nb;
+ struct delayed_work bond_work;
};
struct hns_roce_die_info {
u8 bond_id_mask;
struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+ struct mutex die_mutex;
+ u8 suspend_cnt;
};
struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
@@ -86,5 +89,7 @@ void hns_roce_dealloc_bond_grp(void);
void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp);
bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev);
int hns_roce_bond_init(struct hns_roce_dev *hr_dev);
+void hns_roce_bond_suspend(struct hnae3_handle *handle);
+void hns_roce_bond_resume(struct hnae3_handle *handle);
#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 4c43e930e0d0..f1145f57bb3a 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -7236,14 +7236,20 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
bool reset)
{
+ /* Suspend bond to avoid concurrency */
+ hns_roce_bond_suspend(handle);
+
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
- return;
+ goto out;
handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
__hns_roce_hw_v2_uninit_instance(handle, reset, true);
handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+
+out:
+ hns_roce_bond_resume(handle);
}
struct hns_roce_dev
@@ -7283,6 +7289,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
{
struct hns_roce_dev *hr_dev;
+ /* Suspend bond to avoid concurrency */
+ hns_roce_bond_suspend(handle);
+
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
return 0;
@@ -7313,6 +7322,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN,
&handle->rinfo.state)) {
handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+ hns_roce_bond_resume(handle);
return 0;
}
@@ -7332,6 +7342,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
dev_info(dev, "reset done, RoCE client reinit finished.\n");
}
+ hns_roce_bond_resume(handle);
return ret;
}
--
2.33.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH for-next 7/8] RDMA/hns: Support link state reporting for bond
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
` (5 preceding siblings ...)
2025-09-13 9:06 ` [PATCH for-next 6/8] RDMA/hns: Add delayed work for bonding Junxian Huang
@ 2025-09-13 9:06 ` Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 8/8] RDMA/hns: Support reset recovery " Junxian Huang
7 siblings, 0 replies; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
The link state of bond depends on the upper device. Adapt current
link state querying flow and ib_event dispatching flow to report
correct link state of bond.
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 10 +++
drivers/infiniband/hw/hns/hns_roce_main.c | 89 ++++++++++++++++------
2 files changed, 75 insertions(+), 24 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index f1145f57bb3a..ebd0c5f38bc2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -7386,10 +7386,20 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle,
{
struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
struct net_device *netdev = handle->rinfo.netdev;
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num;
if (linkup || !hr_dev)
return;
+ /* For bond device, the link status depends on the upper netdev,
+ * and the upper device's link status depends on all the slaves'
+ * netdev but not only one. So bond device cannot get a correct
+ * link status from this path.
+ */
+ if (hns_roce_get_bond_grp(netdev, get_hr_bus_num(hr_dev)))
+ return;
+
ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev);
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index f7ef563d8239..eeb8c4bdae32 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -89,30 +89,66 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
return ret;
}
-static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port,
- unsigned long event)
+static int get_upper_port_state(struct hns_roce_dev *hr_dev,
+ enum ib_port_state *state)
{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (!bond_grp)
+ return -ENODEV;
+
+ *state = ib_get_curr_port_state(bond_grp->upper_dev);
+
+ return 0;
+}
+
+static int handle_en_event(struct net_device *netdev,
+ struct hns_roce_dev *hr_dev,
+ u32 port, unsigned long event)
+{
+ struct ib_device *ibdev = &hr_dev->ib_dev;
struct device *dev = hr_dev->dev;
- struct net_device *netdev;
+ enum ib_port_state curr_state;
+ struct ib_event ibevent;
int ret = 0;
- netdev = hr_dev->iboe.netdevs[port];
if (!netdev) {
dev_err(dev, "can't find netdev on port(%u)!\n", port);
return -ENODEV;
}
switch (event) {
- case NETDEV_UP:
- case NETDEV_CHANGE:
case NETDEV_REGISTER:
case NETDEV_CHANGEADDR:
ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
break;
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
+ if (ret)
+ return ret;
+ fallthrough;
case NETDEV_DOWN:
- /*
- * In v1 engine, only support all ports closed together.
- */
+ if (!netif_is_lag_master(netdev))
+ break;
+ curr_state = ib_get_curr_port_state(netdev);
+
+ write_lock_irq(&ibdev->cache_lock);
+ if (ibdev->port_data[port].cache.last_port_state == curr_state) {
+ write_unlock_irq(&ibdev->cache_lock);
+ return 0;
+ }
+ ibdev->port_data[port].cache.last_port_state = curr_state;
+ write_unlock_irq(&ibdev->cache_lock);
+
+ ibevent.event = (curr_state == IB_PORT_DOWN) ?
+ IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE;
+ ibevent.device = ibdev;
+ ibevent.element.port_num = port + 1;
+ ib_dispatch_event(&ibevent);
break;
default:
dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event));
@@ -126,17 +162,25 @@ static int hns_roce_netdev_event(struct notifier_block *self,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct hns_roce_bond_group *bond_grp;
struct hns_roce_ib_iboe *iboe = NULL;
struct hns_roce_dev *hr_dev = NULL;
+ struct net_device *upper = NULL;
int ret;
u32 port;
hr_dev = container_of(self, struct hns_roce_dev, iboe.nb);
iboe = &hr_dev->iboe;
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0),
+ get_hr_bus_num(hr_dev));
+ upper = bond_grp ? bond_grp->upper_dev : NULL;
+ }
for (port = 0; port < hr_dev->caps.num_ports; port++) {
- if (dev == iboe->netdevs[port]) {
- ret = handle_en_event(hr_dev, port, event);
+ if ((!upper && dev == iboe->netdevs[port]) ||
+ (upper && dev == upper)) {
+ ret = handle_en_event(dev, hr_dev, port, event);
if (ret)
return NOTIFY_DONE;
break;
@@ -222,9 +266,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
struct ib_port_attr *props)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
- struct device *dev = hr_dev->dev;
struct net_device *net_dev;
- unsigned long flags;
enum ib_mtu mtu;
u32 port;
int ret;
@@ -245,25 +287,24 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
if (ret)
ibdev_warn(ib_dev, "failed to get speed, ret = %d.\n", ret);
- spin_lock_irqsave(&hr_dev->iboe.lock, flags);
-
- net_dev = get_hr_netdev(hr_dev, port);
+ net_dev = ib_device_get_netdev(ib_dev, port_num);
if (!net_dev) {
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
- dev_err(dev, "find netdev %u failed!\n", port);
- return -EINVAL;
+ ibdev_err(ib_dev, "failed to get net_dev.\n");
+ return -ENODEV;
}
mtu = iboe_get_mtu(net_dev->mtu);
props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256;
- props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ?
- IB_PORT_ACTIVE :
- IB_PORT_DOWN;
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)
+ ret = get_upper_port_state(hr_dev, &props->state);
+ if (ret)
+ props->state = ib_get_curr_port_state(net_dev);
+
props->phys_state = props->state == IB_PORT_ACTIVE ?
IB_PORT_PHYS_STATE_LINK_UP :
IB_PORT_PHYS_STATE_DISABLED;
-
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
+ dev_put(net_dev);
return 0;
}
--
2.33.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH for-next 8/8] RDMA/hns: Support reset recovery for bond
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
` (6 preceding siblings ...)
2025-09-13 9:06 ` [PATCH for-next 7/8] RDMA/hns: Support link state reporting for bond Junxian Huang
@ 2025-09-13 9:06 ` Junxian Huang
7 siblings, 0 replies; 14+ messages in thread
From: Junxian Huang @ 2025-09-13 9:06 UTC (permalink / raw)
To: jgg, leon; +Cc: linux-rdma, linuxarm, huangjunxian6, tangchengchang
Re-set bond configuration to HW after HW reset.
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
drivers/infiniband/hw/hns/hns_roce_bond.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c
index dcafb8d9bfff..08e78d016574 100644
--- a/drivers/infiniband/hw/hns/hns_roce_bond.c
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.c
@@ -157,6 +157,15 @@ static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp)
bond_grp->active_slave_map = active_slave_map;
}
+static int hns_roce_recover_bond(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev)
+{
+ bond_grp->main_hr_dev = hr_dev;
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ return hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND);
+}
+
static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp,
u8 func_idx)
{
@@ -902,11 +911,22 @@ void hns_roce_dealloc_bond_grp(void)
int hns_roce_bond_init(struct hns_roce_dev *hr_dev)
{
struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_v2_priv *priv = hr_dev->priv;
struct hns_roce_bond_group *bond_grp;
u8 bus_num = get_hr_bus_num(hr_dev);
+ int ret;
bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT) {
+ ret = hns_roce_recover_bond(bond_grp, hr_dev);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "failed to recover RoCE bond, ret = %d.\n", ret);
+ return ret;
+ }
+ }
+
return hns_roce_set_bond_netdev(bond_grp, hr_dev);
}
--
2.33.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH for-next 1/8] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev
2025-09-13 9:06 ` [PATCH for-next 1/8] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev Junxian Huang
@ 2025-09-24 14:00 ` Jason Gunthorpe
2025-09-28 10:12 ` Junxian Huang
0 siblings, 1 reply; 14+ messages in thread
From: Jason Gunthorpe @ 2025-09-24 14:00 UTC (permalink / raw)
To: Junxian Huang; +Cc: leon, linux-rdma, linuxarm, tangchengchang
On Sat, Sep 13, 2025 at 05:06:08PM +0800, Junxian Huang wrote:
> Add helpers to obtain netdev and bus_num from hr_dev.
bus number seems like a strange way to do this? Aren't your PFs a PCI
multi-function-device? Shouldn't it check for same-function instead?
Jason
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH for-next 2/8] RDMA/hns: Initialize bonding resources
2025-09-13 9:06 ` [PATCH for-next 2/8] RDMA/hns: Initialize bonding resources Junxian Huang
@ 2025-09-24 14:04 ` Jason Gunthorpe
2025-10-13 8:25 ` Junxian Huang
0 siblings, 1 reply; 14+ messages in thread
From: Jason Gunthorpe @ 2025-09-24 14:04 UTC (permalink / raw)
To: Junxian Huang; +Cc: leon, linux-rdma, linuxarm, tangchengchang
On Sat, Sep 13, 2025 at 05:06:09PM +0800, Junxian Huang wrote:
> +static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev)
> +{
> + struct net_device *upper_dev;
> +
> + rcu_read_lock();
> + upper_dev = netdev_master_upper_dev_get_rcu(net_dev);
> + rcu_read_unlock();
> + return upper_dev;
upper_dev cannot leave the RCU without refcounting it.
> +static bool check_vf_support(struct pci_dev *vf)
> +{
> + struct hns_roce_bond_group *bond_grp;
> + struct pci_dev *pf = pci_physfn(vf);
> + struct hnae3_ae_dev *ae_dev;
> + struct hnae3_handle *handle;
> + struct hns_roce_dev *hr_dev;
> + struct hclge_dev *hdev;
> +
> + if (pf == vf)
> + return true;
> +
> + ae_dev = pci_get_drvdata(pf);
This isn't how you get a drv data of a PF.. Use
pci_iov_get_pf_drvdata()
Jason
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH for-next 3/8] RDMA/hns: Add bonding event handler
2025-09-13 9:06 ` [PATCH for-next 3/8] RDMA/hns: Add bonding event handler Junxian Huang
@ 2025-09-24 14:05 ` Jason Gunthorpe
0 siblings, 0 replies; 14+ messages in thread
From: Jason Gunthorpe @ 2025-09-24 14:05 UTC (permalink / raw)
To: Junxian Huang; +Cc: leon, linux-rdma, linuxarm, tangchengchang
On Sat, Sep 13, 2025 at 05:06:10PM +0800, Junxian Huang wrote:
> +static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev)
> +{
> + struct ib_device *ibdev =
> + ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS);
> + struct hns_roce_dev *hr_dev;
> +
> + if (!ibdev)
> + return NULL;
> +
> + hr_dev = container_of(ibdev, struct hns_roce_dev, ib_dev);
> + ib_device_put(ibdev);
> + return hr_dev;
Huh? Put the refcount and then return a dangling pointer?
Jason
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH for-next 1/8] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev
2025-09-24 14:00 ` Jason Gunthorpe
@ 2025-09-28 10:12 ` Junxian Huang
0 siblings, 0 replies; 14+ messages in thread
From: Junxian Huang @ 2025-09-28 10:12 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: leon, linux-rdma, linuxarm, tangchengchang
On 2025/9/24 22:00, Jason Gunthorpe wrote:
> On Sat, Sep 13, 2025 at 05:06:08PM +0800, Junxian Huang wrote:
>> Add helpers to obtain netdev and bus_num from hr_dev.
>
> bus number seems like a strange way to do this? Aren't your PFs a PCI
> multi-function-device?
No, our each PF is an independent PCI function corresponding to one
physical port. We only support bonding ports on the same NIC, which
share the same bus number but are different functions. I think
checking bus number should be enough.
Junxian
> Shouldn't it check for same-function instead?
>
> Jason
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH for-next 2/8] RDMA/hns: Initialize bonding resources
2025-09-24 14:04 ` Jason Gunthorpe
@ 2025-10-13 8:25 ` Junxian Huang
0 siblings, 0 replies; 14+ messages in thread
From: Junxian Huang @ 2025-10-13 8:25 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: leon, linux-rdma, linuxarm, tangchengchang
On 2025/9/24 22:04, Jason Gunthorpe wrote:
> On Sat, Sep 13, 2025 at 05:06:09PM +0800, Junxian Huang wrote:
>> +static bool check_vf_support(struct pci_dev *vf)
>> +{
>> + struct hns_roce_bond_group *bond_grp;
>> + struct pci_dev *pf = pci_physfn(vf);
>> + struct hnae3_ae_dev *ae_dev;
>> + struct hnae3_handle *handle;
>> + struct hns_roce_dev *hr_dev;
>> + struct hclge_dev *hdev;
>> +
>> + if (pf == vf)
>> + return true;
>> +
>> + ae_dev = pci_get_drvdata(pf);
>
> This isn't how you get a drv data of a PF.. Use
> pci_iov_get_pf_drvdata()
>
> Jason
>
Hi Jason, sorry for the late response.
After discussion, we decided to move this check into FW
instead of driver. I'll remove this code in v2.
Junxian
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2025-10-13 8:25 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-13 9:06 [PATCH for-next 0/8] RDMA/hns: Support RoCE bonding Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 1/8] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev Junxian Huang
2025-09-24 14:00 ` Jason Gunthorpe
2025-09-28 10:12 ` Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 2/8] RDMA/hns: Initialize bonding resources Junxian Huang
2025-09-24 14:04 ` Jason Gunthorpe
2025-10-13 8:25 ` Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 3/8] RDMA/hns: Add bonding event handler Junxian Huang
2025-09-24 14:05 ` Jason Gunthorpe
2025-09-13 9:06 ` [PATCH for-next 4/8] RDMA/hns: Add bonding cmds Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 5/8] RDMA/hns: Implement bonding init/uninit process Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 6/8] RDMA/hns: Add delayed work for bonding Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 7/8] RDMA/hns: Support link state reporting for bond Junxian Huang
2025-09-13 9:06 ` [PATCH for-next 8/8] RDMA/hns: Support reset recovery " Junxian Huang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).