* [PATCHv5 for-rc1 v5 1/8] RDMA/rxe: Creating listening sock in newlink function
2023-04-28 9:39 [PATCHv5 for-rc1 v5 0/8] Fix the problem that rxe can not work in net namespace Zhu Yanjun
@ 2023-04-28 9:39 ` Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 2/8] RDMA/rxe: Support more rdma links in init_net Zhu Yanjun
` (6 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Zhu Yanjun @ 2023-04-28 9:39 UTC (permalink / raw)
To: zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer; +Cc: Zhu Yanjun, Rain River
From: Zhu Yanjun <yanjun.zhu@linux.dev>
Originally when the module rdma_rxe is loaded, the sock listening on udp
port 4791 is created. Currently moving the creating listening port to
newlink function.
So when running "rdma link add" command, the sock listening on udp port
4791 is created.
Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
drivers/infiniband/sw/rxe/rxe.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 136c2efe3466..64644cb0bb38 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -192,6 +192,10 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
goto err;
}
+ err = rxe_net_init();
+ if (err)
+ return err;
+
err = rxe_net_add(ibdev_name, ndev);
if (err) {
rxe_dbg(exists, "failed to add %s\n", ndev->name);
@@ -208,12 +212,6 @@ static struct rdma_link_ops rxe_link_ops = {
static int __init rxe_module_init(void)
{
- int err;
-
- err = rxe_net_init();
- if (err)
- return err;
-
rdma_link_register(&rxe_link_ops);
pr_info("loaded\n");
return 0;
--
2.27.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCHv5 for-rc1 v5 2/8] RDMA/rxe: Support more rdma links in init_net
2023-04-28 9:39 [PATCHv5 for-rc1 v5 0/8] Fix the problem that rxe can not work in net namespace Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 1/8] RDMA/rxe: Creating listening sock in newlink function Zhu Yanjun
@ 2023-04-28 9:39 ` Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 3/8] RDMA/nldev: Add dellink function pointer Zhu Yanjun
` (5 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Zhu Yanjun @ 2023-04-28 9:39 UTC (permalink / raw)
To: zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer; +Cc: Zhu Yanjun, Rain River
From: Zhu Yanjun <yanjun.zhu@linux.dev>
In init_net, when several rdma links are created with the command "rdma
link add", newlink will check whether the udp port 4791 is listening or
not.
If not, creating a sock listening on udp port 4791. If yes, increasing the
reference count of the sock.
Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
drivers/infiniband/sw/rxe/rxe.c | 12 ++++++-
drivers/infiniband/sw/rxe/rxe_net.c | 55 +++++++++++++++++++++--------
drivers/infiniband/sw/rxe/rxe_net.h | 1 +
3 files changed, 52 insertions(+), 16 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 64644cb0bb38..0ce6adb43cfc 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -8,6 +8,7 @@
#include <net/addrconf.h>
#include "rxe.h"
#include "rxe_loc.h"
+#include "rxe_net.h"
MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib");
MODULE_DESCRIPTION("Soft RDMA transport");
@@ -205,14 +206,23 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
return err;
}
-static struct rdma_link_ops rxe_link_ops = {
+struct rdma_link_ops rxe_link_ops = {
.type = "rxe",
.newlink = rxe_newlink,
};
static int __init rxe_module_init(void)
{
+ int err;
+
rdma_link_register(&rxe_link_ops);
+
+ err = rxe_register_notifier();
+ if (err) {
+ pr_err("Failed to register netdev notifier\n");
+ return -1;
+ }
+
pr_info("loaded\n");
return 0;
}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index e02e1624bcf4..3ca92e062800 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -623,13 +623,23 @@ static struct notifier_block rxe_net_notifier = {
static int rxe_net_ipv4_init(void)
{
- recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
- htons(ROCE_V2_UDP_DPORT), false);
- if (IS_ERR(recv_sockets.sk4)) {
- recv_sockets.sk4 = NULL;
+ struct sock *sk;
+ struct socket *sock;
+
+ rcu_read_lock();
+ sk = udp4_lib_lookup(&init_net, 0, 0, htonl(INADDR_ANY),
+ htons(ROCE_V2_UDP_DPORT), 0);
+ rcu_read_unlock();
+ if (sk)
+ return 0;
+
+ sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false);
+ if (IS_ERR(sock)) {
pr_err("Failed to create IPv4 UDP tunnel\n");
+ recv_sockets.sk4 = NULL;
return -1;
}
+ recv_sockets.sk4 = sock;
return 0;
}
@@ -637,24 +647,46 @@ static int rxe_net_ipv4_init(void)
static int rxe_net_ipv6_init(void)
{
#if IS_ENABLED(CONFIG_IPV6)
+ struct sock *sk;
+ struct socket *sock;
+
+ rcu_read_lock();
+ sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any,
+ htons(ROCE_V2_UDP_DPORT), 0);
+ rcu_read_unlock();
+ if (sk)
+ return 0;
- recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
- htons(ROCE_V2_UDP_DPORT), true);
- if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) {
+ sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true);
+ if (PTR_ERR(sock) == -EAFNOSUPPORT) {
recv_sockets.sk6 = NULL;
pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n");
return 0;
}
- if (IS_ERR(recv_sockets.sk6)) {
+ if (IS_ERR(sock)) {
recv_sockets.sk6 = NULL;
pr_err("Failed to create IPv6 UDP tunnel\n");
return -1;
}
+ recv_sockets.sk6 = sock;
#endif
return 0;
}
+int rxe_register_notifier(void)
+{
+ int err;
+
+ err = register_netdevice_notifier(&rxe_net_notifier);
+ if (err) {
+ pr_err("Failed to register netdev notifier\n");
+ return -1;
+ }
+
+ return 0;
+}
+
void rxe_net_exit(void)
{
rxe_release_udp_tunnel(recv_sockets.sk6);
@@ -666,19 +698,12 @@ int rxe_net_init(void)
{
int err;
- recv_sockets.sk6 = NULL;
-
err = rxe_net_ipv4_init();
if (err)
return err;
err = rxe_net_ipv6_init();
if (err)
goto err_out;
- err = register_netdevice_notifier(&rxe_net_notifier);
- if (err) {
- pr_err("Failed to register netdev notifier\n");
- goto err_out;
- }
return 0;
err_out:
rxe_net_exit();
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index 45d80d00f86b..a222c3eeae12 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -18,6 +18,7 @@ struct rxe_recv_sockets {
int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
+int rxe_register_notifier(void);
int rxe_net_init(void);
void rxe_net_exit(void);
--
2.27.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCHv5 for-rc1 v5 3/8] RDMA/nldev: Add dellink function pointer
2023-04-28 9:39 [PATCHv5 for-rc1 v5 0/8] Fix the problem that rxe can not work in net namespace Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 1/8] RDMA/rxe: Creating listening sock in newlink function Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 2/8] RDMA/rxe: Support more rdma links in init_net Zhu Yanjun
@ 2023-04-28 9:39 ` Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe Zhu Yanjun
` (4 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Zhu Yanjun @ 2023-04-28 9:39 UTC (permalink / raw)
To: zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer; +Cc: Zhu Yanjun, Rain River
From: Zhu Yanjun <yanjun.zhu@linux.dev>
The newlink function pointer is added. And the sock listening on port 4791
is added in the newlink function. So the dellink function is needed to
remove the sock.
Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
drivers/infiniband/core/nldev.c | 6 ++++++
include/rdma/rdma_netlink.h | 2 ++
2 files changed, 8 insertions(+)
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index d5d3e4f0de77..97a62685ed5b 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1758,6 +1758,12 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
+ if (device->link_ops) {
+ err = device->link_ops->dellink(device);
+ if (err)
+ return err;
+ }
+
ib_unregister_device_and_put(device);
return 0;
}
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index c2a79aeee113..bf9df004061f 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -5,6 +5,7 @@
#include <linux/netlink.h>
#include <uapi/rdma/rdma_netlink.h>
+#include <rdma/ib_verbs.h>
enum {
RDMA_NLDEV_ATTR_EMPTY_STRING = 1,
@@ -114,6 +115,7 @@ struct rdma_link_ops {
struct list_head list;
const char *type;
int (*newlink)(const char *ibdev_name, struct net_device *ndev);
+ int (*dellink)(struct ib_device *dev);
};
void rdma_link_register(struct rdma_link_ops *ops);
--
2.27.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe
2023-04-28 9:39 [PATCHv5 for-rc1 v5 0/8] Fix the problem that rxe can not work in net namespace Zhu Yanjun
` (2 preceding siblings ...)
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 3/8] RDMA/nldev: Add dellink function pointer Zhu Yanjun
@ 2023-04-28 9:39 ` Zhu Yanjun
2023-06-20 20:21 ` Bob Pearson
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 5/8] RDMA/rxe: Replace global variable with sock lookup functions Zhu Yanjun
` (3 subsequent siblings)
7 siblings, 1 reply; 17+ messages in thread
From: Zhu Yanjun @ 2023-04-28 9:39 UTC (permalink / raw)
To: zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer; +Cc: Zhu Yanjun, Rain River
From: Zhu Yanjun <yanjun.zhu@linux.dev>
When running "rdma link del" command, dellink function will be called.
If the sock refcnt is greater than the refcnt needed for udp tunnel,
the sock refcnt will be decreased by 1.
If equal, the last rdma link is removed. The udp tunnel will be
destroyed.
Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++-
drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
drivers/infiniband/sw/rxe/rxe_net.h | 1 +
3 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 0ce6adb43cfc..ebfabc6d6b76 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
/* called by ifc layer to create new rxe device.
* The caller should allocate memory for rxe by calling ib_alloc_device.
*/
+static struct rdma_link_ops rxe_link_ops;
int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
{
rxe_init(rxe);
rxe_set_mtu(rxe, mtu);
+ rxe->ib_dev.link_ops = &rxe_link_ops;
return rxe_register_device(rxe, ibdev_name);
}
@@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
return err;
}
-struct rdma_link_ops rxe_link_ops = {
+static int rxe_dellink(struct ib_device *dev)
+{
+ rxe_net_del(dev);
+
+ return 0;
+}
+
+static struct rdma_link_ops rxe_link_ops = {
.type = "rxe",
.newlink = rxe_newlink,
+ .dellink = rxe_dellink,
};
static int __init rxe_module_init(void)
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 3ca92e062800..4cc7de7b115b 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
return 0;
}
+#define SK_REF_FOR_TUNNEL 2
+void rxe_net_del(struct ib_device *dev)
+{
+ if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
+ __sock_put(recv_sockets.sk6->sk);
+ else
+ rxe_release_udp_tunnel(recv_sockets.sk6);
+
+ if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
+ __sock_put(recv_sockets.sk4->sk);
+ else
+ rxe_release_udp_tunnel(recv_sockets.sk4);
+}
+#undef SK_REF_FOR_TUNNEL
+
static void rxe_port_event(struct rxe_dev *rxe,
enum ib_event_type event)
{
@@ -689,8 +704,6 @@ int rxe_register_notifier(void)
void rxe_net_exit(void)
{
- rxe_release_udp_tunnel(recv_sockets.sk6);
- rxe_release_udp_tunnel(recv_sockets.sk4);
unregister_netdevice_notifier(&rxe_net_notifier);
}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index a222c3eeae12..f48f22f3353b 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -17,6 +17,7 @@ struct rxe_recv_sockets {
};
int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
+void rxe_net_del(struct ib_device *dev);
int rxe_register_notifier(void);
int rxe_net_init(void);
--
2.27.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe Zhu Yanjun
@ 2023-06-20 20:21 ` Bob Pearson
2023-06-21 2:13 ` Zhu Yanjun
0 siblings, 1 reply; 17+ messages in thread
From: Bob Pearson @ 2023-06-20 20:21 UTC (permalink / raw)
To: Zhu Yanjun, zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer
Cc: Zhu Yanjun, Rain River
On 4/28/23 04:39, Zhu Yanjun wrote:
> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>
> When running "rdma link del" command, dellink function will be called.
> If the sock refcnt is greater than the refcnt needed for udp tunnel,
> the sock refcnt will be decreased by 1.
>
> If equal, the last rdma link is removed. The udp tunnel will be
> destroyed.
>
> Tested-by: Rain River <rain.1986.08.12@gmail.com>
> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
> ---
> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++-
> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
> drivers/infiniband/sw/rxe/rxe_net.h | 1 +
> 3 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 0ce6adb43cfc..ebfabc6d6b76 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
> /* called by ifc layer to create new rxe device.
> * The caller should allocate memory for rxe by calling ib_alloc_device.
> */
> +static struct rdma_link_ops rxe_link_ops;
> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
> {
> rxe_init(rxe);
> rxe_set_mtu(rxe, mtu);
> + rxe->ib_dev.link_ops = &rxe_link_ops;
>
> return rxe_register_device(rxe, ibdev_name);
> }
> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
> return err;
> }
>
> -struct rdma_link_ops rxe_link_ops = {
> +static int rxe_dellink(struct ib_device *dev)
> +{
> + rxe_net_del(dev);
> +
> + return 0;
> +}
> +
> +static struct rdma_link_ops rxe_link_ops = {
> .type = "rxe",
> .newlink = rxe_newlink,
> + .dellink = rxe_dellink,
> };
>
> static int __init rxe_module_init(void)
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index 3ca92e062800..4cc7de7b115b 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
> return 0;
> }
>
> +#define SK_REF_FOR_TUNNEL 2
> +void rxe_net_del(struct ib_device *dev)
> +{
> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
> + __sock_put(recv_sockets.sk6->sk);
> + else
> + rxe_release_udp_tunnel(recv_sockets.sk6);
> +
> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
> + __sock_put(recv_sockets.sk4->sk);
> + else
> + rxe_release_udp_tunnel(recv_sockets.sk4);
> +}
> +#undef SK_REF_FOR_TUNNEL
> +
> static void rxe_port_event(struct rxe_dev *rxe,
> enum ib_event_type event)
> {
> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>
> void rxe_net_exit(void)
> {
> - rxe_release_udp_tunnel(recv_sockets.sk6);
> - rxe_release_udp_tunnel(recv_sockets.sk4);
> unregister_netdevice_notifier(&rxe_net_notifier);
> }
These calls are moved to rxe_net_del which is called by an explicit unlink command.
But if rxe_net_init fails and returns an error code this will never happen.
This will result in leaking resources.
Bob
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
> index a222c3eeae12..f48f22f3353b 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.h
> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
> };
>
> int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
> +void rxe_net_del(struct ib_device *dev);
>
> int rxe_register_notifier(void);
> int rxe_net_init(void);
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe
2023-06-20 20:21 ` Bob Pearson
@ 2023-06-21 2:13 ` Zhu Yanjun
2023-06-21 3:23 ` Bob Pearson
0 siblings, 1 reply; 17+ messages in thread
From: Zhu Yanjun @ 2023-06-21 2:13 UTC (permalink / raw)
To: Bob Pearson, Zhu Yanjun, zyjzyj2000, jgg, leon, linux-rdma, parav,
lehrer
Cc: Rain River
在 2023/6/21 4:21, Bob Pearson 写道:
> On 4/28/23 04:39, Zhu Yanjun wrote:
>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>
>> When running "rdma link del" command, dellink function will be called.
>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>> the sock refcnt will be decreased by 1.
>>
>> If equal, the last rdma link is removed. The udp tunnel will be
>> destroyed.
>>
>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>> ---
>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++-
>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>> drivers/infiniband/sw/rxe/rxe_net.h | 1 +
>> 3 files changed, 27 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>> --- a/drivers/infiniband/sw/rxe/rxe.c
>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>> /* called by ifc layer to create new rxe device.
>> * The caller should allocate memory for rxe by calling ib_alloc_device.
>> */
>> +static struct rdma_link_ops rxe_link_ops;
>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>> {
>> rxe_init(rxe);
>> rxe_set_mtu(rxe, mtu);
>> + rxe->ib_dev.link_ops = &rxe_link_ops;
>>
>> return rxe_register_device(rxe, ibdev_name);
>> }
>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>> return err;
>> }
>>
>> -struct rdma_link_ops rxe_link_ops = {
>> +static int rxe_dellink(struct ib_device *dev)
>> +{
>> + rxe_net_del(dev);
>> +
>> + return 0;
>> +}
>> +
>> +static struct rdma_link_ops rxe_link_ops = {
>> .type = "rxe",
>> .newlink = rxe_newlink,
>> + .dellink = rxe_dellink,
>> };
>>
>> static int __init rxe_module_init(void)
>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>> index 3ca92e062800..4cc7de7b115b 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>> return 0;
>> }
>>
>> +#define SK_REF_FOR_TUNNEL 2
>> +void rxe_net_del(struct ib_device *dev)
>> +{
>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>> + __sock_put(recv_sockets.sk6->sk);
>> + else
>> + rxe_release_udp_tunnel(recv_sockets.sk6);
>> +
>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>> + __sock_put(recv_sockets.sk4->sk);
>> + else
>> + rxe_release_udp_tunnel(recv_sockets.sk4);
>> +}
>> +#undef SK_REF_FOR_TUNNEL
>> +
>> static void rxe_port_event(struct rxe_dev *rxe,
>> enum ib_event_type event)
>> {
>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>
>> void rxe_net_exit(void)
>> {
>> - rxe_release_udp_tunnel(recv_sockets.sk6);
>> - rxe_release_udp_tunnel(recv_sockets.sk4);
>> unregister_netdevice_notifier(&rxe_net_notifier);
>> }
> These calls are moved to rxe_net_del which is called by an explicit unlink command.
> But if rxe_net_init fails and returns an error code this will never happen.
> This will result in leaking resources.
Thanks a lot. Bob.
Sure, if ipv6 tunnel fails to be created, the resource related with ipv4
should be released.
I will fix it in the latest version.
Zhu Yanjun
>
> Bob
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>> index a222c3eeae12..f48f22f3353b 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>> };
>>
>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>> +void rxe_net_del(struct ib_device *dev);
>>
>> int rxe_register_notifier(void);
>> int rxe_net_init(void);
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe
2023-06-21 2:13 ` Zhu Yanjun
@ 2023-06-21 3:23 ` Bob Pearson
2023-06-21 6:17 ` Zhu Yanjun
0 siblings, 1 reply; 17+ messages in thread
From: Bob Pearson @ 2023-06-21 3:23 UTC (permalink / raw)
To: Zhu Yanjun, Zhu Yanjun, zyjzyj2000, jgg, leon, linux-rdma, parav,
lehrer
Cc: Rain River
On 6/20/23 21:13, Zhu Yanjun wrote:
>
> 在 2023/6/21 4:21, Bob Pearson 写道:
>> On 4/28/23 04:39, Zhu Yanjun wrote:
>>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>
>>> When running "rdma link del" command, dellink function will be called.
>>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>>> the sock refcnt will be decreased by 1.
>>>
>>> If equal, the last rdma link is removed. The udp tunnel will be
>>> destroyed.
>>>
>>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>> ---
>>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++-
>>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>> drivers/infiniband/sw/rxe/rxe_net.h | 1 +
>>> 3 files changed, 27 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>> /* called by ifc layer to create new rxe device.
>>> * The caller should allocate memory for rxe by calling ib_alloc_device.
>>> */
>>> +static struct rdma_link_ops rxe_link_ops;
>>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>> {
>>> rxe_init(rxe);
>>> rxe_set_mtu(rxe, mtu);
>>> + rxe->ib_dev.link_ops = &rxe_link_ops;
>>> return rxe_register_device(rxe, ibdev_name);
>>> }
>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>> return err;
>>> }
>>> -struct rdma_link_ops rxe_link_ops = {
>>> +static int rxe_dellink(struct ib_device *dev)
>>> +{
>>> + rxe_net_del(dev);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static struct rdma_link_ops rxe_link_ops = {
>>> .type = "rxe",
>>> .newlink = rxe_newlink,
>>> + .dellink = rxe_dellink,
>>> };
>>> static int __init rxe_module_init(void)
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>> index 3ca92e062800..4cc7de7b115b 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>> return 0;
>>> }
>>> +#define SK_REF_FOR_TUNNEL 2
>>> +void rxe_net_del(struct ib_device *dev)
>>> +{
>>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>> + __sock_put(recv_sockets.sk6->sk);
>>> + else
>>> + rxe_release_udp_tunnel(recv_sockets.sk6);
>>> +
>>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>> + __sock_put(recv_sockets.sk4->sk);
>>> + else
>>> + rxe_release_udp_tunnel(recv_sockets.sk4);
>>> +}
>>> +#undef SK_REF_FOR_TUNNEL
>>> +
>>> static void rxe_port_event(struct rxe_dev *rxe,
>>> enum ib_event_type event)
>>> {
>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>> void rxe_net_exit(void)
>>> {
>>> - rxe_release_udp_tunnel(recv_sockets.sk6);
>>> - rxe_release_udp_tunnel(recv_sockets.sk4);
>>> unregister_netdevice_notifier(&rxe_net_notifier);
>>> }
>> These calls are moved to rxe_net_del which is called by an explicit unlink command.
>> But if rxe_net_init fails and returns an error code this will never happen.
>> This will result in leaking resources.
>
> Thanks a lot. Bob.
>
> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released.
>
> I will fix it in the latest version.
>
> Zhu Yanjun
I haven't had a chance to test netns yet. I am sure it works but I will test it.
The only other thing I noticed are some stylistic differences with the rest of
the rxe driver. You use
struct rxe_dev *rdev;
elsewhere it is
struct rxe_dev *rxe;
Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device.
rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver
specific ones. It's less typing that way.
With a couple of exceptions all the printk's are now in the form
rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to.
where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically
adapted from the siw driver.
Regards,
Bob
>
>>
>> Bob
>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>>> index a222c3eeae12..f48f22f3353b 100644
>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>> };
>>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>>> +void rxe_net_del(struct ib_device *dev);
>>> int rxe_register_notifier(void);
>>> int rxe_net_init(void);
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe
2023-06-21 3:23 ` Bob Pearson
@ 2023-06-21 6:17 ` Zhu Yanjun
2023-06-21 16:24 ` Bob Pearson
0 siblings, 1 reply; 17+ messages in thread
From: Zhu Yanjun @ 2023-06-21 6:17 UTC (permalink / raw)
To: Bob Pearson, Zhu Yanjun, zyjzyj2000, jgg, leon, linux-rdma, parav,
lehrer
Cc: Rain River
在 2023/6/21 11:23, Bob Pearson 写道:
> On 6/20/23 21:13, Zhu Yanjun wrote:
>> 在 2023/6/21 4:21, Bob Pearson 写道:
>>> On 4/28/23 04:39, Zhu Yanjun wrote:
>>>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>
>>>> When running "rdma link del" command, dellink function will be called.
>>>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>>>> the sock refcnt will be decreased by 1.
>>>>
>>>> If equal, the last rdma link is removed. The udp tunnel will be
>>>> destroyed.
>>>>
>>>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>> ---
>>>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++-
>>>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>>> drivers/infiniband/sw/rxe/rxe_net.h | 1 +
>>>> 3 files changed, 27 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>>> /* called by ifc layer to create new rxe device.
>>>> * The caller should allocate memory for rxe by calling ib_alloc_device.
>>>> */
>>>> +static struct rdma_link_ops rxe_link_ops;
>>>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>>> {
>>>> rxe_init(rxe);
>>>> rxe_set_mtu(rxe, mtu);
>>>> + rxe->ib_dev.link_ops = &rxe_link_ops;
>>>> return rxe_register_device(rxe, ibdev_name);
>>>> }
>>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>>> return err;
>>>> }
>>>> -struct rdma_link_ops rxe_link_ops = {
>>>> +static int rxe_dellink(struct ib_device *dev)
>>>> +{
>>>> + rxe_net_del(dev);
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static struct rdma_link_ops rxe_link_ops = {
>>>> .type = "rxe",
>>>> .newlink = rxe_newlink,
>>>> + .dellink = rxe_dellink,
>>>> };
>>>> static int __init rxe_module_init(void)
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>>> index 3ca92e062800..4cc7de7b115b 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>>> return 0;
>>>> }
>>>> +#define SK_REF_FOR_TUNNEL 2
>>>> +void rxe_net_del(struct ib_device *dev)
>>>> +{
>>>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>> + __sock_put(recv_sockets.sk6->sk);
>>>> + else
>>>> + rxe_release_udp_tunnel(recv_sockets.sk6);
>>>> +
>>>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>> + __sock_put(recv_sockets.sk4->sk);
>>>> + else
>>>> + rxe_release_udp_tunnel(recv_sockets.sk4);
>>>> +}
>>>> +#undef SK_REF_FOR_TUNNEL
>>>> +
>>>> static void rxe_port_event(struct rxe_dev *rxe,
>>>> enum ib_event_type event)
>>>> {
>>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>>> void rxe_net_exit(void)
>>>> {
>>>> - rxe_release_udp_tunnel(recv_sockets.sk6);
>>>> - rxe_release_udp_tunnel(recv_sockets.sk4);
>>>> unregister_netdevice_notifier(&rxe_net_notifier);
>>>> }
>>> These calls are moved to rxe_net_del which is called by an explicit unlink command.
>>> But if rxe_net_init fails and returns an error code this will never happen.
>>> This will result in leaking resources.
>> Thanks a lot. Bob.
>>
>> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released.
>>
>> I will fix it in the latest version.
>>
>> Zhu Yanjun
> I haven't had a chance to test netns yet. I am sure it works but I will test it.
Yes. Please. It is an interesting feature.
> The only other thing I noticed are some stylistic differences with the rest of
> the rxe driver. You use
>
> struct rxe_dev *rdev;
>
> elsewhere it is
>
> struct rxe_dev *rxe;
>
> Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device.
> rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver
> specific ones. It's less typing that way.
Got you. I think we should use rxe instead of rdev. I will fix it in the
latest commits.
>
> With a couple of exceptions all the printk's are now in the form
>
> rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to.
>
> where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically
> adapted from the siw driver.
If I can get you correctly, you mean that we should use rxe_dbg_qp, ....
to replace pr_err ....
I have questions:
1). What benefit will this bring?
2). If the log is in module __init or module __exit functions, we
should use pr_xxx? Because obj does not exist in these __init and __exit
functions.
Best Regards,
Zhu Yanjun
>
> Regards,
>
> Bob
>
>>> Bob
>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>>>> index a222c3eeae12..f48f22f3353b 100644
>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>>> };
>>>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>>>> +void rxe_net_del(struct ib_device *dev);
>>>> int rxe_register_notifier(void);
>>>> int rxe_net_init(void);
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe
2023-06-21 6:17 ` Zhu Yanjun
@ 2023-06-21 16:24 ` Bob Pearson
2023-06-23 7:19 ` Zhu Yanjun
0 siblings, 1 reply; 17+ messages in thread
From: Bob Pearson @ 2023-06-21 16:24 UTC (permalink / raw)
To: Zhu Yanjun, Zhu Yanjun, zyjzyj2000, jgg, leon, linux-rdma, parav,
lehrer
Cc: Rain River
On 6/21/23 01:17, Zhu Yanjun wrote:
>
> 在 2023/6/21 11:23, Bob Pearson 写道:
>> On 6/20/23 21:13, Zhu Yanjun wrote:
>>> 在 2023/6/21 4:21, Bob Pearson 写道:
>>>> On 4/28/23 04:39, Zhu Yanjun wrote:
>>>>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>>
>>>>> When running "rdma link del" command, dellink function will be called.
>>>>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>>>>> the sock refcnt will be decreased by 1.
>>>>>
>>>>> If equal, the last rdma link is removed. The udp tunnel will be
>>>>> destroyed.
>>>>>
>>>>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>> ---
>>>>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++-
>>>>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>>>> drivers/infiniband/sw/rxe/rxe_net.h | 1 +
>>>>> 3 files changed, 27 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>>>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>>>> /* called by ifc layer to create new rxe device.
>>>>> * The caller should allocate memory for rxe by calling ib_alloc_device.
>>>>> */
>>>>> +static struct rdma_link_ops rxe_link_ops;
>>>>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>>>> {
>>>>> rxe_init(rxe);
>>>>> rxe_set_mtu(rxe, mtu);
>>>>> + rxe->ib_dev.link_ops = &rxe_link_ops;
>>>>> return rxe_register_device(rxe, ibdev_name);
>>>>> }
>>>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>>>> return err;
>>>>> }
>>>>> -struct rdma_link_ops rxe_link_ops = {
>>>>> +static int rxe_dellink(struct ib_device *dev)
>>>>> +{
>>>>> + rxe_net_del(dev);
>>>>> +
>>>>> + return 0;
>>>>> +}
>>>>> +
>>>>> +static struct rdma_link_ops rxe_link_ops = {
>>>>> .type = "rxe",
>>>>> .newlink = rxe_newlink,
>>>>> + .dellink = rxe_dellink,
>>>>> };
>>>>> static int __init rxe_module_init(void)
>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>>>> index 3ca92e062800..4cc7de7b115b 100644
>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>>>> return 0;
>>>>> }
>>>>> +#define SK_REF_FOR_TUNNEL 2
>>>>> +void rxe_net_del(struct ib_device *dev)
>>>>> +{
>>>>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>>> + __sock_put(recv_sockets.sk6->sk);
>>>>> + else
>>>>> + rxe_release_udp_tunnel(recv_sockets.sk6);
>>>>> +
>>>>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>>> + __sock_put(recv_sockets.sk4->sk);
>>>>> + else
>>>>> + rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>> +}
>>>>> +#undef SK_REF_FOR_TUNNEL
>>>>> +
>>>>> static void rxe_port_event(struct rxe_dev *rxe,
>>>>> enum ib_event_type event)
>>>>> {
>>>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>>>> void rxe_net_exit(void)
>>>>> {
>>>>> - rxe_release_udp_tunnel(recv_sockets.sk6);
>>>>> - rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>> unregister_netdevice_notifier(&rxe_net_notifier);
>>>>> }
>>>> These calls are moved to rxe_net_del which is called by an explicit unlink command.
>>>> But if rxe_net_init fails and returns an error code this will never happen.
>>>> This will result in leaking resources.
>>> Thanks a lot. Bob.
>>>
>>> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released.
>>>
>>> I will fix it in the latest version.
>>>
>>> Zhu Yanjun
>> I haven't had a chance to test netns yet. I am sure it works but I will test it.
> Yes. Please. It is an interesting feature.
>> The only other thing I noticed are some stylistic differences with the rest of
>> the rxe driver. You use
>>
>> struct rxe_dev *rdev;
>>
>> elsewhere it is
>>
>> struct rxe_dev *rxe;
>>
>> Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device.
>> rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver
>> specific ones. It's less typing that way.
>
>
> Got you. I think we should use rxe instead of rdev. I will fix it in the latest commits.
>
>
>>
>> With a couple of exceptions all the printk's are now in the form
>>
>> rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to.
>>
>> where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically
>> adapted from the siw driver.
>
> If I can get you correctly, you mean that we should use rxe_dbg_qp, .... to replace pr_err ....
>
> I have questions:
>
> 1). What benefit will this bring?
>
> 2). If the log is in module __init or module __exit functions, we should use pr_xxx? Because obj does not exist in these __init and __exit functions.
I think that is the way the driver is now. Go ahead.
>
> Best Regards,
>
> Zhu Yanjun
>
>>
>> Regards,
>>
>> Bob
>>
>>>> Bob
>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>>>>> index a222c3eeae12..f48f22f3353b 100644
>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>>>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>>>> };
>>>>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>>>>> +void rxe_net_del(struct ib_device *dev);
>>>>> int rxe_register_notifier(void);
>>>>> int rxe_net_init(void);
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe
2023-06-21 16:24 ` Bob Pearson
@ 2023-06-23 7:19 ` Zhu Yanjun
0 siblings, 0 replies; 17+ messages in thread
From: Zhu Yanjun @ 2023-06-23 7:19 UTC (permalink / raw)
To: Bob Pearson, Zhu Yanjun, zyjzyj2000, jgg, leon, linux-rdma, parav,
lehrer
Cc: Rain River
在 2023/6/22 0:24, Bob Pearson 写道:
> On 6/21/23 01:17, Zhu Yanjun wrote:
>>
>> 在 2023/6/21 11:23, Bob Pearson 写道:
>>> On 6/20/23 21:13, Zhu Yanjun wrote:
>>>> 在 2023/6/21 4:21, Bob Pearson 写道:
>>>>> On 4/28/23 04:39, Zhu Yanjun wrote:
>>>>>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>>>
>>>>>> When running "rdma link del" command, dellink function will be called.
>>>>>> If the sock refcnt is greater than the refcnt needed for udp tunnel,
>>>>>> the sock refcnt will be decreased by 1.
>>>>>>
>>>>>> If equal, the last rdma link is removed. The udp tunnel will be
>>>>>> destroyed.
>>>>>>
>>>>>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>>>>>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>>>>>> ---
>>>>>> drivers/infiniband/sw/rxe/rxe.c | 12 +++++++++++-
>>>>>> drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++++++++++++--
>>>>>> drivers/infiniband/sw/rxe/rxe_net.h | 1 +
>>>>>> 3 files changed, 27 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>>>>>> index 0ce6adb43cfc..ebfabc6d6b76 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe.c
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>>>>>> @@ -166,10 +166,12 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
>>>>>> /* called by ifc layer to create new rxe device.
>>>>>> * The caller should allocate memory for rxe by calling ib_alloc_device.
>>>>>> */
>>>>>> +static struct rdma_link_ops rxe_link_ops;
>>>>>> int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
>>>>>> {
>>>>>> rxe_init(rxe);
>>>>>> rxe_set_mtu(rxe, mtu);
>>>>>> + rxe->ib_dev.link_ops = &rxe_link_ops;
>>>>>> return rxe_register_device(rxe, ibdev_name);
>>>>>> }
>>>>>> @@ -206,9 +208,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
>>>>>> return err;
>>>>>> }
>>>>>> -struct rdma_link_ops rxe_link_ops = {
>>>>>> +static int rxe_dellink(struct ib_device *dev)
>>>>>> +{
>>>>>> + rxe_net_del(dev);
>>>>>> +
>>>>>> + return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static struct rdma_link_ops rxe_link_ops = {
>>>>>> .type = "rxe",
>>>>>> .newlink = rxe_newlink,
>>>>>> + .dellink = rxe_dellink,
>>>>>> };
>>>>>> static int __init rxe_module_init(void)
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>>>>>> index 3ca92e062800..4cc7de7b115b 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>>>>>> @@ -530,6 +530,21 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>>>>>> return 0;
>>>>>> }
>>>>>> +#define SK_REF_FOR_TUNNEL 2
>>>>>> +void rxe_net_del(struct ib_device *dev)
>>>>>> +{
>>>>>> + if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>>>> + __sock_put(recv_sockets.sk6->sk);
>>>>>> + else
>>>>>> + rxe_release_udp_tunnel(recv_sockets.sk6);
>>>>>> +
>>>>>> + if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>>>>>> + __sock_put(recv_sockets.sk4->sk);
>>>>>> + else
>>>>>> + rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>>> +}
>>>>>> +#undef SK_REF_FOR_TUNNEL
>>>>>> +
>>>>>> static void rxe_port_event(struct rxe_dev *rxe,
>>>>>> enum ib_event_type event)
>>>>>> {
>>>>>> @@ -689,8 +704,6 @@ int rxe_register_notifier(void)
>>>>>> void rxe_net_exit(void)
>>>>>> {
>>>>>> - rxe_release_udp_tunnel(recv_sockets.sk6);
>>>>>> - rxe_release_udp_tunnel(recv_sockets.sk4);
>>>>>> unregister_netdevice_notifier(&rxe_net_notifier);
>>>>>> }
>>>>> These calls are moved to rxe_net_del which is called by an explicit unlink command.
>>>>> But if rxe_net_init fails and returns an error code this will never happen.
>>>>> This will result in leaking resources.
>>>> Thanks a lot. Bob.
>>>>
>>>> Sure, if ipv6 tunnel fails to be created, the resource related with ipv4 should be released.
>>>>
>>>> I will fix it in the latest version.
>>>>
>>>> Zhu Yanjun
>>> I haven't had a chance to test netns yet. I am sure it works but I will test it.
>> Yes. Please. It is an interesting feature.
>>> The only other thing I noticed are some stylistic differences with the rest of
>>> the rxe driver. You use
>>>
>>> struct rxe_dev *rdev;
>>>
>>> elsewhere it is
>>>
>>> struct rxe_dev *rxe;
>>>
>>> Yours is more like the mlx drivers where they use dev for ib_device and mdev for mlx_device.
>>> rxe tries to use ibdev ibqp, ibmr, etc for the ib objects and no prefix for the driver
>>> specific ones. It's less typing that way.
>>
>>
>> Got you. I think we should use rxe instead of rdev. I will fix it in the latest commits.
>>
>>
>>>
>>> With a couple of exceptions all the printk's are now in the form
>>>
>>> rxe_[type]_[obj](obj, "message", ...) or rxe_[type] if there isn't an obj to refer to.
>>>
>>> where type = info, err, warn, or dbg and obj = rxe, ah, pd, qp, cq, etc. These are basically
>>> adapted from the siw driver.
>>
>> If I can get you correctly, you mean that we should use rxe_dbg_qp, .... to replace pr_err ....
>>
>> I have questions:
>>
>> 1). What benefit will this bring?
>>
>> 2). If the log is in module __init or module __exit functions, we should use pr_xxx? Because obj does not exist in these __init and __exit functions.
> I think that is the way the driver is now. Go ahead.
OK. The latest commits will be sent out very soon.
Zhu Yanjun
>>
>> Best Regards,
>>
>> Zhu Yanjun
>>
>>>
>>> Regards,
>>>
>>> Bob
>>>
>>>>> Bob
>>>>>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>>>>>> index a222c3eeae12..f48f22f3353b 100644
>>>>>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>>>>>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>>>>>> @@ -17,6 +17,7 @@ struct rxe_recv_sockets {
>>>>>> };
>>>>>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>>>>>> +void rxe_net_del(struct ib_device *dev);
>>>>>> int rxe_register_notifier(void);
>>>>>> int rxe_net_init(void);
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCHv5 for-rc1 v5 5/8] RDMA/rxe: Replace global variable with sock lookup functions
2023-04-28 9:39 [PATCHv5 for-rc1 v5 0/8] Fix the problem that rxe can not work in net namespace Zhu Yanjun
` (3 preceding siblings ...)
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 4/8] RDMA/rxe: Implement dellink in rxe Zhu Yanjun
@ 2023-04-28 9:39 ` Zhu Yanjun
2023-06-20 20:28 ` Bob Pearson
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 6/8] RDMA/rxe: add the support of net namespace Zhu Yanjun
` (2 subsequent siblings)
7 siblings, 1 reply; 17+ messages in thread
From: Zhu Yanjun @ 2023-04-28 9:39 UTC (permalink / raw)
To: zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer; +Cc: Zhu Yanjun, Rain River
From: Zhu Yanjun <yanjun.zhu@linux.dev>
Originally a global variable is to keep the sock of udp listening
on port 4791. In fact, sock lookup functions can be used to get
the sock.
Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
drivers/infiniband/sw/rxe/rxe.c | 1 +
drivers/infiniband/sw/rxe/rxe_net.c | 58 ++++++++++++++++++++-------
drivers/infiniband/sw/rxe/rxe_net.h | 5 ---
drivers/infiniband/sw/rxe/rxe_verbs.h | 1 +
4 files changed, 45 insertions(+), 20 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index ebfabc6d6b76..e81c2164d77f 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -74,6 +74,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
rxe->ndev->dev_addr);
rxe->max_ucontext = RXE_MAX_UCONTEXT;
+ rxe->l_sk6 = NULL;
}
/* initialize port attributes */
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 4cc7de7b115b..b56e2c32fbf7 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -18,8 +18,6 @@
#include "rxe_net.h"
#include "rxe_loc.h"
-static struct rxe_recv_sockets recv_sockets;
-
static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
struct net_device *ndev,
struct in_addr *saddr,
@@ -51,6 +49,23 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
{
struct dst_entry *ndst;
struct flowi6 fl6 = { { 0 } };
+ struct rxe_dev *rdev;
+
+ rdev = rxe_get_dev_from_net(ndev);
+ if (!rdev->l_sk6) {
+ struct sock *sk;
+
+ rcu_read_lock();
+ sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any, htons(ROCE_V2_UDP_DPORT), 0);
+ rcu_read_unlock();
+ if (!sk) {
+ pr_info("file: %s +%d, error\n", __FILE__, __LINE__);
+ return (struct dst_entry *)sk;
+ }
+ __sock_put(sk);
+ rdev->l_sk6 = sk->sk_socket;
+ }
+
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_oif = ndev->ifindex;
@@ -58,8 +73,8 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
memcpy(&fl6.daddr, daddr, sizeof(*daddr));
fl6.flowi6_proto = IPPROTO_UDP;
- ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk),
- recv_sockets.sk6->sk, &fl6,
+ ndst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(ndev),
+ rdev->l_sk6->sk, &fl6,
NULL);
if (IS_ERR(ndst)) {
rxe_dbg_qp(qp, "no route to %pI6\n", daddr);
@@ -533,15 +548,33 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
#define SK_REF_FOR_TUNNEL 2
void rxe_net_del(struct ib_device *dev)
{
- if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
- __sock_put(recv_sockets.sk6->sk);
+ struct sock *sk;
+
+ rcu_read_lock();
+ sk = udp4_lib_lookup(&init_net, 0, 0, htonl(INADDR_ANY), htons(ROCE_V2_UDP_DPORT), 0);
+ rcu_read_unlock();
+ if (!sk)
+ return;
+
+ __sock_put(sk);
+
+ if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
+ __sock_put(sk);
else
- rxe_release_udp_tunnel(recv_sockets.sk6);
+ rxe_release_udp_tunnel(sk->sk_socket);
+
+ rcu_read_lock();
+ sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any, htons(ROCE_V2_UDP_DPORT), 0);
+ rcu_read_unlock();
+ if (!sk)
+ return;
+
+ __sock_put(sk);
- if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
- __sock_put(recv_sockets.sk4->sk);
+ if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
+ __sock_put(sk);
else
- rxe_release_udp_tunnel(recv_sockets.sk4);
+ rxe_release_udp_tunnel(sk->sk_socket);
}
#undef SK_REF_FOR_TUNNEL
@@ -651,10 +684,8 @@ static int rxe_net_ipv4_init(void)
sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false);
if (IS_ERR(sock)) {
pr_err("Failed to create IPv4 UDP tunnel\n");
- recv_sockets.sk4 = NULL;
return -1;
}
- recv_sockets.sk4 = sock;
return 0;
}
@@ -674,17 +705,14 @@ static int rxe_net_ipv6_init(void)
sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true);
if (PTR_ERR(sock) == -EAFNOSUPPORT) {
- recv_sockets.sk6 = NULL;
pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n");
return 0;
}
if (IS_ERR(sock)) {
- recv_sockets.sk6 = NULL;
pr_err("Failed to create IPv6 UDP tunnel\n");
return -1;
}
- recv_sockets.sk6 = sock;
#endif
return 0;
}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index f48f22f3353b..027b20e1bab6 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -11,11 +11,6 @@
#include <net/if_inet6.h>
#include <linux/module.h>
-struct rxe_recv_sockets {
- struct socket *sk4;
- struct socket *sk6;
-};
-
int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
void rxe_net_del(struct ib_device *dev);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index c269ae2a3224..ac9bb55820a2 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -396,6 +396,7 @@ struct rxe_dev {
struct rxe_port port;
struct crypto_shash *tfm;
+ struct socket *l_sk6;
};
static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
--
2.27.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCHv5 for-rc1 v5 5/8] RDMA/rxe: Replace global variable with sock lookup functions
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 5/8] RDMA/rxe: Replace global variable with sock lookup functions Zhu Yanjun
@ 2023-06-20 20:28 ` Bob Pearson
2023-06-21 1:30 ` Zhu Yanjun
0 siblings, 1 reply; 17+ messages in thread
From: Bob Pearson @ 2023-06-20 20:28 UTC (permalink / raw)
To: Zhu Yanjun, zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer
Cc: Zhu Yanjun, Rain River
On 4/28/23 04:39, Zhu Yanjun wrote:
> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>
> Originally a global variable is to keep the sock of udp listening
> on port 4791. In fact, sock lookup functions can be used to get
> the sock.
>
> Tested-by: Rain River <rain.1986.08.12@gmail.com>
> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
> ---
> drivers/infiniband/sw/rxe/rxe.c | 1 +
> drivers/infiniband/sw/rxe/rxe_net.c | 58 ++++++++++++++++++++-------
> drivers/infiniband/sw/rxe/rxe_net.h | 5 ---
> drivers/infiniband/sw/rxe/rxe_verbs.h | 1 +
> 4 files changed, 45 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index ebfabc6d6b76..e81c2164d77f 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -74,6 +74,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
> rxe->ndev->dev_addr);
>
> rxe->max_ucontext = RXE_MAX_UCONTEXT;
> + rxe->l_sk6 = NULL;
> }
>
> /* initialize port attributes */
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index 4cc7de7b115b..b56e2c32fbf7 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -18,8 +18,6 @@
> #include "rxe_net.h"
> #include "rxe_loc.h"
>
> -static struct rxe_recv_sockets recv_sockets;
> -
> static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
> struct net_device *ndev,
> struct in_addr *saddr,
> @@ -51,6 +49,23 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
> {
> struct dst_entry *ndst;
> struct flowi6 fl6 = { { 0 } };
> + struct rxe_dev *rdev;
> +
> + rdev = rxe_get_dev_from_net(ndev);
> + if (!rdev->l_sk6) {
> + struct sock *sk;
> +
> + rcu_read_lock();
> + sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any, htons(ROCE_V2_UDP_DPORT), 0);
This subroutine (and similar udp4) is currently called for every UD packet. This just adds a lot of
code to packet processing for UD packets. All to save a global variable. Not sure it's worth it.
Bob
> + rcu_read_unlock();
> + if (!sk) {
> + pr_info("file: %s +%d, error\n", __FILE__, __LINE__);
> + return (struct dst_entry *)sk;
> + }
> + __sock_put(sk);
> + rdev->l_sk6 = sk->sk_socket;
> + }
> +
>
> memset(&fl6, 0, sizeof(fl6));
> fl6.flowi6_oif = ndev->ifindex;
> @@ -58,8 +73,8 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
> memcpy(&fl6.daddr, daddr, sizeof(*daddr));
> fl6.flowi6_proto = IPPROTO_UDP;
>
> - ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk),
> - recv_sockets.sk6->sk, &fl6,
> + ndst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(ndev),
> + rdev->l_sk6->sk, &fl6,
> NULL);
> if (IS_ERR(ndst)) {
> rxe_dbg_qp(qp, "no route to %pI6\n", daddr);
> @@ -533,15 +548,33 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
> #define SK_REF_FOR_TUNNEL 2
> void rxe_net_del(struct ib_device *dev)
> {
> - if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
> - __sock_put(recv_sockets.sk6->sk);
> + struct sock *sk;
> +
> + rcu_read_lock();
> + sk = udp4_lib_lookup(&init_net, 0, 0, htonl(INADDR_ANY), htons(ROCE_V2_UDP_DPORT), 0);
> + rcu_read_unlock();
> + if (!sk)
> + return;
> +
> + __sock_put(sk);
> +
> + if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
> + __sock_put(sk);
> else
> - rxe_release_udp_tunnel(recv_sockets.sk6);
> + rxe_release_udp_tunnel(sk->sk_socket);
> +
> + rcu_read_lock();
> + sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any, htons(ROCE_V2_UDP_DPORT), 0);
> + rcu_read_unlock();
> + if (!sk)
> + return;
> +
> + __sock_put(sk);
>
> - if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
> - __sock_put(recv_sockets.sk4->sk);
> + if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
> + __sock_put(sk);
> else
> - rxe_release_udp_tunnel(recv_sockets.sk4);
> + rxe_release_udp_tunnel(sk->sk_socket);
> }
> #undef SK_REF_FOR_TUNNEL
>
> @@ -651,10 +684,8 @@ static int rxe_net_ipv4_init(void)
> sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false);
> if (IS_ERR(sock)) {
> pr_err("Failed to create IPv4 UDP tunnel\n");
> - recv_sockets.sk4 = NULL;
> return -1;
> }
> - recv_sockets.sk4 = sock;
>
> return 0;
> }
> @@ -674,17 +705,14 @@ static int rxe_net_ipv6_init(void)
>
> sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true);
> if (PTR_ERR(sock) == -EAFNOSUPPORT) {
> - recv_sockets.sk6 = NULL;
> pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n");
> return 0;
> }
>
> if (IS_ERR(sock)) {
> - recv_sockets.sk6 = NULL;
> pr_err("Failed to create IPv6 UDP tunnel\n");
> return -1;
> }
> - recv_sockets.sk6 = sock;
> #endif
> return 0;
> }
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
> index f48f22f3353b..027b20e1bab6 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.h
> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
> @@ -11,11 +11,6 @@
> #include <net/if_inet6.h>
> #include <linux/module.h>
>
> -struct rxe_recv_sockets {
> - struct socket *sk4;
> - struct socket *sk6;
> -};
> -
> int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
> void rxe_net_del(struct ib_device *dev);
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index c269ae2a3224..ac9bb55820a2 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -396,6 +396,7 @@ struct rxe_dev {
>
> struct rxe_port port;
> struct crypto_shash *tfm;
> + struct socket *l_sk6;
> };
>
> static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCHv5 for-rc1 v5 5/8] RDMA/rxe: Replace global variable with sock lookup functions
2023-06-20 20:28 ` Bob Pearson
@ 2023-06-21 1:30 ` Zhu Yanjun
0 siblings, 0 replies; 17+ messages in thread
From: Zhu Yanjun @ 2023-06-21 1:30 UTC (permalink / raw)
To: Bob Pearson, Zhu Yanjun, zyjzyj2000, jgg, leon, linux-rdma, parav,
lehrer
Cc: Rain River
在 2023/6/21 4:28, Bob Pearson 写道:
> On 4/28/23 04:39, Zhu Yanjun wrote:
>> From: Zhu Yanjun <yanjun.zhu@linux.dev>
>>
>> Originally a global variable is to keep the sock of udp listening
>> on port 4791. In fact, sock lookup functions can be used to get
>> the sock.
>>
>> Tested-by: Rain River <rain.1986.08.12@gmail.com>
>> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
>> ---
>> drivers/infiniband/sw/rxe/rxe.c | 1 +
>> drivers/infiniband/sw/rxe/rxe_net.c | 58 ++++++++++++++++++++-------
>> drivers/infiniband/sw/rxe/rxe_net.h | 5 ---
>> drivers/infiniband/sw/rxe/rxe_verbs.h | 1 +
>> 4 files changed, 45 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
>> index ebfabc6d6b76..e81c2164d77f 100644
>> --- a/drivers/infiniband/sw/rxe/rxe.c
>> +++ b/drivers/infiniband/sw/rxe/rxe.c
>> @@ -74,6 +74,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
>> rxe->ndev->dev_addr);
>>
>> rxe->max_ucontext = RXE_MAX_UCONTEXT;
>> + rxe->l_sk6 = NULL;
>> }
>>
>> /* initialize port attributes */
>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
>> index 4cc7de7b115b..b56e2c32fbf7 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_net.c
>> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
>> @@ -18,8 +18,6 @@
>> #include "rxe_net.h"
>> #include "rxe_loc.h"
>>
>> -static struct rxe_recv_sockets recv_sockets;
>> -
>> static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
>> struct net_device *ndev,
>> struct in_addr *saddr,
>> @@ -51,6 +49,23 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
>> {
>> struct dst_entry *ndst;
>> struct flowi6 fl6 = { { 0 } };
>> + struct rxe_dev *rdev;
>> +
>> + rdev = rxe_get_dev_from_net(ndev);
>> + if (!rdev->l_sk6) {
>> + struct sock *sk;
>> +
>> + rcu_read_lock();
>> + sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any, htons(ROCE_V2_UDP_DPORT), 0);
> This subroutine (and similar udp4) is currently called for every UD packet. This just adds a lot of
> code to packet processing for UD packets. All to save a global variable. Not sure it's worth it.
Thanks, Bob. I also noticed this problem. 2 variables rxe_sk4 and
rxe_sk6 are added.
The 2 functions udp6_lib_lookup and udp4_lib_lookup are removed.
Zhu Yanjun
>
> Bob
>> + rcu_read_unlock();
>> + if (!sk) {
>> + pr_info("file: %s +%d, error\n", __FILE__, __LINE__);
>> + return (struct dst_entry *)sk;
>> + }
>> + __sock_put(sk);
>> + rdev->l_sk6 = sk->sk_socket;
>> + }
>> +
>>
>> memset(&fl6, 0, sizeof(fl6));
>> fl6.flowi6_oif = ndev->ifindex;
>> @@ -58,8 +73,8 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
>> memcpy(&fl6.daddr, daddr, sizeof(*daddr));
>> fl6.flowi6_proto = IPPROTO_UDP;
>>
>> - ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk),
>> - recv_sockets.sk6->sk, &fl6,
>> + ndst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(ndev),
>> + rdev->l_sk6->sk, &fl6,
>> NULL);
>> if (IS_ERR(ndst)) {
>> rxe_dbg_qp(qp, "no route to %pI6\n", daddr);
>> @@ -533,15 +548,33 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
>> #define SK_REF_FOR_TUNNEL 2
>> void rxe_net_del(struct ib_device *dev)
>> {
>> - if (refcount_read(&recv_sockets.sk6->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>> - __sock_put(recv_sockets.sk6->sk);
>> + struct sock *sk;
>> +
>> + rcu_read_lock();
>> + sk = udp4_lib_lookup(&init_net, 0, 0, htonl(INADDR_ANY), htons(ROCE_V2_UDP_DPORT), 0);
>> + rcu_read_unlock();
>> + if (!sk)
>> + return;
>> +
>> + __sock_put(sk);
>> +
>> + if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>> + __sock_put(sk);
>> else
>> - rxe_release_udp_tunnel(recv_sockets.sk6);
>> + rxe_release_udp_tunnel(sk->sk_socket);
>> +
>> + rcu_read_lock();
>> + sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any, htons(ROCE_V2_UDP_DPORT), 0);
>> + rcu_read_unlock();
>> + if (!sk)
>> + return;
>> +
>> + __sock_put(sk);
>>
>> - if (refcount_read(&recv_sockets.sk4->sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>> - __sock_put(recv_sockets.sk4->sk);
>> + if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
>> + __sock_put(sk);
>> else
>> - rxe_release_udp_tunnel(recv_sockets.sk4);
>> + rxe_release_udp_tunnel(sk->sk_socket);
>> }
>> #undef SK_REF_FOR_TUNNEL
>>
>> @@ -651,10 +684,8 @@ static int rxe_net_ipv4_init(void)
>> sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false);
>> if (IS_ERR(sock)) {
>> pr_err("Failed to create IPv4 UDP tunnel\n");
>> - recv_sockets.sk4 = NULL;
>> return -1;
>> }
>> - recv_sockets.sk4 = sock;
>>
>> return 0;
>> }
>> @@ -674,17 +705,14 @@ static int rxe_net_ipv6_init(void)
>>
>> sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true);
>> if (PTR_ERR(sock) == -EAFNOSUPPORT) {
>> - recv_sockets.sk6 = NULL;
>> pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n");
>> return 0;
>> }
>>
>> if (IS_ERR(sock)) {
>> - recv_sockets.sk6 = NULL;
>> pr_err("Failed to create IPv6 UDP tunnel\n");
>> return -1;
>> }
>> - recv_sockets.sk6 = sock;
>> #endif
>> return 0;
>> }
>> diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
>> index f48f22f3353b..027b20e1bab6 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_net.h
>> +++ b/drivers/infiniband/sw/rxe/rxe_net.h
>> @@ -11,11 +11,6 @@
>> #include <net/if_inet6.h>
>> #include <linux/module.h>
>>
>> -struct rxe_recv_sockets {
>> - struct socket *sk4;
>> - struct socket *sk6;
>> -};
>> -
>> int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
>> void rxe_net_del(struct ib_device *dev);
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
>> index c269ae2a3224..ac9bb55820a2 100644
>> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
>> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
>> @@ -396,6 +396,7 @@ struct rxe_dev {
>>
>> struct rxe_port port;
>> struct crypto_shash *tfm;
>> + struct socket *l_sk6;
>> };
>>
>> static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCHv5 for-rc1 v5 6/8] RDMA/rxe: add the support of net namespace
2023-04-28 9:39 [PATCHv5 for-rc1 v5 0/8] Fix the problem that rxe can not work in net namespace Zhu Yanjun
` (4 preceding siblings ...)
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 5/8] RDMA/rxe: Replace global variable with sock lookup functions Zhu Yanjun
@ 2023-04-28 9:39 ` Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 7/8] RDMA/rxe: Add the support of net namespace notifier Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 8/8] RDMA/rxe: Replace l_sk6 with sk6 in net namespace Zhu Yanjun
7 siblings, 0 replies; 17+ messages in thread
From: Zhu Yanjun @ 2023-04-28 9:39 UTC (permalink / raw)
To: zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer; +Cc: Zhu Yanjun, Rain River
From: Zhu Yanjun <yanjun.zhu@linux.dev>
Originally init_net is used to indicate the current net namespace.
Currently more net namespaces are supported.
Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
drivers/infiniband/sw/rxe/rxe.c | 2 +-
drivers/infiniband/sw/rxe/rxe_net.c | 33 +++++++++++++++++------------
drivers/infiniband/sw/rxe/rxe_net.h | 2 +-
3 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index e81c2164d77f..4a17e4a003f5 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -196,7 +196,7 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
goto err;
}
- err = rxe_net_init();
+ err = rxe_net_init(ndev);
if (err)
return err;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index b56e2c32fbf7..9af90587642a 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -32,7 +32,7 @@ static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
memcpy(&fl.daddr, daddr, sizeof(*daddr));
fl.flowi4_proto = IPPROTO_UDP;
- rt = ip_route_output_key(&init_net, &fl);
+ rt = ip_route_output_key(dev_net(ndev), &fl);
if (IS_ERR(rt)) {
rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr);
return NULL;
@@ -56,7 +56,8 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
struct sock *sk;
rcu_read_lock();
- sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any, htons(ROCE_V2_UDP_DPORT), 0);
+ sk = udp6_lib_lookup(dev_net(ndev), NULL, 0, &in6addr_any,
+ htons(ROCE_V2_UDP_DPORT), 0);
rcu_read_unlock();
if (!sk) {
pr_info("file: %s +%d, error\n", __FILE__, __LINE__);
@@ -549,9 +550,13 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
void rxe_net_del(struct ib_device *dev)
{
struct sock *sk;
+ struct rxe_dev *rdev;
+
+ rdev = container_of(dev, struct rxe_dev, ib_dev);
rcu_read_lock();
- sk = udp4_lib_lookup(&init_net, 0, 0, htonl(INADDR_ANY), htons(ROCE_V2_UDP_DPORT), 0);
+ sk = udp4_lib_lookup(dev_net(rdev->ndev), 0, 0, htonl(INADDR_ANY),
+ htons(ROCE_V2_UDP_DPORT), 0);
rcu_read_unlock();
if (!sk)
return;
@@ -564,7 +569,8 @@ void rxe_net_del(struct ib_device *dev)
rxe_release_udp_tunnel(sk->sk_socket);
rcu_read_lock();
- sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any, htons(ROCE_V2_UDP_DPORT), 0);
+ sk = udp6_lib_lookup(dev_net(rdev->ndev), NULL, 0, &in6addr_any,
+ htons(ROCE_V2_UDP_DPORT), 0);
rcu_read_unlock();
if (!sk)
return;
@@ -636,6 +642,7 @@ static int rxe_notify(struct notifier_block *not_blk,
switch (event) {
case NETDEV_UNREGISTER:
ib_unregister_device_queued(&rxe->ib_dev);
+ rxe_net_del(&rxe->ib_dev);
break;
case NETDEV_UP:
rxe_port_up(rxe);
@@ -669,19 +676,19 @@ static struct notifier_block rxe_net_notifier = {
.notifier_call = rxe_notify,
};
-static int rxe_net_ipv4_init(void)
+static int rxe_net_ipv4_init(struct net_device *ndev)
{
struct sock *sk;
struct socket *sock;
rcu_read_lock();
- sk = udp4_lib_lookup(&init_net, 0, 0, htonl(INADDR_ANY),
+ sk = udp4_lib_lookup(dev_net(ndev), 0, 0, htonl(INADDR_ANY),
htons(ROCE_V2_UDP_DPORT), 0);
rcu_read_unlock();
if (sk)
return 0;
- sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false);
+ sock = rxe_setup_udp_tunnel(dev_net(ndev), htons(ROCE_V2_UDP_DPORT), false);
if (IS_ERR(sock)) {
pr_err("Failed to create IPv4 UDP tunnel\n");
return -1;
@@ -690,20 +697,20 @@ static int rxe_net_ipv4_init(void)
return 0;
}
-static int rxe_net_ipv6_init(void)
+static int rxe_net_ipv6_init(struct net_device *ndev)
{
#if IS_ENABLED(CONFIG_IPV6)
struct sock *sk;
struct socket *sock;
rcu_read_lock();
- sk = udp6_lib_lookup(&init_net, NULL, 0, &in6addr_any,
+ sk = udp6_lib_lookup(dev_net(ndev), NULL, 0, &in6addr_any,
htons(ROCE_V2_UDP_DPORT), 0);
rcu_read_unlock();
if (sk)
return 0;
- sock = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true);
+ sock = rxe_setup_udp_tunnel(dev_net(ndev), htons(ROCE_V2_UDP_DPORT), true);
if (PTR_ERR(sock) == -EAFNOSUPPORT) {
pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n");
return 0;
@@ -735,14 +742,14 @@ void rxe_net_exit(void)
unregister_netdevice_notifier(&rxe_net_notifier);
}
-int rxe_net_init(void)
+int rxe_net_init(struct net_device *ndev)
{
int err;
- err = rxe_net_ipv4_init();
+ err = rxe_net_ipv4_init(ndev);
if (err)
return err;
- err = rxe_net_ipv6_init();
+ err = rxe_net_ipv6_init(ndev);
if (err)
goto err_out;
return 0;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index 027b20e1bab6..56249677d692 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -15,7 +15,7 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
void rxe_net_del(struct ib_device *dev);
int rxe_register_notifier(void);
-int rxe_net_init(void);
+int rxe_net_init(struct net_device *ndev);
void rxe_net_exit(void);
#endif /* RXE_NET_H */
--
2.27.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCHv5 for-rc1 v5 7/8] RDMA/rxe: Add the support of net namespace notifier
2023-04-28 9:39 [PATCHv5 for-rc1 v5 0/8] Fix the problem that rxe can not work in net namespace Zhu Yanjun
` (5 preceding siblings ...)
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 6/8] RDMA/rxe: add the support of net namespace Zhu Yanjun
@ 2023-04-28 9:39 ` Zhu Yanjun
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 8/8] RDMA/rxe: Replace l_sk6 with sk6 in net namespace Zhu Yanjun
7 siblings, 0 replies; 17+ messages in thread
From: Zhu Yanjun @ 2023-04-28 9:39 UTC (permalink / raw)
To: zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer; +Cc: Zhu Yanjun, Rain River
From: Zhu Yanjun <yanjun.zhu@linux.dev>
The functions register_pernet_subsys/unregister_pernet_subsys register a
notifier of net namespace. When a new net namespace is created, the init
function of rxe will be called to initialize sk4 and sk6 socks. When a
net namespace is destroyed, the exit function will be called to handle
sk4 and sk6 socks.
The functions rxe_ns_pernet_sk4 and rxe_ns_pernet_sk6 are used to get
sk4 and sk6 socks.
The functions rxe_ns_pernet_set_sk4 and rxe_ns_pernet_set_sk6 are used
to set sk4 and sk6 socks.
Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
drivers/infiniband/sw/rxe/Makefile | 3 +-
drivers/infiniband/sw/rxe/rxe.c | 9 ++
drivers/infiniband/sw/rxe/rxe_net.c | 50 +++++------
drivers/infiniband/sw/rxe/rxe_ns.c | 134 ++++++++++++++++++++++++++++
drivers/infiniband/sw/rxe/rxe_ns.h | 17 ++++
5 files changed, 187 insertions(+), 26 deletions(-)
create mode 100644 drivers/infiniband/sw/rxe/rxe_ns.c
create mode 100644 drivers/infiniband/sw/rxe/rxe_ns.h
diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile
index 5395a581f4bb..8380f97674cb 100644
--- a/drivers/infiniband/sw/rxe/Makefile
+++ b/drivers/infiniband/sw/rxe/Makefile
@@ -22,4 +22,5 @@ rdma_rxe-y := \
rxe_mcast.o \
rxe_task.o \
rxe_net.o \
- rxe_hw_counters.o
+ rxe_hw_counters.o \
+ rxe_ns.o
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 4a17e4a003f5..c297677bf06a 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -9,6 +9,7 @@
#include "rxe.h"
#include "rxe_loc.h"
#include "rxe_net.h"
+#include "rxe_ns.h"
MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib");
MODULE_DESCRIPTION("Soft RDMA transport");
@@ -234,6 +235,12 @@ static int __init rxe_module_init(void)
return -1;
}
+ err = rxe_namespace_init();
+ if (err) {
+ pr_err("Failed to register net namespace notifier\n");
+ return -1;
+ }
+
pr_info("loaded\n");
return 0;
}
@@ -244,6 +251,8 @@ static void __exit rxe_module_exit(void)
ib_unregister_driver(RDMA_DRIVER_RXE);
rxe_net_exit();
+ rxe_namespace_exit();
+
pr_info("unloaded\n");
}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 9af90587642a..8135876b11f6 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -17,6 +17,7 @@
#include "rxe.h"
#include "rxe_net.h"
#include "rxe_loc.h"
+#include "rxe_ns.h"
static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
struct net_device *ndev,
@@ -554,33 +555,30 @@ void rxe_net_del(struct ib_device *dev)
rdev = container_of(dev, struct rxe_dev, ib_dev);
- rcu_read_lock();
- sk = udp4_lib_lookup(dev_net(rdev->ndev), 0, 0, htonl(INADDR_ANY),
- htons(ROCE_V2_UDP_DPORT), 0);
- rcu_read_unlock();
+ sk = rxe_ns_pernet_sk4(dev_net(rdev->ndev));
if (!sk)
return;
- __sock_put(sk);
- if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
+ if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL) {
__sock_put(sk);
- else
+ } else {
rxe_release_udp_tunnel(sk->sk_socket);
+ sk = NULL;
+ rxe_ns_pernet_set_sk4(dev_net(rdev->ndev), sk);
+ }
- rcu_read_lock();
- sk = udp6_lib_lookup(dev_net(rdev->ndev), NULL, 0, &in6addr_any,
- htons(ROCE_V2_UDP_DPORT), 0);
- rcu_read_unlock();
+ sk = rxe_ns_pernet_sk6(dev_net(rdev->ndev));
if (!sk)
return;
- __sock_put(sk);
-
- if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL)
+ if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL) {
__sock_put(sk);
- else
+ } else {
rxe_release_udp_tunnel(sk->sk_socket);
+ sk = NULL;
+ rxe_ns_pernet_set_sk6(dev_net(rdev->ndev), sk);
+ }
}
#undef SK_REF_FOR_TUNNEL
@@ -681,18 +679,18 @@ static int rxe_net_ipv4_init(struct net_device *ndev)
struct sock *sk;
struct socket *sock;
- rcu_read_lock();
- sk = udp4_lib_lookup(dev_net(ndev), 0, 0, htonl(INADDR_ANY),
- htons(ROCE_V2_UDP_DPORT), 0);
- rcu_read_unlock();
- if (sk)
+ sk = rxe_ns_pernet_sk4(dev_net(ndev));
+ if (sk) {
+ sock_hold(sk);
return 0;
+ }
sock = rxe_setup_udp_tunnel(dev_net(ndev), htons(ROCE_V2_UDP_DPORT), false);
if (IS_ERR(sock)) {
pr_err("Failed to create IPv4 UDP tunnel\n");
return -1;
}
+ rxe_ns_pernet_set_sk4(dev_net(ndev), sock->sk);
return 0;
}
@@ -703,12 +701,11 @@ static int rxe_net_ipv6_init(struct net_device *ndev)
struct sock *sk;
struct socket *sock;
- rcu_read_lock();
- sk = udp6_lib_lookup(dev_net(ndev), NULL, 0, &in6addr_any,
- htons(ROCE_V2_UDP_DPORT), 0);
- rcu_read_unlock();
- if (sk)
+ sk = rxe_ns_pernet_sk6(dev_net(ndev));
+ if (sk) {
+ sock_hold(sk);
return 0;
+ }
sock = rxe_setup_udp_tunnel(dev_net(ndev), htons(ROCE_V2_UDP_DPORT), true);
if (PTR_ERR(sock) == -EAFNOSUPPORT) {
@@ -720,6 +717,9 @@ static int rxe_net_ipv6_init(struct net_device *ndev)
pr_err("Failed to create IPv6 UDP tunnel\n");
return -1;
}
+
+ rxe_ns_pernet_set_sk6(dev_net(ndev), sock->sk);
+
#endif
return 0;
}
diff --git a/drivers/infiniband/sw/rxe/rxe_ns.c b/drivers/infiniband/sw/rxe/rxe_ns.c
new file mode 100644
index 000000000000..29d08899dcda
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_ns.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <net/sock.h>
+#include <net/netns/generic.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/pid_namespace.h>
+#include <net/udp_tunnel.h>
+
+#include "rxe_ns.h"
+
+/*
+ * Per network namespace data
+ */
+struct rxe_ns_sock {
+ struct sock __rcu *rxe_sk4;
+ struct sock __rcu *rxe_sk6;
+};
+
+/*
+ * Index to store custom data for each network namespace.
+ */
+static unsigned int rxe_pernet_id;
+
+/*
+ * Called for every existing and added network namespaces
+ */
+static int __net_init rxe_ns_init(struct net *net)
+{
+ /*
+ * create (if not present) and access data item in network namespace
+ * (net) using the id (net_id)
+ */
+ struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id);
+
+ rcu_assign_pointer(ns_sk->rxe_sk4, NULL); /* initialize sock 4 socket */
+ rcu_assign_pointer(ns_sk->rxe_sk6, NULL); /* initialize sock 6 socket */
+ synchronize_rcu();
+
+ return 0;
+}
+
+static void __net_exit rxe_ns_exit(struct net *net)
+{
+ /*
+ * called when the network namespace is removed
+ */
+ struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id);
+ struct sock *rxe_sk4 = NULL;
+ struct sock *rxe_sk6 = NULL;
+
+ rcu_read_lock();
+ rxe_sk4 = rcu_dereference(ns_sk->rxe_sk4);
+ rxe_sk6 = rcu_dereference(ns_sk->rxe_sk6);
+ rcu_read_unlock();
+
+ /* close socket */
+ if (rxe_sk4 && rxe_sk4->sk_socket) {
+ udp_tunnel_sock_release(rxe_sk4->sk_socket);
+ rcu_assign_pointer(ns_sk->rxe_sk4, NULL);
+ synchronize_rcu();
+ }
+
+ if (rxe_sk6 && rxe_sk6->sk_socket) {
+ udp_tunnel_sock_release(rxe_sk6->sk_socket);
+ rcu_assign_pointer(ns_sk->rxe_sk6, NULL);
+ synchronize_rcu();
+ }
+}
+
+/*
+ * callback to make the module network namespace aware
+ */
+static struct pernet_operations rxe_net_ops __net_initdata = {
+ .init = rxe_ns_init,
+ .exit = rxe_ns_exit,
+ .id = &rxe_pernet_id,
+ .size = sizeof(struct rxe_ns_sock),
+};
+
+struct sock *rxe_ns_pernet_sk4(struct net *net)
+{
+ struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id);
+ struct sock *sk;
+
+ rcu_read_lock();
+ sk = rcu_dereference(ns_sk->rxe_sk4);
+ rcu_read_unlock();
+
+ return sk;
+}
+
+void rxe_ns_pernet_set_sk4(struct net *net, struct sock *sk)
+{
+ struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id);
+
+ rcu_assign_pointer(ns_sk->rxe_sk4, sk);
+ synchronize_rcu();
+}
+
+struct sock *rxe_ns_pernet_sk6(struct net *net)
+{
+ struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id);
+ struct sock *sk;
+
+ rcu_read_lock();
+ sk = rcu_dereference(ns_sk->rxe_sk6);
+ rcu_read_unlock();
+
+ return sk;
+}
+
+void rxe_ns_pernet_set_sk6(struct net *net, struct sock *sk)
+{
+ struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id);
+
+ rcu_assign_pointer(ns_sk->rxe_sk6, sk);
+ synchronize_rcu();
+}
+
+int __init rxe_namespace_init(void)
+{
+ return register_pernet_subsys(&rxe_net_ops);
+}
+
+void __exit rxe_namespace_exit(void)
+{
+ unregister_pernet_subsys(&rxe_net_ops);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_ns.h b/drivers/infiniband/sw/rxe/rxe_ns.h
new file mode 100644
index 000000000000..da5bfcea1274
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_ns.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_NS_H
+#define RXE_NS_H
+
+struct sock *rxe_ns_pernet_sk4(struct net *net);
+struct sock *rxe_ns_pernet_sk6(struct net *net);
+void rxe_ns_pernet_set_sk4(struct net *net, struct sock *sk);
+void rxe_ns_pernet_set_sk6(struct net *net, struct sock *sk);
+int __init rxe_namespace_init(void);
+void __exit rxe_namespace_exit(void);
+
+#endif /* RXE_NS_H */
--
2.27.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCHv5 for-rc1 v5 8/8] RDMA/rxe: Replace l_sk6 with sk6 in net namespace
2023-04-28 9:39 [PATCHv5 for-rc1 v5 0/8] Fix the problem that rxe can not work in net namespace Zhu Yanjun
` (6 preceding siblings ...)
2023-04-28 9:39 ` [PATCHv5 for-rc1 v5 7/8] RDMA/rxe: Add the support of net namespace notifier Zhu Yanjun
@ 2023-04-28 9:39 ` Zhu Yanjun
7 siblings, 0 replies; 17+ messages in thread
From: Zhu Yanjun @ 2023-04-28 9:39 UTC (permalink / raw)
To: zyjzyj2000, jgg, leon, linux-rdma, parav, lehrer; +Cc: Zhu Yanjun, Rain River
From: Zhu Yanjun <yanjun.zhu@linux.dev>
The net namespace variable sk6 can be used. As such, l_sk6 can be
replaced with it.
Tested-by: Rain River <rain.1986.08.12@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
---
drivers/infiniband/sw/rxe/rxe.c | 1 -
drivers/infiniband/sw/rxe/rxe_net.c | 20 +-------------------
drivers/infiniband/sw/rxe/rxe_verbs.h | 1 -
3 files changed, 1 insertion(+), 21 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index c297677bf06a..3260f598a7fb 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -75,7 +75,6 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
rxe->ndev->dev_addr);
rxe->max_ucontext = RXE_MAX_UCONTEXT;
- rxe->l_sk6 = NULL;
}
/* initialize port attributes */
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 8135876b11f6..ebcb86fa1e5e 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -50,24 +50,6 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
{
struct dst_entry *ndst;
struct flowi6 fl6 = { { 0 } };
- struct rxe_dev *rdev;
-
- rdev = rxe_get_dev_from_net(ndev);
- if (!rdev->l_sk6) {
- struct sock *sk;
-
- rcu_read_lock();
- sk = udp6_lib_lookup(dev_net(ndev), NULL, 0, &in6addr_any,
- htons(ROCE_V2_UDP_DPORT), 0);
- rcu_read_unlock();
- if (!sk) {
- pr_info("file: %s +%d, error\n", __FILE__, __LINE__);
- return (struct dst_entry *)sk;
- }
- __sock_put(sk);
- rdev->l_sk6 = sk->sk_socket;
- }
-
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_oif = ndev->ifindex;
@@ -76,7 +58,7 @@ static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
fl6.flowi6_proto = IPPROTO_UDP;
ndst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(ndev),
- rdev->l_sk6->sk, &fl6,
+ rxe_ns_pernet_sk6(dev_net(ndev)), &fl6,
NULL);
if (IS_ERR(ndst)) {
rxe_dbg_qp(qp, "no route to %pI6\n", daddr);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index ac9bb55820a2..c269ae2a3224 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -396,7 +396,6 @@ struct rxe_dev {
struct rxe_port port;
struct crypto_shash *tfm;
- struct socket *l_sk6;
};
static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
--
2.27.0
^ permalink raw reply related [flat|nested] 17+ messages in thread