Netdev List
 help / color / mirror / Atom feed
* [PATCH] sctp: Make sha1 as default algorithm if fips is enabled
From: Ashwin Dayanand Kamat @ 2022-12-20  5:10 UTC (permalink / raw)
  To: Vlad Yasevich, Neil Horman, Marcelo Ricardo Leitner,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	linux-sctp, netdev, linux-kernel
  Cc: Ashwin Dayanand Kamat, srivatsab, srivatsa, amakhalov,
	vsirnapalli, akaher

MD5 is not FIPS compliant. But still md5 was used as the default algorithm
for sctp if fips was enabled.
Due to this, listen() system call in ltp tests was failing for sctp
in fips environment, with below error message.

[ 6397.892677] sctp: failed to load transform for md5: -2

Fix is to not assign md5 as default algorithm for sctp
if fips_enabled is true. Instead make sha1 as default algorithm.

Signed-off-by: Ashwin Dayanand Kamat <kashwindayan@vmware.com>
---
 net/sctp/protocol.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 909a89a..b6e9810 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -34,6 +34,7 @@
 #include <linux/memblock.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
+#include <linux/fips.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 #include <net/ip.h>
@@ -1321,14 +1322,13 @@ static int __net_init sctp_defaults_init(struct net *net)
 	/* Whether Cookie Preservative is enabled(1) or not(0) */
 	net->sctp.cookie_preserve_enable 	= 1;
 
-	/* Default sctp sockets to use md5 as their hmac alg */
-#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5)
-	net->sctp.sctp_hmac_alg			= "md5";
-#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1)
-	net->sctp.sctp_hmac_alg			= "sha1";
-#else
-	net->sctp.sctp_hmac_alg			= NULL;
-#endif
+	/* Default sctp sockets to use md5 as default only if fips is not enabled */
+	if (!fips_enabled && IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5))
+		net->sctp.sctp_hmac_alg = "md5";
+	else if (IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1))
+		net->sctp.sctp_hmac_alg = "sha1";
+	else
+		net->sctp.sctp_hmac_alg = NULL;
 
 	/* Max.Burst		    - 4 */
 	net->sctp.max_burst			= SCTP_DEFAULT_MAX_BURST;
-- 
2.7.4


^ permalink raw reply related

* [RFC PATCH net-next v2 3/5] net/smc: add dmb attach and detach interface
From: Wen Gu @ 2022-12-20  3:21 UTC (permalink / raw)
  To: kgraul, wenjia, jaka, davem, edumazet, kuba, pabeni
  Cc: linux-s390, netdev, linux-kernel
In-Reply-To: <1671506505-104676-1-git-send-email-guwen@linux.alibaba.com>

This patch extends smcd_ops, adding two more semantic for
SMC-D device:

- attach_dmb:

  Attach an already registered dmb to a specific buf_desc,
  so that we can refer to the dmb through this buf_desc.

- detach_dmb:

  Reverse operation of attach_dmb. detach the dmb from the
  buf_desc.

This interface extension is to prepare for the reduction
of data moving from sndbuf to RMB in SMC-D loopback device.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
---
 include/net/smc.h      |  2 ++
 net/smc/smc_ism.c      | 36 ++++++++++++++++++++++++++
 net/smc/smc_ism.h      |  2 ++
 net/smc/smc_loopback.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_loopback.h |  4 +++
 5 files changed, 113 insertions(+)

diff --git a/include/net/smc.h b/include/net/smc.h
index 7699f97..60a96f7 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -63,6 +63,8 @@ struct smcd_ops {
 				u32 vid);
 	int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
 	int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
+	int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
+	int (*detach_dmb)(struct smcd_dev *dev, u64 token);
 	int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
 	int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
 	int (*set_vlan_required)(struct smcd_dev *dev);
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 1d10435..2049388 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -202,6 +202,42 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
 	return rc;
 }
 
+int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token,
+		       struct smc_buf_desc *dmb_desc)
+{
+	struct smcd_dmb dmb;
+	int rc = 0;
+
+	memset(&dmb, 0, sizeof(dmb));
+	dmb.dmb_tok = token;
+
+	/* only support loopback device now */
+	if (!dev->is_loopback)
+		return -EINVAL;
+	if (!dev->ops->attach_dmb)
+		return -EINVAL;
+
+	rc = dev->ops->attach_dmb(dev, &dmb);
+	if (!rc) {
+		dmb_desc->sba_idx = dmb.sba_idx;
+		dmb_desc->token = dmb.dmb_tok;
+		dmb_desc->cpu_addr = dmb.cpu_addr;
+		dmb_desc->dma_addr = dmb.dma_addr;
+		dmb_desc->len = dmb.dmb_len;
+	}
+	return rc;
+}
+
+int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token)
+{
+	if (!dev->is_loopback)
+		return -EINVAL;
+	if (!dev->ops->detach_dmb)
+		return -EINVAL;
+
+	return dev->ops->detach_dmb(dev, token);
+}
+
 static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
 				  struct sk_buff *skb,
 				  struct netlink_callback *cb)
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
index d6b2db6..9022979 100644
--- a/net/smc/smc_ism.h
+++ b/net/smc/smc_ism.h
@@ -38,6 +38,8 @@ struct smc_ism_vlanid {			/* VLAN id set on ISM device */
 int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
 			 struct smc_buf_desc *dmb_desc);
 int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
+int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc);
+int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token);
 int smc_ism_signal_shutdown(struct smc_link_group *lgr);
 void smc_ism_get_system_eid(u8 **eid);
 u16 smc_ism_get_chid(struct smcd_dev *dev);
diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c
index 973382a..bc3ff82 100644
--- a/net/smc/smc_loopback.c
+++ b/net/smc/smc_loopback.c
@@ -68,6 +68,7 @@ static int lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
 		goto err_node;
 	}
 	dmb_node->len = dmb->dmb_len;
+	refcount_set(&dmb_node->refcnt, 1);
 
 	/* TODO: token is random but not exclusive !
 	 * suppose to find token in dmb hask table, if has this token
@@ -78,6 +79,7 @@ static int lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
 	write_lock(&ldev->dmb_ht_lock);
 	hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token);
 	write_unlock(&ldev->dmb_ht_lock);
+	atomic_inc(&ldev->dmb_cnt);
 
 	dmb->sba_idx = dmb_node->sba_idx;
 	dmb->dmb_tok = dmb_node->token;
@@ -115,9 +117,69 @@ static int lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
 	write_unlock(&ldev->dmb_ht_lock);
 
 	clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask);
+
+	/* wait for dmb refcnt equal to 0 */
+	if (!refcount_dec_and_test(&dmb_node->refcnt))
+		wait_event(ldev->dmbs_release, !refcount_read(&dmb_node->refcnt));
 	kfree(dmb_node->cpu_addr);
 	kfree(dmb_node);
 
+	if (atomic_dec_and_test(&ldev->dmb_cnt))
+		wake_up(&ldev->ldev_release);
+
+	return 0;
+}
+
+static int lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
+{
+	struct lo_dmb_node *dmb_node = NULL, *tmp_node;
+	struct lo_dev *ldev = smcd->priv;
+
+	/* find dmb_node according to dmb->dmb_tok */
+	read_lock(&ldev->dmb_ht_lock);
+	hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) {
+		if (tmp_node->token == dmb->dmb_tok) {
+			dmb_node = tmp_node;
+			break;
+		}
+	}
+	if (!dmb_node) {
+		read_unlock(&ldev->dmb_ht_lock);
+		return -EINVAL;
+	}
+	read_unlock(&ldev->dmb_ht_lock);
+	refcount_inc(&dmb_node->refcnt);
+
+	/* provide dmb information */
+	dmb->sba_idx = dmb_node->sba_idx;
+	dmb->dmb_tok = dmb_node->token;
+	dmb->cpu_addr = dmb_node->cpu_addr;
+	dmb->dma_addr = dmb_node->dma_addr;
+	dmb->dmb_len = dmb_node->len;
+	return 0;
+}
+
+static int lo_detach_dmb(struct smcd_dev *smcd, u64 token)
+{
+	struct lo_dmb_node *dmb_node = NULL, *tmp_node;
+	struct lo_dev *ldev = smcd->priv;
+
+	/* find dmb_node according to dmb->dmb_tok */
+	read_lock(&ldev->dmb_ht_lock);
+	hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) {
+		if (tmp_node->token == token) {
+			dmb_node = tmp_node;
+			break;
+		}
+	}
+	if (!dmb_node) {
+		read_unlock(&ldev->dmb_ht_lock);
+		return -EINVAL;
+	}
+	read_unlock(&ldev->dmb_ht_lock);
+
+	if (refcount_dec_and_test(&dmb_node->refcnt))
+		wake_up_all(&ldev->dmbs_release);
 	return 0;
 }
 
@@ -193,6 +255,8 @@ static u16 lo_get_chid(struct smcd_dev *smcd)
 	.query_remote_gid = lo_query_rgid,
 	.register_dmb = lo_register_dmb,
 	.unregister_dmb = lo_unregister_dmb,
+	.attach_dmb = lo_attach_dmb,
+	.detach_dmb = lo_detach_dmb,
 	.add_vlan_id = lo_add_vlan_id,
 	.del_vlan_id = lo_del_vlan_id,
 	.set_vlan_required = lo_set_vlan_required,
@@ -218,6 +282,9 @@ static int lo_dev_init(struct lo_dev *ldev)
 	ldev->lgid = smcd->local_gid;
 	rwlock_init(&ldev->dmb_ht_lock);
 	hash_init(ldev->dmb_ht);
+	atomic_set(&ldev->dmb_cnt, 0);
+	init_waitqueue_head(&ldev->dmbs_release);
+	init_waitqueue_head(&ldev->ldev_release);
 
 	return smcd_register_dev(smcd);
 }
@@ -255,6 +322,8 @@ static int lo_dev_probe(void)
 static void lo_dev_exit(struct lo_dev *ldev)
 {
 	smcd_unregister_dev(ldev->smcd);
+	if (atomic_read(&ldev->dmb_cnt))
+		wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt));
 }
 
 static void lo_dev_remove(void)
diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h
index d7f7815..f4122be 100644
--- a/net/smc/smc_loopback.h
+++ b/net/smc/smc_loopback.h
@@ -32,6 +32,7 @@ struct lo_dmb_node {
 	u32 sba_idx;
 	void *cpu_addr;
 	dma_addr_t dma_addr;
+	refcount_t refcnt;
 };
 
 struct lo_dev {
@@ -41,6 +42,9 @@ struct lo_dev {
 	DECLARE_BITMAP(sba_idx_mask, LODEV_MAX_DMBS);
 	rwlock_t dmb_ht_lock;
 	DECLARE_HASHTABLE(dmb_ht, LODEV_MAX_DMBS_BUCKETS);
+	atomic_t dmb_cnt;
+	wait_queue_head_t dmbs_release;
+	wait_queue_head_t ldev_release;
 };
 
 struct lo_systemeid {
-- 
1.8.3.1


^ permalink raw reply related

* [RFC PATCH net-next v2 4/5] net/smc: avoid data copy from sndbuf to peer RMB in SMC-D loopback
From: Wen Gu @ 2022-12-20  3:21 UTC (permalink / raw)
  To: kgraul, wenjia, jaka, davem, edumazet, kuba, pabeni
  Cc: linux-s390, netdev, linux-kernel
In-Reply-To: <1671506505-104676-1-git-send-email-guwen@linux.alibaba.com>

This patch aims to improve SMC-D loopback performance by avoiding
data copy from local sndbuf to peer RMB. The main idea is to let
local sndbuf and peer RMB share the same physical memory.

 +----------+                     +----------+
 | socket A |                     | socket B |
 +----------+                     +----------+
       |                               ^
       |         +---------+           |
  regard as      |         | ----------|
  local sndbuf   |  B's    |     regard as
       |         |  RMB    |     local RMB
       |-------> |         |
                 +---------+

For connections using smcd loopback device:

1. Only create and maintain local RMB.
        a. Create or reuse RMB when create connection;
        b. Free RMB when lgr free;

2. Attach local sndbuf to peer RMB.
        a. sndbuf_desc describes the same memory region as peer rmb_desc.
        b. sndbuf_desc is exclusive to specific connection and won't be
           added to lgr buffer pool for reuse.
        c. sndbuf is attached to peer RMB when receive remote token after
           CLC accept/confirm message.
        d. sndbuf is detached from peer RMB when connection is freed.

Therefore, the data copied from the userspace to local sndbuf directly
reaches the peer RMB.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
---
 net/smc/af_smc.c   | 23 +++++++++++++++++++-
 net/smc/smc_core.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_core.h |  2 ++
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b9884c8..c7de566 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1073,7 +1073,6 @@ static int smc_find_proposal_devices(struct smc_sock *smc,
 	 * The RFC patch hasn't resolved this, just simply always
 	 * chooses loopback device first, and fallback if loopback
 	 * communication is impossible.
-	 *
 	 */
 	/* check if there is an ism or loopback device available */
 	if (!(ini->smcd_version & SMC_V1) ||
@@ -1397,6 +1396,17 @@ static int smc_connect_ism(struct smc_sock *smc,
 	}
 
 	smc_conn_save_peer_info(smc, aclc);
+
+	/* special for smcd loopback
+	 * conns above smcd loopback dev only create their rmbs.
+	 * their sndbufs are 'maps' of peer rmbs.
+	 */
+	if (smc->conn.lgr->smcd->is_loopback) {
+		rc = smcd_buf_attach(&smc->conn);
+		if (rc)
+			goto connect_abort;
+		smc->sk.sk_sndbuf = 2 * (smc->conn.sndbuf_desc->len);
+	}
 	smc_close_init(smc);
 	smc_rx_init(smc);
 	smc_tx_init(smc);
@@ -2464,6 +2474,17 @@ static void smc_listen_work(struct work_struct *work)
 		mutex_unlock(&smc_server_lgr_pending);
 	}
 	smc_conn_save_peer_info(new_smc, cclc);
+
+	/* special for smcd loopback
+	 * conns above smcd loopback dev only create their rmbs.
+	 * their sndbufs are 'maps' of peer rmbs.
+	 */
+	if (ini->is_smcd && new_smc->conn.lgr->smcd->is_loopback) {
+		rc = smcd_buf_attach(&new_smc->conn);
+		if (rc)
+			goto out_decl;
+		new_smc->sk.sk_sndbuf = 2 * (new_smc->conn.sndbuf_desc->len);
+	}
 	smc_listen_out_connected(new_smc);
 	SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
 	goto out_free;
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index c305d8d..bf40ad3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1171,6 +1171,10 @@ void smc_conn_free(struct smc_connection *conn)
 		if (!list_empty(&lgr->list))
 			smc_ism_unset_conn(conn);
 		tasklet_kill(&conn->rx_tsklet);
+
+		/* detach sndbuf from peer rmb */
+		if (lgr->smcd->is_loopback)
+			smcd_buf_detach(conn);
 	} else {
 		smc_cdc_wait_pend_tx_wr(conn);
 		if (current_work() != &conn->abort_work)
@@ -2423,6 +2427,14 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
 {
 	int rc;
 
+	if (is_smcd && smc->conn.lgr->smcd->is_loopback) {
+		/* Conns above smcd loopback device only create and maintain
+		 * their RMBs. The sndbufs will be attached to peer RMBs once
+		 * getting the tokens.
+		 */
+		return __smc_buf_create(smc, is_smcd, true);
+	}
+
 	/* create send buffer */
 	rc = __smc_buf_create(smc, is_smcd, false);
 	if (rc)
@@ -2439,6 +2451,56 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
 	return rc;
 }
 
+/* for smcd loopback conns, attach local sndbuf to peer RMB.
+ * The data copy to sndbuf is equal to data copy to peer RMB.
+ */
+int smcd_buf_attach(struct smc_connection *conn)
+{
+	struct smcd_dev *smcd = conn->lgr->smcd;
+	u64 peer_token = conn->peer_token;
+	struct smc_buf_desc *buf_desc;
+	int rc;
+
+	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+	if (!buf_desc)
+		return -ENOMEM;
+	rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc);
+	if (rc) {
+		rc = SMC_CLC_DECL_ERR_RTOK;
+		goto free;
+	}
+
+	/* attach local sndbuf to peer RMB.
+	 * refer to local sndbuf is equal to refer to peer RMB.
+	 */
+	/* align with peer rmb */
+	buf_desc->cpu_addr = (u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg);
+	buf_desc->len -=  sizeof(struct smcd_cdc_msg);
+	conn->sndbuf_desc = buf_desc;
+	conn->sndbuf_desc->used = 1;
+	//smc->sk.sk_sndbuf = 2 * (smc->conn->sndbuf_desc->len);
+	atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len);
+	return 0;
+
+free:
+	kfree(buf_desc);
+	return rc;
+}
+
+void smcd_buf_detach(struct smc_connection *conn)
+{
+	struct smcd_dev *smcd = conn->lgr->smcd;
+	u64 peer_token = conn->peer_token;
+
+	if (!conn->sndbuf_desc)
+		return;
+
+	smc_ism_detach_dmb(smcd, peer_token);
+
+	kfree(conn->sndbuf_desc);
+	conn->sndbuf_desc = NULL;
+}
+
 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
 {
 	int i;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 285f9bd..b51b020 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -518,6 +518,8 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
 void smc_smcd_terminate_all(struct smcd_dev *dev);
 void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
 int smc_buf_create(struct smc_sock *smc, bool is_smcd);
+int smcd_buf_attach(struct smc_connection *conn);
+void smcd_buf_detach(struct smc_connection *conn);
 int smc_uncompress_bufsize(u8 compressed);
 int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
 			    struct smc_clc_msg_accept_confirm *clc);
-- 
1.8.3.1


^ permalink raw reply related

* [RFC PATCH net-next v2 1/5] net/smc: introduce SMC-D loopback device
From: Wen Gu @ 2022-12-20  3:21 UTC (permalink / raw)
  To: kgraul, wenjia, jaka, davem, edumazet, kuba, pabeni
  Cc: linux-s390, netdev, linux-kernel
In-Reply-To: <1671506505-104676-1-git-send-email-guwen@linux.alibaba.com>

This patch introduces a kind of loopback device for SMC-D, thus
enabling the SMC communication between two local sockets in one
kernel.

The loopback device supports basic capabilities defined by SMC-D,
including registering DMB, unregistering DMB and moving data.

Considering that there is no ism device on other servers expect
IBM z13, the loopback device can be used as a dummy device to
test SMC-D logic for the broad community.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
---
 include/net/smc.h      |   1 +
 net/smc/Makefile       |   2 +-
 net/smc/af_smc.c       |  12 ++-
 net/smc/smc_cdc.c      |   6 ++
 net/smc/smc_cdc.h      |   1 +
 net/smc/smc_loopback.c | 282 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_loopback.h |  59 +++++++++++
 7 files changed, 361 insertions(+), 2 deletions(-)
 create mode 100644 net/smc/smc_loopback.c
 create mode 100644 net/smc/smc_loopback.h

diff --git a/include/net/smc.h b/include/net/smc.h
index c926d33..7699f97 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -93,6 +93,7 @@ struct smcd_dev {
 	atomic_t lgr_cnt;
 	wait_queue_head_t lgrs_deleted;
 	u8 going_away : 1;
+	u8 is_loopback : 1;
 };
 
 struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 875efcd..a8c3711 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -4,5 +4,5 @@ obj-$(CONFIG_SMC)	+= smc.o
 obj-$(CONFIG_SMC_DIAG)	+= smc_diag.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
-smc-y += smc_tracepoint.o
+smc-y += smc_tracepoint.o smc_loopback.o
 smc-$(CONFIG_SYSCTL) += smc_sysctl.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e12d4fa..9546c02 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -52,6 +52,7 @@
 #include "smc_stats.h"
 #include "smc_tracepoint.h"
 #include "smc_sysctl.h"
+#include "smc_loopback.h"
 
 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
 						 * creation on server
@@ -3451,15 +3452,23 @@ static int __init smc_init(void)
 		goto out_sock;
 	}
 
+	rc = smc_loopback_init();
+	if (rc) {
+		pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc);
+		goto out_ib;
+	}
+
 	rc = tcp_register_ulp(&smc_ulp_ops);
 	if (rc) {
 		pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
-		goto out_ib;
+		goto out_lo;
 	}
 
 	static_branch_enable(&tcp_have_smc);
 	return 0;
 
+out_lo:
+	smc_loopback_exit();
 out_ib:
 	smc_ib_unregister_client();
 out_sock:
@@ -3494,6 +3503,7 @@ static void __exit smc_exit(void)
 	tcp_unregister_ulp(&smc_ulp_ops);
 	sock_unregister(PF_SMC);
 	smc_core_exit();
+	smc_loopback_exit();
 	smc_ib_unregister_client();
 	destroy_workqueue(smc_close_wq);
 	destroy_workqueue(smc_tcp_ls_wq);
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 53f63bf..61f5ff7 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -408,6 +408,12 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
 static void smcd_cdc_rx_tsklet(struct tasklet_struct *t)
 {
 	struct smc_connection *conn = from_tasklet(conn, t, rx_tsklet);
+
+	smcd_cdc_rx_handler(conn);
+}
+
+void smcd_cdc_rx_handler(struct smc_connection *conn)
+{
 	struct smcd_cdc_msg *data_cdc;
 	struct smcd_cdc_msg cdc;
 	struct smc_sock *smc;
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 696cc11..11559d4 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -301,5 +301,6 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn,
 				 struct smc_wr_buf *wr_buf);
 int smc_cdc_init(void) __init;
 void smcd_cdc_rx_init(struct smc_connection *conn);
+void smcd_cdc_rx_handler(struct smc_connection *conn);
 
 #endif /* SMC_CDC_H */
diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c
new file mode 100644
index 0000000..973382a
--- /dev/null
+++ b/net/smc/smc_loopback.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Shared Memory Communications Direct over loopback device.
+ *
+ *  Provide a SMC-D loopback dummy device.
+ *
+ *  Copyright (c) 2022, Alibaba Inc.
+ *
+ *  Author: Wen Gu <guwen@linux.alibaba.com>
+ *          Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/types.h>
+#include <net/smc.h>
+
+#include "smc_cdc.h"
+#include "smc_loopback.h"
+
+#define DRV_NAME "smc_lodev"
+
+struct lo_dev *lo_dev;
+
+static struct lo_systemeid LO_SYSTEM_EID = {
+	.seid_string = "SMC-SYSZ-LOSEID000000000",
+	.serial_number = "0000",
+	.type = "0000",
+};
+
+static int lo_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid,
+			 u32 vid)
+{
+	struct lo_dev *ldev = smcd->priv;
+
+	/* return local gid */
+	if (!ldev || rgid != ldev->lgid)
+		return -ENETUNREACH;
+	return 0;
+}
+
+static int lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
+{
+	struct lo_dev *ldev = smcd->priv;
+	struct lo_dmb_node *dmb_node;
+	int sba_idx, rc;
+
+	/* check space for new dmb */
+	for_each_clear_bit(sba_idx, ldev->sba_idx_mask, LODEV_MAX_DMBS) {
+		if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask))
+			break;
+	}
+	if (sba_idx == LODEV_MAX_DMBS)
+		return -ENOSPC;
+
+	dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL);
+	if (!dmb_node) {
+		rc = -ENOMEM;
+		goto err_bit;
+	}
+
+	dmb_node->sba_idx = sba_idx;
+	dmb_node->cpu_addr = kzalloc(dmb->dmb_len, GFP_KERNEL |
+			     __GFP_NOWARN | __GFP_NORETRY |
+			     __GFP_NOMEMALLOC);
+	if (!dmb_node->cpu_addr) {
+		rc = -ENOMEM;
+		goto err_node;
+	}
+	dmb_node->len = dmb->dmb_len;
+
+	/* TODO: token is random but not exclusive !
+	 * suppose to find token in dmb hask table, if has this token
+	 * already, then generate another one.
+	 */
+	/* add new dmb into hash table */
+	get_random_bytes(&dmb_node->token, sizeof(dmb_node->token));
+	write_lock(&ldev->dmb_ht_lock);
+	hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token);
+	write_unlock(&ldev->dmb_ht_lock);
+
+	dmb->sba_idx = dmb_node->sba_idx;
+	dmb->dmb_tok = dmb_node->token;
+	dmb->cpu_addr = dmb_node->cpu_addr;
+	dmb->dma_addr = dmb_node->dma_addr;
+	dmb->dmb_len = dmb_node->len;
+
+	return 0;
+
+err_node:
+	kfree(dmb_node);
+err_bit:
+	clear_bit(sba_idx, ldev->sba_idx_mask);
+	return rc;
+}
+
+static int lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
+{
+	struct lo_dmb_node *dmb_node = NULL, *tmp_node;
+	struct lo_dev *ldev = smcd->priv;
+
+	/* remove dmb from hash table */
+	write_lock(&ldev->dmb_ht_lock);
+	hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) {
+		if (tmp_node->token == dmb->dmb_tok) {
+			dmb_node = tmp_node;
+			break;
+		}
+	}
+	if (!dmb_node) {
+		write_unlock(&ldev->dmb_ht_lock);
+		return -EINVAL;
+	}
+	hash_del(&dmb_node->list);
+	write_unlock(&ldev->dmb_ht_lock);
+
+	clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask);
+	kfree(dmb_node->cpu_addr);
+	kfree(dmb_node);
+
+	return 0;
+}
+
+static int lo_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id)
+{
+	return 0;
+}
+
+static int lo_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id)
+{
+	return 0;
+}
+
+static int lo_set_vlan_required(struct smcd_dev *smcd)
+{
+	return 0;
+}
+
+static int lo_reset_vlan_required(struct smcd_dev *smcd)
+{
+	return 0;
+}
+
+static int lo_signal_ieq(struct smcd_dev *smcd, u64 rgid, u32 trigger_irq,
+			 u32 event_code, u64 info)
+{
+	return 0;
+}
+
+static int lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx,
+			bool sf, unsigned int offset, void *data,
+			unsigned int size)
+{
+	struct lo_dmb_node *rmb_node = NULL, *tmp_node;
+	struct lo_dev *ldev = smcd->priv;
+
+	read_lock(&ldev->dmb_ht_lock);
+	hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) {
+		if (tmp_node->token == dmb_tok) {
+			rmb_node = tmp_node;
+			break;
+		}
+	}
+	if (!rmb_node) {
+		read_unlock(&ldev->dmb_ht_lock);
+		return -EINVAL;
+	}
+	read_unlock(&ldev->dmb_ht_lock);
+
+	memcpy((char *)rmb_node->cpu_addr + offset, data, size);
+
+	if (sf) {
+		struct smc_connection *conn =
+			smcd->conn[rmb_node->sba_idx];
+
+		if (conn && !conn->killed)
+			smcd_cdc_rx_handler(conn);
+	}
+	return 0;
+}
+
+static u8 *lo_get_system_eid(void)
+{
+	return &LO_SYSTEM_EID.seid_string[0];
+}
+
+static u16 lo_get_chid(struct smcd_dev *smcd)
+{
+	return 0;
+}
+
+static const struct smcd_ops lo_ops = {
+	.query_remote_gid = lo_query_rgid,
+	.register_dmb = lo_register_dmb,
+	.unregister_dmb = lo_unregister_dmb,
+	.add_vlan_id = lo_add_vlan_id,
+	.del_vlan_id = lo_del_vlan_id,
+	.set_vlan_required = lo_set_vlan_required,
+	.reset_vlan_required = lo_reset_vlan_required,
+	.signal_event = lo_signal_ieq,
+	.move_data = lo_move_data,
+	.get_system_eid = lo_get_system_eid,
+	.get_chid = lo_get_chid,
+};
+
+static int lo_dev_init(struct lo_dev *ldev)
+{
+	struct smcd_dev *smcd = ldev->smcd;
+
+	/* smcd related */
+	smcd->is_loopback = 1;
+	smcd->priv = ldev;
+	get_random_bytes(&smcd->local_gid, sizeof(smcd->local_gid));
+
+	/* ldev related */
+	/* TODO: lgid is random but not exclusive !
+	 */
+	ldev->lgid = smcd->local_gid;
+	rwlock_init(&ldev->dmb_ht_lock);
+	hash_init(ldev->dmb_ht);
+
+	return smcd_register_dev(smcd);
+}
+
+static int lo_dev_probe(void)
+{
+	struct lo_dev *ldev;
+	int ret;
+
+	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
+	if (!ldev)
+		return -ENOMEM;
+
+	ldev->smcd = smcd_alloc_dev(NULL, "smcd-loopback-dev",
+				    &lo_ops, LODEV_MAX_DMBS);
+	if (!ldev->smcd) {
+		ret = -ENOMEM;
+		goto err_ldev;
+	}
+
+	ret = lo_dev_init(ldev);
+	if (ret)
+		goto err_smcd;
+
+	lo_dev = ldev;
+	return 0;
+
+err_smcd:
+	smcd_free_dev(ldev->smcd);
+err_ldev:
+	kfree(ldev);
+	return ret;
+}
+
+static void lo_dev_exit(struct lo_dev *ldev)
+{
+	smcd_unregister_dev(ldev->smcd);
+}
+
+static void lo_dev_remove(void)
+{
+	if (!lo_dev)
+		return;
+
+	lo_dev_exit(lo_dev);
+	smcd_free_dev(lo_dev->smcd);
+	kfree(lo_dev);
+}
+
+int smc_loopback_init(void)
+{
+	/* TODO: now lo_dev is a global device shared by
+	 * the whole kernel, and can't be referred to by
+	 * smc-tools command 'smcd dev'. Need to be improved.
+	 */
+	return lo_dev_probe();
+}
+
+void smc_loopback_exit(void)
+{
+	lo_dev_remove();
+}
diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h
new file mode 100644
index 0000000..d7f7815
--- /dev/null
+++ b/net/smc/smc_loopback.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Shared Memory Communications Direct over loopback device.
+ *
+ *  Provide a SMC-D loopback dummy device.
+ *
+ *  Copyright (c) 2022, Alibaba Inc.
+ *
+ *  Author: Wen Gu <guwen@linux.alibaba.com>
+ *          Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#ifndef _SMC_LOOPBACK_H
+#define _SMC_LOOPBACK_H
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <net/smc.h>
+
+#include "smc_core.h"
+
+#define LODEV_MAX_DMBS 5000
+#define LODEV_MAX_DMBS_BUCKETS 16
+
+struct lo_dmb_node {
+	struct hlist_node list;
+	u64 token;
+	u32 len;
+	u32 sba_idx;
+	void *cpu_addr;
+	dma_addr_t dma_addr;
+};
+
+struct lo_dev {
+	struct smcd_dev *smcd;
+	/* priv data */
+	u64 lgid;
+	DECLARE_BITMAP(sba_idx_mask, LODEV_MAX_DMBS);
+	rwlock_t dmb_ht_lock;
+	DECLARE_HASHTABLE(dmb_ht, LODEV_MAX_DMBS_BUCKETS);
+};
+
+struct lo_systemeid {
+	u8 seid_string[24];
+	u8 serial_number[4];
+	u8 type[4];
+};
+
+/* smcd loopback dev*/
+extern struct lo_dev *lo_dev;
+
+int smc_loopback_init(void);
+void smc_loopback_exit(void);
+
+#endif /* _SMC_LOOPBACK_H */
+
-- 
1.8.3.1


^ permalink raw reply related

* [RFC PATCH net-next v2 2/5] net/smc: choose loopback device in SMC-D communication
From: Wen Gu @ 2022-12-20  3:21 UTC (permalink / raw)
  To: kgraul, wenjia, jaka, davem, edumazet, kuba, pabeni
  Cc: linux-s390, netdev, linux-kernel
In-Reply-To: <1671506505-104676-1-git-send-email-guwen@linux.alibaba.com>

This patch allows SMC-D to use loopback device.

But noted that the implementation here is quiet simple and informal.
Loopback device will always be firstly chosen, and fallback happens
if loopback communication is impossible.

It needs to be discussed how client indicates to peer that multiple
SMC-D devices are available and how server picks a suitable one.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
---
 net/smc/af_smc.c  | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------
 net/smc/smc_clc.c |  4 +++-
 net/smc/smc_ism.c |  3 ++-
 3 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 9546c02..b9884c8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -979,6 +979,28 @@ static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
 	return 0;
 }
 
+/* check if there is a lo device available for this connection. */
+static int smc_find_lo_device(struct smc_sock *smc, struct smc_init_info *ini)
+{
+	struct smcd_dev *sdev;
+
+	mutex_lock(&smcd_dev_list.mutex);
+	list_for_each_entry(sdev, &smcd_dev_list.list, list) {
+		if (sdev->is_loopback && !sdev->going_away &&
+		    (!ini->ism_peer_gid[0] ||
+		     !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id,
+				      sdev))) {
+			ini->ism_dev[0] = sdev;
+			break;
+		}
+	}
+	mutex_unlock(&smcd_dev_list.mutex);
+	if (!ini->ism_dev[0])
+		return SMC_CLC_DECL_NOSMCDDEV;
+	ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
+	return 0;
+}
+
 /* is chid unique for the ism devices that are already determined? */
 static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
 					   int cnt)
@@ -1044,10 +1066,20 @@ static int smc_find_proposal_devices(struct smc_sock *smc,
 {
 	int rc = 0;
 
-	/* check if there is an ism device available */
+	/* TODO:
+	 * How to indicate to peer if ism device and loopback
+	 * device are both available ?
+	 *
+	 * The RFC patch hasn't resolved this, just simply always
+	 * chooses loopback device first, and fallback if loopback
+	 * communication is impossible.
+	 *
+	 */
+	/* check if there is an ism or loopback device available */
 	if (!(ini->smcd_version & SMC_V1) ||
-	    smc_find_ism_device(smc, ini) ||
-	    smc_connect_ism_vlan_setup(smc, ini))
+	    (smc_find_lo_device(smc, ini) &&
+	    (smc_find_ism_device(smc, ini) ||
+	    smc_connect_ism_vlan_setup(smc, ini))))
 		ini->smcd_version &= ~SMC_V1;
 	/* else ISM V1 is supported for this connection */
 
@@ -2135,9 +2167,20 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
 		goto not_found;
 	ini->is_smcd = true; /* prepare ISM check */
 	ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
-	rc = smc_find_ism_device(new_smc, ini);
-	if (rc)
-		goto not_found;
+
+	/* TODO:
+	 * How to know that peer has both loopback and ism device ?
+	 *
+	 * The RFC patch hasn't resolved this, simply tries loopback
+	 * device first, then ism device.
+	 */
+	/* find available loopback or ism device */
+	if (smc_find_lo_device(new_smc, ini)) {
+		rc = smc_find_ism_device(new_smc, ini);
+		if (rc)
+			goto not_found;
+	}
+
 	ini->ism_selected = 0;
 	rc = smc_listen_ism_init(new_smc, ini);
 	if (!rc)
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index dfb9797..3887692 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -486,7 +486,9 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
 		return -ENODEV;
 
 	in_dev_for_each_ifa_rcu(ifa, in_dev) {
-		if (!inet_ifa_match(ipv4, ifa))
+		/* add loopback support */
+		if (inet_addr_type(dev_net(dst->dev), ipv4) != RTN_LOCAL &&
+		    !inet_ifa_match(ipv4, ifa))
 			continue;
 		prop->prefix_len = inet_mask_len(ifa->ifa_mask);
 		prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 911fe08..1d10435 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -227,7 +227,8 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
 	if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0))
 		goto errattr;
 	memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
-	smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev);
+	if (!smcd->is_loopback)
+		smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev);
 	if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid))
 		goto errattr;
 	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid))
-- 
1.8.3.1


^ permalink raw reply related

* [RFC PATCH net-next v2 0/5] net/smc:Introduce SMC-D based loopback acceleration
From: Wen Gu @ 2022-12-20  3:21 UTC (permalink / raw)
  To: kgraul, wenjia, jaka, davem, edumazet, kuba, pabeni
  Cc: linux-s390, netdev, linux-kernel

Hi, all

# Background

As previously mentioned in [1], we (Alibaba Cloud) are trying to use SMC
to accelerate TCP applications in cloud environment, improving inter-host
or inter-VM communication.

In addition of these, we also found the value of SMC-D in scenario of local
inter-process communication, such as accelerate communication between containers
within the same host. So this RFC tries to provide a SMC-D loopback solution
in such scenario, to bring a significant improvement in latency and throughput
compared to TCP loopback.

# Design

This patch set provides a kind of SMC-D loopback solution.

Patch #1/5 and #2/5 provide an SMC-D based dummy device, preparing for the
inter-process communication acceleration. Except for loopback acceleration,
the dummy device can also meet the requirements mentioned in [2], which is
providing a way to test SMC-D logic for broad community without ISM device.

 +------------------------------------------+
 |  +-----------+           +-----------+   |
 |  | process A |           | process B |   |
 |  +-----------+           +-----------+   |
 |       ^                        ^         |
 |       |    +---------------+   |         |
 |       |    |   SMC stack   |   |         |
 |       +--->| +-----------+ |<--|         |
 |            | |   dummy   | |             |
 |            | |   device  | |             |
 |            +-+-----------+-+             |
 |                   VM                     |
 +------------------------------------------+

Patch #3/5, #4/5, #5/5 provides a way to avoid data copy from sndbuf to RMB
and improve SMC-D loopback performance. Through extending smcd_ops with two
new semantic: attach_dmb and detach_dmb, sender's sndbuf shares the same
physical memory region with receiver's RMB. The data copied from userspace
to sender's sndbuf directly reaches the receiver's RMB without unnecessary
memory copy in the same kernel.

 +----------+                     +----------+
 | socket A |                     | socket B |
 +----------+                     +----------+
       |                               ^
       |         +---------+           |
  regard as      |         | ----------|
  local sndbuf   |  B's    |     regard as
       |         |  RMB    |     local RMB
       |-------> |         |
                 +---------+

# Benchmark Test

 * Test environments:
      - VM with Intel Xeon Platinum 8 core 2.50GHz, 16 GiB mem.
      - SMC sndbuf/RMB size 1MB.

 * Test object:
      - TCP: run on TCP loopback.
      - domain: run on UNIX domain.
      - SMC lo: run on SMC loopback device with patch #1/5 ~ #2/5.
      - SMC lo-nocpy: run on SMC loopback device with patch #1/5 ~ #5/5.

1. ipc-benchmark (see [3])

 - ./<foo> -c 1000000 -s 100

                       TCP              domain              SMC-lo             SMC-lo-nocpy
Message
rate (msg/s)         75140      129548(+72.41)    152266(+102.64%)         151914(+102.17%)

2. sockperf

 - serv: <smc_run> taskset -c <cpu> sockperf sr --tcp
 - clnt: <smc_run> taskset -c <cpu> sockperf { tp | pp } --tcp --msg-size={ 64000 for tp | 14 for pp } -i 127.0.0.1 -t 30

                       TCP                  SMC-lo             SMC-lo-nocpy
Bandwidth(MBps)   4943.359        4936.096(-0.15%)        8239.624(+66.68%)
Latency(us)          6.372          3.359(-47.28%)            3.25(-49.00%)

3. iperf3

 - serv: <smc_run> taskset -c <cpu> iperf3 -s
 - clnt: <smc_run> taskset -c <cpu> iperf3 -c 127.0.0.1 -t 15

                       TCP                  SMC-lo             SMC-lo-nocpy
Bitrate(Gb/s)         40.5            41.4(+2.22%)            76.4(+88.64%)

4. nginx/wrk

 - serv: <smc_run> nginx
 - clnt: <smc_run> wrk -t 8 -c 500 -d 30 http://127.0.0.1:80

                       TCP                  SMC-lo             SMC-lo-nocpy
Requests/s       154643.22      220894.03(+42.84%)        226754.3(+46.63%)


# Discussion

1. API between SMC-D and ISM device

As Jan mentioned in [2], IBM are working on placing an API between SMC-D
and the ISM device for easier use of different "devices" for SMC-D.

So, considering that the introduction of attach_dmb or detach_dmb can
effectively avoid data copying from sndbuf to RMB and brings obvious
throughput advantages in inter-VM or inter-process scenarios, can the
attach/detach semantics be taken into consideration when designing the
API to make it a standard ISM device behavior?

Maybe our RFC of SMC-D based inter-process acceleration (this one) and
inter-VM acceleration (will coming soon, which is the update of [1])
can provide some examples for new API design. And we are very glad to
discuss this on the mail list.

2. Way to select different ISM-like devices

With the proposal of SMC-D loopback 'device' (this RFC) and incoming
device used for inter-VM acceleration as update of [1], SMC-D has more
options to choose from. So we need to consider that how to indicate
supported devices, how to determine which one to use, and their priority...

IMHO, this may require an update of CLC message and negotiation mechanism.
Again, we are very glad to discuss this with you on the mailing list.

[1] https://lore.kernel.org/netdev/20220720170048.20806-1-tonylu@linux.alibaba.com/
[2] https://lore.kernel.org/netdev/35d14144-28f7-6129-d6d3-ba16dae7a646@linux.ibm.com/
[3] https://github.com/goldsborough/ipc-bench

v1->v2
 1. Fix some build WARNINGs complained by kernel test rebot
    Reported-by: kernel test robot <lkp@intel.com>
 2. Add iperf3 test data.

Wen Gu (5):
  net/smc: introduce SMC-D loopback device
  net/smc: choose loopback device in SMC-D communication
  net/smc: add dmb attach and detach interface
  net/smc: avoid data copy from sndbuf to peer RMB in SMC-D loopback
  net/smc: logic of cursors update in SMC-D loopback connections

 include/net/smc.h      |   3 +
 net/smc/Makefile       |   2 +-
 net/smc/af_smc.c       |  88 +++++++++++-
 net/smc/smc_cdc.c      |  59 ++++++--
 net/smc/smc_cdc.h      |   1 +
 net/smc/smc_clc.c      |   4 +-
 net/smc/smc_core.c     |  62 +++++++++
 net/smc/smc_core.h     |   2 +
 net/smc/smc_ism.c      |  39 +++++-
 net/smc/smc_ism.h      |   2 +
 net/smc/smc_loopback.c | 358 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_loopback.h |  63 +++++++++
 12 files changed, 662 insertions(+), 21 deletions(-)
 create mode 100644 net/smc/smc_loopback.c
 create mode 100644 net/smc/smc_loopback.h

-- 
1.8.3.1


^ permalink raw reply

* [RFC PATCH net-next v2 5/5] net/smc: logic of cursors update in SMC-D loopback connections
From: Wen Gu @ 2022-12-20  3:21 UTC (permalink / raw)
  To: kgraul, wenjia, jaka, davem, edumazet, kuba, pabeni
  Cc: linux-s390, netdev, linux-kernel
In-Reply-To: <1671506505-104676-1-git-send-email-guwen@linux.alibaba.com>

Since local sndbuf of SMC-D loopback connection shares the same
physical memory region with peer RMB, the logic of cursors update
needs to be adapted.

The main difference from original implementation is need to ensure
that the data copied to local sndbuf won't overwrite the unconsumed
data of peer.

So, for SMC-D loopback connections:

1. TX
        a. don't update fin_curs when send out cdc msg.
        b. fin_curs and sndbuf_space update will be deferred until
           receiving peer cons_curs update.

2. RX
        a. same as before. peer sndbuf is as large as local rmb,
           which guarantees that prod_curs will behind prep_curs.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
---
 net/smc/smc_cdc.c      | 53 +++++++++++++++++++++++++++++++++++++++-----------
 net/smc/smc_loopback.c |  7 +++++++
 2 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 61f5ff7..586472a 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -253,17 +253,26 @@ int smcd_cdc_msg_send(struct smc_connection *conn)
 		return rc;
 	smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn);
 	conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
-	/* Calculate transmitted data and increment free send buffer space */
-	diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
-			     &conn->tx_curs_sent);
-	/* increased by confirmed number of bytes */
-	smp_mb__before_atomic();
-	atomic_add(diff, &conn->sndbuf_space);
-	/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
-	smp_mb__after_atomic();
-	smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
+	if (!conn->lgr->smcd->is_loopback) {
+		/* Note:
+		 * For smcd loopback device:
+		 *
+		 * Don't update the fin_curs and sndbuf_space here.
+		 * Update fin_curs when peer consumes the data in RMB.
+		 */
 
-	smc_tx_sndbuf_nonfull(smc);
+		/* Calculate transmitted data and increment free send buffer space */
+		diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
+				     &conn->tx_curs_sent);
+		/* increased by confirmed number of bytes */
+		smp_mb__before_atomic();
+		atomic_add(diff, &conn->sndbuf_space);
+		/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+		smp_mb__after_atomic();
+		smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
+
+		smc_tx_sndbuf_nonfull(smc);
+	}
 	return rc;
 }
 
@@ -321,7 +330,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 {
 	union smc_host_cursor cons_old, prod_old;
 	struct smc_connection *conn = &smc->conn;
-	int diff_cons, diff_prod;
+	int diff_cons, diff_prod, diff_tx;
 
 	smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
 	smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
@@ -337,6 +346,28 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		atomic_add(diff_cons, &conn->peer_rmbe_space);
 		/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
 		smp_mb__after_atomic();
+
+		/* For smcd loopback device:
+		 * Update of peer cons_curs indicates that
+		 * 1. peer rmbe space increases.
+		 * 2. local sndbuf space increases.
+		 *
+		 * So local sndbuf fin_curs should be equal to peer RMB cons_curs.
+		 */
+		if (conn->lgr->is_smcd &&
+		    conn->lgr->smcd->is_loopback) {
+			/* calculate peer rmb consumed data */
+			diff_tx = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
+						&conn->local_rx_ctrl.cons);
+			/* increase local sndbuf space and fin_curs */
+			smp_mb__before_atomic();
+			atomic_add(diff_tx, &conn->sndbuf_space);
+			/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+			smp_mb__after_atomic();
+			smc_curs_copy(&conn->tx_curs_fin, &conn->local_rx_ctrl.cons, conn);
+
+			smc_tx_sndbuf_nonfull(smc);
+		}
 	}
 
 	diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c
index bc3ff82..43f0287 100644
--- a/net/smc/smc_loopback.c
+++ b/net/smc/smc_loopback.c
@@ -216,6 +216,13 @@ static int lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx,
 	struct lo_dmb_node *rmb_node = NULL, *tmp_node;
 	struct lo_dev *ldev = smcd->priv;
 
+	if (!sf) {
+		/* no need to move data.
+		 * sndbuf is equal to peer rmb.
+		 */
+		return 0;
+	}
+
 	read_lock(&ldev->dmb_ht_lock);
 	hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) {
 		if (tmp_node->token == dmb_tok) {
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH 0/2] ip/ip6_gre: Fix GRE tunnels not generating IPv6 link local addresses
From: Chris Packham @ 2022-12-20  3:07 UTC (permalink / raw)
  To: Thomas Winter, davem@davemloft.net, dsahern@kernel.org,
	linux-kernel@vger.kernel.org, yoshfuji@linux-ipv6.org,
	kuba@kernel.org, a@unstable.cc, netdev@vger.kernel.org
In-Reply-To: <90c749975ea3940b37cfae95f224da25bee6f577.camel@alliedtelesis.co.nz>


On 20/12/22 16:06, Thomas Winter wrote:
> On Tue, 2022-12-20 at 15:32 +1300, Chris Packham wrote:
>> On 20/12/22 15:25, Chris Packham wrote:
>>> Hi Thomas,
>>>
>>> On 19/12/22 14:06, Thomas Winter wrote:
>>>> For our point-to-point GRE tunnels, they have
>>>> IN6_ADDR_GEN_MODE_NONE
>>>> when they are created then we set IN6_ADDR_GEN_MODE_EUI64 when
>>>> they
>>>> come up to generate the IPv6 link local address for the
>>>> interface.
>>>> Recently we found that they were no longer generating IPv6
>>>> addresses.
>>>>
>>>> Also, non-point-to-point tunnels were not generating any IPv6
>>>> link
>>>> local address and instead generating an IPv6 compat address,
>>>> breaking IPv6 communication on the tunnel.
>>>>
>>>> These failures were caused by commit e5dd729460ca and this patch
>>>> set
>>>> aims to resolve these issues.
>>> This appears to be a v2 of
>>> https://lore.kernel.org/all/20221218215718.1491444-1-Thomas.Winter@alliedtelesis.co.nz/#t
>>> but you haven't said so in the subject nor have you included a
>>> changelog in the patches or in the cover letter.
>>>
>>> Also for networking patches you should include either "net" or
>>> "net-next" in the subject prefix. As this appears to be a bugfix
>>> "net"
>>> is appropriate.
>>>
>> Took me a bit of looking but most of this stuff is covered by
>> https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html#netdev-faq
> Thanks, the git command I used did not put in the v2 that I expected
> and I didn't check the output properly. I will send a new patch set as
> v3.
I don't think there's any particular need to rush. I wouldn't expect 
much of a response over the holiday period.
>>>> Thomas Winter (2):
>>>>     ip/ip6_gre: Fix changing addr gen mode not generating IPv6
>>>> link local
>>>>       address
>>>>     ip/ip6_gre: Fix non-point-to-point tunnel not generating IPv6
>>>> link
>>>>       local address
>>>>
>>>>    net/ipv6/addrconf.c | 57 ++++++++++++++++++++++++------------
>>>> ---------
>>>>    1 file changed, 31 insertions(+), 26 deletions(-)

^ permalink raw reply

* Re: [PATCH 0/2] ip/ip6_gre: Fix GRE tunnels not generating IPv6 link local addresses
From: Thomas Winter @ 2022-12-20  3:06 UTC (permalink / raw)
  To: davem@davemloft.net, dsahern@kernel.org,
	linux-kernel@vger.kernel.org, yoshfuji@linux-ipv6.org,
	kuba@kernel.org, a@unstable.cc, Chris Packham,
	netdev@vger.kernel.org
In-Reply-To: <b9e601b5-c220-0c3f-5499-317f9fd706b9@alliedtelesis.co.nz>

On Tue, 2022-12-20 at 15:32 +1300, Chris Packham wrote:
> On 20/12/22 15:25, Chris Packham wrote:
> > Hi Thomas,
> > 
> > On 19/12/22 14:06, Thomas Winter wrote:
> > > For our point-to-point GRE tunnels, they have
> > > IN6_ADDR_GEN_MODE_NONE
> > > when they are created then we set IN6_ADDR_GEN_MODE_EUI64 when
> > > they
> > > come up to generate the IPv6 link local address for the
> > > interface.
> > > Recently we found that they were no longer generating IPv6
> > > addresses.
> > > 
> > > Also, non-point-to-point tunnels were not generating any IPv6
> > > link
> > > local address and instead generating an IPv6 compat address,
> > > breaking IPv6 communication on the tunnel.
> > > 
> > > These failures were caused by commit e5dd729460ca and this patch
> > > set
> > > aims to resolve these issues.
> > 
> > This appears to be a v2 of 
> > https://lore.kernel.org/all/20221218215718.1491444-1-Thomas.Winter@alliedtelesis.co.nz/#t 
> > but you haven't said so in the subject nor have you included a 
> > changelog in the patches or in the cover letter.
> > 
> > Also for networking patches you should include either "net" or 
> > "net-next" in the subject prefix. As this appears to be a bugfix
> > "net" 
> > is appropriate.
> > 
> Took me a bit of looking but most of this stuff is covered by 
> https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html#netdev-faq

Thanks, the git command I used did not put in the v2 that I expected
and I didn't check the output properly. I will send a new patch set as
v3.

> > > Thomas Winter (2):
> > >    ip/ip6_gre: Fix changing addr gen mode not generating IPv6
> > > link local
> > >      address
> > >    ip/ip6_gre: Fix non-point-to-point tunnel not generating IPv6
> > > link
> > >      local address
> > > 
> > >   net/ipv6/addrconf.c | 57 ++++++++++++++++++++++++------------
> > > ---------
> > >   1 file changed, 31 insertions(+), 26 deletions(-)

^ permalink raw reply

* [PATCH] 9p/rdma: unmap receive dma buffer in rdma_request()
From: Zhengchao Shao @ 2022-12-20  3:12 UTC (permalink / raw)
  To: v9fs-developer, netdev, ericvh, lucho, asmadeus, davem, edumazet,
	kuba, pabeni
  Cc: linux_oss, tom, weiyongjun1, yuehaibing, shaozhengchao

When down_interruptible() failed in rdma_request(), receive dma buffer
is not unmapped. Add unmap action to error path.

Fixes: fc79d4b104f0 ("9p: rdma: RDMA Transport Support for 9P")
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
---
 net/9p/trans_rdma.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 83f9100d46bf..da83023fecbf 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -499,6 +499,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 
 	if (down_interruptible(&rdma->sq_sem)) {
 		err = -EINTR;
+		ib_dma_unmap_single(rdma->cm_id->device, c->busa,
+				    c->req->tc.size, DMA_TO_DEVICE);
 		goto send_error;
 	}
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH] net: bridge: mcast: read ngrec once in igmp3/mld2 report
From: Joy Gu @ 2022-12-20  2:48 UTC (permalink / raw)
  To: bridge
  Cc: roopa, razor, davem, edumazet, kuba, pabeni, joern, netdev,
	linux-kernel, Joy Gu

In br_ip4_multicast_igmp3_report() and br_ip6_multicast_mld2_report(),
"ih" or "mld2r" is a pointer into the skb header. It's dereferenced to
get "num", which is used in the for-loop condition that follows.

Compilers are free to not spend a register on "num" and dereference that
pointer every time "num" would be used, i.e. every loop iteration. Which
would be a bug if pskb_may_pull() (called by ip_mc_may_pull() or
ipv6_mc_may_pull() in the loop body) were to change pointers pointing
into the skb header, e.g. by freeing "skb->head".

We can avoid this by using READ_ONCE().

Suggested-by: Joern Engel <joern@purestorage.com>
Signed-off-by: Joy Gu <jgu@purestorage.com>
---
 net/bridge/br_multicast.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 48170bd3785e..2ac4b099e00d 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2624,11 +2624,11 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx,
 	bool changed = false;
 	int err = 0;
 	u16 nsrcs;
 
 	ih = igmpv3_report_hdr(skb);
-	num = ntohs(ih->ngrec);
+	num = ntohs(READ_ONCE(ih->ngrec));
 	len = skb_transport_offset(skb) + sizeof(*ih);
 
 	for (i = 0; i < num; i++) {
 		len += sizeof(*grec);
 		if (!ip_mc_may_pull(skb, len))
@@ -2750,11 +2750,11 @@ static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx,
 
 	if (!ipv6_mc_may_pull(skb, sizeof(*mld2r)))
 		return -EINVAL;
 
 	mld2r = (struct mld2_report *)icmp6_hdr(skb);
-	num = ntohs(mld2r->mld2r_ngrec);
+	num = ntohs(READ_ONCE(mld2r->mld2r_ngrec));
 	len = skb_transport_offset(skb) + sizeof(*mld2r);
 
 	for (i = 0; i < num; i++) {
 		__be16 *_nsrcs, __nsrcs;
 		u16 nsrcs;
-- 
2.39.0


^ permalink raw reply related

* Re: [PATCH 0/2] ip/ip6_gre: Fix GRE tunnels not generating IPv6 link local addresses
From: Chris Packham @ 2022-12-20  2:32 UTC (permalink / raw)
  To: Thomas Winter, davem@davemloft.net, yoshfuji@linux-ipv6.org,
	dsahern@kernel.org, kuba@kernel.org, a@unstable.cc,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <8e4ad680-4406-e6df-5335-ffe53a60aa83@alliedtelesis.co.nz>


On 20/12/22 15:25, Chris Packham wrote:
> Hi Thomas,
>
> On 19/12/22 14:06, Thomas Winter wrote:
>> For our point-to-point GRE tunnels, they have IN6_ADDR_GEN_MODE_NONE
>> when they are created then we set IN6_ADDR_GEN_MODE_EUI64 when they
>> come up to generate the IPv6 link local address for the interface.
>> Recently we found that they were no longer generating IPv6 addresses.
>>
>> Also, non-point-to-point tunnels were not generating any IPv6 link
>> local address and instead generating an IPv6 compat address,
>> breaking IPv6 communication on the tunnel.
>>
>> These failures were caused by commit e5dd729460ca and this patch set
>> aims to resolve these issues.
>
> This appears to be a v2 of 
> https://lore.kernel.org/all/20221218215718.1491444-1-Thomas.Winter@alliedtelesis.co.nz/#t 
> but you haven't said so in the subject nor have you included a 
> changelog in the patches or in the cover letter.
>
> Also for networking patches you should include either "net" or 
> "net-next" in the subject prefix. As this appears to be a bugfix "net" 
> is appropriate.
>
Took me a bit of looking but most of this stuff is covered by 
https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html#netdev-faq
>>
>> Thomas Winter (2):
>>    ip/ip6_gre: Fix changing addr gen mode not generating IPv6 link local
>>      address
>>    ip/ip6_gre: Fix non-point-to-point tunnel not generating IPv6 link
>>      local address
>>
>>   net/ipv6/addrconf.c | 57 ++++++++++++++++++++++++---------------------
>>   1 file changed, 31 insertions(+), 26 deletions(-)
>>

^ permalink raw reply

* Re: [PATCH 0/2] ip/ip6_gre: Fix GRE tunnels not generating IPv6 link local addresses
From: Chris Packham @ 2022-12-20  2:25 UTC (permalink / raw)
  To: Thomas Winter, davem@davemloft.net, yoshfuji@linux-ipv6.org,
	dsahern@kernel.org, kuba@kernel.org, a@unstable.cc,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20221219010619.1826599-1-Thomas.Winter@alliedtelesis.co.nz>

Hi Thomas,

On 19/12/22 14:06, Thomas Winter wrote:
> For our point-to-point GRE tunnels, they have IN6_ADDR_GEN_MODE_NONE
> when they are created then we set IN6_ADDR_GEN_MODE_EUI64 when they
> come up to generate the IPv6 link local address for the interface.
> Recently we found that they were no longer generating IPv6 addresses.
>
> Also, non-point-to-point tunnels were not generating any IPv6 link
> local address and instead generating an IPv6 compat address,
> breaking IPv6 communication on the tunnel.
>
> These failures were caused by commit e5dd729460ca and this patch set
> aims to resolve these issues.

This appears to be a v2 of 
https://lore.kernel.org/all/20221218215718.1491444-1-Thomas.Winter@alliedtelesis.co.nz/#t 
but you haven't said so in the subject nor have you included a changelog 
in the patches or in the cover letter.

Also for networking patches you should include either "net" or 
"net-next" in the subject prefix. As this appears to be a bugfix "net" 
is appropriate.

>
> Thomas Winter (2):
>    ip/ip6_gre: Fix changing addr gen mode not generating IPv6 link local
>      address
>    ip/ip6_gre: Fix non-point-to-point tunnel not generating IPv6 link
>      local address
>
>   net/ipv6/addrconf.c | 57 ++++++++++++++++++++++++---------------------
>   1 file changed, 31 insertions(+), 26 deletions(-)
>

^ permalink raw reply

* Re: [PATCH net 1/3] Documentation: devlink: add missing toc entry for etas_es58x devlink doc
From: patchwork-bot+netdevbpf @ 2022-12-20  2:00 UTC (permalink / raw)
  To: Marc Kleine-Budde
  Cc: netdev, davem, kuba, linux-can, kernel, mailhol.vincent, sfr, lkp
In-Reply-To: <20221219155210.1143439-2-mkl@pengutronix.de>

Hello:

This series was applied to netdev/net.git (master)
by Marc Kleine-Budde <mkl@pengutronix.de>:

On Mon, 19 Dec 2022 16:52:08 +0100 you wrote:
> From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
> 
> toc entry is missing for etas_es58x devlink doc and triggers this warning:
> 
>   Documentation/networking/devlink/etas_es58x.rst: WARNING: document isn't included in any toctree
> 
> Add the missing toc entry.
> 
> [...]

Here is the summary with links:
  - [net,1/3] Documentation: devlink: add missing toc entry for etas_es58x devlink doc
    https://git.kernel.org/netdev/net/c/115dd5469019
  - [net,2/3] can: flexcan: avoid unbalanced pm_runtime_enable warning
    https://git.kernel.org/netdev/net/c/3bc2afcba812
  - [net,3/3] can: kvaser_usb: hydra: help gcc-13 to figure out cmd_len
    https://git.kernel.org/netdev/net/c/f006229135b7

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net v4 0/3] Stop corrupting socket's task_frag
From: patchwork-bot+netdevbpf @ 2022-12-20  2:00 UTC (permalink / raw)
  To: Benjamin Coddington
  Cc: kuba, pabeni, davem, edumazet, gnault, philipp.reisner,
	lars.ellenberg, christoph.boehmwalder, axboe, josef, kbusch, hch,
	sagi, lduncan, cleech, michael.christie, jejb, martin.petersen,
	valentina.manea.m, shuah, gregkh, dhowells, marc.dionne, sfrench,
	ccaulfie, teigland, mark, jlbec, joseph.qi, ericvh, lucho,
	asmadeus, idryomov, xiubli, chuck.lever, jlayton, trond.myklebust,
	anna, steffen.klassert, herbert, netdev
In-Reply-To: <cover.1671194454.git.bcodding@redhat.com>

Hello:

This series was applied to netdev/net.git (master)
by Jakub Kicinski <kuba@kernel.org>:

On Fri, 16 Dec 2022 07:45:25 -0500 you wrote:
> The networking code uses flags in sk_allocation to determine if it can use
> current->task_frag, however in-kernel users of sockets may stop setting
> sk_allocation when they convert to the preferred memalloc_nofs_save/restore,
> as SUNRPC has done in commit a1231fda7e94 ("SUNRPC: Set memalloc_nofs_save()
> on all rpciod/xprtiod jobs").
> 
> This will cause corruption in current->task_frag when recursing into the
> network layer for those subsystems during page fault or reclaim.  The
> corruption is difficult to diagnose because stack traces may not contain the
> offending subsystem at all.  The corruption is unlikely to show up in
> testing because it requires memory pressure, and so subsystems that
> convert to memalloc_nofs_save/restore are likely to continue to run into
> this issue.
> 
> [...]

Here is the summary with links:
  - [net,v4,1/3] net: Introduce sk_use_task_frag in struct sock.
    https://git.kernel.org/netdev/net/c/fb87bd47516d
  - [net,v4,2/3] Treewide: Stop corrupting socket's task_frag
    https://git.kernel.org/netdev/net/c/98123866fcf3
  - [net,v4,3/3] net: simplify sk_page_frag
    https://git.kernel.org/netdev/net/c/08f65892c5ee

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf: pull before calling skb_postpull_rcsum()
From: Stanislav Fomichev @ 2022-12-20  1:55 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: daniel, bpf, netdev, Anand Parthasarathy, martin.lau, song,
	john.fastabend
In-Reply-To: <20221219174505.67014ea5@kernel.org>

On Mon, Dec 19, 2022 at 5:45 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 19 Dec 2022 17:21:27 -0800 Stanislav Fomichev wrote:
> > > -       skb_postpull_rcsum(skb, skb->data + off, len);
> > > -       memmove(skb->data + len, skb->data, off);
> > > +       old_data = skb->data;
> > >         __skb_pull(skb, len);
> >
> > [..]
>
> Very counter-productively trimmed ;)
>
> > > +       skb_postpull_rcsum(skb, old_data + off, len);
> >
> > Are you sure about the 'old_data + off' part here (for
> > CHECKSUM_COMPLETE)? Shouldn't it be old_data?
>
> AFAIU before:
>
>       old_data (aka skb->data before)
>      /
>     / off  len
>    V-----><--->
> [ .=======xxxxx... buffer ...... ]
>           ^
>            \
>             the xxx part is what we delete
>
> after:
>           skb->data (after)
>          /
>         v
> [ .yyyyy=======... buffer ...... ]
>    <---><----->
>     len   off
>    ^
>     \
>      the yyy part is technically the old values of === but now "outside"
>      of the valid packet data
>
> > I'm assuming we need to negate the old parts that we've pulled?
>
> Yes.
>
> > Maybe safer/more correct to do the following?
> >
> > skb_pull_rcsum(skb, off);
>
> This just pulls from the front, we need to skip over various L2/L3
> headers thanks to off. Hopefully the diagrams help, LMK if they are
> wrong.

Ah, yeah, you're totally correct, thanks for the pics :-) Not sure how
I mixed up off/len in skb_pull_rcsum..

Acked-by: Stanislav Fomichev <sdf@google.com>

> > memmove(skb->data, skb->data-off, off);

^ permalink raw reply

* Re: [PATCH net-next v1 0/2] net: marvell: prestera: add ipv6 routes offloading
From: Jakub Kicinski @ 2022-12-20  1:46 UTC (permalink / raw)
  To: Yevhen Orlov
  Cc: netdev, Volodymyr Mytnyk, Taras Chornyi, Mickey Rachamim,
	Serhiy Pshyk, David S. Miller, Eric Dumazet, Paolo Abeni,
	Andrew Lunn, Stephen Hemminger, linux-kernel
In-Reply-To: <Y5+RDIIGWGeKGUAo@yorlov.ow.s>

On Mon, 19 Dec 2022 00:15:40 +0200 Yevhen Orlov wrote:
> Add support for IPv6 nexthop/blackhole/connected routes for Marvell Prestera driver.
> Handle AF_INET6 neigbours, fib entries.
> 
> Add features:
>  - IPv6:
>    - Support "offload", "offload_failed", "trap" flags
>    - Support blackhole, nexthop, local/connected/unreachable/etc (trap)
>      e.g.: "ip addr add 2001:1::1/64 dev sw1p2"
>      e.g.: "ip route add 2002:2::/64 via 2001:2::2"
>      e.g.: "ip route add blachole 2003:2::/64 dev lo"

# Form letter - net-next is closed

We have already submitted the networking pull request to Linus
for v6.2 and therefore net-next is closed for new drivers, features,
code refactoring and optimizations. We are currently accepting
bug fixes only.

Please repost when net-next reopens after Jan 2nd.

RFC patches sent for review only are obviously welcome at any time.

^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf: pull before calling skb_postpull_rcsum()
From: Jakub Kicinski @ 2022-12-20  1:45 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: daniel, bpf, netdev, Anand Parthasarathy, martin.lau, song,
	john.fastabend
In-Reply-To: <CAKH8qBvVTHXsgVLHuCmdFM1dnYEiDFovOFfXNq1=8igPCCO7jQ@mail.gmail.com>

On Mon, 19 Dec 2022 17:21:27 -0800 Stanislav Fomichev wrote:
> > -       skb_postpull_rcsum(skb, skb->data + off, len);
> > -       memmove(skb->data + len, skb->data, off);
> > +       old_data = skb->data;
> >         __skb_pull(skb, len);  
> 
> [..]

Very counter-productively trimmed ;)

> > +       skb_postpull_rcsum(skb, old_data + off, len);  
> 
> Are you sure about the 'old_data + off' part here (for
> CHECKSUM_COMPLETE)? Shouldn't it be old_data?

AFAIU before:

      old_data (aka skb->data before)
     /
    / off  len 
   V-----><--->
[ .=======xxxxx... buffer ...... ]
          ^
           \
            the xxx part is what we delete

after:
          skb->data (after)
         /
        v
[ .yyyyy=======... buffer ...... ]
   <---><----->
    len   off  
   ^
    \
     the yyy part is technically the old values of === but now "outside"
     of the valid packet data

> I'm assuming we need to negate the old parts that we've pulled?

Yes.

> Maybe safer/more correct to do the following?
> 
> skb_pull_rcsum(skb, off);

This just pulls from the front, we need to skip over various L2/L3
headers thanks to off. Hopefully the diagrams help, LMK if they are
wrong.

> memmove(skb->data, skb->data-off, off);

^ permalink raw reply

* Re: Possible race with xsk_flush
From: Shawn Bohrer @ 2022-12-20  1:31 UTC (permalink / raw)
  To: Magnus Karlsson; +Cc: netdev, bpf, bjorn, magnus.karlsson, kernel-team
In-Reply-To: <CAJ8uoz1GKvoaM0DCo1Ki8q=LHR1cjrNC=1BK7chTKKW9Po5F5A@mail.gmail.com>

On Fri, Dec 16, 2022 at 11:05:19AM +0100, Magnus Karlsson wrote:
> To summarize, we are expecting this ordering:
> 
> CPU 0 __xsk_rcv_zc()
> CPU 0 __xsk_map_flush()
> CPU 2 __xsk_rcv_zc()
> CPU 2 __xsk_map_flush()
> 
> But we are seeing this order:
> 
> CPU 0 __xsk_rcv_zc()
> CPU 2 __xsk_rcv_zc()
> CPU 0 __xsk_map_flush()
> CPU 2 __xsk_map_flush()
> 
> Here is the veth NAPI poll loop:
> 
> static int veth_poll(struct napi_struct *napi, int budget)
> {
>     struct veth_rq *rq =
>     container_of(napi, struct veth_rq, xdp_napi);
>     struct veth_stats stats = {};
>     struct veth_xdp_tx_bq bq;
>     int done;
> 
>     bq.count = 0;
> 
>     xdp_set_return_frame_no_direct();
>     done = veth_xdp_rcv(rq, budget, &bq, &stats);
> 
>     if (done < budget && napi_complete_done(napi, done)) {
>         /* Write rx_notify_masked before reading ptr_ring */
>        smp_store_mb(rq->rx_notify_masked, false);
>        if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
>            if (napi_schedule_prep(&rq->xdp_napi)) {
>                WRITE_ONCE(rq->rx_notify_masked, true);
>                __napi_schedule(&rq->xdp_napi);
>             }
>         }
>     }
> 
>     if (stats.xdp_tx > 0)
>         veth_xdp_flush(rq, &bq);
>     if (stats.xdp_redirect > 0)
>         xdp_do_flush();
>     xdp_clear_return_frame_no_direct();
> 
>     return done;
> }
> 
> Something I have never seen before is that there is
> napi_complete_done() and a __napi_schedule() before xdp_do_flush().
> Let us check if this has something to do with it. So two suggestions
> to be executed separately:
> 
> * Put a probe at the __napi_schedule() above and check if it gets
> triggered before this problem
> * Move the "if (stats.xdp_redirect > 0) xdp_do_flush();" to just
> before "if (done < budget && napi_complete_done(napi, done)) {"
> 
> This might provide us some hints on what is going on.

After staring at this code for way too long I finally made a
breakthrough!  I could not understand how this race could occur when
napi_poll() calls netpoll_poll_lock().  Here is netpoll_poll_lock():

```
  static inline void *netpoll_poll_lock(struct napi_struct *napi)
  {
    struct net_device *dev = napi->dev;

    if (dev && dev->npinfo) {
      int owner = smp_processor_id();

      while (cmpxchg(&napi->poll_owner, -1, owner) != -1)
        cpu_relax();

      return napi;
    }
    return NULL;
  }
```
If dev or dev->npinfo are NULL then it doesn't acquire a lock at all!
Adding some more trace points I see:

```
  iperf2-1325    [002] ..s1. 264246.626880: __napi_poll: (__napi_poll+0x0/0x150) n=0xffff91c885bff000 poll_owner=-1 dev=0xffff91c881d4e000 npinfo=0x0
  iperf2-1325    [002] d.Z1. 264246.626882: __xsk_rcv_zc_L7: (__xsk_rcv_zc+0x3b/0xc0) addr=0x1503100 len=0x42 xs=0xffff91c8bfe77000 fq=0xffff91c8c1a43f80 dev=0xffff91c881d4e000
  iperf2-1325    [002] d.Z1. 264246.626883: __xsk_rcv_zc_L7: (__xsk_rcv_zc+0x42/0xc0) addr=0x1503100 len=0x42 xs=0xffff91c8bfe77000 fq=0xffff91c8c1a43f80 dev=0xffff91c881d4e000
  iperf2-1325    [002] d.Z1. 264246.626884: xsk_flush: (__xsk_map_flush+0x32/0xb0) xs=0xffff91c8bfe77000 
```

Here you can see that poll_owner=-1 meaning the lock was never
acquired because npinfo is NULL.  This means that the same veth rx
queue can be napi_polled from multiple CPU and nothing stops it from
running concurrently.  They all look like this, just most of the time
there aren't concurrent napi_polls running for the same queue.  They
do however move around CPUs as I explained earlier.

I'll note that I've ran with your suggested change of moving
xdp_do_flush() before napi_complete_done() all weekend and I have not
reproduced the issue.  I don't know if that truly means the issue is
fixed by that change or not.  I suspect it does fix the issue because
it prevents the napi_struct from being scheduled again before the
first poll has completed, and nap_schedule_prep() ensures that only
one instance is ever running.

If we think this is the correct fix I'll let it run for another day or
two and prepare a patch.

Thanks,
Shawn Bohrer

^ permalink raw reply

* Re: [Patch net] net: dsa: microchip: remove IRQF_TRIGGER_FALLING in request_threaded_irq
From: patchwork-bot+netdevbpf @ 2022-12-20  1:30 UTC (permalink / raw)
  To: Arun Ramadoss
  Cc: linux-kernel, netdev, woojung.huh, UNGLinuxDriver, andrew,
	vivien.didelot, f.fainelli, olteanv, davem, edumazet, kuba,
	pabeni, linux, Tristram.Ha, ceggers
In-Reply-To: <20221213101440.24667-1-arun.ramadoss@microchip.com>

Hello:

This patch was applied to netdev/net.git (master)
by Jakub Kicinski <kuba@kernel.org>:

On Tue, 13 Dec 2022 15:44:40 +0530 you wrote:
> KSZ swithes used interrupts for detecting the phy link up and down.
> During registering the interrupt handler, it used IRQF_TRIGGER_FALLING
> flag. But this flag has to be retrieved from device tree instead of hard
> coding in the driver, so removing the flag.
> 
> Fixes: ff319a644829 ("net: dsa: microchip: move interrupt handling logic from lan937x to ksz_common")
> Reported-by: Christian Eggers <ceggers@arri.de>
> Signed-off-by: Arun Ramadoss <arun.ramadoss@microchip.com>
> 
> [...]

Here is the summary with links:
  - [net] net: dsa: microchip: remove IRQF_TRIGGER_FALLING in request_threaded_irq
    https://git.kernel.org/netdev/net/c/62e027fb0e52

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net v2] mctp: Remove device type check at unregister
From: patchwork-bot+netdevbpf @ 2022-12-20  1:30 UTC (permalink / raw)
  To: Matt Johnston; +Cc: netdev, davem, kuba, pabeni, edumazet, jk, alexander.duyck
In-Reply-To: <20221215054933.2403401-1-matt@codeconstruct.com.au>

Hello:

This patch was applied to netdev/net.git (master)
by Jakub Kicinski <kuba@kernel.org>:

On Thu, 15 Dec 2022 13:49:33 +0800 you wrote:
> The unregister check could be incorrectly triggered if a netdev
> changes its type after register. That is possible for a tun device
> using TUNSETLINK ioctl, resulting in mctp unregister failing
> and the netdev unregister waiting forever.
> 
> This was encountered by https://github.com/openthread/openthread/issues/8523
> 
> [...]

Here is the summary with links:
  - [net,v2] mctp: Remove device type check at unregister
    https://git.kernel.org/netdev/net/c/b389a902dd5b

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf: pull before calling skb_postpull_rcsum()
From: Stanislav Fomichev @ 2022-12-20  1:21 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: daniel, bpf, netdev, Anand Parthasarathy, martin.lau, song,
	john.fastabend
In-Reply-To: <20221220004701.402165-1-kuba@kernel.org>

On Mon, Dec 19, 2022 at 4:47 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> Anand hit a BUG() when pulling off headers on egress to a SW tunnel.
> We get to skb_checksum_help() with an invalid checksum offset
> (commit d7ea0d9df2a6 ("net: remove two BUG() from skb_checksum_help()")
> converted those BUGs to WARN_ONs()).
> He points out oddness in how skb_postpull_rcsum() gets used.
> Indeed looks like we should pull before "postpull", otherwise
> the CHECKSUM_PARTIAL fixup from skb_postpull_rcsum() will not
> be able to do its job:
>
>         if (skb->ip_summed == CHECKSUM_PARTIAL &&
>             skb_checksum_start_offset(skb) < 0)
>                 skb->ip_summed = CHECKSUM_NONE;
>
> Reported-by: Anand Parthasarathy <anpartha@meta.com>
> Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
> CC: daniel@iogearbox.net
> CC: martin.lau@linux.dev
> CC: song@kernel.org
> CC: john.fastabend@gmail.com
> CC: sdf@google.com
> CC: bpf@vger.kernel.org
> ---
>  net/core/filter.c | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 929358677183..43cc1fe58a2c 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3180,15 +3180,18 @@ static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
>
>  static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
>  {
> +       void *old_data;
> +
>         /* skb_ensure_writable() is not needed here, as we're
>          * already working on an uncloned skb.
>          */
>         if (unlikely(!pskb_may_pull(skb, off + len)))
>                 return -ENOMEM;
>
> -       skb_postpull_rcsum(skb, skb->data + off, len);
> -       memmove(skb->data + len, skb->data, off);
> +       old_data = skb->data;
>         __skb_pull(skb, len);

[..]

> +       skb_postpull_rcsum(skb, old_data + off, len);

Are you sure about the 'old_data + off' part here (for
CHECKSUM_COMPLETE)? Shouldn't it be old_data?
I'm assuming we need to negate the old parts that we've pulled?

Maybe safer/more correct to do the following?

skb_pull_rcsum(skb, off);
memmove(skb->data, skb->data-off, off);


> +       memmove(skb->data, old_data, off);
>
>         return 0;
>  }
> --
> 2.38.1
>

^ permalink raw reply

* Re: [RFC bpf-next 2/8] net: introduce XDP features flag
From: Jakub Kicinski @ 2022-12-20  1:13 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: bpf, netdev, ast, daniel, andrii, davem, hawk, pabeni, edumazet,
	toke, memxor, alardam, saeedm, anthony.l.nguyen, gospo,
	vladimir.oltean, nbd, john, leon, simon.horman, aelior,
	christophe.jaillet, ecree.xilinx, grygorii.strashko, mst, bjorn,
	magnus.karlsson, maciej.fijalkowski, intel-wired-lan,
	lorenzo.bianconi
In-Reply-To: <43c340d440d8a87396198b301c5ffbf5ab56f304.1671462950.git.lorenzo@kernel.org>

On Mon, 19 Dec 2022 16:41:31 +0100 Lorenzo Bianconi wrote:
> +=====================
> +Netdev XDP features
> +=====================
> +
> + * XDP FEATURES FLAGS
> +
> +Following netdev xdp features flags can be retrieved over route netlink
> +interface (compact form) - the same way as netdev feature flags.

How likely is it that I'll be able to convince you that cramming more
stuff in rtnl is a bad idea? I can convert this for you to a YAML-
-compatible genetlink family for you in a jiffy, just say yes :S

rtnl is hard to parse, and already overloaded with random stuff.
And the messages are enormous.

> +These features flags are read only and cannot be change at runtime.
> +
> +*  XDP_ABORTED
> +
> +This feature informs if netdev supports xdp aborted action.
> +
> +*  XDP_DROP
> +
> +This feature informs if netdev supports xdp drop action.
> +
> +*  XDP_PASS
> +
> +This feature informs if netdev supports xdp pass action.
> +
> +*  XDP_TX
> +
> +This feature informs if netdev supports xdp tx action.
> +
> +*  XDP_REDIRECT
> +
> +This feature informs if netdev supports xdp redirect action.
> +It assumes the all beforehand mentioned flags are enabled.
> +
> +*  XDP_SOCK_ZEROCOPY
> +
> +This feature informs if netdev driver supports xdp zero copy.
> +It assumes the all beforehand mentioned flags are enabled.

Why is this "assumption" worth documenting?

> +*  XDP_HW_OFFLOAD
> +
> +This feature informs if netdev driver supports xdp hw oflloading.
> +
> +*  XDP_TX_LOCK
> +
> +This feature informs if netdev ndo_xdp_xmit function requires locking.

Why is it relevant to the user?

> +*  XDP_REDIRECT_TARGET
> +
> +This feature informs if netdev implements ndo_xdp_xmit callback.

Does it make sense to rename XDP_REDIRECT -> XDP_REDIRECT_SOURCE then?

> +*  XDP_FRAG_RX
> +
> +This feature informs if netdev implements non-linear xdp buff support in
> +the driver napi callback.

Who's the target audience? Maybe FRAG is not the best name?
Scatter-gather or multi-buf may be more widely understood.

> +*  XDP_FRAG_TARGET
> +
> +This feature informs if netdev implements non-linear xdp buff support in
> +ndo_xdp_xmit callback. XDP_FRAG_TARGET requires XDP_REDIRECT_TARGET is properly
> +supported.

^ permalink raw reply

* [PATCH bpf 2/2] selftests/bpf: tunnel: add sanity test for checksums
From: Jakub Kicinski @ 2022-12-20  0:47 UTC (permalink / raw)
  To: daniel; +Cc: bpf, netdev, Jakub Kicinski
In-Reply-To: <20221220004701.402165-1-kuba@kernel.org>

Simple netdevsim based test. Netdevsim will validate xmit'ed
packets, in particular we care about checksum sanity (along
the lines of checks inside skb_checksum_help()). Triggering
skb_checksum_help() directly would require the right HW device
or a crypto device setup, netdevsim is much simpler.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/netdevsim/netdev.c                |  5 ++++
 tools/testing/selftests/bpf/test_tc_tunnel.sh | 27 +++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 6db6a75ff9b9..e4808a6d37a4 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -33,6 +33,11 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (!nsim_ipsec_tx(ns, skb))
 		goto out;
 
+	/* Validate the packet */
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		WARN_ON_ONCE((unsigned int)skb_checksum_start_offset(skb) >=
+			     skb_headlen(skb));
+
 	u64_stats_update_begin(&ns->syncp);
 	ns->tx_packets++;
 	ns->tx_bytes += skb->len;
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index 334bdfeab940..4dac87f6a6fa 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -15,6 +15,7 @@ readonly ns1_v4=192.168.1.1
 readonly ns2_v4=192.168.1.2
 readonly ns1_v6=fd::1
 readonly ns2_v6=fd::2
+readonly nsim_v4=192.168.2.1
 
 # Must match port used by bpf program
 readonly udpport=5555
@@ -67,6 +68,10 @@ cleanup() {
 	if [[ -n $server_pid ]]; then
 		kill $server_pid 2> /dev/null
 	fi
+
+	if [ -e /sys/bus/netdevsim/devices/netdevsim1 ]; then
+	    echo 1 > /sys/bus/netdevsim/del_device
+	fi
 }
 
 server_listen() {
@@ -93,6 +98,25 @@ verify_data() {
 	fi
 }
 
+decap_sanity() {
+    echo "test decap sanity"
+    modprobe netdevsim
+    echo 1 1 > /sys/bus/netdevsim/new_device
+    udevadm settle
+    nsim=$(ls /sys/bus/netdevsim/devices/netdevsim1/net/)
+    ip link set dev $nsim up
+    ip addr add dev $nsim $nsim_v4/24
+
+    tc qdisc add dev $nsim clsact
+    tc filter add dev $nsim egress \
+       bpf direct-action object-file ${BPF_FILE} section decap
+
+    echo abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz | \
+	nc -u 192.168.2.2 7777
+
+    echo 1 > /sys/bus/netdevsim/del_device
+}
+
 set -e
 
 # no arguments: automated test, run all
@@ -138,6 +162,9 @@ if [[ "$#" -eq "0" ]]; then
 		$0 ipv6 ip6udp $mac 2000
 	done
 
+	echo "decap sanity check"
+	decap_sanity
+
 	echo "OK. All tests passed"
 	exit 0
 fi
-- 
2.38.1


^ permalink raw reply related

* [PATCH bpf 1/2] bpf: pull before calling skb_postpull_rcsum()
From: Jakub Kicinski @ 2022-12-20  0:47 UTC (permalink / raw)
  To: daniel
  Cc: bpf, netdev, Jakub Kicinski, Anand Parthasarathy, martin.lau,
	song, john.fastabend, sdf

Anand hit a BUG() when pulling off headers on egress to a SW tunnel.
We get to skb_checksum_help() with an invalid checksum offset
(commit d7ea0d9df2a6 ("net: remove two BUG() from skb_checksum_help()")
converted those BUGs to WARN_ONs()).
He points out oddness in how skb_postpull_rcsum() gets used.
Indeed looks like we should pull before "postpull", otherwise
the CHECKSUM_PARTIAL fixup from skb_postpull_rcsum() will not
be able to do its job:

	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    skb_checksum_start_offset(skb) < 0)
		skb->ip_summed = CHECKSUM_NONE;

Reported-by: Anand Parthasarathy <anpartha@meta.com>
Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
CC: daniel@iogearbox.net
CC: martin.lau@linux.dev
CC: song@kernel.org
CC: john.fastabend@gmail.com
CC: sdf@google.com
CC: bpf@vger.kernel.org
---
 net/core/filter.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 929358677183..43cc1fe58a2c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3180,15 +3180,18 @@ static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
 
 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
 {
+	void *old_data;
+
 	/* skb_ensure_writable() is not needed here, as we're
 	 * already working on an uncloned skb.
 	 */
 	if (unlikely(!pskb_may_pull(skb, off + len)))
 		return -ENOMEM;
 
-	skb_postpull_rcsum(skb, skb->data + off, len);
-	memmove(skb->data + len, skb->data, off);
+	old_data = skb->data;
 	__skb_pull(skb, len);
+	skb_postpull_rcsum(skb, old_data + off, len);
+	memmove(skb->data, old_data, off);
 
 	return 0;
 }
-- 
2.38.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox