Netdev List
 help / color / mirror / Atom feed
* [RFC crypto v3 4/9] chcr: Key Macro
From: Atul Gupta @ 2017-12-20 11:37 UTC (permalink / raw)
  To: herbert, linux-crypto, ganeshgr; +Cc: netdev, davem, davejwatson

Define macro for TLS Key context

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
 drivers/crypto/chelsio/chcr_algo.h | 42 +++++++++++++++++++++++++++++
 drivers/crypto/chelsio/chcr_core.h | 55 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.h b/drivers/crypto/chelsio/chcr_algo.h
index d1673a5..f263cd4 100644
--- a/drivers/crypto/chelsio/chcr_algo.h
+++ b/drivers/crypto/chelsio/chcr_algo.h
@@ -86,6 +86,39 @@
 	 KEY_CONTEXT_OPAD_PRESENT_M)
 #define KEY_CONTEXT_OPAD_PRESENT_F      KEY_CONTEXT_OPAD_PRESENT_V(1U)
 
+#define TLS_KEYCTX_RXFLIT_CNT_S 24
+#define TLS_KEYCTX_RXFLIT_CNT_V(x) ((x) << TLS_KEYCTX_RXFLIT_CNT_S)
+
+#define TLS_KEYCTX_RXPROT_VER_S 20
+#define TLS_KEYCTX_RXPROT_VER_M 0xf
+#define TLS_KEYCTX_RXPROT_VER_V(x) ((x) << TLS_KEYCTX_RXPROT_VER_S)
+
+#define TLS_KEYCTX_RXCIPH_MODE_S 16
+#define TLS_KEYCTX_RXCIPH_MODE_M 0xf
+#define TLS_KEYCTX_RXCIPH_MODE_V(x) ((x) << TLS_KEYCTX_RXCIPH_MODE_S)
+
+#define TLS_KEYCTX_RXAUTH_MODE_S 12
+#define TLS_KEYCTX_RXAUTH_MODE_M 0xf
+#define TLS_KEYCTX_RXAUTH_MODE_V(x) ((x) << TLS_KEYCTX_RXAUTH_MODE_S)
+
+#define TLS_KEYCTX_RXCIAU_CTRL_S 11
+#define TLS_KEYCTX_RXCIAU_CTRL_V(x) ((x) << TLS_KEYCTX_RXCIAU_CTRL_S)
+
+#define TLS_KEYCTX_RX_SEQCTR_S 9
+#define TLS_KEYCTX_RX_SEQCTR_M 0x3
+#define TLS_KEYCTX_RX_SEQCTR_V(x) ((x) << TLS_KEYCTX_RX_SEQCTR_S)
+
+#define TLS_KEYCTX_RX_VALID_S 8
+#define TLS_KEYCTX_RX_VALID_V(x) ((x) << TLS_KEYCTX_RX_VALID_S)
+
+#define TLS_KEYCTX_RXCK_SIZE_S 3
+#define TLS_KEYCTX_RXCK_SIZE_M 0x7
+#define TLS_KEYCTX_RXCK_SIZE_V(x) ((x) << TLS_KEYCTX_RXCK_SIZE_S)
+
+#define TLS_KEYCTX_RXMK_SIZE_S 0
+#define TLS_KEYCTX_RXMK_SIZE_M 0x7
+#define TLS_KEYCTX_RXMK_SIZE_V(x) ((x) << TLS_KEYCTX_RXMK_SIZE_S)
+
 #define CHCR_HASH_MAX_DIGEST_SIZE 64
 #define CHCR_MAX_SHA_DIGEST_SIZE 64
 
@@ -176,6 +209,15 @@
 		      KEY_CONTEXT_SALT_PRESENT_V(1) | \
 		      KEY_CONTEXT_CTX_LEN_V((ctx_len)))
 
+#define  FILL_KEY_CRX_HDR(ck_size, mk_size, d_ck, opad, ctx_len) \
+		htonl(TLS_KEYCTX_RXMK_SIZE_V(mk_size) | \
+		      TLS_KEYCTX_RXCK_SIZE_V(ck_size) | \
+		      TLS_KEYCTX_RX_VALID_V(1) | \
+		      TLS_KEYCTX_RX_SEQCTR_V(3) | \
+		      TLS_KEYCTX_RXAUTH_MODE_V(4) | \
+		      TLS_KEYCTX_RXCIPH_MODE_V(2) | \
+		      TLS_KEYCTX_RXFLIT_CNT_V((ctx_len)))
+
 #define FILL_WR_OP_CCTX_SIZE \
 		htonl( \
 			FW_CRYPTO_LOOKASIDE_WR_OPCODE_V( \
diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h
index 3c29ee0..77056a9 100644
--- a/drivers/crypto/chelsio/chcr_core.h
+++ b/drivers/crypto/chelsio/chcr_core.h
@@ -65,10 +65,58 @@
 struct _key_ctx {
 	__be32 ctx_hdr;
 	u8 salt[MAX_SALT];
-	__be64 reserverd;
+	__be64 iv_to_auth;
 	unsigned char key[0];
 };
 
+#define KEYCTX_TX_WR_IV_S  55
+#define KEYCTX_TX_WR_IV_M  0x1ffULL
+#define KEYCTX_TX_WR_IV_V(x) ((x) << KEYCTX_TX_WR_IV_S)
+#define KEYCTX_TX_WR_IV_G(x) \
+	(((x) >> KEYCTX_TX_WR_IV_S) & KEYCTX_TX_WR_IV_M)
+
+#define KEYCTX_TX_WR_AAD_S 47
+#define KEYCTX_TX_WR_AAD_M 0xffULL
+#define KEYCTX_TX_WR_AAD_V(x) ((x) << KEYCTX_TX_WR_AAD_S)
+#define KEYCTX_TX_WR_AAD_G(x) (((x) >> KEYCTX_TX_WR_AAD_S) & \
+				KEYCTX_TX_WR_AAD_M)
+
+#define KEYCTX_TX_WR_AADST_S 39
+#define KEYCTX_TX_WR_AADST_M 0xffULL
+#define KEYCTX_TX_WR_AADST_V(x) ((x) << KEYCTX_TX_WR_AADST_S)
+#define KEYCTX_TX_WR_AADST_G(x) \
+	(((x) >> KEYCTX_TX_WR_AADST_S) & KEYCTX_TX_WR_AADST_M)
+
+#define KEYCTX_TX_WR_CIPHER_S 30
+#define KEYCTX_TX_WR_CIPHER_M 0x1ffULL
+#define KEYCTX_TX_WR_CIPHER_V(x) ((x) << KEYCTX_TX_WR_CIPHER_S)
+#define KEYCTX_TX_WR_CIPHER_G(x) \
+	(((x) >> KEYCTX_TX_WR_CIPHER_S) & KEYCTX_TX_WR_CIPHER_M)
+
+#define KEYCTX_TX_WR_CIPHERST_S 23
+#define KEYCTX_TX_WR_CIPHERST_M 0x7f
+#define KEYCTX_TX_WR_CIPHERST_V(x) ((x) << KEYCTX_TX_WR_CIPHERST_S)
+#define KEYCTX_TX_WR_CIPHERST_G(x) \
+	(((x) >> KEYCTX_TX_WR_CIPHERST_S) & KEYCTX_TX_WR_CIPHERST_M)
+
+#define KEYCTX_TX_WR_AUTH_S 14
+#define KEYCTX_TX_WR_AUTH_M 0x1ff
+#define KEYCTX_TX_WR_AUTH_V(x) ((x) << KEYCTX_TX_WR_AUTH_S)
+#define KEYCTX_TX_WR_AUTH_G(x) \
+	(((x) >> KEYCTX_TX_WR_AUTH_S) & KEYCTX_TX_WR_AUTH_M)
+
+#define KEYCTX_TX_WR_AUTHST_S 7
+#define KEYCTX_TX_WR_AUTHST_M 0x7f
+#define KEYCTX_TX_WR_AUTHST_V(x) ((x) << KEYCTX_TX_WR_AUTHST_S)
+#define KEYCTX_TX_WR_AUTHST_G(x) \
+	(((x) >> KEYCTX_TX_WR_AUTHST_S) & KEYCTX_TX_WR_AUTHST_M)
+
+#define KEYCTX_TX_WR_AUTHIN_S 0
+#define KEYCTX_TX_WR_AUTHIN_M 0x7f
+#define KEYCTX_TX_WR_AUTHIN_V(x) ((x) << KEYCTX_TX_WR_AUTHIN_S)
+#define KEYCTX_TX_WR_AUTHIN_G(x) \
+	(((x) >> KEYCTX_TX_WR_AUTHIN_S) & KEYCTX_TX_WR_AUTHIN_M)
+
 struct chcr_wr {
 	struct fw_crypto_lookaside_wr wreq;
 	struct ulp_txpkt ulptx;
@@ -90,6 +138,11 @@ struct uld_ctx {
 	struct chcr_dev *dev;
 };
 
+struct sge_opaque_hdr {
+	void *dev;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+};
+
 struct chcr_ipsec_req {
 	struct ulp_txpkt ulptx;
 	struct ulptx_idata sc_imm;
-- 
1.8.3.1

^ permalink raw reply related

* [RFC crypto v3 5/9] chtls: Key program
From: Atul Gupta @ 2017-12-20 11:37 UTC (permalink / raw)
  To: herbert, linux-crypto, ganeshgr; +Cc: netdev, davem, davejwatson

Program the tx and rx key on chip.

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
v3: made some functions static
---
 drivers/crypto/chelsio/chtls/chtls_hw.c | 394 ++++++++++++++++++++++++++++++++
 1 file changed, 394 insertions(+)
 create mode 100644 drivers/crypto/chelsio/chtls/chtls_hw.c

diff --git a/drivers/crypto/chelsio/chtls/chtls_hw.c b/drivers/crypto/chelsio/chtls/chtls_hw.c
new file mode 100644
index 0000000..c3e17159
--- /dev/null
+++ b/drivers/crypto/chelsio/chtls/chtls_hw.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2017 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Written by: Atul Gupta (atul.gupta@chelsio.com)
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/tls.h>
+#include <net/tls.h>
+
+#include "chtls.h"
+#include "chtls_cm.h"
+
+static void __set_tcb_field_direct(struct chtls_sock *csk,
+				   struct cpl_set_tcb_field *req, u16 word,
+				   u64 mask, u64 val, u8 cookie, int no_reply)
+{
+	struct ulptx_idata *sc;
+
+	INIT_TP_WR_CPL(req, CPL_SET_TCB_FIELD, csk->tid);
+	req->wr.wr_mid |= htonl(FW_WR_FLOWID_V(csk->tid));
+	req->reply_ctrl = htons(NO_REPLY_V(no_reply) |
+				QUEUENO_V(csk->rss_qid));
+	req->word_cookie = htons(TCB_WORD_V(word) | TCB_COOKIE_V(cookie));
+	req->mask = cpu_to_be64(mask);
+	req->val = cpu_to_be64(val);
+	sc = (struct ulptx_idata *)(req + 1);
+	sc->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_NOOP));
+	sc->len = htonl(0);
+}
+
+static void __set_tcb_field(struct sock *sk, struct sk_buff *skb, u16 word,
+			    u64 mask, u64 val, u8 cookie, int no_reply)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct cpl_set_tcb_field *req;
+	struct ulptx_idata *sc;
+	unsigned int wrlen = roundup(sizeof(*req) + sizeof(*sc), 16);
+
+	req = (struct cpl_set_tcb_field *)__skb_put(skb, wrlen);
+	__set_tcb_field_direct(csk, req, word, mask, val, cookie, no_reply);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->port_id);
+}
+
+static int chtls_set_tcb_field(struct sock *sk, u16 word, u64 mask, u64 val)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *req;
+	struct ulptx_idata *sc;
+	unsigned int wrlen = roundup(sizeof(*req) + sizeof(*sc), 16);
+	unsigned int credits_needed = DIV_ROUND_UP(wrlen, 16);
+
+	skb = alloc_skb(wrlen, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	__set_tcb_field(sk, skb, word, mask, val, 0, 1);
+	set_queue(skb, (csk->txq_idx << 1) | CPL_PRIORITY_DATA, sk);
+	csk->wr_credits -= credits_needed;
+	csk->wr_unacked += credits_needed;
+	enqueue_wr(csk, skb);
+	cxgb4_ofld_send(csk->egress_dev, skb);
+	return 0;
+}
+
+/*
+ * Set one of the t_flags bits in the TCB.
+ */
+int chtls_set_tcb_tflag(struct sock *sk, unsigned int bit_pos, int val)
+{
+	return chtls_set_tcb_field(sk, 1, 1ULL << bit_pos,
+			    val << bit_pos);
+}
+
+static int chtls_set_tcb_keyid(struct sock *sk, int keyid)
+{
+	return chtls_set_tcb_field(sk, 31, 0xFFFFFFFFULL, keyid);
+}
+
+static int chtls_set_tcb_seqno(struct sock *sk)
+{
+	return chtls_set_tcb_field(sk, 28, ~0ULL, 0);
+}
+
+static int chtls_set_tcb_quiesce(struct sock *sk, int val)
+{
+	return chtls_set_tcb_field(sk, 1, (1ULL << TF_RX_QUIESCE_S),
+				   TF_RX_QUIESCE_V(val));
+}
+
+static void *chtls_alloc_mem(unsigned long size)
+{
+	void *p = kmalloc(size, GFP_KERNEL);
+
+	if (!p)
+		p = vmalloc(size);
+	if (p)
+		memset(p, 0, size);
+	return p;
+}
+
+static void chtls_free_mem(void *addr)
+{
+	unsigned long p = (unsigned long)addr;
+
+	if (p >= VMALLOC_START && p < VMALLOC_END)
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
+/* TLS Key bitmap processing */
+int chtls_init_kmap(struct chtls_dev *cdev, struct cxgb4_lld_info *lldi)
+{
+	unsigned int num_key_ctx, bsize;
+
+	num_key_ctx = (lldi->vr->key.size / TLS_KEY_CONTEXT_SZ);
+	bsize = BITS_TO_LONGS(num_key_ctx);
+
+	cdev->kmap.size = num_key_ctx;
+	cdev->kmap.available = bsize;
+	cdev->kmap.addr = chtls_alloc_mem(sizeof(*cdev->kmap.addr) *
+					  bsize);
+	if (!cdev->kmap.addr)
+		return -1;
+
+	cdev->kmap.start = lldi->vr->key.start;
+	spin_lock_init(&cdev->kmap.lock);
+	return 0;
+}
+
+void chtls_free_kmap(struct chtls_dev *cdev)
+{
+	if (cdev->kmap.addr)
+		chtls_free_mem(cdev->kmap.addr);
+}
+
+static int get_new_keyid(struct chtls_sock *csk, u32 optname)
+{
+	struct chtls_dev *cdev = csk->cdev;
+	struct chtls_hws *hws = &csk->tlshws;
+	struct net_device *dev = csk->egress_dev;
+	struct adapter *adap = netdev2adap(dev);
+	int keyid;
+
+	spin_lock_bh(&cdev->kmap.lock);
+	keyid = find_first_zero_bit(cdev->kmap.addr, cdev->kmap.size);
+	if (keyid < cdev->kmap.size) {
+		__set_bit(keyid, cdev->kmap.addr);
+		if (optname == TLS_RX)
+			hws->rxkey = keyid;
+		else
+			hws->txkey = keyid;
+		atomic_inc(&adap->chcr_stats.tls_key);
+	} else {
+		keyid = -1;
+	}
+	spin_unlock_bh(&cdev->kmap.lock);
+	return keyid;
+}
+
+void free_tls_keyid(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct net_device *dev = csk->egress_dev;
+	struct adapter *adap = netdev2adap(dev);
+	struct chtls_dev *cdev = csk->cdev;
+	struct chtls_hws *hws = &csk->tlshws;
+
+	if (!cdev->kmap.addr)
+		return;
+
+	spin_lock_bh(&cdev->kmap.lock);
+	if (hws->rxkey >= 0) {
+		__clear_bit(hws->rxkey, cdev->kmap.addr);
+		atomic_dec(&adap->chcr_stats.tls_key);
+		hws->rxkey = -1;
+	}
+	if (hws->txkey >= 0) {
+		__clear_bit(hws->txkey, cdev->kmap.addr);
+		atomic_dec(&adap->chcr_stats.tls_key);
+		hws->txkey = -1;
+	}
+	spin_unlock_bh(&cdev->kmap.lock);
+}
+
+static unsigned int keyid_to_addr(int start_addr, int keyid)
+{
+	return ((start_addr + (keyid * TLS_KEY_CONTEXT_SZ)) >> 5);
+}
+
+static void chtls_rxkey_ivauth(struct _key_ctx *kctx)
+{
+	kctx->iv_to_auth = cpu_to_be64(KEYCTX_TX_WR_IV_V(6ULL) |
+				  KEYCTX_TX_WR_AAD_V(1ULL) |
+				  KEYCTX_TX_WR_AADST_V(5ULL) |
+				  KEYCTX_TX_WR_CIPHER_V(14ULL) |
+				  KEYCTX_TX_WR_CIPHERST_V(0ULL) |
+				  KEYCTX_TX_WR_AUTH_V(14ULL) |
+				  KEYCTX_TX_WR_AUTHST_V(16ULL) |
+				  KEYCTX_TX_WR_AUTHIN_V(16ULL));
+}
+
+static int chtls_key_info(struct chtls_sock *csk,
+			  struct _key_ctx *kctx,
+			  u32 keylen, u32 optname)
+{
+	unsigned char key[CHCR_KEYCTX_CIPHER_KEY_SIZE_256];
+	struct crypto_cipher *cipher;
+	struct tls12_crypto_info_aes_gcm_128 *gcm_ctx =
+		(struct tls12_crypto_info_aes_gcm_128 *)
+		&csk->tlshws.crypto_info;
+	unsigned char ghash_h[AEAD_H_SIZE];
+	int ck_size, key_ctx_size;
+	int ret;
+
+	key_ctx_size = sizeof(struct _key_ctx) +
+		       roundup(keylen, 16) + AEAD_H_SIZE;
+
+	if (keylen == AES_KEYSIZE_128) {
+		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_128;
+	} else if (keylen == AES_KEYSIZE_192) {
+		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_192;
+	} else if (keylen == AES_KEYSIZE_256) {
+		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_256;
+	} else {
+		pr_err("GCM: Invalid key length %d\n", keylen);
+		return -EINVAL;
+	}
+	memcpy(key, gcm_ctx->key, keylen);
+
+	/* Calculate the H = CIPH(K, 0 repeated 16 times).
+	 * It will go in key context
+	 */
+	cipher = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(cipher)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = crypto_cipher_setkey(cipher, key, keylen);
+	if (ret)
+		goto out1;
+
+	memset(ghash_h, 0, AEAD_H_SIZE);
+	crypto_cipher_encrypt_one(cipher, ghash_h, ghash_h);
+	csk->tlshws.keylen = key_ctx_size;
+
+	/* Copy the Key context */
+	if (optname == TLS_RX) {
+		int key_ctx;
+
+		key_ctx = ((key_ctx_size >> 4) << 3);
+		kctx->ctx_hdr = FILL_KEY_CRX_HDR(ck_size,
+						 CHCR_KEYCTX_MAC_KEY_SIZE_128,
+						 0, 0, key_ctx);
+		chtls_rxkey_ivauth(kctx);
+	} else {
+		kctx->ctx_hdr = FILL_KEY_CTX_HDR(ck_size,
+						 CHCR_KEYCTX_MAC_KEY_SIZE_128,
+						 0, 0, key_ctx_size >> 4);
+	}
+
+	memcpy(kctx->salt, gcm_ctx->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(kctx->key, gcm_ctx->key, keylen);
+	memcpy(kctx->key + keylen, ghash_h, AEAD_H_SIZE);
+	/* erase key info from driver */
+	memset(gcm_ctx->key, 0, keylen);
+
+out1:
+	crypto_free_cipher(cipher);
+out:
+	return ret;
+}
+
+static void chtls_set_scmd(struct chtls_sock *csk)
+{
+	struct chtls_hws *hws = &csk->tlshws;
+
+	hws->scmd.seqno_numivs =
+		SCMD_SEQ_NO_CTRL_V(3) |
+		SCMD_PROTO_VERSION_V(0) |
+		SCMD_ENC_DEC_CTRL_V(0) |
+		SCMD_CIPH_AUTH_SEQ_CTRL_V(1) |
+		SCMD_CIPH_MODE_V(2) |
+		SCMD_AUTH_MODE_V(4) |
+		SCMD_HMAC_CTRL_V(0) |
+		SCMD_IV_SIZE_V(4) |
+		SCMD_NUM_IVS_V(1);
+
+	hws->scmd.ivgen_hdrlen =
+		SCMD_IV_GEN_CTRL_V(1) |
+		SCMD_KEY_CTX_INLINE_V(0) |
+		SCMD_TLS_FRAG_ENABLE_V(1);
+}
+
+int chtls_setkey(struct chtls_sock *csk, u32 keylen, u32 optname)
+{
+	struct chtls_dev *cdev = csk->cdev;
+	struct sock *sk = csk->sk;
+	struct tls_key_req *kwr;
+	struct _key_ctx *kctx;
+	struct sk_buff *skb;
+	int wrlen, klen, len;
+	int keyid;
+	int ret = 0;
+
+	klen = roundup((keylen + AEAD_H_SIZE) + sizeof(*kctx), 32);
+	wrlen = roundup(sizeof(*kwr), 16);
+	len = klen + wrlen;
+
+	/* Flush out-standing data before new key takes effect */
+	if (optname == TLS_TX) {
+		lock_sock(sk);
+		if (skb_queue_len(&csk->txq))
+			chtls_push_frames(csk, 0);
+		release_sock(sk);
+	}
+
+	keyid = get_new_keyid(csk, optname);
+	if (keyid < 0)
+		return -ENOSPC;
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	kwr = (struct tls_key_req *)__skb_put_zero(skb, len);
+	kwr->wr.op_to_compl =
+		cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR) | FW_WR_COMPL_F |
+		      FW_WR_ATOMIC_V(1U));
+	kwr->wr.flowid_len16 =
+		cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(len, 16) |
+			    FW_WR_FLOWID_V(csk->tid)));
+	kwr->wr.protocol = 0;
+	kwr->wr.mfs = htons(TLS_MFS);
+	kwr->wr.reneg_to_write_rx = optname;
+
+	/* ulptx command */
+	kwr->req.cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE) |
+			    T5_ULP_MEMIO_ORDER_V(1) |
+			    T5_ULP_MEMIO_IMM_V(1));
+	kwr->req.len16 = cpu_to_be32((csk->tid << 8) |
+			      DIV_ROUND_UP(len - sizeof(kwr->wr), 16));
+	kwr->req.dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(klen >> 5));
+	kwr->req.lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(keyid_to_addr
+					(cdev->kmap.start, keyid)));
+
+	/* sub command */
+	kwr->sc_imm.cmd_more = cpu_to_be32(ULPTX_CMD_V(ULP_TX_SC_IMM));
+	kwr->sc_imm.len = cpu_to_be32(klen);
+
+	/* key info */
+	kctx = (struct _key_ctx *)(kwr + 1);
+	ret = chtls_key_info(csk, kctx, keylen, optname);
+
+	csk->wr_credits -= DIV_ROUND_UP(len, 16);
+	csk->wr_unacked += DIV_ROUND_UP(len, 16);
+	enqueue_wr(csk, skb);
+	cxgb4_ofld_send(csk->egress_dev, skb);
+
+	chtls_set_scmd(csk);
+	/* Clear quiesce for Rx key */
+	if (optname == TLS_RX) {
+		chtls_set_tcb_keyid(sk, keyid);
+		chtls_set_tcb_field(sk, 0,
+				    TCB_ULP_RAW_V(TCB_ULP_RAW_M),
+				    TCB_ULP_RAW_V((TF_TLS_KEY_SIZE_V(1) |
+						  TF_TLS_CONTROL_V(1) |
+						  TF_TLS_ACTIVE_V(1) |
+						  TF_TLS_ENABLE_V(1))));
+		chtls_set_tcb_seqno(sk);
+		chtls_set_tcb_quiesce(sk, 0);
+		csk->tlshws.rxkey = keyid;
+	} else {
+		csk->tlshws.tx_seq_no = 0;
+		csk->tlshws.txkey = keyid;
+	}
+
+	return ret;
+}
-- 
1.8.3.1

^ permalink raw reply related

* [RFC crypto v3 6/9] chtls: CPL handler definition
From: Atul Gupta @ 2017-12-20 11:37 UTC (permalink / raw)
  To: herbert, linux-crypto, ganeshgr; +Cc: netdev, davem, davejwatson

CPL handlers for TLS session, record transmit and receive.

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
v3: made some functions static and removed un-needed semicolon
---
 drivers/crypto/chelsio/chtls/chtls_cm.c | 2045 +++++++++++++++++++++++++++++++
 net/ipv4/tcp_minisocks.c                |    1 +
 2 files changed, 2046 insertions(+)
 create mode 100644 drivers/crypto/chelsio/chtls/chtls_cm.c

diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c
new file mode 100644
index 0000000..dac3c3a
--- /dev/null
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.c
@@ -0,0 +1,2045 @@
+/*
+ * Copyright (c) 2017 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Written by: Atul Gupta (atul.gupta@chelsio.com)
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/sched/signal.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/if_vlan.h>
+#include <net/tcp.h>
+#include <net/dst.h>
+
+#include "chtls.h"
+#include "chtls_cm.h"
+
+extern struct request_sock_ops chtls_rsk_ops;
+
+/*
+ * State transitions and actions for close.  Note that if we are in SYN_SENT
+ * we remain in that state as we cannot control a connection while it's in
+ * SYN_SENT; such connections are allowed to establish and are then aborted.
+ */
+static unsigned char new_state[16] = {
+	/* current state:     new state:      action: */
+	/* (Invalid)       */ TCP_CLOSE,
+	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+	/* TCP_SYN_SENT    */ TCP_SYN_SENT,
+	/* TCP_SYN_RECV    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+	/* TCP_FIN_WAIT1   */ TCP_FIN_WAIT1,
+	/* TCP_FIN_WAIT2   */ TCP_FIN_WAIT2,
+	/* TCP_TIME_WAIT   */ TCP_CLOSE,
+	/* TCP_CLOSE       */ TCP_CLOSE,
+	/* TCP_CLOSE_WAIT  */ TCP_LAST_ACK | TCP_ACTION_FIN,
+	/* TCP_LAST_ACK    */ TCP_LAST_ACK,
+	/* TCP_LISTEN      */ TCP_CLOSE,
+	/* TCP_CLOSING     */ TCP_CLOSING,
+};
+
+static struct chtls_sock *chtls_sock_create(struct chtls_dev *cdev)
+{
+	struct chtls_sock *csk = kzalloc(sizeof(*csk), GFP_ATOMIC);
+
+	if (!csk)
+		return NULL;
+
+	csk->txdata_skb_cache = alloc_skb(TXDATA_SKB_LEN, GFP_ATOMIC);
+	if (!csk->txdata_skb_cache) {
+		kfree(csk);
+		return NULL;
+	}
+
+	kref_init(&csk->kref);
+	csk->cdev = cdev;
+	skb_queue_head_init(&csk->txq);
+	csk->wr_skb_head = NULL;
+	csk->wr_skb_tail = NULL;
+	csk->mss = MAX_MSS;
+	csk->tlshws.ofld = 1;
+	csk->tlshws.txkey = -1;
+	csk->tlshws.rxkey = -1;
+	csk->tlshws.mfs = TLS_MFS;
+	skb_queue_head_init(&csk->tlshws.sk_recv_queue);
+	return csk;
+}
+
+static void chtls_sock_release(struct kref *ref)
+{
+	struct chtls_sock *csk =
+		container_of(ref, struct chtls_sock, kref);
+
+	kfree(csk);
+}
+
+static struct net_device *chtls_ipv4_netdev(struct chtls_dev *cdev,
+					    struct sock *sk)
+{
+	struct net_device *ndev = cdev->ports[0];
+
+	if (likely(!inet_sk(sk)->inet_rcv_saddr))
+		return ndev;
+
+	ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr);
+	if (!ndev)
+		return NULL;
+
+	if (is_vlan_dev(ndev))
+		return vlan_dev_real_dev(ndev);
+	return ndev;
+}
+
+static void assign_rxopt(struct sock *sk, unsigned int opt)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct chtls_dev *cdev;
+
+	cdev = csk->cdev;
+	tp->tcp_header_len           = sizeof(struct tcphdr);
+	tp->rx_opt.mss_clamp         = cdev->mtus[TCPOPT_MSS_G(opt)] - 40;
+	tp->mss_cache                = tp->rx_opt.mss_clamp;
+	tp->rx_opt.tstamp_ok         = TCPOPT_TSTAMP_G(opt);
+	tp->rx_opt.snd_wscale        = TCPOPT_SACK_G(opt);
+	tp->rx_opt.wscale_ok         = TCPOPT_WSCALE_OK_G(opt);
+	SND_WSCALE(tp)               = TCPOPT_SND_WSCALE_G(opt);
+	if (!tp->rx_opt.wscale_ok)
+		tp->rx_opt.rcv_wscale = 0;
+	if (tp->rx_opt.tstamp_ok) {
+		tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
+		tp->rx_opt.mss_clamp -= TCPOLEN_TSTAMP_ALIGNED;
+	} else if (csk->opt2 & TSTAMPS_EN_F) {
+		csk->opt2 &= ~TSTAMPS_EN_F;
+		csk->mtu_idx = TCPOPT_MSS_G(opt);
+	}
+}
+
+static void chtls_purge_rcv_queue(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		skb_dst_set(skb, (void *)NULL);
+		kfree_skb(skb);
+	}
+}
+
+static void chtls_purge_write_queue(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&csk->txq))) {
+		sk->sk_wmem_queued -= skb->truesize;
+		__kfree_skb(skb);
+	}
+}
+
+static void chtls_purge_receive_queue(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_hws *tlsk = &csk->tlshws;
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&tlsk->sk_recv_queue)) != NULL) {
+		skb_dst_set(skb, NULL);
+		kfree_skb(skb);
+	}
+}
+
+static void abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct chtls_dev *cdev = (struct chtls_dev *)handle;
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb4_ofld_send(cdev->lldi->ports[0], skb);
+}
+
+static struct sk_buff *alloc_ctrl_skb(struct sk_buff *skb, int len)
+{
+	if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) {
+		__skb_trim(skb, 0);
+		refcount_add(2, &skb->users);
+	} else {
+		skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL);
+	}
+	return skb;
+}
+
+static void chtls_send_abort(struct sock *sk, int mode, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct cpl_abort_req *req;
+
+	if (!skb)
+		skb = alloc_ctrl_skb(csk->txdata_skb_cache, sizeof(*req));
+
+	req = (struct cpl_abort_req *)skb_put(skb, sizeof(*req));
+	INIT_TP_WR_CPL(req, CPL_ABORT_REQ, csk->tid);
+	set_queue(skb, (csk->txq_idx << 1) | CPL_PRIORITY_DATA, sk);
+	req->rsvd0 = htonl(tp->snd_nxt);
+	req->rsvd1 = !csk_flag_nochk(csk, CSK_TX_DATA_SENT);
+	req->cmd = mode;
+	t4_set_arp_err_handler(skb, csk->cdev, abort_arp_failure);
+	send_or_defer(sk, tp, skb, mode == CPL_ABORT_SEND_RST);
+}
+
+static int chtls_send_reset(struct sock *sk, int mode, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+
+	if (unlikely(csk_flag_nochk(csk, CSK_ABORT_SHUTDOWN) ||
+		     !csk->cdev)) {
+		if (sk->sk_state == TCP_SYN_RECV)
+			csk_set_flag(csk, CSK_RST_ABORTED);
+		goto out;
+	}
+
+	if (!csk_flag_nochk(csk, CSK_TX_DATA_SENT)) {
+		struct tcp_sock *tp = tcp_sk(sk);
+
+		if (send_tx_flowc_wr(sk, 0, tp->snd_nxt, tp->rcv_nxt) < 0)
+			WARN_ONCE(1, "send tx flowc error");
+		csk_set_flag(csk, CSK_TX_DATA_SENT);
+	}
+
+	csk_set_flag(csk, CSK_ABORT_RPL_PENDING);
+	chtls_purge_write_queue(sk);
+
+	csk_set_flag(csk, CSK_ABORT_SHUTDOWN);
+	if (sk->sk_state != TCP_SYN_RECV)
+		chtls_send_abort(sk, mode, skb);
+	else
+		goto out;
+
+	return 0;
+out:
+	if (skb)
+		kfree_skb(skb);
+	return 1;
+}
+
+static void release_tcp_port(struct sock *sk)
+{
+	if (inet_csk(sk)->icsk_bind_hash)
+		inet_put_port(sk);
+}
+
+static void tcp_uncork(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->nonagle & TCP_NAGLE_CORK) {
+		tp->nonagle &= ~TCP_NAGLE_CORK;
+		chtls_tcp_push(sk, 0);
+	}
+}
+
+static void chtls_close_conn(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	unsigned int len = roundup(sizeof(struct cpl_close_con_req), 16);
+	struct cpl_close_con_req *req;
+	unsigned int tid = csk->tid;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL);
+	req = (struct cpl_close_con_req *)__skb_put(skb, len);
+	memset(req, 0, len);
+	req->wr.wr_hi = htonl(FW_WR_OP_V(FW_TP_WR) |
+			      FW_WR_IMMDLEN_V(sizeof(*req) -
+					      sizeof(req->wr)));
+	req->wr.wr_mid = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16)) |
+			       FW_WR_FLOWID_V(tid));
+
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+
+	tcp_uncork(sk);
+	skb_entail(sk, skb, ULPCB_FLAG_NO_HDR | ULPCB_FLAG_NO_APPEND);
+	if (sk->sk_state != TCP_SYN_SENT)
+		chtls_push_frames(csk, 1);
+}
+
+/*
+ * Perform a state transition during close and return the actions indicated
+ * for the transition.  Do not make this function inline, the main reason
+ * it exists at all is to avoid multiple inlining of tcp_set_state.
+ */
+static int make_close_transition(struct sock *sk)
+{
+	int next = (int)new_state[sk->sk_state];
+
+	tcp_set_state(sk, next & TCP_STATE_MASK);
+	return next & TCP_ACTION_FIN;
+}
+
+void chtls_close(struct sock *sk, long timeout)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	int data_lost, prev_state;
+
+	lock_sock(sk);
+	if (sk->sk_prot->close != chtls_close) {
+		release_sock(sk);
+		return sk->sk_prot->close(sk, timeout);
+	}
+
+	sk->sk_shutdown |= SHUTDOWN_MASK;
+
+	data_lost = skb_queue_len(&sk->sk_receive_queue);
+	data_lost |= skb_queue_len(&csk->tlshws.sk_recv_queue);
+	chtls_purge_receive_queue(sk);
+	chtls_purge_rcv_queue(sk);
+
+	if (sk->sk_state == TCP_CLOSE) {
+		goto wait;
+	} else if (data_lost || sk->sk_state == TCP_SYN_SENT) {
+		chtls_send_reset(sk, CPL_ABORT_SEND_RST, NULL);
+		release_tcp_port(sk);
+		goto unlock;
+	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+		sk->sk_prot->disconnect(sk, 0);
+	} else if (make_close_transition(sk)) {
+		chtls_close_conn(sk);
+	}
+wait:
+	if (timeout)
+		sk_stream_wait_close(sk, timeout);
+
+unlock:
+	prev_state = sk->sk_state;
+	sock_hold(sk);
+	sock_orphan(sk);
+
+	release_sock(sk);
+
+	local_bh_disable();
+	bh_lock_sock(sk);
+
+	if (prev_state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	if (sk->sk_state == TCP_FIN_WAIT2 && tcp_sk(sk)->linger2 < 0 &&
+	    !csk_flag(sk, CSK_ABORT_SHUTDOWN)) {
+		struct sk_buff *skb;
+
+		skb = alloc_skb(sizeof(struct cpl_abort_req), GFP_ATOMIC);
+		if (skb)
+			chtls_send_reset(sk, CPL_ABORT_SEND_RST, skb);
+	}
+
+	if (sk->sk_state == TCP_CLOSE)
+		inet_csk_destroy_sock(sk);
+
+out:
+	bh_unlock_sock(sk);
+	local_bh_enable();
+	sock_put(sk);
+}
+
+/*
+ * Wait until a socket enters on of the given states.
+ */
+static int wait_for_states(struct sock *sk, unsigned int states)
+{
+	struct socket_wq _sk_wq;
+	long current_timeo = 200;
+	DECLARE_WAITQUEUE(wait, current);
+	int err = 0;
+
+	/*
+	 * We want this to work even when there's no associated struct socket.
+	 * In that case we provide a temporary wait_queue_head_t.
+	 */
+	if (!sk->sk_wq) {
+		init_waitqueue_head(&_sk_wq.wait);
+		_sk_wq.fasync_list = NULL;
+		init_rcu_head_on_stack(&_sk_wq.rcu);
+		sk->sk_wq = &_sk_wq;
+	}
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (!sk_in_state(sk, states)) {
+		if (!current_timeo) {
+			err = -EBUSY;
+			break;
+		}
+		if (signal_pending(current)) {
+			err = sock_intr_errno(current_timeo);
+			break;
+		}
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		release_sock(sk);
+		if (!sk_in_state(sk, states))
+			current_timeo = schedule_timeout(current_timeo);
+		__set_current_state(TASK_RUNNING);
+		lock_sock(sk);
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	if (sk->sk_wq == &_sk_wq)
+		sk->sk_wq = NULL;
+	return err;
+}
+
+int chtls_disconnect(struct sock *sk, int flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct chtls_sock *csk;
+	int err;
+
+	if (sk->sk_prot->disconnect != chtls_disconnect)
+		return sk->sk_prot->disconnect(sk, flags);
+
+	csk = rcu_dereference_sk_user_data(sk);
+	chtls_purge_receive_queue(sk);
+	chtls_purge_rcv_queue(sk);
+	chtls_purge_write_queue(sk);
+
+	if (sk->sk_state != TCP_CLOSE) {
+		sk->sk_err = ECONNRESET;
+		chtls_send_reset(sk, CPL_ABORT_SEND_RST, NULL);
+		err = wait_for_states(sk, TCPF_CLOSE);
+		if (err)
+			return err;
+	}
+	if (sk->sk_prot->disconnect != chtls_disconnect)
+		return sk->sk_prot->disconnect(sk, flags);
+
+	chtls_purge_receive_queue(sk);
+	chtls_purge_rcv_queue(sk);
+	tp->max_window = 0xFFFF << (tp->rx_opt.snd_wscale);
+	return tcp_disconnect(sk, flags);
+}
+
+#define SHUTDOWN_ELIGIBLE_STATE (TCPF_ESTABLISHED | \
+				 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)
+void chtls_shutdown(struct sock *sk, int how)
+{
+	if (sk->sk_prot->shutdown != chtls_shutdown)
+		return sk->sk_prot->shutdown(sk, how);
+
+	if ((how & SEND_SHUTDOWN) &&
+	    sk_in_state(sk, SHUTDOWN_ELIGIBLE_STATE) &&
+	    make_close_transition(sk))
+		chtls_close_conn(sk);
+}
+
+void chtls_destroy_sock(struct sock *sk)
+{
+	struct chtls_sock *csk;
+
+	if (sk->sk_prot->destroy != chtls_destroy_sock)
+		return sk->sk_prot->destroy(sk);
+
+	csk = rcu_dereference_sk_user_data(sk);
+	chtls_purge_receive_queue(sk);
+	csk->ulp_mode = ULP_MODE_NONE;
+	chtls_purge_write_queue(sk);
+	free_tls_keyid(sk);
+	kref_put(&csk->kref, chtls_sock_release);
+
+	sk->sk_prot = &tcp_prot;
+	sk->sk_prot->destroy(sk);
+}
+
+static void reset_listen_child(struct sock *child)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(child);
+	struct sk_buff *skb;
+
+	skb = alloc_ctrl_skb(csk->txdata_skb_cache,
+			     sizeof(struct cpl_abort_req));
+
+	chtls_send_reset(child, CPL_ABORT_SEND_RST, skb);
+	sock_orphan(child);
+	INC_ORPHAN_COUNT(child);
+	if (child->sk_state == TCP_CLOSE)
+		inet_csk_destroy_sock(child);
+}
+
+static void chtls_disconnect_acceptq(struct sock *listen_sk)
+{
+	struct request_sock **pprev;
+
+	pprev = ACCEPT_QUEUE(listen_sk);
+	while (*pprev) {
+		struct request_sock *req = *pprev;
+
+		if (req->rsk_ops == &chtls_rsk_ops) {
+			struct sock *child = req->sk;
+
+			*pprev = req->dl_next;
+			sk_acceptq_removed(listen_sk);
+			reqsk_put(req);
+			sock_hold(child);
+			local_bh_disable();
+			bh_lock_sock(child);
+			release_tcp_port(child);
+			reset_listen_child(child);
+			bh_unlock_sock(child);
+			local_bh_enable();
+			sock_put(child);
+		} else {
+			pprev = &req->dl_next;
+		}
+	}
+}
+
+static int listen_hashfn(const struct sock *sk)
+{
+	return ((unsigned long)sk >> 10) & (LISTEN_INFO_HASH_SIZE - 1);
+}
+
+static struct listen_info *listen_hash_add(struct chtls_dev *cdev,
+					   struct sock *sk,
+					   unsigned int stid)
+{
+	struct listen_info *p = kmalloc(sizeof(*p), GFP_KERNEL);
+
+	if (p) {
+		int key = listen_hashfn(sk);
+
+		p->sk = sk;
+		p->stid = stid;
+		spin_lock(&cdev->listen_lock);
+		p->next = cdev->listen_hash_tab[key];
+		cdev->listen_hash_tab[key] = p;
+		spin_unlock(&cdev->listen_lock);
+	}
+	return p;
+}
+
+static int listen_hash_find(struct chtls_dev *cdev,
+			    struct sock *sk)
+{
+	int key = listen_hashfn(sk);
+	struct listen_info *p;
+	int stid = -1;
+
+	spin_lock(&cdev->listen_lock);
+	for (p = cdev->listen_hash_tab[key]; p; p = p->next)
+		if (p->sk == sk) {
+			stid = p->stid;
+			break;
+		}
+	spin_unlock(&cdev->listen_lock);
+	return stid;
+}
+
+static int listen_hash_del(struct chtls_dev *cdev,
+			   struct sock *sk)
+{
+	int key = listen_hashfn(sk);
+	struct listen_info *p, **prev = &cdev->listen_hash_tab[key];
+	int stid = -1;
+
+	spin_lock(&cdev->listen_lock);
+	for (p = *prev; p; prev = &p->next, p = p->next)
+		if (p->sk == sk) {
+			stid = p->stid;
+			*prev = p->next;
+			kfree(p);
+			break;
+		}
+	spin_unlock(&cdev->listen_lock);
+	return stid;
+}
+
+int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk)
+{
+	struct net_device *ndev;
+	struct listen_ctx *ctx;
+	struct adapter *adap;
+	struct port_info *pi;
+	int stid;
+	int ret;
+
+	if (sk->sk_family != PF_INET)
+		return -EAGAIN;
+
+	rcu_read_lock();
+	ndev = chtls_ipv4_netdev(cdev, sk);
+	rcu_read_unlock();
+	if (!ndev)
+		return -EBADF;
+
+	pi = netdev_priv(ndev);
+	adap = pi->adapter;
+	if (!(adap->flags & FULL_INIT_DONE))
+		return -EBADF;
+
+	if (listen_hash_find(cdev, sk) >= 0)   /* already have it */
+		return -EADDRINUSE;
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	__module_get(THIS_MODULE);
+	ctx->lsk = sk;
+	ctx->cdev = cdev;
+	ctx->state = T4_LISTEN_START_PENDING;
+
+	if (cdev->lldi->enable_fw_ofld_conn &&
+	    sk->sk_family == PF_INET)
+		stid = cxgb4_alloc_sftid(cdev->tids, sk->sk_family, ctx);
+	else
+		stid = cxgb4_alloc_stid(cdev->tids, sk->sk_family, ctx);
+
+	if (stid < 0)
+		goto free_ctx;
+
+	sock_hold(sk);
+	if (!listen_hash_add(cdev, sk, stid))
+		goto free_stid;
+
+	if (cdev->lldi->enable_fw_ofld_conn) {
+		ret = cxgb4_create_server_filter(ndev, stid,
+						 inet_sk(sk)->inet_rcv_saddr,
+						 inet_sk(sk)->inet_sport, 0,
+						 cdev->lldi->rxq_ids[0], 0, 0);
+	} else {
+		ret = cxgb4_create_server(ndev, stid,
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_sport, 0,
+					  cdev->lldi->rxq_ids[0]);
+	}
+	if (ret > 0)
+		ret = net_xmit_errno(ret);
+	if (ret)
+		goto del_hash;
+	return 0;
+del_hash:
+	listen_hash_del(cdev, sk);
+free_stid:
+	cxgb4_free_stid(cdev->tids, stid, sk->sk_family);
+	sock_put(sk);
+free_ctx:
+	kfree(ctx);
+	module_put(THIS_MODULE);
+	return -EBADF;
+}
+
+void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk)
+{
+	int stid;
+
+	stid = listen_hash_del(cdev, sk);
+	if (stid < 0)
+		return;
+
+	if (cdev->lldi->enable_fw_ofld_conn) {
+		cxgb4_remove_server_filter(cdev->lldi->ports[0], stid,
+					   cdev->lldi->rxq_ids[0], 0);
+	} else {
+		cxgb4_remove_server(cdev->lldi->ports[0], stid,
+				    cdev->lldi->rxq_ids[0], 0);
+	}
+	chtls_disconnect_acceptq(sk);
+}
+
+static int chtls_pass_open_rpl(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_pass_open_rpl *rpl = cplhdr(skb) + RSS_HDR;
+	unsigned int stid = GET_TID(rpl);
+	struct listen_ctx *listen_ctx;
+
+	listen_ctx = (struct listen_ctx *)lookup_stid(cdev->tids, stid);
+	if (!listen_ctx)
+		return 1;
+
+	if (listen_ctx->state == T4_LISTEN_START_PENDING) {
+		listen_ctx->state = T4_LISTEN_STARTED;
+		return 1;
+	}
+
+	if (rpl->status != CPL_ERR_NONE) {
+		pr_info("Unexpected PASS_OPEN_RPL status %u for STID %u\n",
+			rpl->status, stid);
+		return 1;
+	}
+	cxgb4_free_stid(cdev->tids, stid, listen_ctx->lsk->sk_family);
+	sock_put(listen_ctx->lsk);
+	kfree(listen_ctx);
+	module_put(THIS_MODULE);
+
+	return 0;
+}
+
+static int chtls_close_listsrv_rpl(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_close_listsvr_rpl *rpl = cplhdr(skb) + RSS_HDR;
+	unsigned int stid = GET_TID(rpl);
+	void *data = lookup_stid(cdev->tids, stid);
+	struct listen_ctx *listen_ctx = (struct listen_ctx *)data;
+
+	if (rpl->status != CPL_ERR_NONE) {
+		pr_info("Unexpected CLOSE_LISTSRV_RPL status %u for STID %u\n",
+			rpl->status, stid);
+		return 1;
+	}
+
+	cxgb4_free_stid(cdev->tids, stid, listen_ctx->lsk->sk_family);
+	sock_put(listen_ctx->lsk);
+	kfree(listen_ctx);
+	module_put(THIS_MODULE);
+
+	return 0;
+}
+
+static void conn_remove_handle(struct chtls_dev *cdev,
+			       int tid)
+{
+	spin_lock_bh(&cdev->aidr_lock);
+	idr_remove(&cdev->aidr, tid);
+	spin_unlock_bh(&cdev->aidr_lock);
+}
+
+void free_atid(struct chtls_sock *csk, struct chtls_dev *cdev,
+	       unsigned int atid)
+{
+	struct tid_info *tids = cdev->tids;
+
+	conn_remove_handle(cdev, atid);
+	cxgb4_free_atid(tids, atid);
+	sock_put(csk->sk);
+	kref_put(&csk->kref, chtls_sock_release);
+}
+
+static void chtls_release_resources(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_dev *cdev = csk->cdev;
+	unsigned int tid = csk->tid;
+	struct tid_info *tids;
+
+	if (!cdev)
+		return;
+
+	tids = cdev->tids;
+	kfree_skb(csk->txdata_skb_cache);
+	csk->txdata_skb_cache = NULL;
+
+	if (csk->l2t_entry) {
+		cxgb4_l2t_release(csk->l2t_entry);
+		csk->l2t_entry = NULL;
+	}
+
+	if (sk->sk_state == TCP_SYN_SENT) {
+		free_atid(csk, cdev, tid);
+		__skb_queue_purge(&csk->ooo_queue);
+	} else {
+		cxgb4_remove_tid(tids, csk->port_id, tid, sk->sk_family);
+		sock_put(sk);
+	}
+}
+
+static void cleanup_syn_rcv_conn(struct sock *child, struct sock *parent)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(child);
+	struct request_sock *req = csk->passive_reap_next;
+
+	reqsk_queue_removed(&inet_csk(parent)->icsk_accept_queue, req);
+	chtls_reqsk_free(req);
+	csk->passive_reap_next = NULL;
+}
+
+static void chtls_conn_done(struct sock *sk)
+{
+	if (sock_flag(sk, SOCK_DEAD))
+		chtls_purge_rcv_queue(sk);
+	sk_wakeup_sleepers(sk, 0);
+	tcp_done(sk);
+}
+
+static void do_abort_syn_rcv(struct sock *child, struct sock *parent)
+{
+	/*
+	 * If the server is still open we clean up the child connection,
+	 * otherwise the server already did the clean up as it was purging
+	 * its SYN queue and the skb was just sitting in its backlog.
+	 */
+	if (likely(parent->sk_state == TCP_LISTEN)) {
+		cleanup_syn_rcv_conn(child, parent);
+		/* Without the below call to sock_orphan,
+		 * we leak the socket resource with syn_flood test
+		 * as inet_csk_destroy_sock will not be called
+		 * in tcp_done since SOCK_DEAD flag is not set.
+		 * Kernel handles this differently where new socket is
+		 * created only after 3 way handshake is done.
+		 */
+		sock_orphan(child);
+		percpu_counter_inc((child)->sk_prot->orphan_count);
+		chtls_release_resources(child);
+		chtls_conn_done(child);
+	} else {
+		if (csk_flag(child, CSK_RST_ABORTED)) {
+			chtls_release_resources(child);
+			chtls_conn_done(child);
+		}
+	}
+}
+
+static void pass_open_abort(struct sock *child, struct sock *parent,
+			    struct sk_buff *skb)
+{
+	do_abort_syn_rcv(child, parent);
+	kfree_skb(skb);
+}
+
+static void bl_pass_open_abort(struct sock *lsk, struct sk_buff *skb)
+{
+	pass_open_abort(skb->sk, lsk, skb);
+}
+
+static void chtls_pass_open_arp_failure(struct sock *sk,
+					struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_dev *cdev = csk->cdev;
+	const struct request_sock *oreq;
+	struct sock *parent;
+	void *data;
+
+	/*
+	 * If the connection is being aborted due to the parent listening
+	 * socket going away there's nothing to do, the ABORT_REQ will close
+	 * the connection.
+	 */
+	if (csk_flag(sk, CSK_ABORT_RPL_PENDING)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	oreq = csk->passive_reap_next;
+	data = lookup_stid(cdev->tids, oreq->ts_recent);
+	parent = ((struct listen_ctx *)data)->lsk;
+
+	bh_lock_sock(parent);
+	if (!sock_owned_by_user(parent)) {
+		pass_open_abort(sk, parent, skb);
+	} else {
+		BLOG_SKB_CB(skb)->backlog_rcv = bl_pass_open_abort;
+		__sk_add_backlog(parent, skb);
+	}
+	bh_unlock_sock(parent);
+}
+
+static void chtls_accept_rpl_arp_failure(void *handle,
+					 struct sk_buff *skb)
+{
+	struct sock *sk = (struct sock *)handle;
+
+	sock_hold(sk);
+	process_cpl_msg(chtls_pass_open_arp_failure, sk, skb);
+	sock_put(sk);
+}
+
+static unsigned int chtls_select_mss(const struct chtls_sock *csk,
+				     unsigned int pmtu,
+				     struct cpl_pass_accept_req *req)
+{
+	struct sock *sk = csk->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct chtls_dev *cdev = csk->cdev;
+	unsigned int iphdrsz;
+	unsigned int tcpoptsz = 0;
+	unsigned int mtu_idx;
+	unsigned int mss = ntohs(req->tcpopt.mss);
+
+	iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr);
+	if (req->tcpopt.tstamp)
+		tcpoptsz += round_up(TCPOLEN_TIMESTAMP, 4);
+
+	tp->advmss = dst_metric_advmss(dst);
+	if (USER_MSS(tp) && tp->advmss > USER_MSS(tp))
+		tp->advmss = USER_MSS(tp);
+	if (tp->advmss > pmtu - iphdrsz)
+		tp->advmss = pmtu - iphdrsz;
+	if (mss && tp->advmss > mss)
+		tp->advmss = mss;
+
+	tp->advmss = cxgb4_best_aligned_mtu(cdev->lldi->mtus,
+					    iphdrsz + tcpoptsz,
+					    tp->advmss - tcpoptsz,
+					    8, &mtu_idx);
+	tp->advmss -= iphdrsz;
+
+	inet_csk(sk)->icsk_pmtu_cookie = pmtu;
+	return mtu_idx;
+}
+
+static unsigned int select_rcv_wnd(struct chtls_sock *csk)
+{
+	struct sock *sk = csk->sk;
+	unsigned int wnd = tcp_full_space(sk);
+	unsigned int rcvwnd;
+
+	if (wnd < MIN_RCV_WND)
+		wnd = MIN_RCV_WND;
+
+	rcvwnd = MAX_RCV_WND;
+
+	csk_set_flag(csk, CSK_UPDATE_RCV_WND);
+	return min(wnd, rcvwnd);
+}
+
+static void chtls_pass_accept_rpl(struct sk_buff *skb,
+				  struct cpl_pass_accept_req *req,
+				  unsigned int tid)
+
+{
+	struct cpl_t5_pass_accept_rpl *rpl5;
+	unsigned int len = roundup(sizeof(*rpl5), 16);
+	struct cxgb4_lld_info *lldi;
+	const struct tcphdr *tcph;
+	const struct tcp_sock *tp;
+	struct chtls_sock *csk;
+	struct sock *sk;
+	u64 opt0;
+	u32 opt2, hlen;
+
+	sk = skb->sk;
+	tp = tcp_sk(sk);
+	csk = sk->sk_user_data;
+	csk->tid = tid;
+	lldi = csk->cdev->lldi;
+
+	rpl5 = __skb_put_zero(skb, len);
+	INIT_TP_WR(rpl5, tid);
+
+	OPCODE_TID(rpl5) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+						     csk->tid));
+	csk->mtu_idx = chtls_select_mss(csk, dst_mtu(__sk_dst_get(sk)),
+					req);
+	opt0 = TCAM_BYPASS_F |
+	       WND_SCALE_V((tp)->rx_opt.rcv_wscale) |
+	       MSS_IDX_V(csk->mtu_idx) |
+	       L2T_IDX_V(csk->l2t_entry->idx) |
+	       NAGLE_V(!(tp->nonagle & TCP_NAGLE_OFF)) |
+	       TX_CHAN_V(csk->tx_chan) |
+	       SMAC_SEL_V(csk->smac_idx) |
+	       DSCP_V(csk->tos >> 2) |
+	       ULP_MODE_V(ULP_MODE_TLS) |
+	       RCV_BUFSIZ_V(min(tp->rcv_wnd >> 10, RCV_BUFSIZ_M));
+
+	opt2 = RX_CHANNEL_V(0) |
+		RSS_QUEUE_VALID_F | RSS_QUEUE_V(csk->rss_qid);
+
+	if (!is_t5(lldi->adapter_type))
+		opt2 |= RX_FC_DISABLE_F;
+	if (req->tcpopt.tstamp)
+		opt2 |= TSTAMPS_EN_F;
+	if (req->tcpopt.sack)
+		opt2 |= SACK_EN_F;
+	hlen = ntohl(req->hdr_len);
+
+	tcph = (struct tcphdr *)((u8 *)(req + 1) +
+			T6_ETH_HDR_LEN_G(hlen) + T6_IP_HDR_LEN_G(hlen));
+	if (tcph->ece && tcph->cwr)
+		opt2 |= CCTRL_ECN_V(1);
+	opt2 |= CONG_CNTRL_V(CONG_ALG_NEWRENO);
+	opt2 |= T5_ISS_F;
+	opt2 |= T5_OPT_2_VALID_F;
+	rpl5->opt0 = cpu_to_be64(opt0);
+	rpl5->opt2 = cpu_to_be32(opt2);
+	rpl5->iss = cpu_to_be32((prandom_u32() & ~7UL) - 1);
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id);
+	t4_set_arp_err_handler(skb, sk, chtls_accept_rpl_arp_failure);
+	cxgb4_l2t_send(csk->egress_dev, skb, csk->l2t_entry);
+}
+
+static void inet_inherit_port(struct inet_hashinfo *hash_info,
+			      struct sock *lsk, struct sock *newsk)
+{
+	local_bh_disable();
+	__inet_inherit_port(lsk, newsk);
+	local_bh_enable();
+}
+
+static int chtls_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	if (skb->protocol) {
+		kfree_skb(skb);
+		return 0;
+	}
+	BLOG_SKB_CB(skb)->backlog_rcv(sk, skb);
+	return 0;
+}
+
+static struct sock *chtls_recv_sock(struct sock *lsk,
+				    struct request_sock *oreq,
+				    void *network_hdr,
+				    const struct cpl_pass_accept_req *req,
+				    struct chtls_dev *cdev)
+
+{
+	const struct iphdr *iph = (const struct iphdr *)network_hdr;
+	struct dst_entry *dst = NULL;
+	const struct tcphdr *tcph;
+	struct inet_sock *newinet;
+	struct net_device *ndev;
+	struct chtls_sock *csk;
+	struct neighbour *n;
+	struct tcp_sock *tp;
+	struct sock *newsk;
+	u16 port_id;
+	int rxq_idx;
+	int step;
+
+	newsk = tcp_create_openreq_child(lsk, oreq, cdev->askb);
+	if (!newsk)
+		goto free_oreq;
+
+	dst = inet_csk_route_child_sock(lsk, newsk, oreq);
+	if (!dst)
+		goto free_sk;
+
+	tcph = (struct tcphdr *)(iph + 1);
+	n = dst_neigh_lookup(dst, &iph->saddr);
+	if (!n)
+		goto free_sk;
+
+	ndev = n->dev;
+	if (!ndev)
+		goto free_dst;
+	port_id = cxgb4_port_idx(ndev);
+
+	csk = chtls_sock_create(cdev);
+	if (!csk)
+		goto free_dst;
+
+	csk->l2t_entry = cxgb4_l2t_get(cdev->lldi->l2t, n, ndev, 0);
+	if (!csk->l2t_entry)
+		goto free_csk;
+
+	newsk->sk_user_data = csk;
+	newsk->sk_backlog_rcv = chtls_backlog_rcv;
+
+	tp = tcp_sk(newsk);
+	newinet = inet_sk(newsk);
+
+	newinet->inet_daddr = iph->saddr;
+	newinet->inet_rcv_saddr = iph->daddr;
+	newinet->inet_saddr = iph->daddr;
+
+	oreq->ts_recent = PASS_OPEN_TID_G(ntohl(req->tos_stid));
+	sk_setup_caps(newsk, dst);
+	csk->sk = newsk;
+	csk->passive_reap_next = oreq;
+	csk->tx_chan = cxgb4_port_chan(ndev);
+	csk->port_id = port_id;
+	csk->egress_dev = ndev;
+	csk->tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+	csk->ulp_mode = ULP_MODE_TLS;
+	step = cdev->lldi->nrxq / cdev->lldi->nchan;
+	csk->rss_qid = cdev->lldi->rxq_ids[port_id * step];
+	rxq_idx = port_id * step;
+	csk->txq_idx = (rxq_idx < cdev->lldi->ntxq) ? rxq_idx :
+			port_id * step;
+	csk->sndbuf = newsk->sk_sndbuf;
+	csk->smac_idx = cxgb4_tp_smt_idx(cdev->lldi->adapter_type,
+					 cxgb4_port_viid(ndev));
+	tp->rcv_wnd = select_rcv_wnd(csk);
+
+	neigh_release(n);
+	lsk->sk_prot->hash(newsk);
+	inet_inherit_port(&tcp_hashinfo, lsk, newsk);
+	bh_unlock_sock(newsk); /* tcp_create_openreq_child ->sk_clone_lock */
+
+	return newsk;
+free_csk:
+	chtls_sock_release(&csk->kref);
+free_dst:
+	dst_release(dst);
+free_sk:
+	inet_csk_prepare_forced_close(newsk);
+	tcp_done(newsk);
+free_oreq:
+	chtls_reqsk_free(oreq);
+	return NULL;
+}
+
+/*
+ * Populate a TID_RELEASE WR.  The skb must be already propely sized.
+ */
+static  void mk_tid_release(struct sk_buff *skb,
+			    unsigned int chan, unsigned int tid)
+{
+	unsigned int len = roundup(sizeof(struct cpl_tid_release), 16);
+	struct cpl_tid_release *req;
+
+	req = (struct cpl_tid_release *)__skb_put(skb, len);
+	memset(req, 0, len);
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, chan);
+	INIT_TP_WR_CPL(req, CPL_TID_RELEASE, tid);
+}
+
+static int chtls_get_module(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (!try_module_get(icsk->icsk_ulp_ops->owner))
+		return -1;
+
+	return 0;
+}
+
+static void chtls_pass_accept_request(struct sock *sk,
+				      struct sk_buff *skb)
+{
+	struct cpl_pass_accept_req *req = cplhdr(skb) + RSS_HDR;
+	struct chtls_dev *cdev = BLOG_SKB_CB(skb)->cdev;
+	struct cpl_t5_pass_accept_rpl *rpl;
+	unsigned int len = roundup(sizeof(*rpl), 16);
+	struct request_sock *oreq = NULL;
+	unsigned int tid = GET_TID(req);
+	struct sk_buff *reply_skb;
+	struct sock *newsk;
+	struct ethhdr *eh;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	void *network_hdr;
+
+	newsk = lookup_tid(cdev->tids, tid);
+	if (newsk) {
+		pr_info("tid (%d) already in use\n", tid);
+		return;
+	}
+
+	reply_skb = alloc_skb(len, GFP_ATOMIC);
+	if (!reply_skb) {
+		cxgb4_remove_tid(cdev->tids, 0, tid, sk->sk_family);
+		kfree_skb(skb);
+		return;
+	}
+
+	if (sk->sk_state != TCP_LISTEN)
+		goto reject;
+
+	if (inet_csk_reqsk_queue_is_full(sk))
+		goto reject;
+
+	if (sk_acceptq_is_full(sk))
+		goto reject;
+
+	oreq = inet_reqsk_alloc(&chtls_rsk_ops, sk, true);
+	if (!oreq)
+		goto reject;
+
+	oreq->rsk_rcv_wnd = 0;
+	oreq->rsk_window_clamp = 0;
+	oreq->cookie_ts = 0;
+	oreq->mss = 0;
+	oreq->ts_recent = 0;
+
+	eh = (struct ethhdr *)(req + 1);
+	iph = (struct iphdr *)(eh + 1);
+	if (iph->version != 0x4)
+		goto free_oreq;
+
+	network_hdr = (void *)(eh + 1);
+	tcph = (struct tcphdr *)(iph + 1);
+
+	tcp_rsk(oreq)->tfo_listener = false;
+	tcp_rsk(oreq)->rcv_isn = ntohl(tcph->seq);
+	chtls_set_req_port(oreq, tcph->source, tcph->dest);
+	inet_rsk(oreq)->ecn_ok = 0;
+	chtls_set_req_addr(oreq, iph->daddr, iph->saddr);
+	if (req->tcpopt.wsf <= 14) {
+		inet_rsk(oreq)->wscale_ok = 1;
+		inet_rsk(oreq)->snd_wscale = req->tcpopt.wsf;
+	}
+	inet_rsk(oreq)->ir_iif = sk->sk_bound_dev_if;
+
+	newsk = chtls_recv_sock(sk, oreq, network_hdr, req, cdev);
+	if (!newsk)
+		goto reject;
+
+	if (chtls_get_module(newsk))
+		goto reject;
+	inet_csk_reqsk_queue_added(sk);
+	reply_skb->sk = newsk;
+	chtls_install_cpl_ops(newsk);
+	cxgb4_insert_tid(cdev->tids, newsk, tid, newsk->sk_family);
+	chtls_pass_accept_rpl(reply_skb, req, tid);
+	kfree_skb(skb);
+	return;
+
+free_oreq:
+	chtls_reqsk_free(oreq);
+reject:
+	mk_tid_release(reply_skb, 0, tid);
+	cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb);
+	kfree_skb(skb);
+}
+
+/*
+ * Handle a CPL_PASS_ACCEPT_REQ message.
+ */
+static int chtls_pass_accept_req(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_pass_accept_req *req = cplhdr(skb) + RSS_HDR;
+	unsigned int stid = PASS_OPEN_TID_G(ntohl(req->tos_stid));
+	unsigned int tid = GET_TID(req);
+	struct listen_ctx *ctx;
+	struct sock *lsk;
+	void *data;
+
+	data = lookup_stid(cdev->tids, stid);
+	if (!data)
+		return 1;
+
+	ctx = (struct listen_ctx *)data;
+	lsk = ctx->lsk;
+
+	if (unlikely(tid >= cdev->tids->ntids)) {
+		pr_info("passive open TID %u too large\n", tid);
+		return 1;
+	}
+
+	BLOG_SKB_CB(skb)->cdev = cdev;
+	process_cpl_msg(chtls_pass_accept_request, lsk, skb);
+	return 0;
+}
+
+/*
+ * Completes some final bits of initialization for just established connections
+ * and changes their state to TCP_ESTABLISHED.
+ *
+ * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
+ */
+static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->pushed_seq = snd_isn;
+	tp->write_seq = snd_isn;
+	tp->snd_nxt = snd_isn;
+	tp->snd_una = snd_isn;
+	inet_sk(sk)->inet_id = tp->write_seq ^ jiffies;
+	assign_rxopt(sk, opt);
+
+	if (tp->rcv_wnd > (RCV_BUFSIZ_M << 10))
+		tp->rcv_wup -= tp->rcv_wnd - (RCV_BUFSIZ_M << 10);
+
+	dst_confirm(sk->sk_dst_cache);
+
+	smp_mb();
+	tcp_set_state(sk, TCP_ESTABLISHED);
+}
+
+static void chtls_abort_conn(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *abort_skb;
+
+	abort_skb = alloc_skb(sizeof(struct cpl_abort_req), GFP_ATOMIC);
+	if (abort_skb)
+		chtls_send_reset(sk, CPL_ABORT_SEND_RST, abort_skb);
+}
+
+static struct sock *reap_list;
+static DEFINE_SPINLOCK(reap_list_lock);
+
+/*
+ * Process the reap list.
+ */
+DECLARE_TASK_FUNC(process_reap_list, task_param)
+{
+	spin_lock_bh(&reap_list_lock);
+	while (reap_list) {
+		struct sock *sk = reap_list;
+		struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+
+		reap_list = csk->passive_reap_next;
+		csk->passive_reap_next = NULL;
+		spin_unlock(&reap_list_lock);
+		sock_hold(sk);
+
+		bh_lock_sock(sk);
+		chtls_abort_conn(sk, NULL);
+		sock_orphan(sk);
+		if (sk->sk_state == TCP_CLOSE)
+			inet_csk_destroy_sock(sk);
+		bh_unlock_sock(sk);
+		sock_put(sk);
+		spin_lock(&reap_list_lock);
+	}
+	spin_unlock_bh(&reap_list_lock);
+}
+
+static DECLARE_WORK(reap_task, process_reap_list);
+
+static void add_to_reap_list(struct sock *sk)
+{
+	struct chtls_sock *csk = sk->sk_user_data;
+
+	local_bh_disable();
+	bh_lock_sock(sk);
+	release_tcp_port(sk); /* release the port immediately */
+
+	spin_lock(&reap_list_lock);
+	csk->passive_reap_next = reap_list;
+	reap_list = sk;
+	if (!csk->passive_reap_next)
+		schedule_work(&reap_task);
+	spin_unlock(&reap_list_lock);
+	bh_unlock_sock(sk);
+	local_bh_enable();
+}
+
+static void add_pass_open_to_parent(struct sock *child, struct sock *lsk,
+				    struct chtls_dev *cdev)
+{
+	struct chtls_sock *csk = child->sk_user_data;
+	struct request_sock *oreq;
+
+	if (lsk->sk_state != TCP_LISTEN)
+		return;
+
+	oreq = csk->passive_reap_next;
+	csk->passive_reap_next = NULL;
+
+	reqsk_queue_removed(&inet_csk(lsk)->icsk_accept_queue, oreq);
+
+	if (sk_acceptq_is_full(lsk)) {
+		chtls_reqsk_free(oreq);
+		add_to_reap_list(child);
+	} else {
+		refcount_set(&oreq->rsk_refcnt, 1);
+		inet_csk_reqsk_queue_add(lsk, oreq, child);
+		lsk->sk_data_ready(lsk);
+	}
+}
+
+static void bl_add_pass_open_to_parent(struct sock *lsk, struct sk_buff *skb)
+{
+	struct sock *child = skb->sk;
+
+	skb->sk = NULL;
+	add_pass_open_to_parent(child, lsk, BLOG_SKB_CB(skb)->cdev);
+	kfree_skb(skb);
+}
+
+static int chtls_pass_establish(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_pass_establish *req = cplhdr(skb) + RSS_HDR;
+	unsigned int hwtid = GET_TID(req);
+	struct chtls_sock *csk;
+	struct sock *lsk, *sk;
+
+	sk = lookup_tid(cdev->tids, hwtid);
+	if (!sk)
+		return 1;
+
+	bh_lock_sock(sk);
+	if (unlikely(sock_owned_by_user(sk))) {
+		kfree_skb(skb);
+	} else {
+		void *data;
+		unsigned int stid;
+
+		csk = sk->sk_user_data;
+		csk->wr_max_credits = 64;
+		csk->wr_credits = 64;
+		csk->wr_unacked = 0;
+		make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+		stid = PASS_OPEN_TID_G(ntohl(req->tos_stid));
+		sk->sk_state_change(sk);
+		if (unlikely(sk->sk_socket))
+			sk_wake_async(sk, 0, POLL_OUT);
+
+		data = lookup_stid(cdev->tids, stid);
+		lsk = ((struct listen_ctx *)data)->lsk;
+
+		bh_lock_sock(lsk);
+		if (likely(!sock_owned_by_user(lsk))) {
+			kfree_skb(skb);
+			add_pass_open_to_parent(sk, lsk, cdev);
+		} else {
+			skb->sk = sk;
+			BLOG_SKB_CB(skb)->cdev = cdev;
+			BLOG_SKB_CB(skb)->backlog_rcv =
+				bl_add_pass_open_to_parent;
+			__sk_add_backlog(lsk, skb);
+		}
+		bh_unlock_sock(lsk);
+	}
+	bh_unlock_sock(sk);
+	return 0;
+}
+
+/*
+ * Handle receipt of an urgent pointer.
+ */
+static void handle_urg_ptr(struct sock *sk, u32 urg_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	urg_seq--;
+	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
+		return;	/* duplicate pointer */
+
+	sk_send_sigurg(sk);
+	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+	    !sock_flag(sk, SOCK_URGINLINE) &&
+	    tp->copied_seq != tp->rcv_nxt) {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+		tp->copied_seq++;
+		if (skb && tp->copied_seq - ULP_SKB_CB(skb)->seq >= skb->len)
+			chtls_free_skb(sk, skb);
+	}
+
+	tp->urg_data = TCP_URG_NOTYET;
+	tp->urg_seq = urg_seq;
+}
+
+static void check_sk_callbacks(struct chtls_sock *csk)
+{
+	struct sock *sk = csk->sk;
+
+	if (unlikely(sk->sk_user_data &&
+		     !csk_flag_nochk(csk, CSK_CALLBACKS_CHKD)))
+		csk_set_flag(csk, CSK_CALLBACKS_CHKD);
+}
+
+/*
+ * Handles Rx data that arrives in a state where the socket isn't accepting
+ * new data.
+ */
+static void handle_excess_rx(struct sock *sk, struct sk_buff *skb)
+{
+	if (!csk_flag(sk, CSK_ABORT_SHUTDOWN))
+		chtls_abort_conn(sk, skb);
+
+	kfree_skb(skb);
+}
+
+static void chtls_recv_data(struct sock *sk, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct cpl_rx_data *hdr = cplhdr(skb) + RSS_HDR;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (unlikely(sk->sk_shutdown & RCV_SHUTDOWN)) {
+		handle_excess_rx(sk, skb);
+		return;
+	}
+
+	ULP_SKB_CB(skb)->seq = ntohl(hdr->seq);
+	ULP_SKB_CB(skb)->psh = hdr->psh;
+	skb_ulp_mode(skb) = ULP_MODE_NONE;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(*hdr) + RSS_HDR);
+	if (!skb->data_len)
+		__skb_trim(skb, ntohs(hdr->len));
+
+	if (unlikely(hdr->urg))
+		handle_urg_ptr(sk, tp->rcv_nxt + ntohs(hdr->urg));
+	if (unlikely(tp->urg_data == TCP_URG_NOTYET &&
+		     tp->urg_seq - tp->rcv_nxt < skb->len))
+		tp->urg_data = TCP_URG_VALID |
+			       skb->data[tp->urg_seq - tp->rcv_nxt];
+
+	if (unlikely(hdr->dack_mode != csk->delack_mode)) {
+		csk->delack_mode = hdr->dack_mode;
+		csk->delack_seq = tp->rcv_nxt;
+	}
+
+	tcp_hdr(skb)->fin = 0;
+	tp->rcv_nxt += skb->len;
+
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		check_sk_callbacks(csk);
+		sk->sk_data_ready(sk);
+	}
+}
+
+static int chtls_rx_data(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_rx_data *req = cplhdr(skb) + RSS_HDR;
+	unsigned int hwtid = GET_TID(req);
+	struct sock *sk;
+
+	sk = lookup_tid(cdev->tids, hwtid);
+	skb_dst_set(skb, NULL);
+	process_cpl_msg(chtls_recv_data, sk, skb);
+	return 0;
+}
+
+static void chtls_recv_pdu(struct sock *sk, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_hws *tlsk = &csk->tlshws;
+	struct cpl_tls_data *hdr = cplhdr(skb);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (unlikely(sk->sk_shutdown & RCV_SHUTDOWN)) {
+		handle_excess_rx(sk, skb);
+		return;
+	}
+
+	ULP_SKB_CB(skb)->seq = ntohl(hdr->seq);
+	ULP_SKB_CB(skb)->flags = 0;
+	skb_ulp_mode(skb) = ULP_MODE_TLS;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(*hdr));
+	if (!skb->data_len)
+		__skb_trim(skb,
+			   CPL_TLS_DATA_LENGTH_G(ntohl(hdr->length_pkd)));
+
+	if (unlikely(tp->urg_data == TCP_URG_NOTYET && tp->urg_seq -
+		     tp->rcv_nxt < skb->len))
+		tp->urg_data = TCP_URG_VALID |
+			       skb->data[tp->urg_seq - tp->rcv_nxt];
+
+	tcp_hdr(skb)->fin = 0;
+	tlsk->pldlen = CPL_TLS_DATA_LENGTH_G(ntohl(hdr->length_pkd));
+	__skb_queue_tail(&tlsk->sk_recv_queue, skb);
+}
+
+static int chtls_rx_pdu(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_tls_data *req = cplhdr(skb);
+	unsigned int hwtid = GET_TID(req);
+	struct sock *sk;
+
+	sk = lookup_tid(cdev->tids, hwtid);
+	skb_dst_set(skb, NULL);
+	process_cpl_msg(chtls_recv_pdu, sk, skb);
+	return 0;
+}
+
+static void chtls_set_hdrlen(struct sk_buff *skb, unsigned int nlen)
+{
+	struct tlsrx_cmp_hdr *tls_cmp_hdr = cplhdr(skb);
+
+	skb->hdr_len = ntohs(tls_cmp_hdr->length);
+	tls_cmp_hdr->length = ntohs(nlen);
+}
+
+static void chtls_rx_hdr(struct sock *sk, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct cpl_rx_tls_cmp *cmp_cpl = cplhdr(skb);
+	struct chtls_hws *tlsk = &csk->tlshws;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb_rec = NULL;
+
+	ULP_SKB_CB(skb)->seq = ntohl(cmp_cpl->seq);
+	ULP_SKB_CB(skb)->flags = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(*cmp_cpl));
+	if (!skb->data_len)
+		__skb_trim(skb, CPL_RX_TLS_CMP_LENGTH_G
+				(ntohl(cmp_cpl->pdulength_length)));
+
+	tp->rcv_nxt +=
+		CPL_RX_TLS_CMP_PDULENGTH_G(ntohl(cmp_cpl->pdulength_length));
+
+	skb_rec = __skb_dequeue(&tlsk->sk_recv_queue);
+	if (!skb_rec) {
+		ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_TLS_ND;
+		__skb_queue_tail(&sk->sk_receive_queue, skb);
+	} else {
+		chtls_set_hdrlen(skb, tlsk->pldlen);
+		tlsk->pldlen = 0;
+		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		__skb_queue_tail(&sk->sk_receive_queue, skb_rec);
+	}
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		check_sk_callbacks(csk);
+		sk->sk_data_ready(sk);
+	}
+}
+
+static int chtls_rx_cmp(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_rx_tls_cmp *req = cplhdr(skb);
+	unsigned int hwtid = GET_TID(req);
+	struct sock *sk;
+
+	sk = lookup_tid(cdev->tids, hwtid);
+	skb_dst_set(skb, NULL);
+	process_cpl_msg(chtls_rx_hdr, sk, skb);
+
+	return 0;
+}
+
+static void chtls_timewait(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->rcv_nxt++;
+	tp->rx_opt.ts_recent_stamp = get_seconds();
+	tp->srtt_us = 0;
+	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+}
+
+static void chtls_peer_close(struct sock *sk, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(sk, SOCK_DONE);
+
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+	case TCP_ESTABLISHED:
+		tcp_set_state(sk, TCP_CLOSE_WAIT);
+		break;
+	case TCP_FIN_WAIT1:
+		tcp_set_state(sk, TCP_CLOSING);
+		break;
+	case TCP_FIN_WAIT2:
+		chtls_release_resources(sk);
+		if (csk_flag_nochk(csk, CSK_ABORT_RPL_PENDING))
+			chtls_conn_done(sk);
+		else
+			chtls_timewait(sk);
+		break;
+	default:
+		pr_info("cpl_peer_close in bad state %d\n", sk->sk_state);
+	}
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+		/* Do not send POLL_HUP for half duplex close. */
+
+		if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
+		    sk->sk_state == TCP_CLOSE)
+			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+		else
+			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	}
+}
+
+static void chtls_close_con_rpl(struct sock *sk, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct cpl_close_con_rpl *rpl = cplhdr(skb) + RSS_HDR;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
+
+	switch (sk->sk_state) {
+	case TCP_CLOSING:
+		chtls_release_resources(sk);
+		if (csk_flag_nochk(csk, CSK_ABORT_RPL_PENDING))
+			chtls_conn_done(sk);
+		else
+			chtls_timewait(sk);
+		break;
+	case TCP_LAST_ACK:
+		chtls_release_resources(sk);
+		chtls_conn_done(sk);
+		break;
+	case TCP_FIN_WAIT1:
+		tcp_set_state(sk, TCP_FIN_WAIT2);
+		sk->sk_shutdown |= SEND_SHUTDOWN;
+		dst_confirm(sk->sk_dst_cache);
+
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+		else if (tcp_sk(sk)->linger2 < 0 &&
+			 !csk_flag_nochk(csk, CSK_ABORT_SHUTDOWN))
+			chtls_abort_conn(sk, skb);
+		break;
+	default:
+		pr_info("close_con_rpl in bad state %d\n", sk->sk_state);
+	}
+	kfree_skb(skb);
+}
+
+static struct sk_buff *get_cpl_skb(struct sk_buff *skb,
+				   size_t len, gfp_t gfp)
+{
+	if (likely(!skb_is_nonlinear(skb) && !skb_cloned(skb))) {
+		WARN_ONCE(skb->len < len, "skb alloc error");
+		__skb_trim(skb, len);
+		skb_get(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+		if (skb)
+			__skb_put(skb, len);
+	}
+	return skb;
+}
+
+static void set_abort_rpl_wr(struct sk_buff *skb, unsigned int tid,
+			     int cmd)
+{
+	struct cpl_abort_rpl *rpl = cplhdr(skb);
+
+	INIT_TP_WR_CPL(rpl, CPL_ABORT_RPL, tid);
+	rpl->cmd = cmd;
+}
+
+static void send_defer_abort_rpl(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct sk_buff *reply_skb;
+
+	reply_skb = alloc_skb(sizeof(struct cpl_abort_rpl),
+			      GFP_KERNEL | __GFP_NOFAIL);
+	if (!reply_skb)
+		return;
+
+	__skb_put(reply_skb, sizeof(struct cpl_abort_rpl));
+	set_abort_rpl_wr(reply_skb, GET_TID(req),
+			 (req->status & CPL_ABORT_NO_RST));
+	set_wr_txq(reply_skb, CPL_PRIORITY_DATA, req->status >> 1);
+	cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb);
+	kfree_skb(skb);
+}
+
+static void send_abort_rpl(struct sock *sk, struct sk_buff *skb,
+			   struct chtls_dev *cdev, int status, int queue)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct sk_buff *reply_skb;
+
+	reply_skb = alloc_skb(sizeof(struct cpl_abort_rpl),
+			      GFP_KERNEL);
+
+	if (!reply_skb) {
+		req->status = (queue << 1);
+		send_defer_abort_rpl(cdev, skb);
+		return;
+	}
+
+	set_abort_rpl_wr(reply_skb, GET_TID(req), status);
+	kfree_skb(skb);
+
+	set_wr_txq(reply_skb, CPL_PRIORITY_DATA, queue);
+	if (sock_flag(sk, SOCK_INLINE)) {
+		struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+		struct l2t_entry *e = csk->l2t_entry;
+
+		if (e && sk->sk_state != TCP_SYN_RECV) {
+			cxgb4_l2t_send(csk->egress_dev, reply_skb, e);
+			return;
+		}
+	}
+	cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb);
+}
+
+/*
+ * Add an skb to the deferred skb queue for processing from process context.
+ */
+static void t4_defer_reply(struct sk_buff *skb, struct chtls_dev *cdev,
+			   defer_handler_t handler)
+{
+	DEFERRED_SKB_CB(skb)->handler = handler;
+	spin_lock_bh(&cdev->deferq.lock);
+	__skb_queue_tail(&cdev->deferq, skb);
+	if (skb_queue_len(&cdev->deferq) == 1)
+		schedule_work(&cdev->deferq_task);
+	spin_unlock_bh(&cdev->deferq.lock);
+}
+
+static void chtls_send_abort_rpl(struct sock *sk, struct sk_buff *skb,
+				 struct chtls_dev *cdev,
+				 int status, int queue)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb) + RSS_HDR;
+	unsigned int tid = GET_TID(req);
+	struct sk_buff *reply_skb;
+
+	reply_skb = get_cpl_skb(skb, sizeof(struct cpl_abort_rpl), gfp_any());
+	if (!reply_skb) {
+		req->status = (queue << 1) | status;
+		t4_defer_reply(skb, cdev, send_defer_abort_rpl);
+		return;
+	}
+
+	set_abort_rpl_wr(reply_skb, tid, status);
+	set_wr_txq(reply_skb, CPL_PRIORITY_DATA, queue);
+	if (sock_flag(sk, SOCK_INLINE)) {
+		struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+		struct l2t_entry *e = csk->l2t_entry;
+
+		if (e && sk->sk_state != TCP_SYN_RECV) {
+			cxgb4_l2t_send(csk->egress_dev, reply_skb, e);
+			return;
+		}
+	}
+	cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb);
+	kfree_skb(skb);
+}
+
+/*
+ * This is run from a listener's backlog to abort a child connection in
+ * SYN_RCV state (i.e., one on the listener's SYN queue).
+ */
+static void bl_abort_syn_rcv(struct sock *lsk, struct sk_buff *skb)
+{
+	struct sock *child = skb->sk;
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(child);
+	int queue = csk->txq_idx;
+
+	skb->sk	= NULL;
+	do_abort_syn_rcv(child, lsk);
+	send_abort_rpl(child, skb, BLOG_SKB_CB(skb)->cdev,
+		       CPL_ABORT_NO_RST, queue);
+}
+
+static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = sk->sk_user_data;
+	const struct request_sock *oreq = csk->passive_reap_next;
+	struct chtls_dev *cdev = csk->cdev;
+	struct listen_ctx *listen_ctx;
+	struct sock *psk;
+	void *ctx;
+
+	if (!oreq)
+		return -1;
+
+	ctx = lookup_stid(cdev->tids, oreq->ts_recent);
+	if (!ctx)
+		return -1;
+
+	listen_ctx = (struct listen_ctx *)ctx;
+	psk = listen_ctx->lsk;
+
+	bh_lock_sock(psk);
+	if (!sock_owned_by_user(psk)) {
+		int queue = csk->txq_idx;
+
+		do_abort_syn_rcv(sk, psk);
+		send_abort_rpl(sk, skb, cdev, CPL_ABORT_NO_RST, queue);
+	} else {
+		skb->sk = sk;
+		BLOG_SKB_CB(skb)->backlog_rcv = bl_abort_syn_rcv;
+		__sk_add_backlog(psk, skb);
+	}
+	bh_unlock_sock(psk);
+	return 0;
+}
+
+static void chtls_abort_req_rss(struct sock *sk, struct sk_buff *skb)
+{
+	const struct cpl_abort_req_rss *req = cplhdr(skb) + RSS_HDR;
+	struct chtls_sock *csk = sk->sk_user_data;
+	int rst_status = CPL_ABORT_NO_RST;
+	int queue = csk->txq_idx;
+
+	if (is_neg_adv(req->status)) {
+		if (sk->sk_state == TCP_SYN_RECV)
+			chtls_set_tcb_tflag(sk, 0, 0);
+
+		kfree_skb(skb);
+		return;
+	}
+
+	csk_reset_flag(csk, CSK_ABORT_REQ_RCVD);
+
+	if (!csk_flag_nochk(csk, CSK_ABORT_SHUTDOWN) &&
+	    !csk_flag_nochk(csk, CSK_TX_DATA_SENT)) {
+		struct tcp_sock *tp = tcp_sk(sk);
+
+		if (send_tx_flowc_wr(sk, 0, tp->snd_nxt, tp->rcv_nxt) < 0)
+			WARN_ONCE(1, "send_tx_flowc error");
+		csk_set_flag(csk, CSK_TX_DATA_SENT);
+	}
+
+	csk_set_flag(csk, CSK_ABORT_SHUTDOWN);
+
+	if (!csk_flag_nochk(csk, CSK_ABORT_RPL_PENDING)) {
+		sk->sk_err = ETIMEDOUT;
+
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_error_report(sk);
+
+		if (sk->sk_state == TCP_SYN_RECV && !abort_syn_rcv(sk, skb))
+			return;
+
+		chtls_release_resources(sk);
+		chtls_conn_done(sk);
+	}
+
+	chtls_send_abort_rpl(sk, skb, csk->cdev, rst_status, queue);
+}
+
+static void chtls_abort_rpl_rss(struct sock *sk, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb) + RSS_HDR;
+	struct chtls_dev *cdev = csk->cdev;
+
+	if (csk_flag_nochk(csk, CSK_ABORT_RPL_PENDING)) {
+		csk_reset_flag(csk, CSK_ABORT_RPL_PENDING);
+		if (!csk_flag_nochk(csk, CSK_ABORT_REQ_RCVD)) {
+			if (sk->sk_state == TCP_SYN_SENT) {
+				cxgb4_remove_tid(cdev->tids,
+						 csk->port_id,
+						 GET_TID(rpl),
+						 sk->sk_family);
+				sock_put(sk);
+			}
+			chtls_release_resources(sk);
+			chtls_conn_done(sk);
+		}
+	}
+	kfree_skb(skb);
+}
+
+static int chtls_conn_cpl(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	u8 opcode = ((const struct rss_header *)cplhdr(skb))->opcode;
+	struct cpl_peer_close *req = cplhdr(skb) + RSS_HDR;
+	void (*fn)(struct sock *sk, struct sk_buff *skb);
+	unsigned int hwtid = GET_TID(req);
+	struct sock *sk;
+
+	sk = lookup_tid(cdev->tids, hwtid);
+	if (!sk)
+		goto rel_skb;
+
+	switch (opcode) {
+	case CPL_PEER_CLOSE:
+		fn = chtls_peer_close;
+		break;
+	case CPL_CLOSE_CON_RPL:
+		fn = chtls_close_con_rpl;
+		break;
+	case CPL_ABORT_REQ_RSS:
+		fn = chtls_abort_req_rss;
+		break;
+	case CPL_ABORT_RPL_RSS:
+		fn = chtls_abort_rpl_rss;
+		break;
+	default:
+		goto rel_skb;
+	}
+
+	process_cpl_msg(fn, sk, skb);
+	return 0;
+
+rel_skb:
+	kfree_skb(skb);
+	return 0;
+}
+
+static struct sk_buff *dequeue_wr(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct sk_buff *skb = csk->wr_skb_head;
+
+	if (likely(skb)) {
+	/* Don't bother clearing the tail */
+		csk->wr_skb_head = WR_SKB_CB(skb)->next_wr;
+		WR_SKB_CB(skb)->next_wr = NULL;
+	}
+	return skb;
+}
+
+static void chtls_rx_ack(struct sock *sk, struct sk_buff *skb)
+{
+	struct cpl_fw4_ack *hdr = cplhdr(skb) + RSS_HDR;
+	struct chtls_sock *csk = sk->sk_user_data;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 snd_una = ntohl(hdr->snd_una);
+	u8 credits = hdr->credits;
+
+	csk->wr_credits += credits;
+
+	if (csk->wr_unacked > csk->wr_max_credits - csk->wr_credits)
+		csk->wr_unacked = csk->wr_max_credits - csk->wr_credits;
+
+	while (credits) {
+		struct sk_buff *pskb = csk->wr_skb_head;
+
+		if (unlikely(!pskb)) {
+			if (csk->wr_nondata)
+				csk->wr_nondata -= credits;
+				break;
+		}
+		if (unlikely(credits < pskb->csum)) {
+			pskb->csum -= credits;
+			break;
+		}
+		dequeue_wr(sk);
+		credits -= pskb->csum;
+		kfree_skb(pskb);
+	}
+	if (hdr->seq_vld & CPL_FW4_ACK_FLAGS_SEQVAL) {
+		if (unlikely(before(snd_una, tp->snd_una))) {
+			kfree_skb(skb);
+			return;
+		}
+
+		if (tp->snd_una != snd_una) {
+			tp->snd_una = snd_una;
+			dst_confirm(sk->sk_dst_cache);
+			tp->rcv_tstamp = tcp_time_stamp(tp);
+			if (tp->snd_una == tp->snd_nxt &&
+			    !csk_flag_nochk(csk, CSK_TX_FAILOVER))
+				csk_reset_flag(csk, CSK_TX_WAIT_IDLE);
+		}
+	}
+
+	if (hdr->seq_vld & CPL_FW4_ACK_FLAGS_CH) {
+		unsigned int fclen16 = roundup(failover_flowc_wr_len, 16);
+
+		csk->wr_credits -= fclen16;
+		csk_reset_flag(csk, CSK_TX_WAIT_IDLE);
+		csk_reset_flag(csk, CSK_TX_FAILOVER);
+	}
+	if (skb_queue_len(&csk->txq) && chtls_push_frames(csk, 0))
+		sk->sk_write_space(sk);
+
+	kfree_skb(skb);
+}
+
+static int chtls_wr_ack(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+	struct cpl_fw4_ack *rpl = cplhdr(skb) + RSS_HDR;
+	unsigned int hwtid = GET_TID(rpl);
+	struct sock *sk;
+
+	sk = lookup_tid(cdev->tids, hwtid);
+	process_cpl_msg(chtls_rx_ack, sk, skb);
+
+	return 0;
+}
+
+chtls_handler_func chtls_handlers[NUM_CPL_CMDS] = {
+	[CPL_PASS_OPEN_RPL]     = chtls_pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL] = chtls_close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ]   = chtls_pass_accept_req,
+	[CPL_PASS_ESTABLISH]    = chtls_pass_establish,
+	[CPL_RX_DATA]           = chtls_rx_data,
+	[CPL_TLS_DATA]          = chtls_rx_pdu,
+	[CPL_RX_TLS_CMP]        = chtls_rx_cmp,
+	[CPL_PEER_CLOSE]        = chtls_conn_cpl,
+	[CPL_CLOSE_CON_RPL]     = chtls_conn_cpl,
+	[CPL_ABORT_REQ_RSS]     = chtls_conn_cpl,
+	[CPL_ABORT_RPL_RSS]     = chtls_conn_cpl,
+	[CPL_FW4_ACK]           = chtls_wr_ack,
+};
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e36eff0..0926de5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -325,6 +325,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	tcp_update_metrics(sk);
 	tcp_done(sk);
 }
+EXPORT_SYMBOL(tcp_time_wait);
 
 void tcp_twsk_destructor(struct sock *sk)
 {
-- 
1.8.3.1

^ permalink raw reply related

* [RFC crypto v3 7/9] chtls: Inline crypto request Tx/Rx
From: Atul Gupta @ 2017-12-20 11:38 UTC (permalink / raw)
  To: herbert, linux-crypto, ganeshgr; +Cc: netdev, davem, davejwatson

TLS handler for record transmit and receive.
Create Inline TLS work request and post to FW.

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
v3: made some functions static and initialized few variables
---
 drivers/crypto/chelsio/chtls/chtls_io.c | 1867 +++++++++++++++++++++++++++++++
 1 file changed, 1867 insertions(+)
 create mode 100644 drivers/crypto/chelsio/chtls/chtls_io.c

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
new file mode 100644
index 0000000..a0f03fb
--- /dev/null
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -0,0 +1,1867 @@
+/*
+ * Copyright (c) 2017 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Written by: Atul Gupta (atul.gupta@chelsio.com)
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/sched/signal.h>
+#include <net/tcp.h>
+#include <net/busy_poll.h>
+#include <crypto/aes.h>
+
+#include "chtls.h"
+#include "chtls_cm.h"
+
+static bool is_tls_hw(struct chtls_sock *csk)
+{
+	return csk->tlshws.ofld;
+}
+
+static bool is_tls_rx(struct chtls_sock *csk)
+{
+	return (csk->tlshws.rxkey >= 0);
+}
+
+static bool is_tls_tx(struct chtls_sock *csk)
+{
+	return (csk->tlshws.txkey >= 0);
+}
+
+static bool is_tls_skb(struct chtls_sock *csk, const struct sk_buff *skb)
+{
+	return (is_tls_hw(csk) && skb_ulp_tls_skb_flags(skb));
+}
+
+static int key_size(void *sk)
+{
+	return 16; /* Key on DDR */
+}
+
+#define ceil(x, y) \
+	({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
+
+static int data_sgl_len(const struct sk_buff *skb)
+{
+	unsigned int cnt;
+
+	cnt = skb_shinfo(skb)->nr_frags;
+	return (sgl_len(cnt) * 8);
+}
+
+static int nos_ivs(struct sock *sk, unsigned int size)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+
+	return ceil(size, csk->tlshws.mfs);
+}
+
+#define TLS_WR_CPL_LEN \
+	(sizeof(struct fw_tlstx_data_wr) + \
+	sizeof(struct cpl_tx_tls_sfo))
+
+static int is_ivs_imm(struct sock *sk, const struct sk_buff *skb)
+{
+	int ivs_size = nos_ivs(sk, skb->len) * CIPHER_BLOCK_SIZE;
+	int hlen = TLS_WR_CPL_LEN + data_sgl_len(skb);
+
+	if ((hlen + key_size(sk) + ivs_size) <
+	    MAX_IMM_OFLD_TX_DATA_WR_LEN) {
+		ULP_SKB_CB(skb)->ulp.tls.iv = 1;
+		return 1;
+	}
+	ULP_SKB_CB(skb)->ulp.tls.iv = 0;
+	return 0;
+}
+
+static int max_ivs_size(struct sock *sk, int size)
+{
+	return (nos_ivs(sk, size) * CIPHER_BLOCK_SIZE);
+}
+
+static int ivs_size(struct sock *sk, const struct sk_buff *skb)
+{
+	return (is_ivs_imm(sk, skb) ? (nos_ivs(sk, skb->len) *
+		 CIPHER_BLOCK_SIZE) : 0);
+}
+
+static int flowc_wr_credits(int nparams, int *flowclenp)
+{
+	int flowclen16, flowclen;
+
+	flowclen = offsetof(struct fw_flowc_wr, mnemval[nparams]);
+	flowclen16 = DIV_ROUND_UP(flowclen, 16);
+	flowclen = flowclen16 * 16;
+
+	if (flowclenp)
+		*flowclenp = flowclen;
+
+	return flowclen16;
+}
+
+static struct sk_buff *create_flowc_wr_skb(struct sock *sk,
+					   struct fw_flowc_wr *flowc,
+					   int flowclen)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct sk_buff *skb;
+
+	skb = alloc_skb(flowclen, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	memcpy(__skb_put(skb, flowclen), flowc, flowclen);
+	set_queue(skb, (csk->txq_idx << 1) | CPL_PRIORITY_DATA, sk);
+
+	return skb;
+}
+
+static int send_flowc_wr(struct sock *sk, struct fw_flowc_wr *flowc,
+			 int flowclen)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	bool syn_sent = (sk->sk_state == TCP_SYN_SENT);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int flowclen16 = flowclen / 16;
+	struct sk_buff *skb;
+
+	if (csk_flag(sk, CSK_TX_DATA_SENT)) {
+		skb = create_flowc_wr_skb(sk, flowc, flowclen);
+		if (!skb)
+			return -ENOMEM;
+
+		if (syn_sent)
+			__skb_queue_tail(&csk->ooo_queue, skb);
+		else
+			skb_entail(sk, skb,
+				   ULPCB_FLAG_NO_HDR | ULPCB_FLAG_NO_APPEND);
+		return 0;
+	}
+
+	if (!syn_sent) {
+		int ret;
+
+		ret = cxgb4_immdata_send(csk->egress_dev,
+					 csk->txq_idx,
+					 flowc, flowclen);
+		if (!ret)
+			return flowclen16;
+	}
+	skb = create_flowc_wr_skb(sk, flowc, flowclen);
+	if (!skb)
+		return -ENOMEM;
+	send_or_defer(sk, tp, skb, 0);
+	return flowclen16;
+}
+
+static u8 tcp_state_to_flowc_state(u8 state)
+{
+	u8 ret = FW_FLOWC_MNEM_TCPSTATE_ESTABLISHED;
+
+	switch (state) {
+	case TCP_ESTABLISHED:
+		ret = FW_FLOWC_MNEM_TCPSTATE_ESTABLISHED;
+		break;
+	case TCP_CLOSE_WAIT:
+		ret = FW_FLOWC_MNEM_TCPSTATE_CLOSEWAIT;
+		break;
+	case TCP_FIN_WAIT1:
+		ret = FW_FLOWC_MNEM_TCPSTATE_FINWAIT1;
+		break;
+	case TCP_CLOSING:
+		ret = FW_FLOWC_MNEM_TCPSTATE_CLOSING;
+		break;
+	case TCP_LAST_ACK:
+		ret = FW_FLOWC_MNEM_TCPSTATE_LASTACK;
+		break;
+	case TCP_FIN_WAIT2:
+		ret = FW_FLOWC_MNEM_TCPSTATE_FINWAIT2;
+		break;
+	}
+
+	return ret;
+}
+
+int send_tx_flowc_wr(struct sock *sk, int compl,
+		     u32 snd_nxt, u32 rcv_nxt)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	int nparams, paramidx, flowclen16, flowclen;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct flowc_packed {
+		struct fw_flowc_wr fc;
+		struct fw_flowc_mnemval mnemval[FW_FLOWC_MNEM_MAX];
+	} __packed sflowc;
+	struct fw_flowc_wr *flowc;
+
+	memset(&sflowc, 0, sizeof(sflowc));
+	flowc = &sflowc.fc;
+
+#define FLOWC_PARAM(__m, __v) \
+	do { \
+		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
+		flowc->mnemval[paramidx].val = cpu_to_be32(__v); \
+		paramidx++; \
+	} while (0)
+
+	paramidx = 0;
+
+	FLOWC_PARAM(PFNVFN, FW_PFVF_CMD_PFN_V(csk->cdev->lldi->pf));
+	FLOWC_PARAM(CH, csk->tx_chan);
+	FLOWC_PARAM(PORT, csk->tx_chan);
+	FLOWC_PARAM(IQID, csk->rss_qid);
+	FLOWC_PARAM(SNDNXT, tp->snd_nxt);
+	FLOWC_PARAM(RCVNXT, tp->rcv_nxt);
+	FLOWC_PARAM(SNDBUF, csk->sndbuf);
+	FLOWC_PARAM(MSS, tp->mss_cache);
+	FLOWC_PARAM(TCPSTATE, tcp_state_to_flowc_state(sk->sk_state));
+
+	if (SND_WSCALE(tp))
+		FLOWC_PARAM(RCV_SCALE, SND_WSCALE(tp));
+
+	if (csk->ulp_mode == ULP_MODE_TLS)
+		FLOWC_PARAM(ULD_MODE, ULP_MODE_TLS);
+
+	if (csk->tlshws.fcplenmax)
+		FLOWC_PARAM(TXDATAPLEN_MAX, csk->tlshws.fcplenmax);
+
+	nparams = paramidx;
+#undef FLOWC_PARAM
+
+	flowclen16 = flowc_wr_credits(nparams, &flowclen);
+	flowc->op_to_nparams =
+		cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
+			    FW_WR_COMPL_V(compl) |
+			    FW_FLOWC_WR_NPARAMS_V(nparams));
+	flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(flowclen16) |
+					  FW_WR_FLOWID_V(csk->tid));
+
+	return send_flowc_wr(sk, flowc, flowclen);
+}
+
+/* Copy IVs to WR */
+static int tls_copy_ivs(struct sock *sk, struct sk_buff *skb)
+
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_hws *hws = &csk->tlshws;
+	u16 number_of_ivs = 0;
+	int err = 0;
+	unsigned char *ivs;
+	unsigned char *iv_loc = NULL;
+	unsigned int max_frag_size = 0;
+	struct page *page;
+
+	max_frag_size = hws->mfs;
+	number_of_ivs = nos_ivs(sk, skb->len);
+
+	if (number_of_ivs > MAX_IVS_PAGE) {
+		pr_warn("MAX IVs in PAGE exceeded %d\n", number_of_ivs);
+		return -ENOMEM;
+	}
+
+	/* generate the  IVs */
+	ivs = kmalloc(number_of_ivs * CIPHER_BLOCK_SIZE, GFP_ATOMIC);
+	if (!ivs)
+		return -ENOMEM;
+	get_random_bytes(ivs, number_of_ivs * CIPHER_BLOCK_SIZE);
+
+	if (skb_ulp_tls_skb_iv(skb)) {
+		/* send the IVs as immediate data in the WR */
+		iv_loc = (unsigned char *)__skb_push(skb, number_of_ivs *
+						CIPHER_BLOCK_SIZE);
+		if (iv_loc)
+			memcpy(iv_loc, ivs, number_of_ivs * CIPHER_BLOCK_SIZE);
+
+		hws->ivsize = number_of_ivs * CIPHER_BLOCK_SIZE;
+	} else {
+		/* Send the IVs as sgls */
+		/* Already accounted IV DSGL for credits */
+		skb_shinfo(skb)->nr_frags--;
+		page = alloc_pages(sk->sk_allocation | __GFP_COMP, 0);
+		if (!page) {
+			pr_info("%s : Page allocation for IVs failed\n",
+				__func__);
+			err = -ENOMEM;
+			goto out;
+		}
+		memcpy(page_address(page), ivs, number_of_ivs *
+		       CIPHER_BLOCK_SIZE);
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, page, 0,
+				   number_of_ivs * CIPHER_BLOCK_SIZE);
+		hws->ivsize = 0;
+	}
+out:
+	kfree(ivs);
+	return err;
+}
+
+static unsigned int keyid_to_addr(int start_addr, int keyid)
+{
+	return ((start_addr + (keyid * TLS_KEY_CONTEXT_SZ)) >> 5);
+}
+
+/* Copy Key to WR */
+static void tls_copy_tx_key(struct sock *sk, struct sk_buff *skb)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct ulptx_sc_memrd *sc_memrd;
+	struct ulptx_idata *sc;
+	struct chtls_hws *hws = &csk->tlshws;
+	struct chtls_dev *cdev = csk->cdev;
+	u32 immdlen;
+
+	immdlen = sizeof(*sc) + sizeof(*sc_memrd);
+	sc = (struct ulptx_idata *)__skb_push(skb, immdlen);
+	if (sc) {
+		sc->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_NOOP));
+		sc->len = htonl(0);
+		sc_memrd = (struct ulptx_sc_memrd *)(sc + 1);
+		sc_memrd->cmd_to_len =
+				htonl(ULPTX_CMD_V(ULP_TX_SC_MEMRD) |
+				ULP_TX_SC_MORE_V(1) |
+				ULPTX_LEN16_V(hws->keylen >> 4));
+		sc_memrd->addr = htonl(keyid_to_addr(cdev->kmap.start,
+						     hws->txkey));
+	}
+}
+
+static __be64 tls_sequence_number(struct chtls_hws *hws)
+{
+	return (hws->tx_seq_no++);
+}
+
+static int is_sg_request(const struct sk_buff *skb)
+{
+	return (skb->peeked ||
+		(skb->len > MAX_IMM_ULPTX_WR_LEN));
+}
+
+/*
+ * Returns true if an sk_buff carries urgent data.
+ */
+static int skb_urgent(struct sk_buff *skb)
+{
+	return (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_URG) != 0;
+}
+
+/* TLS content type for CPL SFO */
+static unsigned char tls_content_type(unsigned char content_type)
+{
+	switch (content_type) {
+	case TLS_HDR_TYPE_CCS:
+		return CPL_TX_TLS_SFO_TYPE_CCS;
+	case TLS_HDR_TYPE_ALERT:
+		return CPL_TX_TLS_SFO_TYPE_ALERT;
+	case TLS_HDR_TYPE_HANDSHAKE:
+		return CPL_TX_TLS_SFO_TYPE_HANDSHAKE;
+	case TLS_HDR_TYPE_HEARTBEAT:
+		return CPL_TX_TLS_SFO_TYPE_HEARTBEAT;
+	}
+	return CPL_TX_TLS_SFO_TYPE_DATA;
+}
+
+static void tls_tx_data_wr(struct sock *sk, struct sk_buff *skb,
+			   int dlen, int tls_immd, u32 credits,
+			   int expn, int pdus)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_hws *hws = &csk->tlshws;
+	struct net_device *dev = csk->egress_dev;
+	struct adapter *adap = netdev2adap(dev);
+	struct fw_tlstx_data_wr *req_wr = NULL;
+	struct cpl_tx_tls_sfo *req_cpl = NULL;
+	int iv_imm = skb_ulp_tls_skb_iv(skb);
+	struct tls_scmd *scmd = &hws->scmd;
+	struct tls_scmd *updated_scmd;
+	unsigned int wr_ulp_mode_force;
+	unsigned char data_type = 0;
+	unsigned char *req = NULL;
+	int len = dlen + expn;
+	int immd_len;
+
+	dlen = (dlen < hws->mfs) ? dlen : hws->mfs;
+	atomic_inc(&adap->chcr_stats.tls_pdu_tx);
+
+	updated_scmd = scmd;
+	updated_scmd->seqno_numivs &= 0xffffff80;
+	updated_scmd->seqno_numivs |= SCMD_NUM_IVS_V(pdus);
+	hws->scmd = *updated_scmd;
+
+	req = (unsigned char *)__skb_push(skb, sizeof(struct cpl_tx_tls_sfo));
+	req_cpl = (struct cpl_tx_tls_sfo *)req;
+	req = (unsigned char *)__skb_push(skb, (sizeof(struct
+				fw_tlstx_data_wr)));
+
+	req_wr = (struct fw_tlstx_data_wr *)req;
+	immd_len = (tls_immd ? dlen : 0);
+	req_wr->op_to_immdlen =
+		htonl(FW_WR_OP_V(FW_TLSTX_DATA_WR) |
+		FW_TLSTX_DATA_WR_COMPL_V(1) |
+		FW_TLSTX_DATA_WR_IMMDLEN_V(immd_len));
+	req_wr->flowid_len16 = htonl(FW_TLSTX_DATA_WR_FLOWID_V(csk->tid) |
+				     FW_TLSTX_DATA_WR_LEN16_V(credits));
+	wr_ulp_mode_force = TX_ULP_MODE_V(ULP_MODE_TLS);
+
+	if (is_sg_request(skb))
+		wr_ulp_mode_force |= FW_OFLD_TX_DATA_WR_ALIGNPLD_F |
+			((tcp_sk(sk)->nonagle & TCP_NAGLE_OFF) ? 0 :
+			FW_OFLD_TX_DATA_WR_SHOVE_F);
+
+	req_wr->lsodisable_to_flags =
+			htonl(TX_ULP_MODE_V(ULP_MODE_TLS) |
+			      FW_OFLD_TX_DATA_WR_URGENT_V(skb_urgent(skb)) |
+			      T6_TX_FORCE_F | wr_ulp_mode_force |
+			      TX_SHOVE_V((!csk_flag(sk, CSK_TX_MORE_DATA)) &&
+					 skb_queue_empty(&csk->txq)));
+
+	req_wr->ctxloc_to_exp =
+			htonl(FW_TLSTX_DATA_WR_NUMIVS_V(pdus) |
+			      FW_TLSTX_DATA_WR_EXP_V(expn) |
+			      FW_TLSTX_DATA_WR_CTXLOC_V(CHTLS_KEY_CONTEXT_DDR) |
+			      FW_TLSTX_DATA_WR_IVDSGL_V(!iv_imm) |
+			      FW_TLSTX_DATA_WR_KEYSIZE_V(hws->keylen >> 4));
+
+	/* Fill in the length */
+	req_wr->plen = htonl(len);
+	req_wr->mfs = htons(hws->mfs);
+	req_wr->adjustedplen_pkd =
+		htons(FW_TLSTX_DATA_WR_ADJUSTEDPLEN_V(hws->adjustlen));
+	req_wr->expinplenmax_pkd =
+		htons(FW_TLSTX_DATA_WR_EXPINPLENMAX_V(hws->expansion));
+	req_wr->pdusinplenmax_pkd =
+		FW_TLSTX_DATA_WR_PDUSINPLENMAX_V(hws->pdus);
+	req_wr->r10 = 0;
+
+	data_type = tls_content_type(ULP_SKB_CB(skb)->ulp.tls.type);
+	req_cpl->op_to_seg_len = htonl(CPL_TX_TLS_SFO_OPCODE_V(CPL_TX_TLS_SFO) |
+				       CPL_TX_TLS_SFO_DATA_TYPE_V(data_type) |
+				       CPL_TX_TLS_SFO_CPL_LEN_V(2) |
+				       CPL_TX_TLS_SFO_SEG_LEN_V(dlen));
+	req_cpl->pld_len = htonl(len - expn);
+
+	req_cpl->type_protover = htonl(CPL_TX_TLS_SFO_TYPE_V
+		((data_type == CPL_TX_TLS_SFO_TYPE_HEARTBEAT) ?
+		TLS_HDR_TYPE_HEARTBEAT : 0) |
+		CPL_TX_TLS_SFO_PROTOVER_V(0));
+
+	/* create the s-command */
+	req_cpl->r1_lo = 0;
+	req_cpl->seqno_numivs  = htonl(hws->scmd.seqno_numivs);
+	req_cpl->ivgen_hdrlen = htonl(hws->scmd.ivgen_hdrlen);
+	req_cpl->scmd1 = cpu_to_be64(tls_sequence_number(hws));
+}
+
+/*
+ * Calculate the TLS data expansion size
+ */
+static int chtls_expansion_size(struct sock *sk, int data_len,
+				int fullpdu,
+				unsigned short *pducnt)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_hws *hws = &csk->tlshws;
+	struct tls_scmd *scmd = &hws->scmd;
+	int hdrlen = TLS_HEADER_LENGTH;
+	int expnsize = 0, frcnt = 0, fraglast = 0, fragsize = 0;
+	int expppdu = 0;
+
+	do {
+		fragsize = hws->mfs;
+
+		if (SCMD_CIPH_MODE_G(scmd->seqno_numivs) ==
+			SCMD_CIPH_MODE_AES_GCM) {
+			frcnt = (data_len / fragsize);
+			expppdu = GCM_TAG_SIZE + AEAD_EXPLICIT_DATA_SIZE +
+					hdrlen;
+			expnsize =  frcnt * expppdu;
+
+			if (fullpdu) {
+				*pducnt = data_len / (expppdu + fragsize);
+
+				if (*pducnt > 32)
+					*pducnt = 32;
+				else if (!*pducnt)
+					*pducnt = 1;
+				expnsize = (*pducnt) * expppdu;
+					break;
+			}
+			fraglast = data_len % fragsize;
+			if (fraglast > 0) {
+				frcnt += 1;
+				expnsize += expppdu;
+			}
+			break;
+		}
+		pr_info("unsupported cipher\n");
+		expnsize = 0;
+	} while (0);
+
+	return expnsize;
+}
+
+/* WR with IV, KEY and CPL SFO added */
+void make_tlstx_data_wr(struct sock *sk, struct sk_buff *skb,
+			int tls_tx_imm, int tls_len, u32 credits)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_hws *hws = &csk->tlshws;
+	int pdus = ceil(tls_len, hws->mfs);
+	unsigned short pdus_per_ulp = 0;
+	int expn_sz = 0;
+
+	expn_sz = chtls_expansion_size(sk, tls_len, 0, NULL);
+	if (!hws->compute) {
+		hws->expansion = chtls_expansion_size(sk,
+						      hws->fcplenmax,
+						      1, &pdus_per_ulp);
+		hws->pdus = pdus_per_ulp;
+		hws->adjustlen = hws->pdus *
+			((hws->expansion / hws->pdus) + hws->mfs);
+		hws->compute = 1;
+	}
+	tls_copy_ivs(sk, skb);
+	tls_copy_tx_key(sk, skb);
+	tls_tx_data_wr(sk, skb, tls_len, tls_tx_imm, credits, expn_sz, pdus);
+	hws->tx_seq_no += (pdus - 1);
+}
+
+static void make_tx_data_wr(struct sock *sk, struct sk_buff *skb,
+			    unsigned int immdlen, int len,
+			    u32 credits, u32 compl)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	unsigned int opcode = FW_OFLD_TX_DATA_WR;
+	struct fw_ofld_tx_data_wr *req;
+	unsigned int wr_ulp_mode_force;
+
+	req = (struct fw_ofld_tx_data_wr *)__skb_push(skb, sizeof(*req));
+	req->op_to_immdlen = htonl(WR_OP_V(opcode) |
+				FW_WR_COMPL_V(compl) |
+				FW_WR_IMMDLEN_V(immdlen));
+	req->flowid_len16 = htonl(FW_WR_FLOWID_V(csk->tid) |
+				FW_WR_LEN16_V(credits));
+
+	wr_ulp_mode_force = TX_ULP_MODE_V(csk->ulp_mode);
+	if (is_sg_request(skb))
+		wr_ulp_mode_force |= FW_OFLD_TX_DATA_WR_ALIGNPLD_F |
+			((tcp_sk(sk)->nonagle & TCP_NAGLE_OFF) ? 0 :
+				FW_OFLD_TX_DATA_WR_SHOVE_F);
+
+	req->tunnel_to_proxy = htonl(wr_ulp_mode_force |
+			FW_OFLD_TX_DATA_WR_URGENT_V(skb_urgent(skb)) |
+			FW_OFLD_TX_DATA_WR_SHOVE_V((!csk_flag
+					(sk, CSK_TX_MORE_DATA)) &&
+					 skb_queue_empty(&csk->txq)));
+	req->plen = htonl(len);
+}
+
+static int chtls_wr_size(struct chtls_sock *csk, const struct sk_buff *skb,
+			 bool size)
+{
+	int wr_size;
+
+	wr_size = TLS_WR_CPL_LEN;
+	wr_size += key_size(csk->sk);
+	wr_size += ivs_size(csk->sk, skb);
+
+	if (size)
+		return wr_size;
+
+	/* frags counted for IV dsgl */
+	if (!skb_ulp_tls_skb_iv(skb))
+		skb_shinfo(skb)->nr_frags++;
+
+	return wr_size;
+}
+
+static bool is_ofld_imm(struct chtls_sock *csk,
+			const struct sk_buff *skb)
+{
+	int length = skb->len;
+
+	if (skb->peeked || skb->len > MAX_IMM_ULPTX_WR_LEN)
+		return false;
+
+	if (likely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) {
+		/* Check TLS header len for Immediate */
+		if (csk->ulp_mode == ULP_MODE_TLS &&
+		    is_tls_skb(csk, skb))
+			length += chtls_wr_size(csk, skb, true);
+		else
+			length += sizeof(struct fw_ofld_tx_data_wr);
+
+		return length <= MAX_IMM_OFLD_TX_DATA_WR_LEN;
+	}
+	return true;
+}
+
+static unsigned int calc_tx_flits(const struct sk_buff *skb,
+				  unsigned int immdlen)
+{
+	unsigned int flits, cnt;
+
+	flits = immdlen / 8;   /* headers */
+	cnt = skb_shinfo(skb)->nr_frags;
+	if (skb_tail_pointer(skb) != skb_transport_header(skb))
+		cnt++;
+	return flits + sgl_len(cnt);
+}
+
+static void arp_failure_discard(void *handle, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+int chtls_push_frames(struct chtls_sock *csk, int comp)
+{
+	struct sock *sk = csk->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct chtls_hws *hws = &csk->tlshws;
+	struct sk_buff *skb;
+	int total_size = 0;
+	int wr_size = sizeof(struct fw_ofld_tx_data_wr);
+
+	if (unlikely(sk_in_state(sk, TCPF_SYN_SENT | TCPF_CLOSE)))
+		return 0;
+
+	if (unlikely(csk_flag(sk, CSK_ABORT_SHUTDOWN)))
+		return 0;
+
+	while (csk->wr_credits && (skb = skb_peek(&csk->txq)) &&
+	       (!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_HOLD) ||
+		skb_queue_len(&csk->txq) > 1)) {
+		unsigned int immdlen;
+		unsigned int credits_needed;
+		int len = skb->len;    /* length [ulp bytes] inserted by hw */
+		int tls_len = skb->len;/* TLS data len before IV/key */
+		unsigned int credit_len = skb->len;
+		unsigned int completion = 0;
+		int flowclen16 = 0;
+		int tls_tx_imm = 0;
+
+		immdlen = skb->len;
+		if (!is_ofld_imm(csk, skb)) {
+			immdlen = skb_transport_offset(skb);
+			if (is_tls_skb(csk, skb))
+				wr_size = chtls_wr_size(csk, skb, false);
+			credit_len = 8 * calc_tx_flits(skb, immdlen);
+		} else {
+			if (is_tls_skb(csk, skb)) {
+				wr_size = chtls_wr_size(csk, skb, false);
+				tls_tx_imm = 1;
+			}
+		}
+		if (likely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR))
+			credit_len += wr_size;
+		credits_needed = DIV_ROUND_UP(credit_len, 16);
+		if (!csk_flag_nochk(csk, CSK_TX_DATA_SENT)) {
+			flowclen16 = send_tx_flowc_wr(sk, 1, tp->snd_nxt,
+						      tp->rcv_nxt);
+			if (flowclen16 <= 0)
+				break;
+			csk->wr_credits -= flowclen16;
+			csk->wr_unacked += flowclen16;
+			csk->wr_nondata += flowclen16;
+			csk_set_flag(csk, CSK_TX_DATA_SENT);
+		}
+
+		if (csk->wr_credits < credits_needed) {
+			if (is_tls_skb(csk, skb) &&
+			    !skb_ulp_tls_skb_iv(skb))
+				skb_shinfo(skb)->nr_frags--;
+			break;
+		}
+
+		__skb_unlink(skb, &csk->txq);
+		set_queue(skb, (csk->txq_idx << 1) | CPL_PRIORITY_DATA, sk);
+		if (is_tls_hw(csk))
+			hws->txqid = (skb->queue_mapping >> 1);
+		skb->csum = credits_needed + csk->wr_nondata;
+		csk->wr_credits -= credits_needed;
+		csk->wr_unacked += credits_needed;
+		csk->wr_nondata = 0;
+		enqueue_wr(csk, skb);
+
+		if (likely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) {
+			if ((comp && csk->wr_unacked == credits_needed) ||
+			    (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_COMPL) ||
+			    csk->wr_unacked >= csk->wr_max_credits / 2) {
+				completion = 1;
+				csk->wr_unacked = 0;
+			}
+			if (is_tls_skb(csk, skb))
+				make_tlstx_data_wr(sk, skb, tls_tx_imm,
+						   tls_len, credits_needed);
+			else
+				make_tx_data_wr(sk, skb, immdlen, len,
+						credits_needed, completion);
+			tp->snd_nxt += len;
+			tp->lsndtime = tcp_time_stamp(tp);
+			if (completion)
+				ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_NEED_HDR;
+		} else {
+			struct cpl_close_con_req *req = cplhdr(skb);
+			unsigned int cmd  = CPL_OPCODE_G(ntohl
+					     (OPCODE_TID(req)));
+
+			if (cmd == CPL_CLOSE_CON_REQ)
+				csk_set_flag(csk,
+					     CSK_CLOSE_CON_REQUESTED);
+
+			if ((ULP_SKB_CB(skb)->flags & ULPCB_FLAG_COMPL) &&
+			    (csk->wr_unacked >= csk->wr_max_credits / 2)) {
+				req->wr.wr_hi |= htonl(FW_WR_COMPL_F);
+				csk->wr_unacked = 0;
+			}
+		}
+		total_size += skb->truesize;
+		if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_BARRIER)
+			csk_set_flag(csk, CSK_TX_WAIT_IDLE);
+		t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+		cxgb4_l2t_send(csk->egress_dev, skb, csk->l2t_entry);
+	}
+	sk->sk_wmem_queued -= total_size;
+	return total_size;
+}
+
+static void mark_urg(struct tcp_sock *tp, int flags,
+		     struct sk_buff *skb)
+{
+	if (unlikely(flags & MSG_OOB)) {
+		tp->snd_up = tp->write_seq;
+			ULP_SKB_CB(skb)->flags = ULPCB_FLAG_URG |
+						 ULPCB_FLAG_BARRIER |
+						 ULPCB_FLAG_NO_APPEND |
+						 ULPCB_FLAG_NEED_HDR;
+	}
+}
+
+/*
+ * Returns true if a connection should send more data to the TOE ASAP.
+ */
+static int should_push(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct chtls_dev *cdev = csk->cdev;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/*
+	 * If we've released our offload resources there's nothing to do ...
+	 */
+	if (!cdev)
+		return 0;
+
+	/*
+	 * If there aren't any work requests in flight, or there isn't enough
+	 * data in flight, or Nagle is off then send the current TX_DATA
+	 * otherwise hold it and wait to accumulate more data.
+	 */
+	return csk->wr_credits == csk->wr_max_credits ||
+		(tp->nonagle & TCP_NAGLE_OFF);
+}
+
+/*
+ * Returns true if a TCP socket is corked.
+ */
+static int corked(const struct tcp_sock *tp, int flags)
+{
+	return (flags & MSG_MORE) | (tp->nonagle & TCP_NAGLE_CORK);
+}
+
+/*
+ * Returns true if a send should try to push new data.
+ */
+static int send_should_push(struct sock *sk, int flags)
+{
+	return should_push(sk) && !corked(tcp_sk(sk), flags);
+}
+
+void chtls_tcp_push(struct sock *sk, int flags)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	int qlen = skb_queue_len(&csk->txq);
+
+	if (likely(qlen)) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct sk_buff *skb = skb_peek_tail(&csk->txq);
+
+		mark_urg(tp, flags, skb);
+
+		if (!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) &&
+		    corked(tp, flags)) {
+			ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_HOLD;
+			return;
+		}
+
+		ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_HOLD;
+		if (qlen == 1 &&
+		    ((ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
+		     should_push(sk)))
+			chtls_push_frames(csk, 1);
+	}
+}
+
+/*
+ * Calculate the size for a new send sk_buff.  It's maximum size so we can
+ * pack lots of data into it, unless we plan to send it immediately, in which
+ * case we size it more tightly.
+ *
+ * Note: we don't bother compensating for MSS < PAGE_SIZE because it doesn't
+ * arise in normal cases and when it does we are just wasting memory.
+ */
+static int select_size(struct sock *sk, int io_len,
+		       int flags, int len)
+{
+	const int pgbreak = SKB_MAX_HEAD(len);
+
+	/*
+	 * If the data wouldn't fit in the main body anyway, put only the
+	 * header in the main body so it can use immediate data and place all
+	 * the payload in page fragments.
+	 */
+	if (io_len > pgbreak)
+		return 0;
+
+	/*
+	 * If we will be accumulating payload get a large main body.
+	 */
+	if (!send_should_push(sk, flags))
+		return pgbreak;
+
+	return io_len;
+}
+
+void skb_entail(struct sock *sk, struct sk_buff *skb, int flags)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ULP_SKB_CB(skb)->seq = tp->write_seq;
+	ULP_SKB_CB(skb)->flags = flags;
+	__skb_queue_tail(&csk->txq, skb);
+	sk->sk_wmem_queued += skb->truesize;
+
+	if (TCP_PAGE(sk) && TCP_OFF(sk)) {
+		put_page(TCP_PAGE(sk));
+		TCP_PAGE(sk) = NULL;
+		TCP_OFF(sk) = 0;
+	}
+}
+
+static struct sk_buff *get_tx_skb(struct sock *sk, int size)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_skb(size + TX_HEADER_LEN, sk->sk_allocation);
+	if (likely(skb)) {
+		skb_reserve(skb, TX_HEADER_LEN);
+		skb_entail(sk, skb, ULPCB_FLAG_NEED_HDR);
+		skb_reset_transport_header(skb);
+	}
+	return skb;
+}
+
+static struct sk_buff *get_record_skb(struct sock *sk, int size, bool zcopy)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct sk_buff *skb;
+
+	skb = alloc_skb(((zcopy ? 0 : size) + TX_TLSHDR_LEN +
+			key_size(sk) + max_ivs_size(sk, size)),
+			sk->sk_allocation);
+	if (likely(skb)) {
+		skb_reserve(skb, (TX_TLSHDR_LEN +
+			    key_size(sk) + max_ivs_size(sk, size)));
+		skb_entail(sk, skb, ULPCB_FLAG_NEED_HDR);
+		skb_reset_transport_header(skb);
+		ULP_SKB_CB(skb)->ulp.tls.ofld = 1;
+		ULP_SKB_CB(skb)->ulp.tls.type = csk->tlshws.type;
+	}
+	return skb;
+}
+
+static void tx_skb_finalize(struct sk_buff *skb)
+{
+	struct ulp_skb_cb *cb = ULP_SKB_CB(skb);
+
+	if (!(cb->flags & ULPCB_FLAG_NO_HDR))
+		cb->flags = ULPCB_FLAG_NEED_HDR;
+	cb->flags |= ULPCB_FLAG_NO_APPEND;
+}
+
+static void push_frames_if_head(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+
+	if (skb_queue_len(&csk->txq) == 1)
+		chtls_push_frames(csk, 1);
+}
+
+static int chtls_skb_copy_to_page_nocache(struct sock *sk,
+					  struct iov_iter *from,
+					  struct sk_buff *skb,
+					  struct page *page,
+					  int off, int copy)
+{
+	int err;
+
+	err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) +
+				       off, copy, skb->len);
+	if (err)
+		return err;
+
+	skb->len             += copy;
+	skb->data_len        += copy;
+	skb->truesize        += copy;
+	sk->sk_wmem_queued   += copy;
+	return 0;
+}
+
+/* Read TLS header to find content type and data length */
+static int tls_header_read(struct tls_hdr *thdr, struct iov_iter *from)
+{
+	if (copy_from_iter(thdr, sizeof(*thdr), from) != sizeof(*thdr))
+		return -EFAULT;
+	return htons(thdr->length);
+}
+
+int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct chtls_dev *cdev = csk->cdev;
+	struct sk_buff *skb;
+	int mss, flags, err;
+	int copied = 0;
+	int hdrlen = 0;
+	int recordsz = 0;
+	long timeo;
+
+	lock_sock(sk);
+	flags = msg->msg_flags;
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
+		err = sk_stream_wait_connect(sk, &timeo);
+		if (err)
+			goto out_err;
+	}
+
+	if (sk->sk_prot->sendmsg != chtls_sendmsg) {
+		release_sock(sk);
+		if (sk->sk_prot->sendmsg)
+			return sk->sk_prot->sendmsg(sk, msg, size);
+		else
+			return sk->sk_socket->ops->sendmsg(sk->sk_socket,
+							   msg, size);
+	}
+
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		goto out_err;
+
+	mss = csk->mss;
+	csk_set_flag(csk, CSK_TX_MORE_DATA);
+
+	while (msg_data_left(msg)) {
+		int copy = 0;
+
+		skb = skb_peek_tail(&csk->txq);
+		if (skb) {
+			copy = mss - skb->len;
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		}
+
+		if (is_tls_tx(csk) && !csk->tlshws.txleft) {
+			struct tls_hdr hdr;
+
+			recordsz = tls_header_read(&hdr, &msg->msg_iter);
+			size -= TLS_HEADER_LENGTH;
+			hdrlen += TLS_HEADER_LENGTH;
+			csk->tlshws.txleft = recordsz;
+			csk->tlshws.type = hdr.type;
+			if (skb)
+				ULP_SKB_CB(skb)->ulp.tls.type = hdr.type;
+		}
+
+		if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
+		    copy <= 0) {
+new_buf:
+			if (skb) {
+				tx_skb_finalize(skb);
+				push_frames_if_head(sk);
+			}
+
+			if (is_tls_tx(csk)) {
+				skb = get_record_skb(sk,
+						     select_size(sk,
+								 recordsz,
+								 flags,
+								 TX_TLSHDR_LEN),
+								 false);
+			} else {
+				skb = get_tx_skb(sk,
+						 select_size(sk, size, flags,
+							     TX_HEADER_LEN));
+			}
+			if (unlikely(!skb))
+				goto wait_for_memory;
+
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			copy = mss;
+		}
+		if (copy > size)
+			copy = size;
+
+		if (skb_tailroom(skb) > 0) {
+			copy = min(copy, skb_tailroom(skb));
+			if (is_tls_tx(csk))
+				copy = min_t(int, copy, csk->tlshws.txleft);
+			err = skb_add_data_nocache(sk, skb,
+						   &msg->msg_iter, copy);
+			if (err)
+				goto do_fault;
+		} else {
+			bool merge;
+			int off = TCP_OFF(sk);
+			int i = skb_shinfo(skb)->nr_frags;
+			struct page *page = TCP_PAGE(sk);
+			int pg_size = PAGE_SIZE;
+
+			if (page)
+				pg_size <<= compound_order(page);
+
+			if (off < pg_size &&
+			    skb_can_coalesce(skb, i, page, off)) {
+				merge = 1;
+				goto copy;
+			}
+			merge = 0;
+			if (i == (is_tls_tx(csk) ? (MAX_SKB_FRAGS - 1) :
+			    MAX_SKB_FRAGS))
+				goto new_buf;
+
+			if (page && off == pg_size) {
+				put_page(page);
+				TCP_PAGE(sk) = page = NULL;
+				pg_size = PAGE_SIZE;
+			}
+
+			if (!page) {
+				gfp_t gfp = sk->sk_allocation;
+				int order = cdev->send_page_order;
+
+				if (order) {
+					page = alloc_pages(gfp | __GFP_COMP |
+							   __GFP_NOWARN |
+							   __GFP_NORETRY,
+							   order);
+					if (page)
+						pg_size <<=
+							compound_order(page);
+				}
+				if (!page) {
+					page = alloc_page(gfp);
+					pg_size = PAGE_SIZE;
+				}
+				if (!page)
+					goto wait_for_memory;
+				off = 0;
+			}
+copy:
+			if (copy > pg_size - off)
+				copy = pg_size - off;
+			if (is_tls_tx(csk))
+				copy = min_t(int, copy, csk->tlshws.txleft);
+
+			err = chtls_skb_copy_to_page_nocache(sk, &msg->msg_iter,
+							     skb, page,
+							     off, copy);
+			if (unlikely(err)) {
+				if (!TCP_PAGE(sk)) {
+					TCP_PAGE(sk) = page;
+					TCP_OFF(sk) = 0;
+				}
+				goto do_fault;
+			}
+			/* Update the skb. */
+			if (merge) {
+				skb_shinfo(skb)->frags[i - 1].size += copy;
+			} else {
+				skb_fill_page_desc(skb, i, page, off, copy);
+				if (off + copy < pg_size) {
+					/* space left keep page */
+					get_page(page);
+					TCP_PAGE(sk) = page;
+				} else {
+					TCP_PAGE(sk) = NULL;
+				}
+			}
+			TCP_OFF(sk) = off + copy;
+		}
+		if (unlikely(skb->len == mss))
+			tx_skb_finalize(skb);
+		tp->write_seq += copy;
+		copied += copy;
+		size -= copy;
+
+		if (is_tls_tx(csk))
+			csk->tlshws.txleft -= copy;
+
+		if (corked(tp, flags) &&
+		    (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)))
+			ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_NO_APPEND;
+
+		if (size == 0)
+			goto out;
+
+		if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND)
+			push_frames_if_head(sk);
+		continue;
+wait_for_memory:
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto do_error;
+	}
+out:
+	csk_reset_flag(csk, CSK_TX_MORE_DATA);
+	if (copied)
+		chtls_tcp_push(sk, flags);
+done:
+	release_sock(sk);
+	return copied + hdrlen;
+do_fault:
+	if (!skb->len) {
+		__skb_unlink(skb, &csk->txq);
+		sk->sk_wmem_queued -= skb->truesize;
+		__kfree_skb(skb);
+	}
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	if (sock_flag(sk, SOCK_INLINE))
+		csk_reset_flag(csk, CSK_TX_MORE_DATA);
+	copied = sk_stream_error(sk, flags, err);
+	goto done;
+}
+
+int chtls_sendpage(struct sock *sk, struct page *page,
+		   int offset, size_t size, int flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct chtls_sock *csk;
+	int mss, err, copied = 0;
+	long timeo;
+
+	if (sk->sk_prot->sendpage != chtls_sendpage) {
+		release_sock(sk);
+		if (sk->sk_prot->sendpage)
+			return sk->sk_prot->sendpage(sk, page, offset,
+						     size, flags);
+		else
+			return sk->sk_socket->ops->sendpage(sk->sk_socket,
+							    page, offset,
+							    size, flags);
+	}
+
+	csk = rcu_dereference_sk_user_data(sk);
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	err = sk_stream_wait_connect(sk, &timeo);
+	if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+	    err != 0)
+		goto out_err;
+
+	mss = csk->mss;
+	csk_set_flag(csk, CSK_TX_MORE_DATA);
+
+	while (size > 0) {
+		int copy, i;
+		struct sk_buff *skb = skb_peek_tail(&csk->txq);
+
+		copy = mss - skb->len;
+		if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
+		    copy <= 0) {
+new_buf:
+
+			if (is_tls_tx(csk)) {
+				skb = get_record_skb(sk,
+						     select_size(sk, size,
+								 flags,
+								 TX_TLSHDR_LEN),
+						     true);
+			} else {
+				skb = get_tx_skb(sk, 0);
+			}
+			if (!skb)
+				goto do_error;
+			copy = mss;
+		}
+		if (copy > size)
+			copy = size;
+
+		i = skb_shinfo(skb)->nr_frags;
+		if (skb_can_coalesce(skb, i, page, offset)) {
+			skb_shinfo(skb)->frags[i - 1].size += copy;
+		} else if (i < MAX_SKB_FRAGS) {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, copy);
+		} else {
+			tx_skb_finalize(skb);
+			push_frames_if_head(sk);
+			goto new_buf;
+		}
+
+		skb->len += copy;
+		if (skb->len == mss)
+			tx_skb_finalize(skb);
+		skb->data_len += copy;
+		skb->truesize += copy;
+		sk->sk_wmem_queued += copy;
+		tp->write_seq += copy;
+		copied += copy;
+		offset += copy;
+		size -= copy;
+
+		if (corked(tp, flags) &&
+		    (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)))
+			ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_NO_APPEND;
+
+		if (!size)
+			break;
+
+		if (unlikely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND))
+			push_frames_if_head(sk);
+		continue;
+
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+	}
+out:
+	csk_reset_flag(csk, CSK_TX_MORE_DATA);
+	if (copied)
+		chtls_tcp_push(sk, flags);
+done:
+	release_sock(sk);
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+
+out_err:
+	if (sock_flag(sk, SOCK_INLINE))
+		csk_reset_flag(csk, CSK_TX_MORE_DATA);
+	copied = sk_stream_error(sk, flags, err);
+	goto done;
+}
+
+/*
+ * Returns true if a socket cannot accept new Rx data.
+ */
+static int sk_no_receive(const struct sock *sk)
+{
+	return (sk->sk_shutdown & RCV_SHUTDOWN);
+}
+
+static void chtls_select_window(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int wnd = tp->rcv_wnd;
+
+	wnd = max_t(unsigned int, wnd, tcp_full_space(sk));
+	wnd = max_t(unsigned int, MIN_RCV_WND, wnd);
+
+	if (wnd > MAX_RCV_WND)
+		wnd = MAX_RCV_WND;
+
+/*
+ * Check if we need to grow the receive window in response to an increase in
+ * the socket's receive buffer size.  Some applications increase the buffer
+ * size dynamically and rely on the window to grow accordingly.
+ */
+
+	if (wnd > tp->rcv_wnd) {
+		tp->rcv_wup -= wnd - tp->rcv_wnd;
+		tp->rcv_wnd = wnd;
+		/* Mark the receive window as updated */
+		csk_reset_flag(csk, CSK_UPDATE_RCV_WND);
+	}
+}
+
+static u32 send_rx_credits(struct chtls_sock *csk, u32 credits)
+{
+	struct cpl_rx_data_ack *req;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
+	if (!skb)
+		return 0;
+	__skb_put(skb, sizeof(*req));
+	req = (struct cpl_rx_data_ack *)skb->head;
+
+	set_wr_txq(skb, CPL_PRIORITY_ACK, csk->port_id);
+	INIT_TP_WR(req, csk->tid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK,
+						    csk->tid));
+	req->credit_dack = cpu_to_be32(RX_CREDITS_V(credits) |
+				       RX_FORCE_ACK_F);
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+	return credits;
+}
+
+#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | \
+			     TCPF_FIN_WAIT1 | \
+			     TCPF_FIN_WAIT2)
+/*
+ * Called after some received data has been read.  It returns RX credits
+ * to the HW for the amount of data processed.
+ */
+static void chtls_cleanup_rbuf(struct sock *sk, int copied)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct tcp_sock *tp;
+	int must_send;
+	u32 credits;
+	u32 thres = 15 * 1024;
+
+	if (!sk_in_state(sk, CREDIT_RETURN_STATE))
+		return;
+
+	chtls_select_window(sk);
+	tp = tcp_sk(sk);
+	credits = tp->copied_seq - tp->rcv_wup;
+	if (unlikely(!credits))
+		return;
+
+	/*
+	 * For coalescing to work effectively ensure the receive window has
+	 * at least 16KB left.
+	 */
+	must_send = credits + 16384 >= tp->rcv_wnd;
+
+	if (must_send || credits >= thres)
+		tp->rcv_wup += send_rx_credits(csk, credits);
+}
+
+static int chtls_pt_recvmsg(struct sock *sk,
+			    struct msghdr *msg, size_t len,
+			    int nonblock, int flags, int *addr_len)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct net_device *dev = csk->egress_dev;
+	struct adapter *adap = netdev2adap(dev);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct chtls_hws *hws = &csk->tlshws;
+	int copied = 0, buffers_freed = 0;
+	int target;
+	int request;
+	long timeo;
+	unsigned long avail;
+
+	timeo = sock_rcvtimeo(sk, nonblock);
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+	request = len;
+
+	if (unlikely(csk_flag(sk, CSK_UPDATE_RCV_WND)))
+		chtls_cleanup_rbuf(sk, copied);
+
+	do {
+		struct sk_buff *skb;
+		u32 offset = 0;
+
+		if (unlikely(tp->urg_data &&
+			     tp->urg_seq == tp->copied_seq)) {
+			if (copied)
+				break;
+			if (signal_pending(current)) {
+				copied = timeo ? sock_intr_errno(timeo) :
+					 -EAGAIN;
+				break;
+			}
+		}
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb)
+			goto found_ok_skb;
+		if (csk->wr_credits &&
+		    skb_queue_len(&csk->txq) &&
+		    chtls_push_frames(csk, csk->wr_credits ==
+				       csk->wr_max_credits))
+			sk->sk_write_space(sk);
+
+		if (copied >= target && !sk->sk_backlog.tail)
+			break;
+
+		if (copied) {
+			if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+			    sk_no_receive(sk) ||
+			    signal_pending(current))
+				break;
+
+			if (!timeo)
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				copied = sock_error(sk);
+				break;
+			}
+			if (sk_no_receive(sk))
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				copied = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+			if (signal_pending(current)) {
+				copied = sock_intr_errno(timeo);
+				break;
+			}
+		}
+		if (sk->sk_backlog.tail) {
+			release_sock(sk);
+			lock_sock(sk);
+			chtls_cleanup_rbuf(sk, copied);
+			continue;
+		}
+
+		if (copied >= target)
+			break;
+		chtls_cleanup_rbuf(sk, copied);
+		sk_wait_data(sk, &timeo, NULL);
+		continue;
+found_ok_skb:
+		if (!skb->len) {
+			skb_dst_set(skb, NULL);
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			kfree_skb(skb);
+
+			if (!copied && !timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+
+			if (copied < target) {
+				release_sock(sk);
+				lock_sock(sk);
+				continue;
+			}
+			break;
+		}
+		offset = hws->copied_seq;
+		avail = skb->len - offset;
+		if (len < avail)
+			avail = len;
+
+		if (unlikely(tp->urg_data)) {
+			u32 urg_offset = tp->urg_seq - tp->copied_seq;
+
+			if (urg_offset < avail) {
+				if (urg_offset) {
+					avail = urg_offset;
+				} else if (!sock_flag(sk, SOCK_URGINLINE)) {
+					/* First byte is urgent, skip */
+					tp->copied_seq++;
+					offset++;
+					avail--;
+					if (!avail)
+						goto skip_copy;
+				}
+			}
+		}
+		if (hws->rstate == TLS_RCV_ST_READ_BODY) {
+			if (skb_copy_datagram_msg(skb, offset,
+						  msg, avail)) {
+				if (!copied) {
+					copied = -EFAULT;
+					break;
+				}
+			}
+		} else {
+			struct tlsrx_cmp_hdr *tls_hdr_pkt =
+				(struct tlsrx_cmp_hdr *)skb->data;
+
+			if ((tls_hdr_pkt->res_to_mac_error &
+			     TLSRX_HDR_PKT_ERROR_M))
+				tls_hdr_pkt->type = 0x7F;
+
+			/* CMP pld len is for recv seq */
+			hws->rcvpld = skb->hdr_len;
+			if (skb_copy_datagram_msg(skb, offset, msg, avail)) {
+				if (!copied) {
+					copied = -EFAULT;
+					break;
+				}
+			}
+		}
+		copied += avail;
+		len -= avail;
+		hws->copied_seq += avail;
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
+			tp->urg_data = 0;
+
+		if (hws->rstate == TLS_RCV_ST_READ_BODY &&
+		    (avail + offset) >= skb->len) {
+			if (likely(skb))
+				chtls_free_skb(sk, skb);
+			buffers_freed++;
+			hws->rstate = TLS_RCV_ST_READ_HEADER;
+			atomic_inc(&adap->chcr_stats.tls_pdu_rx);
+			tp->copied_seq += hws->rcvpld;
+			hws->copied_seq = 0;
+			if (copied >= target &&
+			    !skb_peek(&sk->sk_receive_queue))
+				break;
+		} else {
+			if (likely(skb)) {
+				if (ULP_SKB_CB(skb)->flags &
+				    ULPCB_FLAG_TLS_ND)
+					hws->rstate =
+						TLS_RCV_ST_READ_HEADER;
+				else
+					hws->rstate =
+						TLS_RCV_ST_READ_BODY;
+				chtls_free_skb(sk, skb);
+			}
+			buffers_freed++;
+			tp->copied_seq += avail;
+			hws->copied_seq = 0;
+		}
+	} while (len > 0);
+
+	if (buffers_freed)
+		chtls_cleanup_rbuf(sk, copied);
+	release_sock(sk);
+	return copied;
+}
+
+/*
+ * Peek at data in a socket's receive buffer.
+ */
+static int peekmsg(struct sock *sk, struct msghdr *msg,
+		   size_t len, int nonblock, int flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	u32 peek_seq, offset;
+	int copied = 0;
+	size_t avail;          /* amount of available data in current skb */
+	long timeo;
+
+	lock_sock(sk);
+	timeo = sock_rcvtimeo(sk, nonblock);
+	peek_seq = tp->copied_seq;
+
+	do {
+		if (unlikely(tp->urg_data && tp->urg_seq == peek_seq)) {
+			if (copied)
+				break;
+			if (signal_pending(current)) {
+				copied = timeo ? sock_intr_errno(timeo) :
+						 -EAGAIN;
+				break;
+			}
+		}
+
+		skb_queue_walk(&sk->sk_receive_queue, skb) {
+			offset = peek_seq - ULP_SKB_CB(skb)->seq;
+			if (offset < skb->len)
+				goto found_ok_skb;
+		}
+
+		/* empty receive queue */
+		if (copied)
+			break;
+		if (sock_flag(sk, SOCK_DONE))
+			break;
+		if (sk->sk_err) {
+			copied = sock_error(sk);
+			break;
+		}
+		if (sk_no_receive(sk))
+			break;
+		if (sk->sk_state == TCP_CLOSE) {
+			copied = -ENOTCONN;
+			break;
+		}
+		if (!timeo) {
+			copied = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			copied = sock_intr_errno(timeo);
+			break;
+		}
+
+		if (sk->sk_backlog.tail) {
+			/* Do not sleep, just process backlog. */
+			release_sock(sk);
+			lock_sock(sk);
+		} else {
+			sk_wait_data(sk, &timeo, NULL);
+		}
+
+		if (unlikely(peek_seq != tp->copied_seq)) {
+			if (net_ratelimit())
+				pr_info("TCP(%s:%d), race in MSG_PEEK.\n",
+					current->comm, current->pid);
+			peek_seq = tp->copied_seq;
+		}
+		continue;
+
+found_ok_skb:
+		avail = skb->len - offset;
+		if (len < avail)
+			avail = len;
+
+		/*
+		 * Do we have urgent data here?  We need to skip over the
+		 * urgent byte.
+		 */
+		if (unlikely(tp->urg_data)) {
+			u32 urg_offset = tp->urg_seq - peek_seq;
+
+			if (urg_offset < avail) {
+				/*
+				 * The amount of data we are preparing to copy
+				 * contains urgent data.
+				 */
+				if (!urg_offset) { /* First byte is urgent */
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						peek_seq++;
+						offset++;
+						avail--;
+					}
+					if (!avail)
+						continue;
+				} else {
+					/* stop short of the urgent data */
+					avail = urg_offset;
+				}
+			}
+		}
+
+		/*
+		 * If MSG_TRUNC is specified the data is discarded.
+		 */
+		if (likely(!(flags & MSG_TRUNC)))
+			if (skb_copy_datagram_msg(skb, offset, msg, len)) {
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		peek_seq += avail;
+		copied += avail;
+		len -= avail;
+	} while (len > 0);
+
+	release_sock(sk);
+	return copied;
+}
+
+int chtls_recvmsg(struct sock *sk, struct msghdr *msg,
+		  size_t len, int nonblock,
+		  int flags, int *addr_len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct chtls_sock *csk;
+	struct chtls_hws *hws;
+	unsigned long avail;    /* amount of available data in current skb */
+	int buffers_freed = 0;
+	int copied = 0;
+	long timeo;
+	int target;             /* Read at least this many bytes */
+	int request;
+
+	if (unlikely(flags & MSG_OOB))
+		return tcp_prot.recvmsg(sk, msg, len, nonblock, flags,
+					addr_len);
+
+	if (unlikely(flags & MSG_PEEK))
+		return peekmsg(sk, msg, len, nonblock, flags);
+
+	if (sk_can_busy_loop(sk) &&
+	    skb_queue_empty(&sk->sk_receive_queue) &&
+	    sk->sk_state == TCP_ESTABLISHED)
+		sk_busy_loop(sk, nonblock);
+
+	lock_sock(sk);
+
+	if (sk->sk_prot->recvmsg != chtls_recvmsg) {
+		release_sock(sk);
+		return sk->sk_prot->recvmsg(sk, msg, len, nonblock,
+					    flags, addr_len);
+	}
+
+	csk = rcu_dereference_sk_user_data(sk);
+	hws = &csk->tlshws;
+
+	if (is_tls_rx(csk))
+		return chtls_pt_recvmsg(sk, msg, len, nonblock,
+					flags, addr_len);
+
+	timeo = sock_rcvtimeo(sk, nonblock);
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+	request = len;
+
+	if (unlikely(csk_flag(sk, CSK_UPDATE_RCV_WND)))
+		chtls_cleanup_rbuf(sk, copied);
+
+	do {
+		struct sk_buff *skb;
+		u32 offset;
+
+		if (unlikely(tp->urg_data && tp->urg_seq == tp->copied_seq)) {
+			if (copied)
+				break;
+			if (signal_pending(current)) {
+				copied = timeo ? sock_intr_errno(timeo) :
+					 -EAGAIN;
+				break;
+			}
+		}
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb)
+			goto found_ok_skb;
+
+		if (csk->wr_credits &&
+		    skb_queue_len(&csk->txq) &&
+		    chtls_push_frames(csk, csk->wr_credits ==
+				      csk->wr_max_credits))
+			sk->sk_write_space(sk);
+
+		if (copied >= target && !sk->sk_backlog.tail)
+			break;
+
+		if (copied) {
+			if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+			    sk_no_receive(sk) ||
+			    signal_pending(current))
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				copied = sock_error(sk);
+				break;
+			}
+			if (sk_no_receive(sk))
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				copied = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+			if (signal_pending(current)) {
+				copied = sock_intr_errno(timeo);
+				break;
+			}
+		}
+
+		if (sk->sk_backlog.tail) {
+			release_sock(sk);
+			lock_sock(sk);
+			chtls_cleanup_rbuf(sk, copied);
+			continue;
+		}
+
+		if (copied >= target)
+			break;
+		chtls_cleanup_rbuf(sk, copied);
+		sk_wait_data(sk, &timeo, NULL);
+		continue;
+
+found_ok_skb:
+		if (!skb->len) {
+			chtls_kfree_skb(sk, skb);
+			if (!copied && !timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+
+			if (copied < target)
+				continue;
+
+			break;
+		}
+
+		offset = tp->copied_seq - ULP_SKB_CB(skb)->seq;
+		avail = skb->len - offset;
+		if (len < avail)
+			avail = len;
+
+		if (unlikely(tp->urg_data)) {
+			u32 urg_offset = tp->urg_seq - tp->copied_seq;
+
+			if (urg_offset < avail) {
+				if (urg_offset) {
+					avail = urg_offset;
+				} else if (!sock_flag(sk, SOCK_URGINLINE)) {
+					tp->copied_seq++;
+					offset++;
+					avail--;
+					if (!avail)
+						goto skip_copy;
+				}
+			}
+		}
+
+		if (likely(!(flags & MSG_TRUNC))) {
+			if (skb_copy_datagram_msg(skb, offset,
+						  msg, avail)) {
+				if (!copied) {
+					copied = -EFAULT;
+					break;
+				}
+			}
+		}
+
+		tp->copied_seq += avail;
+		copied += avail;
+		len -= avail;
+
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
+			tp->urg_data = 0;
+
+		if (avail + offset >= skb->len) {
+			if (likely(skb))
+				chtls_free_skb(sk, skb);
+			buffers_freed++;
+
+			if  (copied >= target &&
+			     !skb_peek(&sk->sk_receive_queue))
+				break;
+		}
+	} while (len > 0);
+
+	if (buffers_freed)
+		chtls_cleanup_rbuf(sk, copied);
+
+	release_sock(sk);
+	return copied;
+}
-- 
1.8.3.1

^ permalink raw reply related

* [RFC crypto v3 8/9] chtls: Register the ULP
From: Atul Gupta @ 2017-12-20 11:38 UTC (permalink / raw)
  To: herbert, linux-crypto, ganeshgr; +Cc: netdev, davem, davejwatson

Add new uld driver for Inline TLS support. Register ULP for chtls.
Setsockopt to program key on chip. support AES GCM key size 128.

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
v3: made some functions static
---
 drivers/crypto/chelsio/chtls/chtls_main.c | 584 ++++++++++++++++++++++++++++++
 include/uapi/linux/tls.h                  |   1 +
 2 files changed, 585 insertions(+)
 create mode 100644 drivers/crypto/chelsio/chtls/chtls_main.c

diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c
new file mode 100644
index 0000000..fb2d441
--- /dev/null
+++ b/drivers/crypto/chelsio/chtls/chtls_main.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2017 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Written by: Atul Gupta (atul.gupta@chelsio.com)
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/hash.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+#include <net/tls.h>
+
+#include "chtls.h"
+#include "chtls_cm.h"
+
+#define DRV_NAME "chtls"
+
+/*
+ * chtls device management
+ * maintains a list of the chtls devices
+ */
+static LIST_HEAD(cdev_list);
+static DEFINE_MUTEX(cdev_mutex);
+static DEFINE_MUTEX(cdev_list_lock);
+
+static struct proto chtls_base_prot;
+static struct proto chtls_cpl_prot;
+static DEFINE_MUTEX(notify_mutex);
+static RAW_NOTIFIER_HEAD(listen_notify_list);
+struct request_sock_ops chtls_rsk_ops;
+static uint send_page_order = (14 - PAGE_SHIFT < 0) ? 0 : 14 - PAGE_SHIFT;
+
+static int register_listen_notifier(struct notifier_block *nb)
+{
+	int err;
+
+	mutex_lock(&notify_mutex);
+	err = raw_notifier_chain_register(&listen_notify_list, nb);
+	mutex_unlock(&notify_mutex);
+	return err;
+}
+
+static int unregister_listen_notifier(struct notifier_block *nb)
+{
+	int err;
+
+	mutex_lock(&notify_mutex);
+	err = raw_notifier_chain_unregister(&listen_notify_list, nb);
+	mutex_unlock(&notify_mutex);
+	return err;
+}
+
+static int listen_notify_handler(struct notifier_block *this,
+				 unsigned long event, void *data)
+{
+	struct sock *sk = data;
+	struct chtls_dev *cdev;
+	int ret =  NOTIFY_DONE;
+
+	switch (event) {
+	case CHTLS_LISTEN_START:
+	case CHTLS_LISTEN_STOP:
+		mutex_lock(&cdev_list_lock);
+		list_for_each_entry(cdev, &cdev_list, list) {
+			if (event == CHTLS_LISTEN_START)
+				ret = chtls_listen_start(cdev, sk);
+			else
+				chtls_listen_stop(cdev, sk);
+		}
+		mutex_unlock(&cdev_list_lock);
+		break;
+	}
+	return ret;
+}
+
+static struct notifier_block listen_notifier = {
+	.notifier_call = listen_notify_handler
+};
+
+static int listen_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	if (likely(skb_transport_header(skb) != skb_network_header(skb)))
+		return tcp_v4_do_rcv(sk, skb);
+	BLOG_SKB_CB(skb)->backlog_rcv(sk, skb);
+	return 0;
+}
+
+static int chtls_start_listen(struct sock *sk)
+{
+	int err;
+
+	if (sk->sk_protocol != IPPROTO_TCP)
+		return -EPROTONOSUPPORT;
+
+	if (sk->sk_family == PF_INET &&
+	    LOOPBACK(inet_sk(sk)->inet_rcv_saddr))
+		return -EADDRNOTAVAIL;
+
+	sk->sk_backlog_rcv = listen_backlog_rcv;
+	mutex_lock(&notify_mutex);
+	err = raw_notifier_call_chain(&listen_notify_list, 0, sk);
+	mutex_unlock(&notify_mutex);
+	return err;
+}
+
+static int chtls_hash(struct sock *sk)
+{
+	int err;
+
+	err = tcp_prot.hash(sk);
+	if (sk->sk_state == TCP_LISTEN)
+		err |= chtls_start_listen(sk);
+
+	if (err)
+		tcp_prot.unhash(sk);
+	return err;
+}
+
+static int chtls_stop_listen(struct sock *sk)
+{
+	if (sk->sk_protocol != IPPROTO_TCP)
+		return -EPROTONOSUPPORT;
+
+	mutex_lock(&notify_mutex);
+	raw_notifier_call_chain(&listen_notify_list, 1, sk);
+	mutex_unlock(&notify_mutex);
+	return 0;
+}
+
+static void chtls_unhash(struct sock *sk)
+{
+	if (sk->sk_state == TCP_LISTEN)
+		chtls_stop_listen(sk);
+	tcp_prot.unhash(sk);
+}
+
+static void chtls_lsk_close(struct sock *sk, long timeout)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	void (*sk_proto_close)(struct sock *sk, long timeout);
+
+	lock_sock(sk);
+	sk_proto_close = ctx->sk_proto_close;
+	kfree(ctx);
+
+	release_sock(sk);
+	sk_proto_close(sk, timeout);
+}
+
+static void process_deferq(struct work_struct *task_param)
+{
+	struct chtls_dev *cdev = container_of(task_param,
+				struct chtls_dev, deferq_task);
+	struct sk_buff *skb;
+
+	spin_lock_bh(&cdev->deferq.lock);
+	while ((skb = __skb_dequeue(&cdev->deferq)) != NULL) {
+		spin_unlock_bh(&cdev->deferq.lock);
+		DEFERRED_SKB_CB(skb)->handler(cdev, skb);
+		spin_lock_bh(&cdev->deferq.lock);
+	}
+	spin_unlock_bh(&cdev->deferq.lock);
+}
+
+static int chtls_get_skb(struct chtls_dev *cdev)
+{
+	cdev->askb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
+	if (!cdev->askb)
+		return 1;
+
+	skb_put(cdev->askb, sizeof(struct tcphdr));
+	skb_reset_transport_header(cdev->askb);
+	memset(cdev->askb->data, 0, cdev->askb->len);
+	return 0;
+}
+
+static void *chtls_uld_add(const struct cxgb4_lld_info *info)
+{
+	struct cxgb4_lld_info *lldi;
+	struct chtls_dev *cdev;
+	int i;
+
+	cdev = kzalloc(sizeof(*cdev) + info->nports *
+		      (sizeof(struct net_device *)), GFP_KERNEL);
+	if (!cdev)
+		return NULL;
+
+	lldi = kzalloc(sizeof(*lldi), GFP_KERNEL);
+	if (!lldi) {
+		kfree(cdev);
+		return NULL;
+	}
+
+	if (chtls_get_skb(cdev)) {
+		kfree(lldi);
+		kfree(cdev);
+		return NULL;
+	}
+	*lldi = *info;
+	cdev->lldi = lldi;
+	cdev->pdev = lldi->pdev;
+	cdev->tids = lldi->tids;
+	cdev->ports = (struct net_device **)(cdev + 1);
+	cdev->ports = lldi->ports;
+	cdev->mtus = lldi->mtus;
+	cdev->tids = lldi->tids;
+	cdev->pfvf = FW_VIID_PFN_G(cxgb4_port_viid(lldi->ports[0]))
+			<< FW_VIID_PFN_S;
+
+	for (i = 0; i < (1 << RSPQ_HASH_BITS); i++) {
+		unsigned int size = 64 - sizeof(struct rsp_ctrl) - 8;
+
+		cdev->rspq_skb_cache[i] = __alloc_skb(size,
+						      gfp_any(), 0,
+						      lldi->nodeid);
+	}
+
+	idr_init(&cdev->aidr);
+	idr_init(&cdev->hwtid_idr);
+	INIT_WORK(&cdev->deferq_task, process_deferq);
+	spin_lock_init(&cdev->listen_lock);
+	spin_lock_init(&cdev->idr_lock);
+	spin_lock_init(&cdev->aidr_lock);
+	cdev->send_page_order = min_t(uint, get_order(32768),
+				      send_page_order);
+
+	if (lldi->vr->key.size)
+		chtls_init_kmap(cdev, lldi);
+
+	mutex_lock(&cdev_mutex);
+	list_add_tail(&cdev->list, &cdev_list);
+	mutex_unlock(&cdev_mutex);
+
+	return cdev;
+}
+
+static void chtls_free_uld(struct chtls_dev *cdev)
+{
+	int i;
+
+	mutex_lock(&cdev_mutex);
+	list_del(&cdev->list);
+	mutex_unlock(&cdev_mutex);
+	chtls_free_kmap(cdev);
+	idr_destroy(&cdev->hwtid_idr);
+	idr_destroy(&cdev->aidr);
+	for (i = 0; i < (1 << RSPQ_HASH_BITS); i++) {
+		kfree_skb(cdev->rspq_skb_cache[i]);
+		cdev->rspq_skb_cache[i] = NULL;
+	}
+	kfree(cdev->lldi);
+	if (cdev->askb)
+		kfree_skb(cdev->askb);
+	kfree(cdev);
+}
+
+static void chtls_free_all_uld(void)
+{
+	struct chtls_dev *cdev, *tmp;
+
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list)
+		chtls_free_uld(cdev);
+}
+
+static int chtls_uld_state_change(void *handle, enum cxgb4_state new_state)
+{
+	struct chtls_dev *cdev = handle;
+
+	switch (new_state) {
+	case CXGB4_STATE_UP:
+		break;
+	case CXGB4_STATE_DOWN:
+		break;
+	case CXGB4_STATE_START_RECOVERY:
+		break;
+	case CXGB4_STATE_DETACH:
+		chtls_free_uld(cdev);
+		break;
+	}
+	return 0;
+}
+
+struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl,
+				   const __be64 *rsp,
+				   u32 pktshift)
+{
+	struct sk_buff *skb;
+
+	/* Allocate space for cpl_pass_accpet_req which will be synthesized by
+	 * driver. Once driver synthesizes cpl_pass_accpet_req the skb will go
+	 * through the regular cpl_pass_accept_req processing in TOM.
+	 */
+	skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req)
+			- pktshift, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+	__skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req)
+		   - pktshift);
+	/* For now we will copy  cpl_rx_pkt in the skb */
+	skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_rx_pkt));
+	skb_copy_to_linear_data_offset(skb, sizeof(struct cpl_pass_accept_req)
+				       , gl->va + pktshift,
+				       gl->tot_len - pktshift);
+
+	return skb;
+}
+
+static int chtls_recv_packet(struct chtls_dev *cdev,
+			     const struct pkt_gl *gl, const __be64 *rsp)
+{
+	unsigned int opcode = *(u8 *)rsp;
+	struct sk_buff *skb;
+	int ret;
+
+	skb = copy_gl_to_skb_pkt(gl, rsp, cdev->lldi->sge_pktshift);
+	if (!skb)
+		return -ENOMEM;
+
+	ret = chtls_handlers[opcode](cdev, skb);
+	if (ret & CPL_RET_BUF_DONE)
+		kfree_skb(skb);
+
+	return 0;
+}
+
+static int chtls_recv_rsp(struct chtls_dev *cdev, const __be64 *rsp)
+{
+	unsigned int opcode = *(u8 *)rsp;
+	unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8;
+	unsigned long rspq_bin;
+	struct sk_buff *skb;
+	int ret;
+
+	rspq_bin = hash_ptr((void *)rsp, RSPQ_HASH_BITS);
+	skb = cdev->rspq_skb_cache[rspq_bin];
+	if (skb && !skb_is_nonlinear(skb) &&
+	    !skb_shared(skb) && !skb_cloned(skb)) {
+		refcount_inc(&skb->users);
+		if (refcount_read(&skb->users) == 2) {
+			__skb_trim(skb, 0);
+			if (skb_tailroom(skb) >= len)
+				goto copy_out;
+		}
+		refcount_dec(&skb->users);
+	}
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return 1;
+
+copy_out:
+	__skb_put(skb, len);
+	skb_copy_to_linear_data(skb, rsp, len);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	ret = chtls_handlers[opcode](cdev, skb);
+
+	if (ret & CPL_RET_BUF_DONE)
+		kfree_skb(skb);
+	return 0;
+}
+
+static int chtls_recv(struct chtls_dev *cdev,
+		      struct sk_buff **skbs, const __be64 *rsp)
+{
+	unsigned int opcode = *(u8 *)rsp;
+	struct sk_buff *skb = *skbs;
+	int ret;
+
+	__skb_push(skb, sizeof(struct rss_header));
+	skb_copy_to_linear_data(skb, rsp, sizeof(struct rss_header));
+
+	ret = chtls_handlers[opcode](cdev, skb);
+	if (ret & CPL_RET_BUF_DONE)
+		kfree_skb(skb);
+
+	return 0;
+}
+
+static int chtls_uld_rx_handler(void *handle, const __be64 *rsp,
+				const struct pkt_gl *gl)
+{
+	struct chtls_dev *cdev = handle;
+	unsigned int opcode = *(u8 *)rsp;
+	struct sk_buff *skb;
+
+	if (unlikely(opcode == CPL_RX_PKT)) {
+		if (chtls_recv_packet(cdev, gl, rsp) < 0)
+			goto nomem;
+		return 0;
+	}
+
+	if (!gl)
+		return chtls_recv_rsp(cdev, rsp);
+
+#define RX_PULL_LEN 128
+	skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN);
+	if (unlikely(!skb))
+		goto nomem;
+	chtls_recv(cdev, &skb, rsp);
+	return 0;
+
+nomem:
+	return 1;
+}
+
+static int do_chtls_getsockopt(struct sock *sk, char __user *optval,
+			       int __user *optlen)
+{
+	struct tls_crypto_info crypto_info;
+
+	crypto_info.version = TLS_1_2_VERSION;
+	if (copy_to_user(optval, &crypto_info, sizeof(struct tls_crypto_info)))
+		return -EFAULT;
+	return 0;
+}
+
+static int chtls_getsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, int __user *optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (level != SOL_TLS)
+		return ctx->getsockopt(sk, level, optname, optval, optlen);
+
+	return do_chtls_getsockopt(sk, optval, optlen);
+}
+
+static int do_chtls_setsockopt(struct sock *sk, int optname,
+			       char __user *optval, unsigned int optlen)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct tls_crypto_info *crypto_info, tmp_crypto_info;
+	int keylen;
+	int rc = 0;
+
+	if (!optval || optlen < sizeof(*crypto_info)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info));
+	if (rc) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/* check version */
+	if (tmp_crypto_info.version != TLS_1_2_VERSION) {
+		rc = -ENOTSUPP;
+		goto out;
+	}
+
+	crypto_info = (struct tls_crypto_info *)&csk->tlshws.crypto_info;
+
+	switch (tmp_crypto_info.cipher_type) {
+	case TLS_CIPHER_AES_GCM_128: {
+		rc = copy_from_user(crypto_info, optval,
+				    sizeof(struct
+					   tls12_crypto_info_aes_gcm_128));
+
+		if (rc) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		keylen = TLS_CIPHER_AES_GCM_128_KEY_SIZE;
+		chtls_setkey(csk, keylen, optname);
+		break;
+	}
+	default:
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static int chtls_setsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, unsigned int optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (level != SOL_TLS)
+		return ctx->setsockopt(sk, level, optname, optval, optlen);
+
+	return do_chtls_setsockopt(sk, optname, optval, optlen);
+}
+
+static int chtls_init(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tls_context *ctx;
+	int rc = 0;
+
+	/* allocate tls context */
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	icsk->icsk_ulp_data = ctx;
+	ctx->setsockopt = sk->sk_prot->setsockopt;
+	ctx->getsockopt = sk->sk_prot->getsockopt;
+	ctx->sk_proto_close = sk->sk_prot->close;
+	sk->sk_prot = &chtls_base_prot;
+out:
+	return rc;
+}
+
+static struct cxgb4_uld_info chtls_uld_info = {
+	.name = DRV_NAME,
+	.nrxq = MAX_ULD_QSETS,
+	.ntxq = MAX_ULD_QSETS,
+	.rxq_size = 1024,
+	.add = chtls_uld_add,
+	.state_change = chtls_uld_state_change,
+	.rx_handler = chtls_uld_rx_handler,
+};
+
+static struct tcp_ulp_ops tcp_chtls_ulp_ops __read_mostly = {
+	.name                   = "chtls",
+	.owner                  = THIS_MODULE,
+	.init                   = chtls_init,
+};
+
+void chtls_install_cpl_ops(struct sock *sk)
+{
+	sk->sk_prot = &chtls_cpl_prot;
+}
+
+static void __init chtls_init_ulp_ops(void)
+{
+	chtls_base_prot			= tcp_prot;
+	chtls_base_prot.hash		= chtls_hash;
+	chtls_base_prot.unhash		= chtls_unhash;
+	chtls_base_prot.close		= chtls_lsk_close;
+
+	chtls_cpl_prot			= chtls_base_prot;
+	chtls_init_rsk_ops(&chtls_cpl_prot, &chtls_rsk_ops,
+			   &tcp_prot, PF_INET);
+	chtls_cpl_prot.close		= chtls_close;
+	chtls_cpl_prot.disconnect	= chtls_disconnect;
+	chtls_cpl_prot.destroy		= chtls_destroy_sock;
+	chtls_cpl_prot.shutdown		= chtls_shutdown;
+	chtls_cpl_prot.sendmsg		= chtls_sendmsg;
+	chtls_cpl_prot.recvmsg		= chtls_recvmsg;
+	chtls_cpl_prot.sendpage		= chtls_sendpage;
+	chtls_cpl_prot.setsockopt	= chtls_setsockopt;
+	chtls_cpl_prot.getsockopt	= chtls_getsockopt;
+}
+
+static int __init chtls_register(void)
+{
+	chtls_init_ulp_ops();
+	register_listen_notifier(&listen_notifier);
+	cxgb4_register_uld(CXGB4_ULD_TLS, &chtls_uld_info);
+	tcp_register_ulp(&tcp_chtls_ulp_ops);
+	return 0;
+}
+
+static void __exit chtls_unregister(void)
+{
+	unregister_listen_notifier(&listen_notifier);
+	tcp_unregister_ulp(&tcp_chtls_ulp_ops);
+	chtls_free_all_uld();
+	cxgb4_unregister_uld(CXGB4_ULD_TLS);
+}
+
+module_init(chtls_register);
+module_exit(chtls_unregister);
+
+MODULE_DESCRIPTION("Chelsio TLS Inline driver");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Chelsio Communications");
+MODULE_VERSION(DRV_VERSION);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 293b2cd..6048b27 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -38,6 +38,7 @@
 
 /* TLS socket options */
 #define TLS_TX			1	/* Set transmit parameters */
+#define TLS_RX			2       /* Set receive parameters */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
-- 
1.8.3.1

^ permalink raw reply related

* [RFC crypto v3 9/9] Makefile Kconfig
From: Atul Gupta @ 2017-12-20 11:38 UTC (permalink / raw)
  To: herbert, linux-crypto, ganeshgr; +Cc: netdev, davem, davejwatson

Entry for Inline TLS as another driver dependent on cxgb4 and chcr

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
 drivers/crypto/chelsio/Kconfig        | 10 ++++++++++
 drivers/crypto/chelsio/Makefile       |  1 +
 drivers/crypto/chelsio/chtls/Makefile |  4 ++++
 3 files changed, 15 insertions(+)
 create mode 100644 drivers/crypto/chelsio/chtls/Makefile

diff --git a/drivers/crypto/chelsio/Kconfig b/drivers/crypto/chelsio/Kconfig
index 51932c7..686d246 100644
--- a/drivers/crypto/chelsio/Kconfig
+++ b/drivers/crypto/chelsio/Kconfig
@@ -28,3 +28,13 @@ config CHELSIO_IPSEC_INLINE
         default n
         ---help---
           Enable support for IPSec Tx Inline.
+
+config CRYPTO_DEV_CHELSIO_TLS
+        tristate "Chelsio Crypto Inline TLS Driver"
+        depends on CHELSIO_T4
+        select CRYPTO_DEV_CHELSIO
+        ---help---
+          Support Chelsio Inline TLS with Chelsio crypto accelerator.
+
+          To compile this driver as a module, choose M here: the module
+          will be called chtls.
diff --git a/drivers/crypto/chelsio/Makefile b/drivers/crypto/chelsio/Makefile
index eaecaf1..639e571 100644
--- a/drivers/crypto/chelsio/Makefile
+++ b/drivers/crypto/chelsio/Makefile
@@ -3,3 +3,4 @@ ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
 obj-$(CONFIG_CRYPTO_DEV_CHELSIO) += chcr.o
 chcr-objs :=  chcr_core.o chcr_algo.o
 chcr-$(CONFIG_CHELSIO_IPSEC_INLINE) += chcr_ipsec.o
+obj-$(CONFIG_CRYPTO_DEV_CHELSIO_TLS) += chtls/
diff --git a/drivers/crypto/chelsio/chtls/Makefile b/drivers/crypto/chelsio/chtls/Makefile
new file mode 100644
index 0000000..df13795
--- /dev/null
+++ b/drivers/crypto/chelsio/chtls/Makefile
@@ -0,0 +1,4 @@
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4 -Idrivers/crypto/chelsio/
+
+obj-$(CONFIG_CRYPTO_DEV_CHELSIO_TLS) += chtls.o
+chtls-objs := chtls_main.o chtls_cm.o chtls_io.o chtls_hw.o
-- 
1.8.3.1

^ permalink raw reply related

* [patch net-next 00/10] Add support for resource abstraction
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa

From: Jiri Pirko <jiri@mellanox.com>

Many of the ASIC's internal resources are limited and are shared between
several hardware procedures. For example, unified hash-based memory can
be used for many lookup purposes, like FDB and LPM. In many cases the user
can provide a partitioning scheme for such a resource in order to perform
fine tuning for his application. In such cases performing driver reload is
needed for the changes to take place, thus this patchset also adds support
for hot reload.

Such an abstraction can be coupled with devlink's dpipe interface, which
models the ASIC's pipeline as a graph of match/action tables. By modeling
the hardware resource object, and by coupling it to several dpipe tables,
further visibility can be achieved in order to debug ASIC-wide issues.

The proposed interface will provide the user the ability to understand the
limitations of the hardware, and receive notification regarding its occupancy.
Furthermore, monitoring the resource occupancy can be done in real-time and
can be useful in many cases.
---
Userspace part prototype can be found at https://github.com/arkadis/iproute2/
at resource_dev branch.

Arkadi Sharshevsky (10):
  devlink: Add per devlink instance lock
  devlink: Add support for resource abstraction
  devlink: Add support for reload
  devlink: Add relation between dpipe and resource
  mlxsw: pci: Add support for performing bus reset
  mlxsw: spectrum: Register KVD resources with devlink
  mlxsw: spectrum_dpipe: Connect dpipe tables to resources
  mlxsw: spectrum: Add support for getting kvdl occupancy
  mlxsw: pci: Add support for getting resource through devlink
  mlxsw: core: Add support for reload

 drivers/net/ethernet/mellanox/mlxsw/core.c         |  85 +++-
 drivers/net/ethernet/mellanox/mlxsw/core.h         |  16 +-
 drivers/net/ethernet/mellanox/mlxsw/i2c.c          |   5 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c          |  98 ++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 201 ++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  13 +
 .../net/ethernet/mellanox/mlxsw/spectrum_dpipe.c   |  72 ++-
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c    |  26 +
 include/net/devlink.h                              |  93 ++++
 include/uapi/linux/devlink.h                       |  16 +
 net/core/devlink.c                                 | 564 ++++++++++++++++++---
 11 files changed, 1059 insertions(+), 130 deletions(-)

-- 
2.9.5

^ permalink raw reply

* [patch net-next 01/10] devlink: Add per devlink instance lock
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

This is a preparation before introducing resources and hot reload support.
Currently there are two global lock where one protects all devlink access,
and the second one protects devlink port access. This patch adds per devlink
instance lock which protects the internal members which are the sb/dpipe/
resource/ports. By introducing this lock the global devlink port lock can
be discarded.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/devlink.h |   1 +
 net/core/devlink.c    | 122 +++++++++++++++++++++++++++-----------------------
 2 files changed, 66 insertions(+), 57 deletions(-)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b9654e1..4d2c6fc 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -30,6 +30,7 @@ struct devlink {
 	const struct devlink_ops *ops;
 	struct device *dev;
 	possible_net_t _net;
+	struct mutex lock;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7d430c1..0114dfc 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -92,12 +92,6 @@ static LIST_HEAD(devlink_list);
  */
 static DEFINE_MUTEX(devlink_mutex);
 
-/* devlink_port_mutex
- *
- * Shared lock to guard lists of ports in all devlink devices.
- */
-static DEFINE_MUTEX(devlink_port_mutex);
-
 static struct net *devlink_net(const struct devlink *devlink)
 {
 	return read_pnet(&devlink->_net);
@@ -335,15 +329,12 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
 #define DEVLINK_NL_FLAG_NEED_DEVLINK	BIT(0)
 #define DEVLINK_NL_FLAG_NEED_PORT	BIT(1)
 #define DEVLINK_NL_FLAG_NEED_SB		BIT(2)
-#define DEVLINK_NL_FLAG_LOCK_PORTS	BIT(3)
-	/* port is not needed but we need to ensure they don't
-	 * change in the middle of command
-	 */
 
 static int devlink_nl_pre_doit(const struct genl_ops *ops,
 			       struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink;
+	int err;
 
 	mutex_lock(&devlink_mutex);
 	devlink = devlink_get_from_info(info);
@@ -351,44 +342,45 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
 		mutex_unlock(&devlink_mutex);
 		return PTR_ERR(devlink);
 	}
+	mutex_lock(&devlink->lock);
 	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
 		info->user_ptr[0] = devlink;
 	} else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
 		struct devlink_port *devlink_port;
 
-		mutex_lock(&devlink_port_mutex);
 		devlink_port = devlink_port_get_from_info(devlink, info);
 		if (IS_ERR(devlink_port)) {
-			mutex_unlock(&devlink_port_mutex);
-			mutex_unlock(&devlink_mutex);
-			return PTR_ERR(devlink_port);
+			err = PTR_ERR(devlink_port);
+			goto unlock;
 		}
 		info->user_ptr[0] = devlink_port;
 	}
-	if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) {
-		mutex_lock(&devlink_port_mutex);
-	}
 	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
 		struct devlink_sb *devlink_sb;
 
 		devlink_sb = devlink_sb_get_from_info(devlink, info);
 		if (IS_ERR(devlink_sb)) {
-			if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
-				mutex_unlock(&devlink_port_mutex);
-			mutex_unlock(&devlink_mutex);
-			return PTR_ERR(devlink_sb);
+			err = PTR_ERR(devlink_sb);
+			goto unlock;
 		}
 		info->user_ptr[1] = devlink_sb;
 	}
+
 	return 0;
+
+unlock:
+	mutex_unlock(&devlink->lock);
+	mutex_unlock(&devlink_mutex);
+	return err;
 }
 
 static void devlink_nl_post_doit(const struct genl_ops *ops,
 				 struct sk_buff *skb, struct genl_info *info)
 {
-	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT ||
-	    ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS)
-		mutex_unlock(&devlink_port_mutex);
+	struct devlink *devlink;
+
+	devlink = devlink_get_from_info(info);
+	mutex_unlock(&devlink->lock);
 	mutex_unlock(&devlink_mutex);
 }
 
@@ -614,10 +606,10 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_port, &devlink->port_list, list) {
 			if (idx < start) {
 				idx++;
@@ -628,13 +620,15 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq,
 						   NLM_F_MULTI);
-			if (err)
+			if (err) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 			idx++;
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -801,6 +795,7 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			if (idx < start) {
 				idx++;
@@ -811,10 +806,13 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 						 NETLINK_CB(cb->skb).portid,
 						 cb->nlh->nlmsg_seq,
 						 NLM_F_MULTI);
-			if (err)
+			if (err) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 			idx++;
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -935,14 +933,18 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_pool_get)
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
 						   devlink_sb,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -1123,22 +1125,24 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_port_pool_get)
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_port_pool_get_dumpit(msg, start, &idx,
 							devlink, devlink_sb,
 							NETLINK_CB(cb->skb).portid,
 							cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -1347,23 +1351,26 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
 			continue;
+
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
 							   devlink,
 							   devlink_sb,
 							   NETLINK_CB(cb->skb).portid,
 							   cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -2397,8 +2404,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NEED_SB |
-				  DEVLINK_NL_FLAG_LOCK_PORTS,
+				  DEVLINK_NL_FLAG_NEED_SB,
 	},
 	{
 		.cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
@@ -2406,8 +2412,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NEED_SB |
-				  DEVLINK_NL_FLAG_LOCK_PORTS,
+				  DEVLINK_NL_FLAG_NEED_SB,
 	},
 	{
 		.cmd = DEVLINK_CMD_ESWITCH_GET,
@@ -2488,6 +2493,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	mutex_init(&devlink->lock);
 	return devlink;
 }
 EXPORT_SYMBOL_GPL(devlink_alloc);
@@ -2550,16 +2556,16 @@ int devlink_port_register(struct devlink *devlink,
 			  struct devlink_port *devlink_port,
 			  unsigned int port_index)
 {
-	mutex_lock(&devlink_port_mutex);
+	mutex_lock(&devlink->lock);
 	if (devlink_port_index_exists(devlink, port_index)) {
-		mutex_unlock(&devlink_port_mutex);
+		mutex_unlock(&devlink->lock);
 		return -EEXIST;
 	}
 	devlink_port->devlink = devlink;
 	devlink_port->index = port_index;
 	devlink_port->registered = true;
 	list_add_tail(&devlink_port->list, &devlink->port_list);
-	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink->lock);
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 	return 0;
 }
@@ -2572,10 +2578,12 @@ EXPORT_SYMBOL_GPL(devlink_port_register);
  */
 void devlink_port_unregister(struct devlink_port *devlink_port)
 {
+	struct devlink *devlink = devlink_port->devlink;
+
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
-	mutex_lock(&devlink_port_mutex);
+	mutex_lock(&devlink->lock);
 	list_del(&devlink_port->list);
-	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_port_unregister);
 
@@ -2651,7 +2659,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	struct devlink_sb *devlink_sb;
 	int err = 0;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	if (devlink_sb_index_exists(devlink, sb_index)) {
 		err = -EEXIST;
 		goto unlock;
@@ -2670,7 +2678,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	devlink_sb->egress_tc_count = egress_tc_count;
 	list_add_tail(&devlink_sb->list, &devlink->sb_list);
 unlock:
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_sb_register);
@@ -2679,11 +2687,11 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
 {
 	struct devlink_sb *devlink_sb;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
 	WARN_ON(!devlink_sb);
 	list_del(&devlink_sb->list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	kfree(devlink_sb);
 }
 EXPORT_SYMBOL_GPL(devlink_sb_unregister);
@@ -2699,9 +2707,9 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
 int devlink_dpipe_headers_register(struct devlink *devlink,
 				   struct devlink_dpipe_headers *dpipe_headers)
 {
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink->dpipe_headers = dpipe_headers;
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
@@ -2715,9 +2723,9 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
  */
 void devlink_dpipe_headers_unregister(struct devlink *devlink)
 {
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink->dpipe_headers = NULL;
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
 
@@ -2783,9 +2791,9 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 	table->priv = priv;
 	table->counter_control_extern = counter_control_extern;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
@@ -2801,17 +2809,17 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
 {
 	struct devlink_dpipe_table *table;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
 					 table_name);
 	if (!table)
 		goto unlock;
 	list_del_rcu(&table->list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	kfree_rcu(table, rcu);
 	return;
 unlock:
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 02/10] devlink: Add support for resource abstraction
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Add support for hardware resource abstraction over devlink. Each resource
is identified via id, furthermore it contains information regarding its
size and its related sub resources. Each resource can also provide its
current occupancy.

In some cases the sizes of some resources can be changed, yet for those
changes to take place a hot driver reload may be needed. The reload
capability will be introduced in the next patch.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/devlink.h        |  78 ++++++++++
 include/uapi/linux/devlink.h |  10 ++
 net/core/devlink.c           | 354 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 442 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4d2c6fc..a94a155 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -26,6 +26,7 @@ struct devlink {
 	struct list_head port_list;
 	struct list_head sb_list;
 	struct list_head dpipe_table_list;
+	struct list_head resource_list;
 	struct devlink_dpipe_headers *dpipe_headers;
 	const struct devlink_ops *ops;
 	struct device *dev;
@@ -224,6 +225,45 @@ struct devlink_dpipe_headers {
 	unsigned int headers_count;
 };
 
+/**
+ * struct devlink_resource_ops - resource ops
+ * @occ_get: get the occupied size
+ * @size_validate: validate the size of the resource before update, reload
+ *                 is needed for changes to take place
+ */
+struct devlink_resource_ops {
+	u64 (*occ_get)(struct devlink *devlink);
+	int (*size_validate)(struct devlink *devlink, u64 size,
+			     struct netlink_ext_ack *extack);
+};
+
+/**
+ * struct devlink_resource - devlink resource
+ * @name: name of the resource
+ * @size: size of the resource
+ * @size_new: updated size of the resource, reload is needed
+ * @size_valid: valid in case the total size of the resource is valid
+ *              including its children
+ * @id: id, per devlink instance
+ * @parent: parent resource
+ * @list: parent list
+ * @resource_list: list of child resources
+ * @resource_ops: resource ops
+ */
+struct devlink_resource {
+	const char *name;
+	u64 size;
+	u64 size_new;
+	bool size_valid;
+	u64 id;
+	struct devlink_resource *parent;
+	struct list_head list;
+	struct list_head resource_list;
+	struct devlink_resource_ops *resource_ops;
+};
+
+#define DEVLINK_RESOURCE_ID_PARENT_TOP 0
+
 struct devlink_ops {
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
@@ -333,6 +373,19 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv6;
 
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      struct devlink_resource_ops *resource_ops);
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource);
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size);
+
 #else
 
 static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
@@ -469,6 +522,31 @@ devlink_dpipe_match_put(struct sk_buff *skb,
 	return 0;
 }
 
+static inline int
+devlink_resource_register(struct devlink *devlink,
+			  const char *resource_name,
+			  bool top_hierarchy,
+			  u64 resource_size,
+			  u64 resource_id,
+			  u64 parent_resource_id,
+			  struct devlink_resource_ops *resource_ops)
+{
+	return 0;
+}
+
+static inline void
+devlink_resources_unregister(struct devlink *devlink,
+			     struct devlink_resource *resource)
+{
+}
+
+static inline int
+devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
+			  u64 *p_resource_size)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6665df6..382981e 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -70,6 +70,8 @@ enum devlink_command {
 	DEVLINK_CMD_DPIPE_ENTRIES_GET,
 	DEVLINK_CMD_DPIPE_HEADERS_GET,
 	DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
+	DEVLINK_CMD_RESOURCE_SET,
+	DEVLINK_CMD_RESOURCE_DUMP,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
@@ -202,6 +204,14 @@ enum devlink_attr {
 	DEVLINK_ATTR_PAD,
 
 	DEVLINK_ATTR_ESWITCH_ENCAP_MODE,	/* u8 */
+	DEVLINK_ATTR_RESOURCE_LIST,		/* nested */
+	DEVLINK_ATTR_RESOURCE,			/* nested */
+	DEVLINK_ATTR_RESOURCE_NAME,		/* string */
+	DEVLINK_ATTR_RESOURCE_SIZE,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_NEW,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_VALID,	/* u8 */
+	DEVLINK_ATTR_RESOURCE_OCC,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_ID,		/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 0114dfc..88d5dc0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2280,6 +2280,216 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
 						counters_enable);
 }
 
+static struct devlink_resource *
+devlink_resource_find(struct devlink *devlink,
+		      struct devlink_resource *resource, u64 resource_id)
+{
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		struct devlink_resource *__resource;
+
+		if (resource->id == resource_id)
+			return resource;
+
+		__resource = devlink_resource_find(devlink, resource,
+						   resource_id);
+		if (__resource)
+			return __resource;
+	}
+	return NULL;
+}
+
+static void devlink_resource_validate_children(struct devlink_resource *resource)
+{
+	struct devlink_resource *__resource;
+	bool size_valid = true;
+	u64 parts_size = 0;
+
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	list_for_each_entry(__resource, &resource->resource_list, list)
+		parts_size += __resource->size_new;
+
+	if (parts_size > resource->size)
+		size_valid = false;
+out:
+	resource->size_valid = size_valid;
+}
+
+static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	u64 resource_id;
+	u64 size;
+	int err;
+
+	if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
+	    !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
+		return -EINVAL;
+	resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
+
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource)
+		return -EINVAL;
+
+	if (!resource->resource_ops->size_validate)
+		return -EINVAL;
+
+	size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
+	err = resource->resource_ops->size_validate(devlink, size,
+						    info->extack);
+	if (err)
+		return err;
+
+	resource->size_new = size;
+	devlink_resource_validate_children(resource);
+	if (resource->parent)
+		devlink_resource_validate_children(resource->parent);
+	return 0;
+}
+
+static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
+				struct devlink_resource *resource)
+{
+	struct devlink_resource *__resource;
+	struct nlattr *__resource_attr;
+	struct nlattr *resource_attr;
+
+	resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE);
+	if (!resource_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size,
+			      DEVLINK_ATTR_PAD) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
+			      DEVLINK_ATTR_PAD))
+		goto nla_put_failure;
+	if (resource->size != resource->size_new)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+				  resource->size_new, DEVLINK_ATTR_PAD);
+	if (resource->resource_ops && resource->resource_ops->occ_get)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+				  resource->resource_ops->occ_get(devlink),
+				  DEVLINK_ATTR_PAD);
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
+		       resource->size_valid))
+		goto nla_put_failure;
+
+	__resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!__resource_attr)
+		goto nla_put_failure;
+
+	list_for_each_entry(__resource, &resource->resource_list, list) {
+		if (devlink_resource_put(devlink, skb, __resource))
+			goto resource_put_failure;
+	}
+
+	nla_nest_end(skb, __resource_attr);
+out:
+	nla_nest_end(skb, resource_attr);
+	return 0;
+
+resource_put_failure:
+	nla_nest_cancel(skb, __resource_attr);
+nla_put_failure:
+	nla_nest_cancel(skb, resource_attr);
+	return -EMSGSIZE;
+}
+
+static int devlink_resource_fill(struct genl_info *info,
+				 enum devlink_command cmd, int flags)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	struct nlattr *resources_attr;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	bool incomplete;
+	void *hdr;
+	int i;
+	int err;
+
+	resource = list_first_entry(&devlink->resource_list,
+				    struct devlink_resource, list);
+start_again:
+	err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+	if (err)
+		return err;
+
+	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+			  &devlink_nl_family, NLM_F_MULTI, cmd);
+	if (!hdr) {
+		nlmsg_free(skb);
+		return -EMSGSIZE;
+	}
+
+	if (devlink_nl_put_handle(skb, devlink))
+		goto nla_put_failure;
+
+	resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!resources_attr)
+		goto nla_put_failure;
+
+	incomplete = false;
+	i = 0;
+	list_for_each_entry_from(resource, &devlink->resource_list, list) {
+		err = devlink_resource_put(devlink, skb, resource);
+		if (err) {
+			if (!i)
+				goto err_resource_put;
+			incomplete = true;
+			break;
+		}
+		i++;
+	}
+	nla_nest_end(skb, resources_attr);
+	genlmsg_end(skb, hdr);
+	if (incomplete)
+		goto start_again;
+send_done:
+	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+			NLMSG_DONE, 0, flags | NLM_F_MULTI);
+	if (!nlh) {
+		err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+		if (err)
+			goto err_skb_send_alloc;
+		goto send_done;
+	}
+	return genlmsg_reply(skb, info);
+
+nla_put_failure:
+	err = -EMSGSIZE;
+err_resource_put:
+err_skb_send_alloc:
+	genlmsg_cancel(skb, hdr);
+	nlmsg_free(skb);
+	return err;
+}
+
+static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+
+	if (list_empty(&devlink->resource_list))
+		return -EOPNOTSUPP;
+
+	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2298,6 +2508,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
+	[DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -2456,6 +2668,20 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_SET,
+		.doit = devlink_nl_cmd_resource_set,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_DUMP,
+		.doit = devlink_nl_cmd_resource_dump,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
@@ -2493,6 +2719,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	INIT_LIST_HEAD(&devlink->resource_list);
 	mutex_init(&devlink->lock);
 	return devlink;
 }
@@ -2823,6 +3050,133 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
+/**
+ *	devlink_resource_register - devlink resource register
+ *
+ *	@devlink: devlink
+ *	@resource_name: resource's name
+ *	@top_hierarchy: top hierarchy
+ *	@reload_required: reload is required for new configuration to
+ *			  apply
+ *	@resource_size: resource's size
+ *	@resource_id: resource's id
+ *	@parent_reosurce_id: resource's parent id
+ *	@resource_ops: resource ops
+ */
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      struct devlink_resource_ops *resource_ops)
+{
+	struct devlink_resource *resource;
+	struct list_head *resource_list;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (resource) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (top_hierarchy) {
+		resource_list = &devlink->resource_list;
+	} else {
+		struct devlink_resource *parent_resource;
+
+		parent_resource = devlink_resource_find(devlink, NULL,
+							parent_resource_id);
+		if (parent_resource) {
+			resource_list = &parent_resource->resource_list;
+			resource->parent = parent_resource;
+		} else {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	resource->name = resource_name;
+	resource->size = resource_size;
+	resource->size_new = resource_size;
+	resource->id = resource_id;
+	resource->resource_ops = resource_ops;
+	resource->size_valid = true;
+	INIT_LIST_HEAD(&resource->resource_list);
+	list_add_tail(&resource->list, resource_list);
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_register);
+
+/**
+ *	devlink_resources_unregister - free all resources
+ *
+ *	@devlink: devlink
+ *	@resource: resource
+ */
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource)
+{
+	struct devlink_resource *tmp, *__resource;
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	if (!resource)
+		mutex_lock(&devlink->lock);
+
+	list_for_each_entry_safe(__resource, tmp, resource_list, list) {
+		devlink_resources_unregister(devlink, __resource);
+		list_del(&__resource->list);
+		kfree(__resource);
+	}
+
+	if (!resource)
+		mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resources_unregister);
+
+/**
+ *	devlink_resource_size_get - get and update size
+ *
+ *	@devlink: devlink
+ *	@resource_id: the requested resource id
+ *	@p_resource_size: ptr to update
+ */
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size)
+{
+	struct devlink_resource *resource;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource) {
+		err = -EINVAL;
+		goto out;
+	}
+	*p_resource_size = resource->size_new;
+	resource->size = resource->size_new;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_size_get);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 03/10] devlink: Add support for reload
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Add support for performing driver hot reload.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  5 ++++
 net/core/devlink.c           | 57 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index a94a155..340c2fc 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -265,6 +265,7 @@ struct devlink_resource {
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
 
 struct devlink_ops {
+	int (*reload)(struct devlink *devlink);
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
 	int (*port_split)(struct devlink *devlink, unsigned int port_index,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 382981e..bee827b 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -73,6 +73,11 @@ enum devlink_command {
 	DEVLINK_CMD_RESOURCE_SET,
 	DEVLINK_CMD_RESOURCE_DUMP,
 
+	/* Hot driver reload, makes configuration changes take place. The
+	 * devlink instance is not released during the process.
+	 */
+	DEVLINK_CMD_RELOAD,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 88d5dc0..3dedc3f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2490,6 +2490,56 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
 	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
 }
 
+static int
+devlink_resources_validate(struct devlink *devlink,
+			   struct devlink_resource *resource,
+			   struct genl_info *info)
+{
+	struct list_head *resource_list;
+	int err = 0;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		if (!resource->size_valid)
+			return -EINVAL;
+		err = devlink_resources_validate(devlink, resource, info);
+		if (err)
+			return err;
+	}
+	return err;
+}
+
+static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	int err;
+
+	if (!devlink->ops->reload)
+		return -EOPNOTSUPP;
+
+	err = devlink_resources_validate(devlink, NULL, info);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(info->extack,
+				   "resources size validation failed");
+		return err;
+	}
+
+	/* During the reload the driver performs sb/dpipe register/
+	 * unregister which requires pre release of the per instance
+	 * devlink lock. The global devlink lock is taken and protects
+	 * from disruption by user-calls.
+	 */
+	mutex_unlock(&devlink->lock);
+	err = devlink->ops->reload(devlink);
+	mutex_lock(&devlink->lock);
+
+	return err;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2682,6 +2732,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RELOAD,
+		.doit = devlink_nl_cmd_reload,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 04/10] devlink: Add relation between dpipe and resource
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

The hardware processes which are modeled via dpipe commonly use some
internal hardware resources. Such relation can improve the understanding
of hardware limitations.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/devlink.h        | 13 +++++++++++++
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 31 +++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 340c2fc..9217620 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -183,6 +183,8 @@ struct devlink_dpipe_table_ops;
  * @counters_enabled: indicates if counters are active
  * @counter_control_extern: indicates if counter control is in dpipe or
  *			    external tool
+ * @resource_valid: Indicate that the resource id is valid
+ * @resource_id: relative resource this table is related to
  * @table_ops: table operations
  * @rcu: rcu
  */
@@ -192,6 +194,8 @@ struct devlink_dpipe_table {
 	const char *name;
 	bool counters_enabled;
 	bool counter_control_extern;
+	bool resource_valid;
+	u64 resource_id;
 	struct devlink_dpipe_table_ops *table_ops;
 	struct rcu_head rcu;
 };
@@ -386,6 +390,8 @@ void devlink_resources_unregister(struct devlink *devlink,
 int devlink_resource_size_get(struct devlink *devlink,
 			      u64 resource_id,
 			      u64 *p_resource_size);
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id);
 
 #else
 
@@ -548,6 +554,13 @@ devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_dpipe_table_resource_set(struct devlink *devlink,
+				 const char *table_name, u64 resource_id)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index bee827b..b84c6aa 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -217,6 +217,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_RESOURCE_SIZE_VALID,	/* u8 */
 	DEVLINK_ATTR_RESOURCE_OCC,		/* u64 */
 	DEVLINK_ATTR_RESOURCE_ID,		/* u64 */
+	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3dedc3f..8fbef2d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1686,6 +1686,9 @@ static int devlink_dpipe_table_put(struct sk_buff *skb,
 		       table->counters_enabled))
 		goto nla_put_failure;
 
+	if (table->resource_valid)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+				  table->resource_id, DEVLINK_ATTR_PAD);
 	if (devlink_dpipe_matches_put(table, skb))
 		goto nla_put_failure;
 
@@ -3234,6 +3237,34 @@ int devlink_resource_size_get(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devlink_resource_size_get);
 
+/**
+ *	devlink_dpipe_table_resource_set - set the resource id
+ *
+ *	@devlink: devlink
+ *	@table_name: table name
+ *	@resource_id: resource id
+ */
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id)
+{
+	struct devlink_dpipe_table *table;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+					 table_name);
+	if (!table) {
+		err = -EINVAL;
+		goto out;
+	}
+	table->resource_id = resource_id;
+	table->resource_valid = true;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 05/10] mlxsw: pci: Add support for performing bus reset
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

This is a preparation stage before introducing hot reload. During the
reload process the ASIC should be resetted by accessing the PCI BAR due
to unavailability of the mailbox/emad interfaces.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.h |  1 +
 drivers/net/ethernet/mellanox/mlxsw/pci.c  | 53 ++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 6e966af..34dda96 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -332,6 +332,7 @@ struct mlxsw_bus {
 		    const struct mlxsw_config_profile *profile,
 		    struct mlxsw_res *res);
 	void (*fini)(void *bus_priv);
+	void (*reset)(void *bus_priv);
 	bool (*skb_transmit_busy)(void *bus_priv,
 				  const struct mlxsw_tx_info *tx_info);
 	int (*skb_transmit)(void *bus_priv, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 23f7d82..39872fb 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -154,6 +154,7 @@ struct mlxsw_pci {
 		} comp;
 	} cmd;
 	struct mlxsw_bus_info bus_info;
+	const struct pci_device_id *id;
 };
 
 static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q)
@@ -1622,16 +1623,6 @@ static int mlxsw_pci_cmd_exec(void *bus_priv, u16 opcode, u8 opcode_mod,
 	return err;
 }
 
-static const struct mlxsw_bus mlxsw_pci_bus = {
-	.kind			= "pci",
-	.init			= mlxsw_pci_init,
-	.fini			= mlxsw_pci_fini,
-	.skb_transmit_busy	= mlxsw_pci_skb_transmit_busy,
-	.skb_transmit		= mlxsw_pci_skb_transmit,
-	.cmd_exec		= mlxsw_pci_cmd_exec,
-	.features		= MLXSW_BUS_F_TXRX,
-};
-
 static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
 			      const struct pci_device_id *id)
 {
@@ -1655,6 +1646,41 @@ static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
 	return 0;
 }
 
+static void mlxsw_pci_free_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	pci_free_irq_vectors(mlxsw_pci->pdev);
+}
+
+static int mlxsw_pci_alloc_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	int err;
+
+	err = pci_alloc_irq_vectors(mlxsw_pci->pdev, 1, 1, PCI_IRQ_MSIX);
+	if (err < 0)
+		dev_err(&mlxsw_pci->pdev->dev, "MSI-X init failed\n");
+	return err;
+}
+
+static void mlxsw_pci_reset(void *bus_priv)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
+	mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id);
+	mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
+}
+
+static const struct mlxsw_bus mlxsw_pci_bus = {
+	.kind			= "pci",
+	.init			= mlxsw_pci_init,
+	.fini			= mlxsw_pci_fini,
+	.skb_transmit_busy	= mlxsw_pci_skb_transmit_busy,
+	.skb_transmit		= mlxsw_pci_skb_transmit,
+	.cmd_exec		= mlxsw_pci_cmd_exec,
+	.features		= MLXSW_BUS_F_TXRX,
+	.reset			= mlxsw_pci_reset,
+};
+
 static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	const char *driver_name = pdev->driver->name;
@@ -1716,7 +1742,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_sw_reset;
 	}
 
-	err = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
+	err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
 	if (err < 0) {
 		dev_err(&pdev->dev, "MSI-X init failed\n");
 		goto err_msix_init;
@@ -1725,6 +1751,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mlxsw_pci->bus_info.device_kind = driver_name;
 	mlxsw_pci->bus_info.device_name = pci_name(mlxsw_pci->pdev);
 	mlxsw_pci->bus_info.dev = &pdev->dev;
+	mlxsw_pci->id = id;
 
 	err = mlxsw_core_bus_device_register(&mlxsw_pci->bus_info,
 					     &mlxsw_pci_bus, mlxsw_pci);
@@ -1736,7 +1763,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 
 err_bus_device_register:
-	pci_free_irq_vectors(mlxsw_pci->pdev);
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
 err_msix_init:
 err_sw_reset:
 	iounmap(mlxsw_pci->hw_addr);
@@ -1756,7 +1783,7 @@ static void mlxsw_pci_remove(struct pci_dev *pdev)
 	struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
 
 	mlxsw_core_bus_device_unregister(mlxsw_pci->core);
-	pci_free_irq_vectors(mlxsw_pci->pdev);
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
 	iounmap(mlxsw_pci->hw_addr);
 	pci_release_regions(mlxsw_pci->pdev);
 	pci_disable_device(mlxsw_pci->pdev);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 07/10] mlxsw: spectrum_dpipe: Connect dpipe tables to resources
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Connect current dpipe tables to resources. The tables are connected
in the following fashion:
1. IPv4 host - KVD hash single
2. IPv6 host - KVD hash double
3. Adjacency - KVD linear

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_dpipe.c   | 72 ++++++++++++++++++----
 1 file changed, 60 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
index 96fdba7..282fe82 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -774,11 +774,27 @@ static struct devlink_dpipe_table_ops mlxsw_sp_host4_ops = {
 static int mlxsw_sp_dpipe_host4_table_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
 
-	return devlink_dpipe_table_register(devlink,
-					    MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
-					    &mlxsw_sp_host4_ops,
-					    mlxsw_sp, false);
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
+					   &mlxsw_sp_host4_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
+					       MLXSW_SP_RESOURCE_KVD_HASH_SINGLE);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST4);
+	return err;
 }
 
 static void mlxsw_sp_dpipe_host4_table_fini(struct mlxsw_sp *mlxsw_sp)
@@ -832,11 +848,27 @@ static struct devlink_dpipe_table_ops mlxsw_sp_host6_ops = {
 static int mlxsw_sp_dpipe_host6_table_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
 
-	return devlink_dpipe_table_register(devlink,
-					    MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
-					    &mlxsw_sp_host6_ops,
-					    mlxsw_sp, false);
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
+					   &mlxsw_sp_host6_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
+					       MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST6);
+	return err;
 }
 
 static void mlxsw_sp_dpipe_host6_table_fini(struct mlxsw_sp *mlxsw_sp)
@@ -1216,11 +1248,27 @@ static struct devlink_dpipe_table_ops mlxsw_sp_dpipe_table_adj_ops = {
 static int mlxsw_sp_dpipe_adj_table_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
 
-	return devlink_dpipe_table_register(devlink,
-					    MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
-					    &mlxsw_sp_dpipe_table_adj_ops,
-					    mlxsw_sp, false);
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					   &mlxsw_sp_dpipe_table_adj_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					       MLXSW_SP_RESOURCE_KVD_LINEAR);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_ADJ);
+	return err;
 }
 
 static void mlxsw_sp_dpipe_adj_table_fini(struct mlxsw_sp *mlxsw_sp)
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 06/10] mlxsw: spectrum: Register KVD resources with devlink
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Register the KVD resources with devlink. The KVD is a memory resource
which is subdivided into three partitions which are the linear, hash
single and hash double.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c     |   9 ++
 drivers/net/ethernet/mellanox/mlxsw/core.h     |   1 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 135 +++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  12 +++
 4 files changed, 157 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index f3315bc..2488662 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1012,6 +1012,12 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 	if (err)
 		goto err_bus_init;
 
+	if (mlxsw_driver->resources_register) {
+		err = mlxsw_driver->resources_register(mlxsw_core);
+		if (err)
+			goto err_register_resources;
+	}
+
 	err = mlxsw_ports_init(mlxsw_core);
 	if (err)
 		goto err_ports_init;
@@ -1067,6 +1073,8 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 err_ports_init:
 	mlxsw_bus->fini(bus_priv);
 err_bus_init:
+	devlink_resources_unregister(devlink, NULL);
+err_register_resources:
 	devlink_free(devlink);
 err_devlink_alloc:
 	mlxsw_core_driver_put(device_kind);
@@ -1086,6 +1094,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core)
 	mlxsw_emad_fini(mlxsw_core);
 	kfree(mlxsw_core->lag.mapping);
 	mlxsw_ports_fini(mlxsw_core);
+	devlink_resources_unregister(devlink, NULL);
 	mlxsw_core->bus->fini(mlxsw_core->bus_priv);
 	devlink_free(devlink);
 	mlxsw_core_driver_put(device_kind);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 34dda96..e23f83b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -308,6 +308,7 @@ struct mlxsw_driver {
 				       u32 *p_cur, u32 *p_max);
 	void (*txhdr_construct)(struct sk_buff *skb,
 				const struct mlxsw_tx_info *tx_info);
+	int (*resources_register)(struct mlxsw_core *mlxsw_core);
 	u8 txhdr_len;
 	const struct mlxsw_config_profile *profile;
 };
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index d373df7..6e615ae 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3979,6 +3979,140 @@ static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
 	.resource_query_enable		= 1,
 };
 
+static bool
+mlxsw_sp_resource_kvd_granularity_validate(struct netlink_ext_ack *extack,
+					   u64 size)
+{
+	const struct mlxsw_config_profile *profile;
+
+	profile = &mlxsw_sp_config_profile;
+	if (size % profile->kvd_hash_granularity) {
+		NL_SET_ERR_MSG_MOD(extack, "resource set with wrong granularity");
+		return false;
+	}
+	return true;
+}
+
+static int
+mlxsw_sp_resource_kvd_size_validate(struct devlink *devlink, u64 size,
+				    struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG_MOD(extack, "kvd size cannot be changed");
+	return -EINVAL;
+}
+
+static int
+mlxsw_sp_resource_kvd_linear_size_validate(struct devlink *devlink, u64 size,
+					   struct netlink_ext_ack *extack)
+{
+	if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+mlxsw_sp_resource_kvd_hash_single_size_validate(struct devlink *devlink, u64 size,
+						struct netlink_ext_ack *extack)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+
+	if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
+		return -EINVAL;
+
+	if (size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE)) {
+		NL_SET_ERR_MSG_MOD(extack, "hash single size is smaller than minimum");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_resource_kvd_hash_double_size_validate(struct devlink *devlink, u64 size,
+						struct netlink_ext_ack *extack)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+
+	if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
+		return -EINVAL;
+
+	if (size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE)) {
+		NL_SET_ERR_MSG_MOD(extack, "hash double size is smaller than minimum");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct devlink_resource_ops mlxsw_sp_resource_kvd_ops = {
+	.size_validate = mlxsw_sp_resource_kvd_size_validate,
+};
+
+static struct devlink_resource_ops mlxsw_sp_resource_kvd_linear_ops = {
+	.size_validate = mlxsw_sp_resource_kvd_linear_size_validate,
+};
+
+static struct devlink_resource_ops mlxsw_sp_resource_kvd_hash_single_ops = {
+	.size_validate = mlxsw_sp_resource_kvd_hash_single_size_validate,
+};
+
+static struct devlink_resource_ops mlxsw_sp_resource_kvd_hash_double_ops = {
+	.size_validate = mlxsw_sp_resource_kvd_hash_double_size_validate,
+};
+
+static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	u32 kvd_size, single_size, double_size, linear_size;
+	const struct mlxsw_config_profile *profile;
+	int err;
+
+	profile = &mlxsw_sp_config_profile;
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SIZE))
+		return -EIO;
+
+	kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD,
+					true, kvd_size,
+					MLXSW_SP_RESOURCE_KVD,
+					DEVLINK_RESOURCE_ID_PARENT_TOP,
+					&mlxsw_sp_resource_kvd_ops);
+	if (err)
+		return err;
+
+	linear_size = profile->kvd_linear_size;
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR,
+					false, linear_size,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					MLXSW_SP_RESOURCE_KVD,
+					&mlxsw_sp_resource_kvd_linear_ops);
+	if (err)
+		return err;
+
+	double_size = kvd_size - linear_size;
+	double_size *= profile->kvd_hash_double_parts;
+	double_size /= profile->kvd_hash_double_parts +
+		       profile->kvd_hash_single_parts;
+	double_size = rounddown(double_size, profile->kvd_hash_granularity);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE,
+					false, double_size,
+					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					MLXSW_SP_RESOURCE_KVD,
+					&mlxsw_sp_resource_kvd_hash_double_ops);
+	if (err)
+		return err;
+
+	single_size = kvd_size - double_size - linear_size;
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE,
+					false, single_size,
+					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					MLXSW_SP_RESOURCE_KVD,
+					&mlxsw_sp_resource_kvd_hash_single_ops);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static struct mlxsw_driver mlxsw_sp_driver = {
 	.kind				= mlxsw_sp_driver_name,
 	.priv_size			= sizeof(struct mlxsw_sp),
@@ -3998,6 +4132,7 @@ static struct mlxsw_driver mlxsw_sp_driver = {
 	.sb_occ_port_pool_get		= mlxsw_sp_sb_occ_port_pool_get,
 	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
 	.txhdr_construct		= mlxsw_sp_txhdr_construct,
+	.resources_register		= mlxsw_sp_resources_register,
 	.txhdr_len			= MLXSW_TXHDR_LEN,
 	.profile			= &mlxsw_sp_config_profile,
 };
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index a0adcd8..5abc8c5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -66,6 +66,18 @@
 #define MLXSW_SP_KVD_LINEAR_SIZE 98304 /* entries */
 #define MLXSW_SP_KVD_GRANULARITY 128
 
+#define MLXSW_SP_RESOURCE_NAME_KVD "kvd"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR "linear"
+#define MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE "hash_single"
+#define MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE "hash_double"
+
+enum mlxsw_sp_resource_id {
+	MLXSW_SP_RESOURCE_KVD,
+	MLXSW_SP_RESOURCE_KVD_LINEAR,
+	MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+	MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+};
+
 struct mlxsw_sp_port;
 struct mlxsw_sp_rif;
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 08/10] mlxsw: spectrum: Add support for getting kvdl occupancy
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Add support for getting the kvdl occupancy through the resource interface.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  9 ++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  1 +
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c    | 26 ++++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 6e615ae..68e7d13 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4043,12 +4043,21 @@ mlxsw_sp_resource_kvd_hash_double_size_validate(struct devlink *devlink, u64 siz
 	return 0;
 }
 
+static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+
+	return mlxsw_sp_kvdl_occ_get(mlxsw_sp);
+}
+
 static struct devlink_resource_ops mlxsw_sp_resource_kvd_ops = {
 	.size_validate = mlxsw_sp_resource_kvd_size_validate,
 };
 
 static struct devlink_resource_ops mlxsw_sp_resource_kvd_linear_ops = {
 	.size_validate = mlxsw_sp_resource_kvd_linear_size_validate,
+	.occ_get = mlxsw_sp_resource_kvd_linear_occ_get,
 };
 
 static struct devlink_resource_ops mlxsw_sp_resource_kvd_hash_single_ops = {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 5abc8c5..ff8d32b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -469,6 +469,7 @@ void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index);
 int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
 				   unsigned int entry_count,
 				   unsigned int *p_alloc_size);
+u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp);
 
 struct mlxsw_sp_acl_rule_info {
 	unsigned int priority;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
index 310c382..cfacc17 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -286,6 +286,32 @@ static void mlxsw_sp_kvdl_parts_fini(struct mlxsw_sp *mlxsw_sp)
 		mlxsw_sp_kvdl_part_fini(mlxsw_sp, i);
 }
 
+u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part *part)
+{
+	unsigned int nr_entries;
+	int bit = -1;
+	u64 occ = 0;
+
+	nr_entries = (part->info->end_index -
+		      part->info->start_index + 1) /
+		      part->info->alloc_size;
+	while ((bit = find_next_bit(part->usage, nr_entries, bit + 1))
+		< nr_entries)
+		occ += part->info->alloc_size;
+	return occ;
+}
+
+u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_kvdl_part *part;
+	u64 occ = 0;
+
+	list_for_each_entry(part, &mlxsw_sp->kvdl->parts_list, list)
+		occ += mlxsw_sp_kvdl_part_occ(part);
+
+	return occ;
+}
+
 int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct mlxsw_sp_kvdl *kvdl;
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 09/10] mlxsw: pci: Add support for getting resource through devlink
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Up until now the KVD partition was static. This patch introduces the
ability to get the resource sizes via devlink. In case the resource is not
available the default configuration is used.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c     | 16 ++++++++
 drivers/net/ethernet/mellanox/mlxsw/core.h     |  9 ++++
 drivers/net/ethernet/mellanox/mlxsw/pci.c      | 40 +++++-------------
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 57 ++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 2488662..9fe25b1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1800,6 +1800,22 @@ void mlxsw_core_flush_owq(void)
 }
 EXPORT_SYMBOL(mlxsw_core_flush_owq);
 
+int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size)
+{
+	struct mlxsw_driver *driver =  mlxsw_core->driver;
+
+	if (!driver->kvd_sizes_get)
+		return -EINVAL;
+
+	return driver->kvd_sizes_get(mlxsw_core, profile,
+				     p_single_size, p_double_size,
+				     p_linear_size);
+}
+EXPORT_SYMBOL(mlxsw_core_kvd_sizes_get);
+
 static int __init mlxsw_core_module_init(void)
 {
 	int err;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e23f83b..e44061d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -309,10 +309,19 @@ struct mlxsw_driver {
 	void (*txhdr_construct)(struct sk_buff *skb,
 				const struct mlxsw_tx_info *tx_info);
 	int (*resources_register)(struct mlxsw_core *mlxsw_core);
+	int (*kvd_sizes_get)(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size);
 	u8 txhdr_len;
 	const struct mlxsw_config_profile *profile;
 };
 
+int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size);
+
 bool mlxsw_core_res_valid(struct mlxsw_core *mlxsw_core,
 			  enum mlxsw_res_id res_id);
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 39872fb..6468cf2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1053,38 +1053,18 @@ static int mlxsw_pci_resources_query(struct mlxsw_pci *mlxsw_pci, char *mbox,
 }
 
 static int
-mlxsw_pci_profile_get_kvd_sizes(const struct mlxsw_config_profile *profile,
+mlxsw_pci_profile_get_kvd_sizes(const struct mlxsw_pci *mlxsw_pci,
+				const struct mlxsw_config_profile *profile,
 				struct mlxsw_res *res)
 {
-	u32 single_size, double_size, linear_size;
-
-	if (!MLXSW_RES_VALID(res, KVD_SINGLE_MIN_SIZE) ||
-	    !MLXSW_RES_VALID(res, KVD_DOUBLE_MIN_SIZE) ||
-	    !profile->used_kvd_split_data)
-		return -EIO;
-
-	linear_size = profile->kvd_linear_size;
+	u64 single_size, double_size, linear_size;
+	int err;
 
-	/* The hash part is what left of the kvd without the
-	 * linear part. It is split to the single size and
-	 * double size by the parts ratio from the profile.
-	 * Both sizes must be a multiplications of the
-	 * granularity from the profile.
-	 */
-	double_size = MLXSW_RES_GET(res, KVD_SIZE) - linear_size;
-	double_size *= profile->kvd_hash_double_parts;
-	double_size /= profile->kvd_hash_double_parts +
-		       profile->kvd_hash_single_parts;
-	double_size /= profile->kvd_hash_granularity;
-	double_size *= profile->kvd_hash_granularity;
-	single_size = MLXSW_RES_GET(res, KVD_SIZE) - double_size -
-		      linear_size;
-
-	/* Check results are legal. */
-	if (single_size < MLXSW_RES_GET(res, KVD_SINGLE_MIN_SIZE) ||
-	    double_size < MLXSW_RES_GET(res, KVD_DOUBLE_MIN_SIZE) ||
-	    MLXSW_RES_GET(res, KVD_SIZE) < linear_size)
-		return -EIO;
+	err = mlxsw_core_kvd_sizes_get(mlxsw_pci->core, profile,
+				       &single_size, &double_size,
+				       &linear_size);
+	if (err)
+		return err;
 
 	MLXSW_RES_SET(res, KVD_SINGLE_SIZE, single_size);
 	MLXSW_RES_SET(res, KVD_DOUBLE_SIZE, double_size);
@@ -1185,7 +1165,7 @@ static int mlxsw_pci_config_profile(struct mlxsw_pci *mlxsw_pci, char *mbox,
 			mbox, profile->adaptive_routing_group_cap);
 	}
 	if (MLXSW_RES_VALID(res, KVD_SIZE)) {
-		err = mlxsw_pci_profile_get_kvd_sizes(profile, res);
+		err = mlxsw_pci_profile_get_kvd_sizes(mlxsw_pci, profile, res);
 		if (err)
 			return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 68e7d13..ab41438 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4122,6 +4122,62 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
 	return 0;
 }
 
+static int mlxsw_sp_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+				  const struct mlxsw_config_profile *profile,
+				  u64 *p_single_size, u64 *p_double_size,
+				  u64 *p_linear_size)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	u64 double_size;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_core, KVD_DOUBLE_MIN_SIZE) ||
+	    !profile->used_kvd_split_data)
+		return -EIO;
+
+	/* The hash part is what left of the kvd without the
+	 * linear part. It is split to the single size and
+	 * double size by the parts ratio from the profile.
+	 * Both sizes must be a multiplications of the
+	 * granularity from the profile. In case the user
+	 * provided the sizes they are obtained via devlink.
+	 */
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					p_linear_size);
+	if (err)
+		*p_linear_size = profile->kvd_linear_size;
+
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					p_double_size);
+	if (err) {
+		double_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+			      *p_linear_size;
+		double_size *= profile->kvd_hash_double_parts;
+		double_size /= profile->kvd_hash_double_parts +
+			       profile->kvd_hash_single_parts;
+		*p_double_size = rounddown(double_size,
+					   profile->kvd_hash_granularity);
+	}
+
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					p_single_size);
+	if (err)
+		*p_single_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+				 *p_double_size - *p_linear_size;
+
+	/* Check results are legal. */
+	if (*p_single_size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
+	    *p_double_size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE) ||
+	    MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) < *p_linear_size)
+		return -EIO;
+
+	return 0;
+}
+
 static struct mlxsw_driver mlxsw_sp_driver = {
 	.kind				= mlxsw_sp_driver_name,
 	.priv_size			= sizeof(struct mlxsw_sp),
@@ -4142,6 +4198,7 @@ static struct mlxsw_driver mlxsw_sp_driver = {
 	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
 	.txhdr_construct		= mlxsw_sp_txhdr_construct,
 	.resources_register		= mlxsw_sp_resources_register,
+	.kvd_sizes_get			= mlxsw_sp_kvd_sizes_get,
 	.txhdr_len			= MLXSW_TXHDR_LEN,
 	.profile			= &mlxsw_sp_config_profile,
 };
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 10/10] mlxsw: core: Add support for reload
From: Jiri Pirko @ 2017-12-20 11:58 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171220115821.22171-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Add support for hot reload. First, all the driver/core resources are
released but the PCI and devlink instances, then reset is performed
through the PCI interface. Finally the driver performs initialization.

In case of reload failure the driver is left in a partially initialized
state. Special care is taken during the driver removal in order to
properly handle this state.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 64 +++++++++++++++++++++++-------
 drivers/net/ethernet/mellanox/mlxsw/core.h |  5 ++-
 drivers/net/ethernet/mellanox/mlxsw/i2c.c  |  5 ++-
 drivers/net/ethernet/mellanox/mlxsw/pci.c  |  5 ++-
 4 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 9fe25b1..4b33919 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -113,6 +113,7 @@ struct mlxsw_core {
 	struct mlxsw_thermal *thermal;
 	struct mlxsw_core_port *ports;
 	unsigned int max_ports;
+	bool reload_fail;
 	unsigned long driver_priv[0];
 	/* driver_priv has to be always the last item */
 };
@@ -962,7 +963,28 @@ mlxsw_devlink_sb_occ_tc_port_bind_get(struct devlink_port *devlink_port,
 						     pool_type, p_cur, p_max);
 }
 
+static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	const struct mlxsw_bus *mlxsw_bus = mlxsw_core->bus;
+	int err;
+
+	if (!mlxsw_bus->reset)
+		return -EOPNOTSUPP;
+
+	mlxsw_core_bus_device_unregister(mlxsw_core, true);
+	mlxsw_bus->reset(mlxsw_core->bus_priv);
+	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
+					     mlxsw_core->bus,
+					     mlxsw_core->bus_priv, true,
+					     devlink);
+	if (err)
+		mlxsw_core->reload_fail = true;
+	return err;
+}
+
 static const struct devlink_ops mlxsw_devlink_ops = {
+	.reload				= mlxsw_devlink_core_bus_device_reload,
 	.port_type_set			= mlxsw_devlink_port_type_set,
 	.port_split			= mlxsw_devlink_port_split,
 	.port_unsplit			= mlxsw_devlink_port_unsplit,
@@ -980,23 +1002,26 @@ static const struct devlink_ops mlxsw_devlink_ops = {
 
 int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 				   const struct mlxsw_bus *mlxsw_bus,
-				   void *bus_priv)
+				   void *bus_priv, bool reload,
+				   struct devlink *devlink)
 {
 	const char *device_kind = mlxsw_bus_info->device_kind;
 	struct mlxsw_core *mlxsw_core;
 	struct mlxsw_driver *mlxsw_driver;
-	struct devlink *devlink;
 	size_t alloc_size;
 	int err;
 
 	mlxsw_driver = mlxsw_core_driver_get(device_kind);
 	if (!mlxsw_driver)
 		return -EINVAL;
-	alloc_size = sizeof(*mlxsw_core) + mlxsw_driver->priv_size;
-	devlink = devlink_alloc(&mlxsw_devlink_ops, alloc_size);
-	if (!devlink) {
-		err = -ENOMEM;
-		goto err_devlink_alloc;
+
+	if (!reload) {
+		alloc_size = sizeof(*mlxsw_core) + mlxsw_driver->priv_size;
+		devlink = devlink_alloc(&mlxsw_devlink_ops, alloc_size);
+		if (!devlink) {
+			err = -ENOMEM;
+			goto err_devlink_alloc;
+		}
 	}
 
 	mlxsw_core = devlink_priv(devlink);
@@ -1012,7 +1037,7 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 	if (err)
 		goto err_bus_init;
 
-	if (mlxsw_driver->resources_register) {
+	if (mlxsw_driver->resources_register && !reload) {
 		err = mlxsw_driver->resources_register(mlxsw_core);
 		if (err)
 			goto err_register_resources;
@@ -1038,9 +1063,11 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 	if (err)
 		goto err_emad_init;
 
-	err = devlink_register(devlink, mlxsw_bus_info->dev);
-	if (err)
-		goto err_devlink_register;
+	if (!reload) {
+		err = devlink_register(devlink, mlxsw_bus_info->dev);
+		if (err)
+			goto err_devlink_register;
+	}
 
 	err = mlxsw_hwmon_init(mlxsw_core, mlxsw_bus_info, &mlxsw_core->hwmon);
 	if (err)
@@ -1082,20 +1109,29 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 }
 EXPORT_SYMBOL(mlxsw_core_bus_device_register);
 
-void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core)
+void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
+				      bool reload)
 {
 	const char *device_kind = mlxsw_core->bus_info->device_kind;
 	struct devlink *devlink = priv_to_devlink(mlxsw_core);
 
+	if (mlxsw_core->reload_fail)
+		goto out;
+
 	if (mlxsw_core->driver->fini)
 		mlxsw_core->driver->fini(mlxsw_core);
 	mlxsw_thermal_fini(mlxsw_core->thermal);
-	devlink_unregister(devlink);
+	if (!reload)
+		devlink_unregister(devlink);
 	mlxsw_emad_fini(mlxsw_core);
 	kfree(mlxsw_core->lag.mapping);
 	mlxsw_ports_fini(mlxsw_core);
-	devlink_resources_unregister(devlink, NULL);
+	if (!reload)
+		devlink_resources_unregister(devlink, NULL);
 	mlxsw_core->bus->fini(mlxsw_core->bus_priv);
+	if (reload)
+		return;
+out:
 	devlink_free(devlink);
 	mlxsw_core_driver_put(device_kind);
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e44061d..5ddafd7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -66,8 +66,9 @@ void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver);
 
 int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 				   const struct mlxsw_bus *mlxsw_bus,
-				   void *bus_priv);
-void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core);
+				   void *bus_priv, bool reload,
+				   struct devlink *devlink);
+void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, bool reload);
 
 struct mlxsw_tx_info {
 	u8 local_port;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.c b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
index c0dcfa0..25f9915 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/i2c.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
@@ -539,7 +539,8 @@ static int mlxsw_i2c_probe(struct i2c_client *client,
 	mlxsw_i2c->dev = &client->dev;
 
 	err = mlxsw_core_bus_device_register(&mlxsw_i2c->bus_info,
-					     &mlxsw_i2c_bus, mlxsw_i2c);
+					     &mlxsw_i2c_bus, mlxsw_i2c, false,
+					     NULL);
 	if (err) {
 		dev_err(&client->dev, "Fail to register core bus\n");
 		return err;
@@ -557,7 +558,7 @@ static int mlxsw_i2c_remove(struct i2c_client *client)
 {
 	struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client);
 
-	mlxsw_core_bus_device_unregister(mlxsw_i2c->core);
+	mlxsw_core_bus_device_unregister(mlxsw_i2c->core, false);
 	mutex_destroy(&mlxsw_i2c->cmd.lock);
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 6468cf2..ceb8785 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1734,7 +1734,8 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mlxsw_pci->id = id;
 
 	err = mlxsw_core_bus_device_register(&mlxsw_pci->bus_info,
-					     &mlxsw_pci_bus, mlxsw_pci);
+					     &mlxsw_pci_bus, mlxsw_pci, false,
+					     NULL);
 	if (err) {
 		dev_err(&pdev->dev, "cannot register bus device\n");
 		goto err_bus_device_register;
@@ -1762,7 +1763,7 @@ static void mlxsw_pci_remove(struct pci_dev *pdev)
 {
 	struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
 
-	mlxsw_core_bus_device_unregister(mlxsw_pci->core);
+	mlxsw_core_bus_device_unregister(mlxsw_pci->core, false);
 	mlxsw_pci_free_irq_vectors(mlxsw_pci);
 	iounmap(mlxsw_pci->hw_addr);
 	pci_release_regions(mlxsw_pci->pdev);
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 0/2] bpftool improvements for xlated dump
From: Daniel Borkmann @ 2017-12-20 12:42 UTC (permalink / raw)
  To: alexei.starovoitov
  Cc: netdev, jakub.kicinski, quentin.monnet, Daniel Borkmann

This work adds correlation of maps and calls into the bpftool
xlated dump in order to help debugging and introspection of
loaded BPF progs. First patch makes kallsyms work on subprogs
with bpf calls, and second implements the actual correlation.
Details and example output can be found in the 2nd patch.

Thanks a lot!

Daniel Borkmann (2):
  bpf: fix kallsyms handling for subprogs
  bpf: allow for correlation of maps and helpers in dump

 include/linux/filter.h   |   9 +++
 kernel/bpf/core.c        |   4 +-
 kernel/bpf/disasm.c      |  65 ++++++++++++++---
 kernel/bpf/disasm.h      |  29 ++++++--
 kernel/bpf/syscall.c     |  93 ++++++++++++++++++++++--
 kernel/bpf/verifier.c    |  33 +++++++--
 tools/bpf/bpftool/prog.c | 181 ++++++++++++++++++++++++++++++++++++++++++++---
 7 files changed, 379 insertions(+), 35 deletions(-)

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next 1/2] bpf: fix kallsyms handling for subprogs
From: Daniel Borkmann @ 2017-12-20 12:42 UTC (permalink / raw)
  To: alexei.starovoitov
  Cc: netdev, jakub.kicinski, quentin.monnet, Daniel Borkmann
In-Reply-To: <20171220124257.4512-1-daniel@iogearbox.net>

Right now kallsyms handling is not working with JITed subprogs.
The reason is that when in 1c2a088a6626 ("bpf: x64: add JIT support
for multi-function programs") in jit_subprogs() they are passed
to bpf_prog_kallsyms_add(), then their prog type is 0, which BPF
core will think it's a cBPF program as only cBPF programs have a
0 type. Thus, they need to inherit the type from the main prog.

Once that is fixed, they are indeed added to the BPF kallsyms
infra, but their tag is 0. Therefore, since intention is to add
them as bpf_prog_F_<tag>, we need to pass them to bpf_prog_calc_tag()
first. And once this is resolved, there is a use-after-free on
prog cleanup: we remove the kallsyms entry from the main prog,
later walk all subprogs and call bpf_jit_free() on them. However,
the kallsyms linkage was never released on them. Thus, do that
for all subprogs right in __bpf_prog_put() when refcount hits 0.

Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c  | 6 ++++++
 kernel/bpf/verifier.c | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e2e1c78..30e728d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -937,10 +937,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		int i;
+
 		trace_bpf_prog_put_rcu(prog);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
+
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			bpf_prog_kallsyms_del(prog->aux->func[i]);
 		bpf_prog_kallsyms_del(prog);
+
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48b2901..811e6e9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5062,7 +5062,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			goto out_free;
 		memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
 		       len * sizeof(struct bpf_insn));
+		func[i]->type = prog->type;
 		func[i]->len = len;
+		if (bpf_prog_calc_tag(func[i]))
+			goto out_free;
 		func[i]->is_func = 1;
 		/* Use bpf_prog_F_tag to indicate functions in stack traces.
 		 * Long term would need debug info to populate names
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 2/2] bpf: allow for correlation of maps and helpers in dump
From: Daniel Borkmann @ 2017-12-20 12:42 UTC (permalink / raw)
  To: alexei.starovoitov
  Cc: netdev, jakub.kicinski, quentin.monnet, Daniel Borkmann
In-Reply-To: <20171220124257.4512-1-daniel@iogearbox.net>

Currently a dump of an xlated prog (post verifier stage) doesn't
correlate used helpers as well as maps. The prog info lists
involved map ids, however there's no correlation of where in the
program they are used as of today. Likewise, bpftool does not
correlate helper calls with the target functions.

The latter can be done w/o any kernel changes through kallsyms,
and also has the advantage that this works with inlined helpers
and BPF calls.

Example, via interpreter:

  # tc filter show dev foo ingress
  filter protocol all pref 49152 bpf chain 0
  filter protocol all pref 49152 bpf chain 0 handle 0x1 foo.o:[ingress] \
                      direct-action not_in_hw id 1 tag c74773051b364165   <-- prog id:1

  * Output before patch (calls/maps remain unclear):

  # bpftool prog dump xlated id 1             <-- dump prog id:1
   0: (b7) r1 = 2
   1: (63) *(u32 *)(r10 -4) = r1
   2: (bf) r2 = r10
   3: (07) r2 += -4
   4: (18) r1 = 0xffff95c47a8d4800
   6: (85) call unknown#73040
   7: (15) if r0 == 0x0 goto pc+18
   8: (bf) r2 = r10
   9: (07) r2 += -4
  10: (bf) r1 = r0
  11: (85) call unknown#73040
  12: (15) if r0 == 0x0 goto pc+23
  [...]

  * Output after patch:

  # bpftool prog dump xlated id 1
   0: (b7) r1 = 2
   1: (63) *(u32 *)(r10 -4) = r1
   2: (bf) r2 = r10
   3: (07) r2 += -4
   4: (18) r1 = map[id:2]                     <-- map id:2
   6: (85) call bpf_map_lookup_elem#73424     <-- helper call
   7: (15) if r0 == 0x0 goto pc+18
   8: (bf) r2 = r10
   9: (07) r2 += -4
  10: (bf) r1 = r0
  11: (85) call bpf_map_lookup_elem#73424
  12: (15) if r0 == 0x0 goto pc+23
  [...]

  # bpftool map show id 2                     <-- show/dump/etc map id:2
  2: hash_of_maps  flags 0x0
        key 4B  value 4B  max_entries 3  memlock 4096B

Example, JITed, same prog:

  # tc filter show dev foo ingress
  filter protocol all pref 49152 bpf chain 0
  filter protocol all pref 49152 bpf chain 0 handle 0x1 foo.o:[ingress] \
                  direct-action not_in_hw id 3 tag c74773051b364165 jited

  # bpftool prog show id 3
  3: sched_cls  tag c74773051b364165
        loaded_at Dec 19/13:48  uid 0
        xlated 384B  jited 257B  memlock 4096B  map_ids 2

  # bpftool prog dump xlated id 3
   0: (b7) r1 = 2
   1: (63) *(u32 *)(r10 -4) = r1
   2: (bf) r2 = r10
   3: (07) r2 += -4
   4: (18) r1 = map[id:2]                      <-- map id:2
   6: (85) call __htab_map_lookup_elem#77408   <-+ inlined rewrite
   7: (15) if r0 == 0x0 goto pc+2                |
   8: (07) r0 += 56                              |
   9: (79) r0 = *(u64 *)(r0 +0)                <-+
  10: (15) if r0 == 0x0 goto pc+24
  11: (bf) r2 = r10
  12: (07) r2 += -4
  [...]

Example, same prog, but kallsyms disabled (in that case we are
also not allowed to pass any relative offsets, etc, so prog
becomes pointer sanitized on dump):

  # sysctl kernel.kptr_restrict=2
  kernel.kptr_restrict = 2

  # bpftool prog dump xlated id 3
   0: (b7) r1 = 2
   1: (63) *(u32 *)(r10 -4) = r1
   2: (bf) r2 = r10
   3: (07) r2 += -4
   4: (18) r1 = map[id:2]
   6: (85) call bpf_unspec#0
   7: (15) if r0 == 0x0 goto pc+2
  [...]

Example, BPF calls via interpreter:

  # bpftool prog dump xlated id 1
   0: (85) call pc+2#__bpf_prog_run_args32
   1: (b7) r0 = 1
   2: (95) exit
   3: (b7) r0 = 2
   4: (95) exit

Example, BPF calls via JIT:

  # sysctl net.core.bpf_jit_enable=1
  net.core.bpf_jit_enable = 1
  # sysctl net.core.bpf_jit_kallsyms=1
  net.core.bpf_jit_kallsyms = 1

  # bpftool prog dump xlated id 1
   0: (85) call pc+2#bpf_prog_3b185187f1855c4c_F
   1: (b7) r0 = 1
   2: (95) exit
   3: (b7) r0 = 2
   4: (95) exit

And finally, an example for tail calls that is now working
as well wrt correlation:

  # bpftool prog dump xlated id 2
  [...]
  10: (b7) r2 = 8
  11: (85) call bpf_trace_printk#-41312
  12: (bf) r1 = r6
  13: (18) r2 = map[id:1]
  15: (b7) r3 = 0
  16: (85) call bpf_tail_call#12
  17: (b7) r1 = 42
  18: (6b) *(u16 *)(r6 +46) = r1
  19: (b7) r0 = 0
  20: (95) exit

  # bpftool map show id 1
  1: prog_array  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h   |   9 +++
 kernel/bpf/core.c        |   4 +-
 kernel/bpf/disasm.c      |  65 ++++++++++++++---
 kernel/bpf/disasm.h      |  29 ++++++--
 kernel/bpf/syscall.c     |  87 +++++++++++++++++++++--
 kernel/bpf/verifier.c    |  30 ++++++--
 tools/bpf/bpftool/prog.c | 181 ++++++++++++++++++++++++++++++++++++++++++++---
 7 files changed, 370 insertions(+), 35 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index e872b4e..2b0df27 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -18,6 +18,7 @@
 #include <linux/capability.h>
 #include <linux/cryptohash.h>
 #include <linux/set_memory.h>
+#include <linux/kallsyms.h>
 
 #include <net/sch_generic.h>
 
@@ -724,6 +725,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_helper_changes_pkt_data(void *func);
 
+static inline bool bpf_dump_raw_ok(void)
+{
+	/* Reconstruction of call-sites is dependent on kallsyms,
+	 * thus make dump the same restriction.
+	 */
+	return kallsyms_show_value() == 1;
+}
+
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 768e0a0..70a5345 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -771,7 +771,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
 
 /* Base function for offset calculation. Needs to go into .text section,
  * therefore keeping it non-static as well; will also be used by JITs
- * anyway later on, so do not let the compiler omit it.
+ * anyway later on, so do not let the compiler omit it. This also needs
+ * to go into kallsyms for correlation from e.g. bpftool, so naming
+ * must not change.
  */
 noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 883f88f..8740406 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -21,10 +21,39 @@ static const char * const func_id_str[] = {
 };
 #undef __BPF_FUNC_STR_FN
 
-const char *func_id_name(int id)
+static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
+				   const struct bpf_insn *insn,
+				   char *buff, size_t len)
 {
 	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
 
+	if (insn->src_reg != BPF_PSEUDO_CALL &&
+	    insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
+	    func_id_str[insn->imm])
+		return func_id_str[insn->imm];
+
+	if (cbs && cbs->cb_call)
+		return cbs->cb_call(cbs->private_data, insn);
+
+	if (insn->src_reg == BPF_PSEUDO_CALL)
+		snprintf(buff, len, "%+d", insn->imm);
+
+	return buff;
+}
+
+static const char *__func_imm_name(const struct bpf_insn_cbs *cbs,
+				   const struct bpf_insn *insn,
+				   u64 full_imm, char *buff, size_t len)
+{
+	if (cbs && cbs->cb_imm)
+		return cbs->cb_imm(cbs->private_data, insn, full_imm);
+
+	snprintf(buff, len, "0x%llx", (unsigned long long)full_imm);
+	return buff;
+}
+
+const char *func_id_name(int id)
+{
 	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
 		return func_id_str[id];
 	else
@@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
-static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+static void print_bpf_end_insn(bpf_insn_print_t verbose,
 			       struct bpf_verifier_env *env,
 			       const struct bpf_insn *insn)
 {
@@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose,
 		insn->imm, insn->dst_reg);
 }
 
-void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
-		    const struct bpf_insn *insn, bool allow_ptr_leaks)
+void print_bpf_insn(const struct bpf_insn_cbs *cbs,
+		    struct bpf_verifier_env *env,
+		    const struct bpf_insn *insn,
+		    bool allow_ptr_leaks)
 {
+	const bpf_insn_print_t verbose = cbs->cb_print;
 	u8 class = BPF_CLASS(insn->code);
 
 	if (class == BPF_ALU || class == BPF_ALU64) {
@@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
 			 */
 			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
 			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+			char tmp[64];
 
 			if (map_ptr && !allow_ptr_leaks)
 				imm = 0;
 
-			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
-				insn->dst_reg, (unsigned long long)imm);
+			verbose(env, "(%02x) r%d = %s\n",
+				insn->code, insn->dst_reg,
+				__func_imm_name(cbs, insn, imm,
+						tmp, sizeof(tmp)));
 		} else {
 			verbose(env, "BUG_ld_%02x\n", insn->code);
 			return;
@@ -189,12 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
 		u8 opcode = BPF_OP(insn->code);
 
 		if (opcode == BPF_CALL) {
-			if (insn->src_reg == BPF_PSEUDO_CALL)
-				verbose(env, "(%02x) call pc%+d\n", insn->code,
-					insn->imm);
-			else
+			char tmp[64];
+
+			if (insn->src_reg == BPF_PSEUDO_CALL) {
+				verbose(env, "(%02x) call pc%s\n",
+					insn->code,
+					__func_get_name(cbs, insn,
+							tmp, sizeof(tmp)));
+			} else {
+				strcpy(tmp, "unknown");
 				verbose(env, "(%02x) call %s#%d\n", insn->code,
-					func_id_name(insn->imm), insn->imm);
+					__func_get_name(cbs, insn,
+							tmp, sizeof(tmp)),
+					insn->imm);
+			}
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
 			verbose(env, "(%02x) goto pc%+d\n",
 				insn->code, insn->off);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index 8de977e..e0857d0 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -17,16 +17,35 @@
 #include <linux/bpf.h>
 #include <linux/kernel.h>
 #include <linux/stringify.h>
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <string.h>
+#endif
+
+struct bpf_verifier_env;
 
 extern const char *const bpf_alu_string[16];
 extern const char *const bpf_class_string[8];
 
 const char *func_id_name(int id);
 
-struct bpf_verifier_env;
-typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
-				  const char *, ...);
-void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
-		    const struct bpf_insn *insn, bool allow_ptr_leaks);
+typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
+				 const char *, ...);
+typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
+					      const struct bpf_insn *insn);
+typedef const char *(*bpf_insn_print_imm_t)(void *private_data,
+					    const struct bpf_insn *insn,
+					    __u64 full_imm);
+
+struct bpf_insn_cbs {
+	bpf_insn_print_t	cb_print;
+	bpf_insn_revmap_call_t	cb_call;
+	bpf_insn_print_imm_t	cb_imm;
+	void			*private_data;
+};
 
+void print_bpf_insn(const struct bpf_insn_cbs *cbs,
+		    struct bpf_verifier_env *env,
+		    const struct bpf_insn *insn,
+		    bool allow_ptr_leaks);
 #endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 30e728d..007802c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1558,6 +1558,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
+static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
+					      unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < prog->aux->used_map_cnt; i++)
+		if (prog->aux->used_maps[i] == (void *)addr)
+			return prog->aux->used_maps[i];
+	return NULL;
+}
+
+static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
+{
+	const struct bpf_map *map;
+	struct bpf_insn *insns;
+	u64 imm;
+	int i;
+
+	insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
+			GFP_USER);
+	if (!insns)
+		return insns;
+
+	for (i = 0; i < prog->len; i++) {
+		if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
+			insns[i].code = BPF_JMP | BPF_CALL;
+			insns[i].imm = BPF_FUNC_tail_call;
+			/* fall-through */
+		}
+		if (insns[i].code == (BPF_JMP | BPF_CALL) ||
+		    insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
+			if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
+				insns[i].code = BPF_JMP | BPF_CALL;
+			if (!bpf_dump_raw_ok())
+				insns[i].imm = 0;
+			continue;
+		}
+
+		if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
+			continue;
+
+		imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
+		map = bpf_map_from_imm(prog, imm);
+		if (map) {
+			insns[i].src_reg = BPF_PSEUDO_MAP_FD;
+			insns[i].imm = map->id;
+			insns[i + 1].imm = 0;
+			continue;
+		}
+
+		if (!bpf_dump_raw_ok() &&
+		    imm == (unsigned long)prog->aux) {
+			insns[i].imm = 0;
+			insns[i + 1].imm = 0;
+			continue;
+		}
+	}
+
+	return insns;
+}
+
 static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 				   const union bpf_attr *attr,
 				   union bpf_attr __user *uattr)
@@ -1608,18 +1669,34 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	ulen = info.jited_prog_len;
 	info.jited_prog_len = prog->jited_len;
 	if (info.jited_prog_len && ulen) {
-		uinsns = u64_to_user_ptr(info.jited_prog_insns);
-		ulen = min_t(u32, info.jited_prog_len, ulen);
-		if (copy_to_user(uinsns, prog->bpf_func, ulen))
-			return -EFAULT;
+		if (bpf_dump_raw_ok()) {
+			uinsns = u64_to_user_ptr(info.jited_prog_insns);
+			ulen = min_t(u32, info.jited_prog_len, ulen);
+			if (copy_to_user(uinsns, prog->bpf_func, ulen))
+				return -EFAULT;
+		} else {
+			info.jited_prog_insns = 0;
+		}
 	}
 
 	ulen = info.xlated_prog_len;
 	info.xlated_prog_len = bpf_prog_insn_size(prog);
 	if (info.xlated_prog_len && ulen) {
+		struct bpf_insn *insns_sanitized;
+		bool fault;
+
+		if (prog->blinded && !bpf_dump_raw_ok()) {
+			info.xlated_prog_insns = 0;
+			goto done;
+		}
+		insns_sanitized = bpf_insn_prepare_dump(prog);
+		if (!insns_sanitized)
+			return -ENOMEM;
 		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
 		ulen = min_t(u32, info.xlated_prog_len, ulen);
-		if (copy_to_user(uinsns, prog->insnsi, ulen))
+		fault = copy_to_user(uinsns, insns_sanitized, ulen);
+		kfree(insns_sanitized);
+		if (fault)
 			return -EFAULT;
 	}
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 811e6e9..f423726 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4426,9 +4426,12 @@ static int do_check(struct bpf_verifier_env *env)
 		}
 
 		if (env->log.level) {
+			const struct bpf_insn_cbs cbs = {
+				.cb_print	= verbose,
+			};
+
 			verbose(env, "%d: ", insn_idx);
-			print_bpf_insn(verbose, env, insn,
-				       env->allow_ptr_leaks);
+			print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
 		}
 
 		err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
@@ -5016,14 +5019,14 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog, **func, *tmp;
 	int i, j, subprog_start, subprog_end = 0, len, subprog;
-	struct bpf_insn *insn = prog->insnsi;
+	struct bpf_insn *insn;
 	void *old_bpf_func;
 	int err = -ENOMEM;
 
 	if (env->subprog_cnt == 0)
 		return 0;
 
-	for (i = 0; i < prog->len; i++, insn++) {
+	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
@@ -5115,6 +5118,25 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		bpf_prog_lock_ro(func[i]);
 		bpf_prog_kallsyms_add(func[i]);
 	}
+
+	/* Last step: make now unused interpreter insns from main
+	 * prog consistent for later dump requests, so they can
+	 * later look the same as if they were interpreted only.
+	 */
+	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+		unsigned long addr;
+
+		if (insn->code != (BPF_JMP | BPF_CALL) ||
+		    insn->src_reg != BPF_PSEUDO_CALL)
+			continue;
+		insn->off = env->insn_aux_data[i].call_imm;
+		subprog = find_subprog(env, i + insn->off + 1);
+		addr  = (unsigned long)func[subprog + 1]->bpf_func;
+		addr &= PAGE_MASK;
+		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
+			    addr - __bpf_call_base;
+	}
+
 	prog->jited = 1;
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 037484c..42ee889 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -401,6 +401,88 @@ static int do_show(int argc, char **argv)
 	return err;
 }
 
+#define SYM_MAX_NAME	256
+
+struct kernel_sym {
+	unsigned long address;
+	char name[SYM_MAX_NAME];
+};
+
+struct dump_data {
+	unsigned long address_call_base;
+	struct kernel_sym *sym_mapping;
+	__u32 sym_count;
+	char scratch_buff[SYM_MAX_NAME];
+};
+
+static int kernel_syms_cmp(const void *sym_a, const void *sym_b)
+{
+	return ((struct kernel_sym *)sym_a)->address -
+	       ((struct kernel_sym *)sym_b)->address;
+}
+
+static void kernel_syms_load(struct dump_data *dd)
+{
+	struct kernel_sym *sym;
+	char buff[256];
+	void *tmp, *address;
+	FILE *fp;
+
+	fp = fopen("/proc/kallsyms", "r");
+	if (!fp)
+		return;
+
+	while (!feof(fp)) {
+		if (!fgets(buff, sizeof(buff), fp))
+			break;
+		tmp = realloc(dd->sym_mapping,
+			      (dd->sym_count + 1) *
+			      sizeof(*dd->sym_mapping));
+		if (!tmp) {
+out:
+			free(dd->sym_mapping);
+			dd->sym_mapping = NULL;
+			fclose(fp);
+			return;
+		}
+		dd->sym_mapping = tmp;
+		sym = &dd->sym_mapping[dd->sym_count];
+		if (sscanf(buff, "%p %*c %s", &address, sym->name) != 2)
+			continue;
+		sym->address = (unsigned long)address;
+		if (!strcmp(sym->name, "__bpf_call_base")) {
+			dd->address_call_base = sym->address;
+			/* sysctl kernel.kptr_restrict was set */
+			if (!sym->address)
+				goto out;
+		}
+		if (sym->address)
+			dd->sym_count++;
+	}
+
+	fclose(fp);
+
+	qsort(dd->sym_mapping, dd->sym_count,
+	      sizeof(*dd->sym_mapping), kernel_syms_cmp);
+}
+
+static void kernel_syms_destroy(struct dump_data *dd)
+{
+	free(dd->sym_mapping);
+}
+
+static struct kernel_sym *kernel_syms_search(struct dump_data *dd,
+					     unsigned long key)
+{
+	struct kernel_sym sym = {
+		.address = key,
+	};
+
+	return dd->sym_mapping ?
+	       bsearch(&sym, dd->sym_mapping, dd->sym_count,
+		       sizeof(*dd->sym_mapping), kernel_syms_cmp) : NULL;
+}
+
 static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...)
 {
 	va_list args;
@@ -410,8 +492,71 @@ static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...)
 	va_end(args);
 }
 
-static void dump_xlated_plain(void *buf, unsigned int len, bool opcodes)
+static const char *print_call_pcrel(struct dump_data *dd,
+				    struct kernel_sym *sym,
+				    unsigned long address,
+				    const struct bpf_insn *insn)
 {
+	if (sym)
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			 "%+d#%s", insn->off, sym->name);
+	else
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			 "%+d#0x%lx", insn->off, address);
+	return dd->scratch_buff;
+}
+
+static const char *print_call_helper(struct dump_data *dd,
+				     struct kernel_sym *sym,
+				     unsigned long address)
+{
+	if (sym)
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			 "%s", sym->name);
+	else
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			 "0x%lx", address);
+	return dd->scratch_buff;
+}
+
+static const char *print_call(void *private_data,
+			      const struct bpf_insn *insn)
+{
+	struct dump_data *dd = private_data;
+	unsigned long address = dd->address_call_base + insn->imm;
+	struct kernel_sym *sym;
+
+	sym = kernel_syms_search(dd, address);
+	if (insn->src_reg == BPF_PSEUDO_CALL)
+		return print_call_pcrel(dd, sym, address, insn);
+	else
+		return print_call_helper(dd, sym, address);
+}
+
+static const char *print_imm(void *private_data,
+			     const struct bpf_insn *insn,
+			     __u64 full_imm)
+{
+	struct dump_data *dd = private_data;
+
+	if (insn->src_reg == BPF_PSEUDO_MAP_FD)
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			 "map[id:%u]", insn->imm);
+	else
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			 "0x%llx", (unsigned long long)full_imm);
+	return dd->scratch_buff;
+}
+
+static void dump_xlated_plain(struct dump_data *dd, void *buf,
+			      unsigned int len, bool opcodes)
+{
+	const struct bpf_insn_cbs cbs = {
+		.cb_print	= print_insn,
+		.cb_call	= print_call,
+		.cb_imm		= print_imm,
+		.private_data	= dd,
+	};
 	struct bpf_insn *insn = buf;
 	bool double_insn = false;
 	unsigned int i;
@@ -425,7 +570,7 @@ static void dump_xlated_plain(void *buf, unsigned int len, bool opcodes)
 		double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW);
 
 		printf("% 4d: ", i);
-		print_bpf_insn(print_insn, NULL, insn + i, true);
+		print_bpf_insn(&cbs, NULL, insn + i, true);
 
 		if (opcodes) {
 			printf("       ");
@@ -454,8 +599,15 @@ static void print_insn_json(struct bpf_verifier_env *env, const char *fmt, ...)
 	va_end(args);
 }
 
-static void dump_xlated_json(void *buf, unsigned int len, bool opcodes)
+static void dump_xlated_json(struct dump_data *dd, void *buf,
+			     unsigned int len, bool opcodes)
 {
+	const struct bpf_insn_cbs cbs = {
+		.cb_print	= print_insn_json,
+		.cb_call	= print_call,
+		.cb_imm		= print_imm,
+		.private_data	= dd,
+	};
 	struct bpf_insn *insn = buf;
 	bool double_insn = false;
 	unsigned int i;
@@ -470,7 +622,7 @@ static void dump_xlated_json(void *buf, unsigned int len, bool opcodes)
 
 		jsonw_start_object(json_wtr);
 		jsonw_name(json_wtr, "disasm");
-		print_bpf_insn(print_insn_json, NULL, insn + i, true);
+		print_bpf_insn(&cbs, NULL, insn + i, true);
 
 		if (opcodes) {
 			jsonw_name(json_wtr, "opcodes");
@@ -505,6 +657,7 @@ static void dump_xlated_json(void *buf, unsigned int len, bool opcodes)
 static int do_dump(int argc, char **argv)
 {
 	struct bpf_prog_info info = {};
+	struct dump_data dd = {};
 	__u32 len = sizeof(info);
 	unsigned int buf_size;
 	char *filepath = NULL;
@@ -592,6 +745,14 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}
 
+	if ((member_len == &info.jited_prog_len &&
+	     info.jited_prog_insns == 0) ||
+	    (member_len == &info.xlated_prog_len &&
+	     info.xlated_prog_insns == 0)) {
+		p_err("error retrieving insn dump: kernel.kptr_restrict set?");
+		goto err_free;
+	}
+
 	if (filepath) {
 		fd = open(filepath, O_WRONLY | O_CREAT | O_TRUNC, 0600);
 		if (fd < 0) {
@@ -608,17 +769,19 @@ static int do_dump(int argc, char **argv)
 			goto err_free;
 		}
 	} else {
-		if (member_len == &info.jited_prog_len)
+		if (member_len == &info.jited_prog_len) {
 			disasm_print_insn(buf, *member_len, opcodes);
-		else
+		} else {
+			kernel_syms_load(&dd);
 			if (json_output)
-				dump_xlated_json(buf, *member_len, opcodes);
+				dump_xlated_json(&dd, buf, *member_len, opcodes);
 			else
-				dump_xlated_plain(buf, *member_len, opcodes);
+				dump_xlated_plain(&dd, buf, *member_len, opcodes);
+			kernel_syms_destroy(&dd);
+		}
 	}
 
 	free(buf);
-
 	return 0;
 
 err_free:
-- 
2.9.5

^ permalink raw reply related

* Re: sparc64 verifier failures..
From: Daniel Borkmann @ 2017-12-20 13:05 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, alexei.starovoitov
In-Reply-To: <20171219.153636.783486472609136549.davem@davemloft.net>

On 12/19/2017 09:36 PM, David Miller wrote:
> 
> I'm getting about 100 verifier failures on sparc64.
> 
> The vast majority of them seem to be due to misaligned packet
> accesses.  Here is a sample of some of the failures.

Thanks, I'll check it next days. It would probably make sense to have
a --strict-align mode for test_verifier that enables it on all tests
unconditionally so selftests suite would always run in both modes (at
least on archs that don't have this restriction).

^ permalink raw reply

* [PATCH net-next] net: packet: allow bind to device which is !IFF_UP
From: yuan linyu @ 2017-12-20 13:20 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, yuan linyu

From: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>

this try to allow tcpdump to capture packet once device IFF_UP

Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
---
 net/packet/af_packet.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index da215e5..11b19fc 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3124,13 +3124,8 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 	if (proto == 0 || !need_rehook)
 		goto out_unlock;
 
-	if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
+	if (!unlisted)
 		register_prot_hook(sk);
-	} else {
-		sk->sk_err = ENETDOWN;
-		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
-	}
 
 out_unlock:
 	rcu_read_unlock();
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v3,net-next 2/2] ip6_gre: fix potential memory leak in ip6erspan_rcv
From: William Tu @ 2017-12-20 14:11 UTC (permalink / raw)
  To: Haishuang Yan
  Cc: David S. Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Linux Kernel Network Developers, linux-kernel
In-Reply-To: <1513735621-21913-3-git-send-email-yanhaishuang@cmss.chinamobile.com>

On Tue, Dec 19, 2017 at 6:07 PM, Haishuang Yan
<yanhaishuang@cmss.chinamobile.com> wrote:
> If md is NULL, tun_dst must be freed, otherwise it will cause memory
> leak.
>
> Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
> Cc: William Tu <u9012063@gmail.com>
> Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
>
> ---
> Changes since v3:
>   * Rebase on latest master branch.
>   * Fix wrong commit information.
> ---
>  net/ipv6/ip6_gre.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
> index 9bd1103..45038a9 100644
> --- a/net/ipv6/ip6_gre.c
> +++ b/net/ipv6/ip6_gre.c
> @@ -550,8 +550,10 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
>
>                         info = &tun_dst->u.tun_info;
>                         md = ip_tunnel_info_opts(info);
> -                       if (!md)
> +                       if (!md) {
> +                               dst_release((struct dst_entry *)tun_dst);
>                                 return PACKET_REJECT;
isn't md allocated previously at
    tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
                                           sizeof(*md));
so md should never be null after we check tun_dst?
William

^ permalink raw reply

* Re: [PATCH v1] net: bonding: Replace mac address parsing
From: Andy Gospodarek @ 2017-12-20 14:25 UTC (permalink / raw)
  To: Andy Shevchenko; +Cc: Jay Vosburgh, Veaceslav Falico, netdev
In-Reply-To: <20171219182044.5678-1-andriy.shevchenko@linux.intel.com>

On Tue, Dec 19, 2017 at 08:20:44PM +0200, Andy Shevchenko wrote:
> Replace sscanf() with mac_pton().
> 
> Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>

Nice cleanup.  Thanks!

Acked-by: Andy Gospodarek <andy@greyhouse.net>

> ---
>  drivers/net/bonding/bond_options.c | 6 +-----
>  1 file changed, 1 insertion(+), 5 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
> index 8a9b085c2a98..58c705f24f96 100644
> --- a/drivers/net/bonding/bond_options.c
> +++ b/drivers/net/bonding/bond_options.c
> @@ -1431,13 +1431,9 @@ static int bond_option_ad_actor_system_set(struct bonding *bond,
>  {
>  	u8 macaddr[ETH_ALEN];
>  	u8 *mac;
> -	int i;
>  
>  	if (newval->string) {
> -		i = sscanf(newval->string, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
> -			   &macaddr[0], &macaddr[1], &macaddr[2],
> -			   &macaddr[3], &macaddr[4], &macaddr[5]);
> -		if (i != ETH_ALEN)
> +		if (!mac_pton(newval->string, macaddr))
>  			goto err;
>  		mac = macaddr;
>  	} else {
> -- 
> 2.15.1
> 

^ permalink raw reply

* Re: [PATCH 0/5] VSOCK: add vsock_test test suite
From: Jorgen S. Hansen @ 2017-12-20 14:48 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: netdev@vger.kernel.org, Dexuan Cui
In-Reply-To: <20171213144911.6428-1-stefanha@redhat.com>


> On Dec 13, 2017, at 3:49 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> The vsock_diag.ko module already has a test suite but the core AF_VSOCK
> functionality has no tests.  This patch series adds several test cases that
> exercise AF_VSOCK SOCK_STREAM socket semantics (send/recv, connect/accept,
> half-closed connections, simultaneous connections).
> 
> The test suite is modest but I hope to cover additional cases in the future.
> My goal is to have a shared test suite so VMCI, Hyper-V, and KVM can ensure
> that our transports behave the same.
> 
> I have tested virtio-vsock.
> 
> Jorgen: Please test the VMCI transport and let me know if anything needs to be
> adjusted.  See tools/testing/vsock/README for information on how to run the
> test suite.
> 

I tried running the vsock_test on VMCI, and all the tests failed in one way or
another:
1) connection reset test: when the guest tries to connect to the host, we
  get EINVAL as the error instead of ECONNRESET. I’ll fix that.
2) client close and server close tests: On the host side, VMCI doesn’t
  support reading data from a socket that has been closed by the
  guest. When the guest closes a connection, all data is gone, and
  we return EOF on the host side. So the tests that try to read data
  after close, should not attempt that on VMCI host side. I got the
  tests to pass by adding a getsockname call to determine if
  the local CID was the host CID, and then skip the read attempt
  in that case. We could add a vmci flag, that would enable
  this behavior.
3) send_byte(fd, -EPIPE): for the VMCI transport, the close
 isn’t necessarily visible immediately on the peer. So in most
 cases, these send operations would complete with success.
 I was running these tests using nested virtualization, so I
 suspect that the problem is more likely to occur here, but
 I had to add a sleep to be sure to get the EPIPE error.
4) server close test: the connect would sometimes fail. This looks
  like an issue where we detect the peer close on the client side
  before we complete the connection handshake on the client
  side. There are two different channels used for the connection
  handshake and the disconnect. I’ll look into this to see what
  exactly is going on.
5) multiple connections tests: with the standard socket sizes,
  VMCI is only able to support about 100 concurrent stream
  connections so this test passes with MULTICONN_NFDS
  set to 100.

Thanks,
Jorgen


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox