Netdev List
 help / color / mirror / Atom feed
* [PATCH 14/20] netfilter: nf_tables: Add audit support to log statement
From: Pablo Neira Ayuso @ 2018-06-02  0:22 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180602002259.4024-1-pablo@netfilter.org>

From: Phil Sutter <phil@nwl.cc>

This extends log statement to support the behaviour achieved with
AUDIT target in iptables.

Audit logging is enabled via a pseudo log level 8. In this case any
other settings like log prefix are ignored since audit log format is
fixed.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  5 ++
 net/netfilter/nft_log.c                  | 92 +++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 3d46c82a5ebd..5c7eb9b9f6d6 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1081,6 +1081,11 @@ enum nft_log_attributes {
 #define NFTA_LOG_MAX		(__NFTA_LOG_MAX - 1)
 
 /**
+ * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging
+ */
+#define LOGLEVEL_AUDIT		8
+
+/**
  * enum nft_queue_attributes - nf_tables queue expression netlink attributes
  *
  * @NFTA_QUEUE_NUM: netlink queue to send messages to (NLA_U16)
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index a27be36dc0af..7eef1cffbf1b 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -9,12 +9,15 @@
  * Development of this code funded by Astaro AG (http://www.astaro.com/)
  */
 
+#include <linux/audit.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 #include <linux/netdevice.h>
@@ -26,12 +29,93 @@ struct nft_log {
 	char			*prefix;
 };
 
+static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct iphdr _iph;
+	const struct iphdr *ih;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph);
+	if (!ih)
+		return false;
+
+	audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
+			 &ih->saddr, &ih->daddr, ih->protocol);
+
+	return true;
+}
+
+static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct ipv6hdr _ip6h;
+	const struct ipv6hdr *ih;
+	u8 nexthdr;
+	__be16 frag_off;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+	if (!ih)
+		return false;
+
+	nexthdr = ih->nexthdr;
+	ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off);
+
+	audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+			 &ih->saddr, &ih->daddr, nexthdr);
+
+	return true;
+}
+
+static void nft_log_eval_audit(const struct nft_pktinfo *pkt)
+{
+	struct sk_buff *skb = pkt->skb;
+	struct audit_buffer *ab;
+	int fam = -1;
+
+	if (!audit_enabled)
+		return;
+
+	ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+	if (!ab)
+		return;
+
+	audit_log_format(ab, "mark=%#x", skb->mark);
+
+	switch (nft_pf(pkt)) {
+	case NFPROTO_BRIDGE:
+		switch (eth_hdr(skb)->h_proto) {
+		case htons(ETH_P_IP):
+			fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+			break;
+		case htons(ETH_P_IPV6):
+			fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+			break;
+		}
+		break;
+	case NFPROTO_IPV4:
+		fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+		break;
+	case NFPROTO_IPV6:
+		fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+		break;
+	}
+
+	if (fam == -1)
+		audit_log_format(ab, " saddr=? daddr=? proto=-1");
+
+	audit_log_end(ab);
+}
+
 static void nft_log_eval(const struct nft_expr *expr,
 			 struct nft_regs *regs,
 			 const struct nft_pktinfo *pkt)
 {
 	const struct nft_log *priv = nft_expr_priv(expr);
 
+	if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
+	    priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+		nft_log_eval_audit(pkt);
+		return;
+	}
+
 	nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
 		      nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
 		      priv->prefix);
@@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		} else {
 			li->u.log.level = LOGLEVEL_WARNING;
 		}
-		if (li->u.log.level > LOGLEVEL_DEBUG) {
+		if (li->u.log.level > LOGLEVEL_AUDIT) {
 			err = -EINVAL;
 			goto err1;
 		}
@@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		break;
 	}
 
+	if (li->u.log.level == LOGLEVEL_AUDIT)
+		return 0;
+
 	err = nf_logger_find_get(ctx->family, li->type);
 	if (err < 0)
 		goto err1;
@@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
 	if (priv->prefix != nft_log_null_prefix)
 		kfree(priv->prefix);
 
+	if (li->u.log.level == LOGLEVEL_AUDIT)
+		return;
+
 	nf_logger_put(ctx->family, li->type);
 }
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 17/20] netfilter: nfnetlink: Remove VLA usage
From: Pablo Neira Ayuso @ 2018-06-02  0:22 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180602002259.4024-1-pablo@netfilter.org>

From: Kees Cook <keescook@chromium.org>

In the quest to remove all stack VLA usage from the kernel[1], this
allocates the maximum size expected for all possible attrs and adds
sanity-checks at both registration and usage to make sure nothing
gets out of sync.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 261820627711..4d0da7042aff 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -38,6 +38,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
 	rcu_dereference_protected(table[(id)].subsys, \
 				  lockdep_nfnl_is_held((id)))
 
+#define NFNL_MAX_ATTR_COUNT	32
+
 static struct {
 	struct mutex				mutex;
 	const struct nfnetlink_subsystem __rcu	*subsys;
@@ -77,6 +79,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
 {
+	u8 cb_id;
+
+	/* Sanity-check attr_count size to avoid stack buffer overflow. */
+	for (cb_id = 0; cb_id < n->cb_count; cb_id++)
+		if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT))
+			return -EINVAL;
+
 	nfnl_lock(n->subsys_id);
 	if (table[n->subsys_id].subsys) {
 		nfnl_unlock(n->subsys_id);
@@ -186,11 +195,17 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	{
 		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
 		u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-		struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+		struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
 		struct nlattr *attr = (void *)nlh + min_len;
 		int attrlen = nlh->nlmsg_len - min_len;
 		__u8 subsys_id = NFNL_SUBSYS_ID(type);
 
+		/* Sanity-check NFNL_MAX_ATTR_COUNT */
+		if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+			rcu_read_unlock();
+			return -ENOMEM;
+		}
+
 		err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
 				ss->cb[cb_id].policy, extack);
 		if (err < 0) {
@@ -387,10 +402,16 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 		{
 			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
 			u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-			struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+			struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
 			struct nlattr *attr = (void *)nlh + min_len;
 			int attrlen = nlh->nlmsg_len - min_len;
 
+			/* Sanity-check NFTA_MAX_ATTR */
+			if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+				err = -ENOMEM;
+				goto ack;
+			}
+
 			err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
 					attrlen, ss->cb[cb_id].policy, NULL);
 			if (err < 0)
-- 
2.11.0

^ permalink raw reply related

* [PATCH 15/20] netfilter: nf_tables: fix chain dependency validation
From: Pablo Neira Ayuso @ 2018-06-02  0:22 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180602002259.4024-1-pablo@netfilter.org>

The following ruleset:

 add table ip filter
 add chain ip filter input { type filter hook input priority 4; }
 add chain ip filter ap
 add rule ip filter input jump ap
 add rule ip filter ap masquerade

results in a panic, because the masquerade extension should be rejected
from the filter chain. The existing validation is missing a chain
dependency check when the rule is added to the non-base chain.

This patch fixes the problem by walking down the rules from the
basechains, searching for either immediate or lookup expressions, then
jumping to non-base chains and again walking down the rules to perform
the expression validation, so we make sure the full ruleset graph is
validated. This is done only once from the commit phase, in case of
problem, we abort the transaction and perform fine grain validation for
error reporting. This patch requires 003087911af2 ("netfilter:
nfnetlink: allow commit to fail") to achieve this behaviour.

This patch also adds a cleanup callback to nfnl batch interface to reset
the validate state from the exit path.

As a result of this patch, nf_tables_check_loops() doesn't use
->validate to check for loops, instead it just checks for immediate
expressions.

Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h    |   1 +
 include/net/netfilter/nf_tables.h      |   2 +
 include/net/netfilter/nf_tables_core.h |   8 ++
 include/net/netns/nftables.h           |   1 +
 net/netfilter/nf_tables_api.c          | 152 ++++++++++++++++++++++++++++-----
 net/netfilter/nfnetlink.c              |   2 +
 net/netfilter/nft_immediate.c          |  27 ++++--
 net/netfilter/nft_lookup.c             |  47 ++++++++++
 8 files changed, 208 insertions(+), 32 deletions(-)

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 34551f8aaf9d..3ecc3050be0e 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -31,6 +31,7 @@ struct nfnetlink_subsystem {
 	const struct nfnl_callback *cb;	/* callback for individual types */
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb);
+	void (*cleanup)(struct net *net);
 	bool (*valid_genid)(struct net *net, u32 genid);
 };
 
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 6b9ddb99f3a8..435c32d8a995 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -874,6 +874,8 @@ struct nft_chain {
 	struct nft_rule			**rules_next;
 };
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
+
 enum nft_chain_types {
 	NFT_CHAIN_T_DEFAULT = 0,
 	NFT_CHAIN_T_ROUTE,
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index cd6915b6c054..e0c0c2558ec4 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -2,6 +2,8 @@
 #ifndef _NET_NF_TABLES_CORE_H
 #define _NET_NF_TABLES_CORE_H
 
+#include <net/netfilter/nf_tables.h>
+
 extern struct nft_expr_type nft_imm_type;
 extern struct nft_expr_type nft_cmp_type;
 extern struct nft_expr_type nft_lookup_type;
@@ -23,6 +25,12 @@ struct nft_cmp_fast_expr {
 	u8			len;
 };
 
+struct nft_immediate_expr {
+	struct nft_data		data;
+	enum nft_registers	dreg:8;
+	u8			dlen;
+};
+
 /* Calculate the mask for the nft_cmp_fast expression. On big endian the
  * mask needs to include the *upper* bytes when interpreting that data as
  * something smaller than the full u32, therefore a cpu_to_le32 is done.
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 29c3851b486a..94767ea3a490 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -9,6 +9,7 @@ struct netns_nftables {
 	struct list_head	commit_list;
 	unsigned int		base_seq;
 	u8			gencursor;
+	u8			validate_state;
 };
 
 #endif
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3b2ad96a9a05..c785bc5a66f1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -28,6 +28,28 @@ static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
 static u64 table_handle;
 
+enum {
+	NFT_VALIDATE_SKIP	= 0,
+	NFT_VALIDATE_NEED,
+	NFT_VALIDATE_DO,
+};
+
+static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+{
+	switch (net->nft.validate_state) {
+	case NFT_VALIDATE_SKIP:
+		WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
+		break;
+	case NFT_VALIDATE_NEED:
+		break;
+	case NFT_VALIDATE_DO:
+		if (new_validate_state == NFT_VALIDATE_NEED)
+			return;
+	}
+
+	net->nft.validate_state = new_validate_state;
+}
+
 static void nft_ctx_init(struct nft_ctx *ctx,
 			 struct net *net,
 			 const struct sk_buff *skb,
@@ -1921,19 +1943,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
 			goto err1;
 	}
 
-	if (ops->validate) {
-		const struct nft_data *data = NULL;
-
-		err = ops->validate(ctx, expr, &data);
-		if (err < 0)
-			goto err2;
-	}
-
 	return 0;
-
-err2:
-	if (ops->destroy)
-		ops->destroy(ctx, expr);
 err1:
 	expr->ops = NULL;
 	return err;
@@ -2299,6 +2309,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx,
 	nf_tables_rule_destroy(ctx, rule);
 }
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+{
+	struct nft_expr *expr, *last;
+	const struct nft_data *data;
+	struct nft_rule *rule;
+	int err;
+
+	list_for_each_entry(rule, &chain->rules, list) {
+		if (!nft_is_active_next(ctx->net, rule))
+			continue;
+
+		nft_rule_for_each_expr(expr, last, rule) {
+			if (!expr->ops->validate)
+				continue;
+
+			err = expr->ops->validate(ctx, expr, &data);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate);
+
+static int nft_table_validate(struct net *net, const struct nft_table *table)
+{
+	struct nft_chain *chain;
+	struct nft_ctx ctx = {
+		.net	= net,
+		.family	= table->family,
+	};
+	int err;
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (!nft_is_base_chain(chain))
+			continue;
+
+		ctx.chain = chain;
+		err = nft_chain_validate(&ctx, chain);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
 #define NFT_RULE_MAXEXPRS	128
 
 static struct nft_expr_info *info;
@@ -2426,6 +2483,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 		err = nf_tables_newexpr(&ctx, &info[i], expr);
 		if (err < 0)
 			goto err2;
+
+		if (info[i].ops->validate)
+			nft_validate_state_update(net, NFT_VALIDATE_NEED);
+
 		info[i].ops = NULL;
 		expr = nft_expr_next(expr);
 	}
@@ -2469,8 +2530,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 		}
 	}
 	chain->use++;
-	return 0;
 
+	if (net->nft.validate_state == NFT_VALIDATE_DO)
+		return nft_table_validate(net, table);
+
+	return 0;
 err2:
 	nf_tables_rule_release(&ctx, rule);
 err1:
@@ -4112,6 +4176,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 							  d2.type, d2.len);
 			if (err < 0)
 				goto err3;
+
+			if (d2.type == NFT_DATA_VERDICT &&
+			    (data.verdict.code == NFT_GOTO ||
+			     data.verdict.code == NFT_JUMP))
+				nft_validate_state_update(ctx->net,
+							  NFT_VALIDATE_NEED);
 		}
 
 		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
@@ -4211,7 +4281,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
-	int rem, err = 0;
+	int rem, err;
 
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
 		return -EINVAL;
@@ -4232,9 +4302,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
 		err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
 		if (err < 0)
-			break;
+			return err;
 	}
-	return err;
+
+	if (net->nft.validate_state == NFT_VALIDATE_DO)
+		return nft_table_validate(net, ctx.table);
+
+	return 0;
 }
 
 /**
@@ -5867,6 +5941,27 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 	},
 };
 
+static int nf_tables_validate(struct net *net)
+{
+	struct nft_table *table;
+
+	switch (net->nft.validate_state) {
+	case NFT_VALIDATE_SKIP:
+		break;
+	case NFT_VALIDATE_NEED:
+		nft_validate_state_update(net, NFT_VALIDATE_DO);
+		/* fall through */
+	case NFT_VALIDATE_DO:
+		list_for_each_entry(table, &net->nft.tables, list) {
+			if (nft_table_validate(net, table) < 0)
+				return -EAGAIN;
+		}
+		break;
+	}
+
+	return 0;
+}
+
 static void nft_chain_commit_update(struct nft_trans *trans)
 {
 	struct nft_base_chain *basechain;
@@ -6055,6 +6150,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 	struct nft_chain *chain;
 	struct nft_table *table;
 
+	/* 0. Validate ruleset, otherwise roll back for error reporting. */
+	if (nf_tables_validate(net) < 0)
+		return -EAGAIN;
+
 	/* 1.  Allocate space for next generation rules_gen_X[] */
 	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
 		int ret;
@@ -6349,6 +6448,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 	return 0;
 }
 
+static void nf_tables_cleanup(struct net *net)
+{
+	nft_validate_state_update(net, NFT_VALIDATE_SKIP);
+}
+
 static bool nf_tables_valid_genid(struct net *net, u32 genid)
 {
 	return net->nft.base_seq == genid;
@@ -6361,6 +6465,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.cb		= nf_tables_cb,
 	.commit		= nf_tables_commit,
 	.abort		= nf_tables_abort,
+	.cleanup	= nf_tables_cleanup,
 	.valid_genid	= nf_tables_valid_genid,
 };
 
@@ -6444,19 +6549,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 
 	list_for_each_entry(rule, &chain->rules, list) {
 		nft_rule_for_each_expr(expr, last, rule) {
-			const struct nft_data *data = NULL;
+			struct nft_immediate_expr *priv;
+			const struct nft_data *data;
 			int err;
 
-			if (!expr->ops->validate)
+			if (strcmp(expr->ops->type->name, "immediate"))
 				continue;
 
-			err = expr->ops->validate(ctx, expr, &data);
-			if (err < 0)
-				return err;
-
-			if (data == NULL)
+			priv = nft_expr_priv(expr);
+			if (priv->dreg != NFT_REG_VERDICT)
 				continue;
 
+			data = &priv->data;
 			switch (data->verdict.code) {
 			case NFT_JUMP:
 			case NFT_GOTO:
@@ -6936,6 +7040,8 @@ static int __net_init nf_tables_init_net(struct net *net)
 	INIT_LIST_HEAD(&net->nft.tables);
 	INIT_LIST_HEAD(&net->nft.commit_list);
 	net->nft.base_seq = 1;
+	net->nft.validate_state = NFT_VALIDATE_SKIP;
+
 	return 0;
 }
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 5a1bd23af1a3..261820627711 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -460,6 +460,8 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 	} else {
 		ss->abort(net, oskb);
 	}
+	if (ss->cleanup)
+		ss->cleanup(net);
 
 	nfnl_err_deliver(&err_list, oskb);
 	nfnl_unlock(subsys_id);
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index aa87ff8beae8..15adf8ca82c3 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -17,12 +17,6 @@
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 
-struct nft_immediate_expr {
-	struct nft_data		data;
-	enum nft_registers	dreg:8;
-	u8			dlen;
-};
-
 static void nft_immediate_eval(const struct nft_expr *expr,
 			       struct nft_regs *regs,
 			       const struct nft_pktinfo *pkt)
@@ -101,12 +95,27 @@ static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
 
 static int nft_immediate_validate(const struct nft_ctx *ctx,
 				  const struct nft_expr *expr,
-				  const struct nft_data **data)
+				  const struct nft_data **d)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	const struct nft_data *data;
+	int err;
 
-	if (priv->dreg == NFT_REG_VERDICT)
-		*data = &priv->data;
+	if (priv->dreg != NFT_REG_VERDICT)
+		return 0;
+
+	data = &priv->data;
+
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		err = nft_chain_validate(ctx, data->verdict.chain);
+		if (err < 0)
+			return err;
+		break;
+	default:
+		break;
+	}
 
 	return 0;
 }
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index f52da5e2199f..42e6fadf1417 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -149,6 +149,52 @@ static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return -1;
 }
 
+static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
+				       struct nft_set *set,
+				       const struct nft_set_iter *iter,
+				       struct nft_set_elem *elem)
+{
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_data *data;
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+		return 0;
+
+	data = nft_set_ext_data(ext);
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		return nft_chain_validate(ctx, data->verdict.chain);
+	default:
+		return 0;
+	}
+}
+
+static int nft_lookup_validate(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nft_data **d)
+{
+	const struct nft_lookup *priv = nft_expr_priv(expr);
+	struct nft_set_iter iter;
+
+	if (!(priv->set->flags & NFT_SET_MAP) ||
+	    priv->set->dtype != NFT_DATA_VERDICT)
+		return 0;
+
+	iter.genmask	= nft_genmask_next(ctx->net);
+	iter.skip	= 0;
+	iter.count	= 0;
+	iter.err	= 0;
+	iter.fn		= nft_lookup_validate_setelem;
+
+	priv->set->ops->walk(ctx, priv->set, &iter);
+	if (iter.err < 0)
+		return iter.err;
+
+	return 0;
+}
+
 static const struct nft_expr_ops nft_lookup_ops = {
 	.type		= &nft_lookup_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
 	.init		= nft_lookup_init,
 	.destroy	= nft_lookup_destroy,
 	.dump		= nft_lookup_dump,
+	.validate	= nft_lookup_validate,
 };
 
 struct nft_expr_type nft_lookup_type __read_mostly = {
-- 
2.11.0

^ permalink raw reply related

* [PATCH 18/20] netfilter: nft_fwd_netdev: allow to forward packets via neighbour layer
From: Pablo Neira Ayuso @ 2018-06-02  0:22 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180602002259.4024-1-pablo@netfilter.org>

This allows us to forward packets from the netdev family via neighbour
layer, so you don't need an explicit link-layer destination when using
this expression from rules. The ttl/hop_limit field is decremented.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +
 net/netfilter/nft_fwd_netdev.c           | 146 ++++++++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5c7eb9b9f6d6..a089af092a29 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1260,10 +1260,14 @@ enum nft_dup_attributes {
  * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes
  *
  * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register)
+ * @NFTA_FWD_SREG_ADDR: source register of destination address (NLA_U32: nft_register)
+ * @NFTA_FWD_NFPROTO: layer 3 family of source register address (NLA_U32: enum nfproto)
  */
 enum nft_fwd_attributes {
 	NFTA_FWD_UNSPEC,
 	NFTA_FWD_SREG_DEV,
+	NFTA_FWD_SREG_ADDR,
+	NFTA_FWD_NFPROTO,
 	__NFTA_FWD_MAX
 };
 #define NFTA_FWD_MAX	(__NFTA_FWD_MAX - 1)
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index ce13a50b9189..8abb9891cdf2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -12,8 +12,12 @@
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_dup_netdev.h>
+#include <net/neighbour.h>
+#include <net/ip.h>
 
 struct nft_fwd_netdev {
 	enum nft_registers	sreg_dev:8;
@@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
 	[NFTA_FWD_SREG_DEV]	= { .type = NLA_U32 },
+	[NFTA_FWD_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_FWD_NFPROTO]	= { .type = NLA_U32 },
 };
 
 static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -62,7 +68,133 @@ static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return -1;
 }
 
+struct nft_fwd_neigh {
+	enum nft_registers	sreg_dev:8;
+	enum nft_registers	sreg_addr:8;
+	u8			nfproto;
+};
+
+static void nft_fwd_neigh_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+	void *addr = &regs->data[priv->sreg_addr];
+	int oif = regs->data[priv->sreg_dev];
+	unsigned int verdict = NF_STOLEN;
+	struct sk_buff *skb = pkt->skb;
+	struct net_device *dev;
+	int neigh_table;
+
+	switch (priv->nfproto) {
+	case NFPROTO_IPV4: {
+		struct iphdr *iph;
+
+		if (skb->protocol != htons(ETH_P_IP)) {
+			verdict = NFT_BREAK;
+			goto out;
+		}
+		if (skb_try_make_writable(skb, sizeof(*iph))) {
+			verdict = NF_DROP;
+			goto out;
+		}
+		iph = ip_hdr(skb);
+		ip_decrease_ttl(iph);
+		neigh_table = NEIGH_ARP_TABLE;
+		break;
+		}
+	case NFPROTO_IPV6: {
+		struct ipv6hdr *ip6h;
+
+		if (skb->protocol != htons(ETH_P_IPV6)) {
+			verdict = NFT_BREAK;
+			goto out;
+		}
+		if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+			verdict = NF_DROP;
+			goto out;
+		}
+		ip6h = ipv6_hdr(skb);
+		ip6h->hop_limit--;
+		neigh_table = NEIGH_ND_TABLE;
+		break;
+		}
+	default:
+		verdict = NFT_BREAK;
+		goto out;
+	}
+
+	dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+	if (dev == NULL)
+		return;
+
+	skb->dev = dev;
+	neigh_xmit(neigh_table, dev, addr, skb);
+out:
+	regs->verdict.code = verdict;
+}
+
+static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+	unsigned int addr_len;
+	int err;
+
+	if (!tb[NFTA_FWD_SREG_DEV] ||
+	    !tb[NFTA_FWD_SREG_ADDR] ||
+	    !tb[NFTA_FWD_NFPROTO])
+		return -EINVAL;
+
+	priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
+	priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
+	priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
+
+	switch (priv->nfproto) {
+	case NFPROTO_IPV4:
+		addr_len = sizeof(struct in_addr);
+		break;
+	case NFPROTO_IPV6:
+		addr_len = sizeof(struct in6_addr);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+	if (err < 0)
+		return err;
+
+	return nft_validate_register_load(priv->sreg_addr, addr_len);
+}
+
+static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
+
+static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) ||
+	    nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) ||
+	    nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static struct nft_expr_type nft_fwd_netdev_type;
+static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
+	.type		= &nft_fwd_netdev_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)),
+	.eval		= nft_fwd_neigh_eval,
+	.init		= nft_fwd_neigh_init,
+	.dump		= nft_fwd_neigh_dump,
+};
+
 static const struct nft_expr_ops nft_fwd_netdev_ops = {
 	.type		= &nft_fwd_netdev_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
@@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
 	.dump		= nft_fwd_netdev_dump,
 };
 
+static const struct nft_expr_ops *
+nft_fwd_select_ops(const struct nft_ctx *ctx,
+		   const struct nlattr * const tb[])
+{
+	if (tb[NFTA_FWD_SREG_ADDR])
+		return &nft_fwd_neigh_netdev_ops;
+	if (tb[NFTA_FWD_SREG_DEV])
+		return &nft_fwd_netdev_ops;
+
+        return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
 	.family		= NFPROTO_NETDEV,
 	.name		= "fwd",
-	.ops		= &nft_fwd_netdev_ops,
+	.select_ops	= nft_fwd_select_ops,
 	.policy		= nft_fwd_netdev_policy,
 	.maxattr	= NFTA_FWD_MAX,
 	.owner		= THIS_MODULE,
-- 
2.11.0

^ permalink raw reply related

* [PATCH 16/20] netfilter: nf_flow_table: attach dst to skbs
From: Pablo Neira Ayuso @ 2018-06-02  0:22 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180602002259.4024-1-pablo@netfilter.org>

From: "Jason A. Donenfeld" <Jason@zx2c4.com>

Some drivers, such as vxlan and wireguard, use the skb's dst in order to
determine things like PMTU. They therefore loose functionality when flow
offloading is enabled. So, we ensure the skb has it before xmit'ing it
in the offloading path.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 82451b7e0acb..15ed91309992 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -220,7 +220,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	enum flow_offload_tuple_dir dir;
 	struct flow_offload *flow;
 	struct net_device *outdev;
-	const struct rtable *rt;
+	struct rtable *rt;
 	unsigned int thoff;
 	struct iphdr *iph;
 	__be32 nexthop;
@@ -241,7 +241,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+	rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
 
 	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
 	    (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
@@ -264,6 +264,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	skb->dev = outdev;
 	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+	skb_dst_set_noref(skb, &rt->dst);
 	neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
 
 	return NF_STOLEN;
@@ -480,6 +481,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
 	skb->dev = outdev;
 	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+	skb_dst_set_noref(skb, &rt->dst);
 	neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
 
 	return NF_STOLEN;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 19/20] ipvs: add full ipv6 support to nfct
From: Pablo Neira Ayuso @ 2018-06-02  0:22 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180602002259.4024-1-pablo@netfilter.org>

From: Julian Anastasov <ja@ssi.bg>

Prepare NFCT to support IPv6 for FTP:

- Do not restrict the expectation callback to PF_INET

- Split the debug messages, so that the 160-byte limitation
in IP_VS_DBG_BUF is not exceeded when printing many IPv6
addresses. This means no more than 3 addresses in one message,
i.e. 1 tuple with 2 addresses or 1 connection with 3 addresses.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_nfct.c | 101 +++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 52 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 6cf3fd81a5ec..eb8b9c883889 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -67,15 +67,20 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 
 
-#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
-			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+#define FMT_TUPLE	"%s:%u->%s:%u/%u"
+#define ARG_TUPLE(T)	IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3),	\
+			ntohs((T)->src.u.all),				\
+			IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3),	\
+			ntohs((T)->dst.u.all),				\
 			(T)->dst.protonum
 
-#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
-			&((C)->vaddr.ip), ntohs((C)->vport), \
-			&((C)->daddr.ip), ntohs((C)->dport), \
+#define FMT_CONN	"%s:%u->%s:%u->%s:%u/%u:%u"
+#define ARG_CONN(C)	IP_VS_DBG_ADDR((C)->af, &((C)->caddr)),		\
+			ntohs((C)->cport),				\
+			IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)),		\
+			ntohs((C)->vport),				\
+			IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)),	\
+			ntohs((C)->dport),				\
 			(C)->protocol, (C)->state
 
 void
@@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
 		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
 			new_tuple.dst.u.tcp.port = cp->vport;
 	}
-	IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
-		  "ctinfo=%d, old reply=" FMT_TUPLE
-		  ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
-		  __func__, ct, ct->status, ctinfo,
-		  ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
-		  ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+	IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+		      "ctinfo=%d, old reply=" FMT_TUPLE "\n",
+		      __func__, ct, ct->status, ctinfo,
+		      ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple));
+	IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+		      "ctinfo=%d, new reply=" FMT_TUPLE "\n",
+		      __func__, ct, ct->status, ctinfo,
+		      ARG_TUPLE(&new_tuple));
 	nf_conntrack_alter_reply(ct, &new_tuple);
+	IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n",
+		      __func__, ct, ARG_CONN(cp));
 }
 
 int ip_vs_confirm_conntrack(struct sk_buff *skb)
@@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 	struct ip_vs_conn_param p;
 	struct net *net = nf_ct_net(ct);
 
-	if (exp->tuple.src.l3num != PF_INET)
-		return;
-
 	/*
 	 * We assume that no NF locks are held before this callback.
 	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
@@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 	cp = ip_vs_conn_out_get(&p);
 	if (cp) {
 		/* Change reply CLIENT->RS to CLIENT->VS */
+		IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp="
+			      FMT_CONN "\n",
+			      __func__, ct, ct->status, ARG_CONN(cp));
 		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
-			  __func__, ct, ct->status,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
+		IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+			      FMT_TUPLE "\n",
+			      __func__, ct, ARG_TUPLE(&new_reply));
 		new_reply.dst.u3 = cp->vaddr;
 		new_reply.dst.u.tcp.port = cp->vport;
-		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
-			  ", inout cp=" FMT_CONN "\n",
-			  __func__, ct,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
 		goto alter;
 	}
 
@@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 	cp = ip_vs_conn_in_get(&p);
 	if (cp) {
 		/* Change reply VS->CLIENT to RS->CLIENT */
+		IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp="
+			      FMT_CONN "\n",
+			      __func__, ct, ct->status, ARG_CONN(cp));
 		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
-			  __func__, ct, ct->status,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
+		IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+			      FMT_TUPLE "\n",
+			      __func__, ct, ARG_TUPLE(&new_reply));
 		new_reply.src.u3 = cp->daddr;
 		new_reply.src.u.tcp.port = cp->dport;
-		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
-			  __func__, ct,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
 		goto alter;
 	}
 
-	IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
-		  " - unknown expect\n",
-		  __func__, ct, ct->status, ARG_TUPLE(orig));
+	IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+		      " - unknown expect\n",
+		      __func__, ct, ct->status, ARG_TUPLE(orig));
 	return;
 
 alter:
@@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
 
 	exp->expectfn = ip_vs_nfct_expect_callback;
 
-	IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
-		__func__, ct, ARG_TUPLE(&exp->tuple));
+	IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+		      __func__, ct, ARG_TUPLE(&exp->tuple));
 	nf_ct_expect_related(exp);
 	nf_ct_expect_put(exp);
 }
@@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
 	tuple.dst.u3 = cp->vaddr;
 	tuple.dst.u.all = cp->vport;
 
-	IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
-		" for conn " FMT_CONN "\n",
-		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+	IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n",
+		      __func__, ARG_CONN(cp));
 
 	h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
 	if (h) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (nf_ct_kill(ct)) {
-			IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
-				FMT_TUPLE "\n",
-				__func__, ct, ARG_TUPLE(&tuple));
+			IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple="
+				      FMT_TUPLE "\n",
+				      __func__, ct, ARG_TUPLE(&tuple));
 		} else {
-			IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
-				FMT_TUPLE "\n",
-				__func__, ct, ARG_TUPLE(&tuple));
+			IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple="
+				      FMT_TUPLE "\n",
+				      __func__, ct, ARG_TUPLE(&tuple));
 		}
 		nf_ct_put(ct);
 	} else {
-		IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
-			__func__, ARG_TUPLE(&tuple));
+		IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+			      __func__, ARG_TUPLE(&tuple));
 	}
 }
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 20/20] ipvs: add ipv6 support to ftp
From: Pablo Neira Ayuso @ 2018-06-02  0:22 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180602002259.4024-1-pablo@netfilter.org>

From: Julian Anastasov <ja@ssi.bg>

Add support for FTP commands with extended format (RFC 2428):

- FTP EPRT: IPv4 and IPv6, active mode, similar to PORT
- FTP EPSV: IPv4 and IPv6, passive mode, similar to PASV.
EPSV response usually contains only port but we allow real
server to provide different address

We restrict control and data connection to be from same
address family.

Allow the "(" and ")" to be optional in PASV response.

Also, add ipvsh argument to the pkt_in/pkt_out handlers to better
access the payload after transport header.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h                   |  10 +-
 net/netfilter/ipvs/ip_vs_app.c        |  24 +-
 net/netfilter/ipvs/ip_vs_ftp.c        | 467 ++++++++++++++++++++++------------
 net/netfilter/ipvs/ip_vs_proto_sctp.c |   4 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |   4 +-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |   4 +-
 6 files changed, 331 insertions(+), 182 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 0ac795b41ab8..03f567eb9536 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -763,14 +763,14 @@ struct ip_vs_app {
 	 *	   2=Mangled but checksum was not updated
 	 */
 	int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
-		       struct sk_buff *, int *diff);
+		       struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
 
 	/* input hook: Process packet in outin direction, diff set for TCP.
 	 * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok,
 	 *	   2=Mangled but checksum was not updated
 	 */
 	int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
-		      struct sk_buff *, int *diff);
+		      struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
 
 	/* ip_vs_app initializer */
 	int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
@@ -1328,8 +1328,10 @@ int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16
 int ip_vs_app_inc_get(struct ip_vs_app *inc);
 void ip_vs_app_inc_put(struct ip_vs_app *inc);
 
-int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
-int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
+int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb,
+		      struct ip_vs_iphdr *ipvsh);
+int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb,
+		     struct ip_vs_iphdr *ipvsh);
 
 int register_ip_vs_pe(struct ip_vs_pe *pe);
 int unregister_ip_vs_pe(struct ip_vs_pe *pe);
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 1c98c907bc63..12d74896556a 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
 }
 
 static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
-				  struct ip_vs_app *app)
+				  struct ip_vs_app *app,
+				  struct ip_vs_iphdr *ipvsh)
 {
 	int diff;
 	const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
 	if (app->pkt_out == NULL)
 		return 1;
 
-	if (!app->pkt_out(app, cp, skb, &diff))
+	if (!app->pkt_out(app, cp, skb, &diff, ipvsh))
 		return 0;
 
 	/*
@@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
  *	called by ipvs packet handler, assumes previously checked cp!=NULL
  *	returns false if it can't handle packet (oom)
  */
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+		      struct ip_vs_iphdr *ipvsh)
 {
 	struct ip_vs_app *app;
 
@@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
 
 	/* TCP is complicated */
 	if (cp->protocol == IPPROTO_TCP)
-		return app_tcp_pkt_out(cp, skb, app);
+		return app_tcp_pkt_out(cp, skb, app, ipvsh);
 
 	/*
 	 *	Call private output hook function
@@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
 	if (app->pkt_out == NULL)
 		return 1;
 
-	return app->pkt_out(app, cp, skb, NULL);
+	return app->pkt_out(app, cp, skb, NULL, ipvsh);
 }
 
 
 static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
-				 struct ip_vs_app *app)
+				 struct ip_vs_app *app,
+				 struct ip_vs_iphdr *ipvsh)
 {
 	int diff;
 	const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
 	if (app->pkt_in == NULL)
 		return 1;
 
-	if (!app->pkt_in(app, cp, skb, &diff))
+	if (!app->pkt_in(app, cp, skb, &diff, ipvsh))
 		return 0;
 
 	/*
@@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
  *	called by ipvs packet handler, assumes previously checked cp!=NULL.
  *	returns false if can't handle packet (oom).
  */
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+		     struct ip_vs_iphdr *ipvsh)
 {
 	struct ip_vs_app *app;
 
@@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
 
 	/* TCP is complicated */
 	if (cp->protocol == IPPROTO_TCP)
-		return app_tcp_pkt_in(cp, skb, app);
+		return app_tcp_pkt_in(cp, skb, app, ipvsh);
 
 	/*
 	 *	Call private input hook function
@@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
 	if (app->pkt_in == NULL)
 		return 1;
 
-	return app->pkt_in(app, cp, skb, NULL);
+	return app->pkt_in(app, cp, skb, NULL, ipvsh);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..4398a72edec5 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -29,6 +29,8 @@
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/netfilter.h>
@@ -44,9 +46,18 @@
 #include <net/ip_vs.h>
 
 
-#define SERVER_STRING "227 "
-#define CLIENT_STRING "PORT"
+#define SERVER_STRING_PASV "227 "
+#define CLIENT_STRING_PORT "PORT"
+#define SERVER_STRING_EPSV "229 "
+#define CLIENT_STRING_EPRT "EPRT"
 
+enum {
+	IP_VS_FTP_ACTIVE = 0,
+	IP_VS_FTP_PORT = 0,
+	IP_VS_FTP_PASV,
+	IP_VS_FTP_EPRT,
+	IP_VS_FTP_EPSV,
+};
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444);
 MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
 
 
-/*	Dummy variable */
-static int ip_vs_ftp_pasv;
+static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh)
+{
+	struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len);
+
+	if ((th->doff << 2) < sizeof(struct tcphdr))
+		return NULL;
 
+	return (char *)th + (th->doff << 2);
+}
 
 static int
 ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
@@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
 }
 
 
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern", ignoring before "skip" and terminated with
- * the "term" character.
- * <addr,port> is in network order.
+/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern". <addr,port> is in network order.
+ * Parse extended format depending on ext. In this case addr can be pre-set.
  */
 static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 				  const char *pattern, size_t plen,
-				  char skip, char term,
-				  __be32 *addr, __be16 *port,
-				  char **start, char **end)
+				  char skip, bool ext, int mode,
+				  union nf_inet_addr *addr, __be16 *port,
+				  __u16 af, char **start, char **end)
 {
 	char *s, c;
 	unsigned char p[6];
+	char edelim;
+	__u16 hport;
 	int i = 0;
 
 	if (data_limit - data < plen) {
@@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 			if (s == data_limit)
 				return -1;
 			if (!found) {
+				/* "(" is optional for non-extended format,
+				 * so catch the start of IPv4 address
+				 */
+				if (!ext && isdigit(*s))
+					break;
 				if (*s == skip)
 					found = 1;
 			} else if (*s != skip) {
@@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 			}
 		}
 	}
+	/* Old IPv4-only format? */
+	if (!ext) {
+		p[0] = 0;
+		for (data = s; ; data++) {
+			if (data == data_limit)
+				return -1;
+			c = *data;
+			if (isdigit(c)) {
+				p[i] = p[i]*10 + c - '0';
+			} else if (c == ',' && i < 5) {
+				i++;
+				p[i] = 0;
+			} else {
+				/* unexpected character or terminator */
+				break;
+			}
+		}
 
-	for (data = s; ; data++) {
-		if (data == data_limit)
+		if (i != 5)
 			return -1;
-		if (*data == term)
-			break;
+
+		*start = s;
+		*end = data;
+		addr->ip = get_unaligned((__be32 *) p);
+		*port = get_unaligned((__be16 *) (p + 4));
+		return 1;
 	}
-	*end = data;
+	if (s == data_limit)
+		return -1;
+	*start = s;
+	edelim = *s++;
+	if (edelim < 33 || edelim > 126)
+		return -1;
+	if (s == data_limit)
+		return -1;
+	if (*s == edelim) {
+		/* Address family is usually missing for EPSV response */
+		if (mode != IP_VS_FTP_EPSV)
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		/* Then address should be missing too */
+		if (*s != edelim)
+			return -1;
+		/* Caller can pre-set addr, if needed */
+		s++;
+	} else {
+		const char *ep;
 
-	memset(p, 0, sizeof(p));
-	for (data = s; ; data++) {
-		c = *data;
-		if (c == term)
-			break;
-		if (c >= '0' && c <= '9') {
-			p[i] = p[i]*10 + c - '0';
-		} else if (c == ',' && i < 5) {
-			i++;
-		} else {
-			/* unexpected character */
+		/* We allow address only from same family */
+		if (af == AF_INET6 && *s != '2')
 			return -1;
+		if (af == AF_INET && *s != '1')
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		if (*s != edelim)
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		if (af == AF_INET6) {
+			if (in6_pton(s, data_limit - s, (u8 *)addr, edelim,
+				     &ep) <= 0)
+				return -1;
+		} else {
+			if (in4_pton(s, data_limit - s, (u8 *)addr, edelim,
+				     &ep) <= 0)
+				return -1;
 		}
+		s = (char *) ep;
+		if (s == data_limit)
+			return -1;
+		if (*s != edelim)
+			return -1;
+		s++;
 	}
-
-	if (i != 5)
+	for (hport = 0; ; s++)
+	{
+		if (s == data_limit)
+			return -1;
+		if (!isdigit(*s))
+			break;
+		hport = hport * 10 + *s - '0';
+	}
+	if (s == data_limit || !hport || *s != edelim)
 		return -1;
-
-	*start = s;
-	*addr = get_unaligned((__be32 *) p);
-	*port = get_unaligned((__be16 *) (p + 4));
+	s++;
+	*end = s;
+	*port = htons(hport);
 	return 1;
 }
 
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
+/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command
  * from the server (inside-to-outside).
  * When we see one, we build a connection entry with the client address,
  * client port 0 (unknown at the moment), the server address and the
@@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
  * The outgoing packet should be something like
  *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
  * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ * The extended format for EPSV response provides usually only port:
+ *   "229 Entering Extended Passive Mode (|||ppp|)"
  */
 static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
-			 struct sk_buff *skb, int *diff)
+			 struct sk_buff *skb, int *diff,
+			 struct ip_vs_iphdr *ipvsh)
 {
-	struct iphdr *iph;
-	struct tcphdr *th;
 	char *data, *data_limit;
 	char *start, *end;
 	union nf_inet_addr from;
@@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 
 	*diff = 0;
 
-#ifdef CONFIG_IP_VS_IPV6
-	/* This application helper doesn't work with IPv6 yet,
-	 * so turn this into a no-op for IPv6 packets
-	 */
-	if (cp->af == AF_INET6)
-		return 1;
-#endif
-
 	/* Only useful for established sessions */
 	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
 		return 1;
@@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	if (!skb_make_writable(skb, skb->len))
 		return 0;
 
-	if (cp->app_data == &ip_vs_ftp_pasv) {
-		iph = ip_hdr(skb);
-		th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-		data = (char *)th + (th->doff << 2);
+	if (cp->app_data == (void *) IP_VS_FTP_PASV) {
+		data = ip_vs_ftp_data_ptr(skb, ipvsh);
 		data_limit = skb_tail_pointer(skb);
 
+		if (!data || data >= data_limit)
+			return 1;
+
 		if (ip_vs_ftp_get_addrport(data, data_limit,
-					   SERVER_STRING,
-					   sizeof(SERVER_STRING)-1,
-					   '(', ')',
-					   &from.ip, &port,
+					   SERVER_STRING_PASV,
+					   sizeof(SERVER_STRING_PASV)-1,
+					   '(', false, IP_VS_FTP_PASV,
+					   &from, &port, cp->af,
 					   &start, &end) != 1)
 			return 1;
 
-		IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+		IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n",
 			  &from.ip, ntohs(port), &cp->caddr.ip, 0);
+	} else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+		data = ip_vs_ftp_data_ptr(skb, ipvsh);
+		data_limit = skb_tail_pointer(skb);
 
-		/*
-		 * Now update or create an connection entry for it
+		if (!data || data >= data_limit)
+			return 1;
+
+		/* Usually, data address is not specified but
+		 * we support different address, so pre-set it.
 		 */
-		{
-			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(cp->ipvs, AF_INET,
-					      iph->protocol, &from, port,
-					      &cp->caddr, 0, &p);
-			n_cp = ip_vs_conn_out_get(&p);
-		}
-		if (!n_cp) {
-			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(cp->ipvs,
-					      AF_INET, IPPROTO_TCP, &cp->caddr,
-					      0, &cp->vaddr, port, &p);
-			/* As above, this is ipv4 only */
-			n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
-					      IP_VS_CONN_F_NO_CPORT |
-					      IP_VS_CONN_F_NFCT,
-					      cp->dest, skb->mark);
-			if (!n_cp)
-				return 0;
+		from = cp->daddr;
+		if (ip_vs_ftp_get_addrport(data, data_limit,
+					   SERVER_STRING_EPSV,
+					   sizeof(SERVER_STRING_EPSV)-1,
+					   '(', true, IP_VS_FTP_EPSV,
+					   &from, &port, cp->af,
+					   &start, &end) != 1)
+			return 1;
 
-			/* add its controller */
-			ip_vs_control_add(n_cp, cp);
-		}
+		IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n",
+			      IP_VS_DBG_ADDR(cp->af, &from), ntohs(port),
+			      IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0);
+	} else {
+		return 1;
+	}
 
-		/*
-		 * Replace the old passive address with the new one
-		 */
+	/* Now update or create a connection entry for it */
+	{
+		struct ip_vs_conn_param p;
+
+		ip_vs_conn_fill_param(cp->ipvs, cp->af,
+				      ipvsh->protocol, &from, port,
+				      &cp->caddr, 0, &p);
+		n_cp = ip_vs_conn_out_get(&p);
+	}
+	if (!n_cp) {
+		struct ip_vs_conn_param p;
+
+		ip_vs_conn_fill_param(cp->ipvs,
+				      cp->af, ipvsh->protocol, &cp->caddr,
+				      0, &cp->vaddr, port, &p);
+		n_cp = ip_vs_conn_new(&p, cp->af, &from, port,
+				      IP_VS_CONN_F_NO_CPORT |
+				      IP_VS_CONN_F_NFCT,
+				      cp->dest, skb->mark);
+		if (!n_cp)
+			return 0;
+
+		/* add its controller */
+		ip_vs_control_add(n_cp, cp);
+	}
+
+	/* Replace the old passive address with the new one */
+	if (cp->app_data == (void *) IP_VS_FTP_PASV) {
 		from.ip = n_cp->vaddr.ip;
 		port = n_cp->vport;
 		snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
@@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 			 ((unsigned char *)&from.ip)[3],
 			 ntohs(port) >> 8,
 			 ntohs(port) & 0xFF);
+	} else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+		from = n_cp->vaddr;
+		port = n_cp->vport;
+		/* Only port, client will use VIP for the data connection */
+		snprintf(buf, sizeof(buf), "|||%u|",
+			 ntohs(port));
+	} else {
+		*buf = 0;
+	}
+	buf_len = strlen(buf);
 
-		buf_len = strlen(buf);
-
-		ct = nf_ct_get(skb, &ctinfo);
-		if (ct) {
-			bool mangled;
-
-			/* If mangling fails this function will return 0
-			 * which will cause the packet to be dropped.
-			 * Mangling can only fail under memory pressure,
-			 * hopefully it will succeed on the retransmitted
-			 * packet.
-			 */
-			mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-							   iph->ihl * 4,
-							   start - data,
-							   end - start,
-							   buf, buf_len);
-			if (mangled) {
-				ip_vs_nfct_expect_related(skb, ct, n_cp,
-							  IPPROTO_TCP, 0, 0);
-				if (skb->ip_summed == CHECKSUM_COMPLETE)
-					skb->ip_summed = CHECKSUM_UNNECESSARY;
-				/* csum is updated */
-				ret = 1;
-			}
-		}
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		bool mangled;
 
-		/*
-		 * Not setting 'diff' is intentional, otherwise the sequence
-		 * would be adjusted twice.
+		/* If mangling fails this function will return 0
+		 * which will cause the packet to be dropped.
+		 * Mangling can only fail under memory pressure,
+		 * hopefully it will succeed on the retransmitted
+		 * packet.
 		 */
-
-		cp->app_data = NULL;
-		ip_vs_tcp_conn_listen(n_cp);
-		ip_vs_conn_put(n_cp);
-		return ret;
+		mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+						   ipvsh->len,
+						   start - data,
+						   end - start,
+						   buf, buf_len);
+		if (mangled) {
+			ip_vs_nfct_expect_related(skb, ct, n_cp,
+						  ipvsh->protocol, 0, 0);
+			if (skb->ip_summed == CHECKSUM_COMPLETE)
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+			/* csum is updated */
+			ret = 1;
+		}
 	}
-	return 1;
+
+	/* Not setting 'diff' is intentional, otherwise the sequence
+	 * would be adjusted twice.
+	 */
+
+	cp->app_data = (void *) IP_VS_FTP_ACTIVE;
+	ip_vs_tcp_conn_listen(n_cp);
+	ip_vs_conn_put(n_cp);
+	return ret;
 }
 
 
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
+/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command
  * (outside-to-inside).
  *
  * The incoming packet having the PORT command should be something like
@@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
  * In this case, we create a connection entry using the client address and
  * port, so that the active ftp data connection from the server can reach
  * the client.
+ * Extended format:
+ *	"EPSV\r\n" when client requests server address from same family
+ *	"EPSV 1\r\n" when client requests IPv4 server address
+ *	"EPSV 2\r\n" when client requests IPv6 server address
+ *	"EPSV ALL\r\n" - not supported
+ *	EPRT with specified delimiter (ASCII 33..126), "|" by default:
+ *	"EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport
+ *	"EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport
  */
 static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
-			struct sk_buff *skb, int *diff)
+			struct sk_buff *skb, int *diff,
+			struct ip_vs_iphdr *ipvsh)
 {
-	struct iphdr *iph;
-	struct tcphdr *th;
 	char *data, *data_start, *data_limit;
 	char *start, *end;
 	union nf_inet_addr to;
@@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	/* no diff required for incoming packets */
 	*diff = 0;
 
-#ifdef CONFIG_IP_VS_IPV6
-	/* This application helper doesn't work with IPv6 yet,
-	 * so turn this into a no-op for IPv6 packets
-	 */
-	if (cp->af == AF_INET6)
-		return 1;
-#endif
-
 	/* Only useful for established sessions */
 	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
 		return 1;
@@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	if (!skb_make_writable(skb, skb->len))
 		return 0;
 
-	/*
-	 * Detecting whether it is passive
-	 */
-	iph = ip_hdr(skb);
-	th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
-	/* Since there may be OPTIONS in the TCP packet and the HLEN is
-	   the length of the header in 32-bit multiples, it is accurate
-	   to calculate data address by th+HLEN*4 */
-	data = data_start = (char *)th + (th->doff << 2);
+	data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
 	data_limit = skb_tail_pointer(skb);
+	if (!data || data >= data_limit)
+		return 1;
 
 	while (data <= data_limit - 6) {
-		if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+		if (cp->af == AF_INET &&
+		    strncasecmp(data, "PASV\r\n", 6) == 0) {
 			/* Passive mode on */
 			IP_VS_DBG(7, "got PASV at %td of %td\n",
 				  data - data_start,
 				  data_limit - data_start);
-			cp->app_data = &ip_vs_ftp_pasv;
+			cp->app_data = (void *) IP_VS_FTP_PASV;
 			return 1;
 		}
+
+		/* EPSV or EPSV<space><net-prt> */
+		if (strncasecmp(data, "EPSV", 4) == 0 &&
+		    (data[4] == ' ' || data[4] == '\r')) {
+			if (data[4] == ' ') {
+				char proto = data[5];
+
+				if (data > data_limit - 7 || data[6] != '\r')
+					return 1;
+
+#ifdef CONFIG_IP_VS_IPV6
+				if (cp->af == AF_INET6 && proto == '2') {
+				} else
+#endif
+				if (cp->af == AF_INET && proto == '1') {
+				} else {
+					return 1;
+				}
+			}
+			/* Extended Passive mode on */
+			IP_VS_DBG(7, "got EPSV at %td of %td\n",
+				  data - data_start,
+				  data_limit - data_start);
+			cp->app_data = (void *) IP_VS_FTP_EPSV;
+			return 1;
+		}
+
 		data++;
 	}
 
@@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	 * then create a new connection entry for the coming data
 	 * connection.
 	 */
-	if (ip_vs_ftp_get_addrport(data_start, data_limit,
-				   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
-				   ' ', '\r', &to.ip, &port,
-				   &start, &end) != 1)
+	if (cp->af == AF_INET &&
+	    ip_vs_ftp_get_addrport(data_start, data_limit,
+				   CLIENT_STRING_PORT,
+				   sizeof(CLIENT_STRING_PORT)-1,
+				   ' ', false, IP_VS_FTP_PORT,
+				   &to, &port, cp->af,
+				   &start, &end) == 1) {
+
+		IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port));
+
+		/* Now update or create a connection entry for it */
+		IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n",
+			  ip_vs_proto_name(ipvsh->protocol),
+			  &to.ip, ntohs(port), &cp->vaddr.ip,
+			  ntohs(cp->vport)-1);
+	} else if (ip_vs_ftp_get_addrport(data_start, data_limit,
+					  CLIENT_STRING_EPRT,
+					  sizeof(CLIENT_STRING_EPRT)-1,
+					  ' ', true, IP_VS_FTP_EPRT,
+					  &to, &port, cp->af,
+					  &start, &end) == 1) {
+
+		IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n",
+			      IP_VS_DBG_ADDR(cp->af, &to), ntohs(port));
+
+		/* Now update or create a connection entry for it */
+		IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n",
+			      ip_vs_proto_name(ipvsh->protocol),
+			      IP_VS_DBG_ADDR(cp->af, &to), ntohs(port),
+			      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+			      ntohs(cp->vport)-1);
+	} else {
 		return 1;
-
-	IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+	}
 
 	/* Passive mode off */
-	cp->app_data = NULL;
-
-	/*
-	 * Now update or create a connection entry for it
-	 */
-	IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
-		  ip_vs_proto_name(iph->protocol),
-		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+	cp->app_data = (void *) IP_VS_FTP_ACTIVE;
 
 	{
 		struct ip_vs_conn_param p;
-		ip_vs_conn_fill_param(cp->ipvs, AF_INET,
-				      iph->protocol, &to, port, &cp->vaddr,
+		ip_vs_conn_fill_param(cp->ipvs, cp->af,
+				      ipvsh->protocol, &to, port, &cp->vaddr,
 				      htons(ntohs(cp->vport)-1), &p);
 		n_cp = ip_vs_conn_in_get(&p);
 		if (!n_cp) {
-			/* This is ipv4 only */
-			n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+			n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr,
 					      htons(ntohs(cp->dport)-1),
 					      IP_VS_CONN_F_NFCT, cp->dest,
 					      skb->mark);
@@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
 		ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
 		if (ret)
 			goto err_unreg;
-		pr_info("%s: loaded support on port[%d] = %d\n",
+		pr_info("%s: loaded support on port[%d] = %u\n",
 			app->name, i, ports[i]);
 	}
 	return 0;
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index eff7569824e5..3250c4a1111e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		ret = ip_vs_app_pkt_out(cp, skb);
+		ret = ip_vs_app_pkt_out(cp, skb, iph);
 		if (ret == 0)
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
@@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		ret = ip_vs_app_pkt_in(cp, skb);
+		ret = ip_vs_app_pkt_in(cp, skb, iph);
 		if (ret == 0)
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 569631d2b2a1..80d10ad12a15 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
@@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn and iph ack_seq stuff
 		 */
-		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index c15ef7c2a1fa..e0ef11c3691e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		/*
 		 *	Call application helper if needed
 		 */
-		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
@@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn
 		 */
-		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
-- 
2.11.0

^ permalink raw reply related

* [pull request][net-next 00/17] Mellanox, mlx5e updates 2018-06-01
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Saeed Mahameed

Hi dave,

Sorry for the extra 2 patches in this series, but mostly the series
contains small patches and some fixes to previous patches in this
submission window, with one main patch from Tariq to improve legacy
RQ buffer management, for more information please refer to that tag
log below.

Please pull and let me know if there's any problem.

Thanks,
Saeed.

---

The following changes since commit 21ad1173589ef63a93f94e05c879393e2c27588c:

  Merge branch 'hns3-next' (2018-06-01 14:23:59 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5e-updates-2018-06-01

for you to fetch changes up to f65a59ffbcc26135e225058b2e6cd49ab9f9f13f:

  net/mlx5e: TX, Separate cachelines of xmit and completion stats (2018-06-01 16:48:15 -0700)

----------------------------------------------------------------
mlx5e-updates-2018-06-01

1) From Tariq, Two patches to Fix IPoIB issues introduced in
   "net/mlx5e: TX, Use actual WQE size for SQ edge fill"

2) From Eran, Additional improvements to mlx5e statistics reporting

3) From Maor, Increase aRFS flow tables size

4) From Adi, Support MTU change for ethernet representors

5) From Ilan and Adi, Handle QP error events in FPGA

6) From Tariq, last 10 patches mainly deals with RX buffer scheme improvements for legacy RQ
   to use only order-0 pages and fragmented SKBs for large MTUs.

-  Tariq starts with some refactoring and removing HW LRO support from traditional
   (legacy) RQ, since it complicates the buffer scheme and removing it makes it smoother
   to move to cyclic descriptor buffer for traditional RQ.

- Use cyclic WQ in legacy RQ, which has many benefits and paves the way for fragmented SKBs
  for large MTUs.

- Enhance legacy Receive Queue memory scheme, such that only order-0 pages are used.
  Whenever possible, prefer using a linear SKB, and build it wrapping the WQE buffer.
  Otherwise (for example, jumbo frames on x86), use non-linear SKB, with as many frags
  as needed. In this case, multiple WQE scatter entries are used, up to a maximum of 4
  frags and 10KB of MTU.

- TX statistics access improvements.

----------------------------------------------------------------
Adi Nissim (1):
      net/mlx5e: Support configurable MTU for vport representors

Eran Ben Elisha (2):
      net/mlx5e: Present SW stats when state is not opened
      net/mlx5e: Remove redundant active_channels indication

Ilan Tayari (1):
      net/mlx5: FPGA, Handle QP error event

Maor Gottlieb (1):
      net/mlx5e: Increase aRFS flow tables size

Tariq Toukan (12):
      net/mlx5e: IPOIB, Fix overflowing SQ WQE memset
      net/mlx5e: IPOIB, Add a missing skb_pull
      net/mlx5e: TX, Obsolete maintaining local copies of skb->len/data
      net/mlx5e: RX, Generalise name of non-linear SKB head size
      net/mlx5e: RX, Generalise function of SKB frag addition
      net/mlx5e: RX, Dedicate a function for copying SKB header
      net/mlx5e: RX, Remove HW LRO support in legacy RQ
      net/mlx5e: RX, Split WQ objects for different RQ types
      net/mlx5e: RX, Use cyclic WQ in legacy RQ
      net/mlx5e: RX, Enhance legacy Receive Queue memory scheme
      net/mlx5e: RX, Always prefer Linear SKB configuration
      net/mlx5e: TX, Separate cachelines of xmit and completion stats

 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  62 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c  |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   7 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 469 +++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  22 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 359 ++++++++++------
 .../net/ethernet/mellanox/mlx5/core/en_selftest.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  23 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |   9 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |  47 +--
 .../net/ethernet/mellanox/mlx5/core/fpga/core.c    |  28 +-
 drivers/net/ethernet/mellanox/mlx5/core/wq.c       |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/wq.h       |  55 ++-
 13 files changed, 726 insertions(+), 360 deletions(-)

^ permalink raw reply

* [net-next 01/17] net/mlx5e: IPOIB, Fix overflowing SQ WQE memset
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

IPoIB WQE size is larger than a single WQEBB.  Must not fetch the WQE,
and surely not memset it, until it is guaranteed that there are enough
WQEBBs available before getting to SQ/frag edge.

Fixes: 043dc78ecf07 ("net/mlx5e: TX, Use actual WQE size for SQ edge fill")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index aafd75257fd0..9829ee02de31 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -653,8 +653,6 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	int num_dma;
 	__be16 mss;
 
-	mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
-
 	/* Calc ihs and ds cnt, no writes to wqe yet */
 	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
 	if (skb_is_gso(skb)) {
@@ -686,10 +684,12 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc);
 	if (unlikely(frag_pi + num_wqebbs > mlx5_wq_cyc_get_frag_size(wq))) {
+		pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 		mlx5e_fill_sq_frag_edge(sq, wq, pi, frag_pi);
-		mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
 	}
 
+	mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
+
 	/* fill wqe */
 	wi       = &sq->db.wqe_info[pi];
 	cseg     = &wqe->ctrl;
-- 
2.17.0

^ permalink raw reply related

* [net-next 03/17] net/mlx5e: Present SW stats when state is not opened
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

The driver can present all SW stats even when the state not opened.
Fixed get strings, count and stats to support it.

In addition, fix tc2txq to hold a static mapping which doesn't depend on
the amount of open channels, and cannot have the same value on two
different cells  while moving between configurations.
Example:
- OOB 16 channels
- Change to 2 channels, 8 TCs
- tc2txq[15][0] == tc2txq[1][7] == 15
This will cause multiple appearances of the same TX index in statistics
output.

Fixes: 76c3810bade3 ("net/mlx5e: Avoid reset netdev stats on configuration changes")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 19 +++++++++++++------
 .../ethernet/mellanox/mlx5/core/en_stats.c    | 10 ----------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9b19863b059d..adc55de6d4f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2633,15 +2633,21 @@ static void mlx5e_netdev_set_tcs(struct net_device *netdev)
 		netdev_set_tc_queue(netdev, tc, nch, 0);
 }
 
-static void mlx5e_build_channels_tx_maps(struct mlx5e_priv *priv)
+static void mlx5e_build_tc2txq_maps(struct mlx5e_priv *priv)
 {
-	struct mlx5e_channel *c;
-	struct mlx5e_txqsq *sq;
+	int max_nch = priv->profile->max_nch(priv->mdev);
 	int i, tc;
 
-	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++)
+	for (i = 0; i < max_nch; i++)
 		for (tc = 0; tc < priv->profile->max_tc; tc++)
-			priv->channel_tc2txq[i][tc] = i + tc * priv->channels.num;
+			priv->channel_tc2txq[i][tc] = i + tc * max_nch;
+}
+
+static void mlx5e_build_tx2sq_maps(struct mlx5e_priv *priv)
+{
+	struct mlx5e_channel *c;
+	struct mlx5e_txqsq *sq;
+	int i, tc;
 
 	for (i = 0; i < priv->channels.num; i++) {
 		c = priv->channels.c[i];
@@ -2661,7 +2667,7 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 	netif_set_real_num_tx_queues(netdev, num_txqs);
 	netif_set_real_num_rx_queues(netdev, priv->channels.num);
 
-	mlx5e_build_channels_tx_maps(priv);
+	mlx5e_build_tx2sq_maps(priv);
 	mlx5e_activate_channels(&priv->channels);
 	write_lock(&priv->stats_lock);
 	priv->channels_active = true;
@@ -4446,6 +4452,7 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
 	if (err)
 		mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
 	mlx5e_build_nic_netdev(netdev);
+	mlx5e_build_tc2txq_maps(priv);
 	mlx5e_vxlan_init(priv);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 776b4d68e156..3b2aed43f660 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -1161,9 +1161,6 @@ static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv)
 {
 	int max_nch = priv->profile->max_nch(priv->mdev);
 
-	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
-		return 0;
-
 	return (NUM_RQ_STATS * max_nch) +
 	       (NUM_CH_STATS * max_nch) +
 	       (NUM_SQ_STATS * max_nch * priv->max_opened_tc);
@@ -1175,9 +1172,6 @@ static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
 	int max_nch = priv->profile->max_nch(priv->mdev);
 	int i, j, tc;
 
-	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
-		return idx;
-
 	for (i = 0; i < max_nch; i++)
 		for (j = 0; j < NUM_CH_STATS; j++)
 			sprintf(data + (idx++) * ETH_GSTRING_LEN,
@@ -1187,7 +1181,6 @@ static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
 		for (j = 0; j < NUM_RQ_STATS; j++)
 			sprintf(data + (idx++) * ETH_GSTRING_LEN, rq_stats_desc[j].format, i);
 
-	/* priv->channel_tc2txq[i][tc] is valid only when device is open */
 	for (tc = 0; tc < priv->max_opened_tc; tc++)
 		for (i = 0; i < max_nch; i++)
 			for (j = 0; j < NUM_SQ_STATS; j++)
@@ -1204,9 +1197,6 @@ static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data,
 	int max_nch = priv->profile->max_nch(priv->mdev);
 	int i, j, tc;
 
-	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
-		return idx;
-
 	for (i = 0; i < max_nch; i++)
 		for (j = 0; j < NUM_CH_STATS; j++)
 			data[idx++] =
-- 
2.17.0

^ permalink raw reply related

* [net-next 02/17] net/mlx5e: IPOIB, Add a missing skb_pull
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

A call to mlx5e_tx_skb_pull_inline was mistakenly dropped
in the cited patch. Get it back.

Fixes: 043dc78ecf07 ("net/mlx5e: TX, Use actual WQE size for SQ edge fill")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 9829ee02de31..725c06221e95 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -705,6 +705,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 
 	if (ihs) {
 		memcpy(eseg->inline_hdr.start, skb_data, ihs);
+		mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
 		eseg->inline_hdr.sz = cpu_to_be16(ihs);
 		dseg += ds_cnt_inl;
 	}
-- 
2.17.0

^ permalink raw reply related

* [net-next 04/17] net/mlx5e: Remove redundant active_channels indication
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

Now, when all channels stats are saved regardless of the channel's state
{open, closed}, we can safely remove this indication and the stats spin
lock which protects it.

Fixes: 76c3810bade3 ("net/mlx5e: Avoid reset netdev stats on configuration changes")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 7 -------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 6 ------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 5 -----
 4 files changed, 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1c04df043e07..372cdf8a496c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -777,8 +777,6 @@ struct mlx5e_priv {
 	struct mutex               state_lock; /* Protects Interface state */
 	struct mlx5e_rq            drop_rq;
 
-	rwlock_t                   stats_lock; /* Protects channels SW stats updates */
-	bool                       channels_active;
 	struct mlx5e_channels      channels;
 	u32                        tisn[MLX5E_MAX_NUM_TC];
 	struct mlx5e_rqt           indir_rqt;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index adc55de6d4f4..dd119bad102d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2669,9 +2669,6 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 
 	mlx5e_build_tx2sq_maps(priv);
 	mlx5e_activate_channels(&priv->channels);
-	write_lock(&priv->stats_lock);
-	priv->channels_active = true;
-	write_unlock(&priv->stats_lock);
 	netif_tx_start_all_queues(priv->netdev);
 
 	if (MLX5_VPORT_MANAGER(priv->mdev))
@@ -2693,9 +2690,6 @@ void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
 	 */
 	netif_tx_stop_all_queues(priv->netdev);
 	netif_tx_disable(priv->netdev);
-	write_lock(&priv->stats_lock);
-	priv->channels_active = false;
-	write_unlock(&priv->stats_lock);
 	mlx5e_deactivate_channels(&priv->channels);
 }
 
@@ -4269,7 +4263,6 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
 			       profile->max_nch(mdev), netdev->mtu);
 
 	mutex_init(&priv->state_lock);
-	rwlock_init(&priv->stats_lock);
 
 	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
 	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index de6364125f0f..1fb4835eac72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -130,10 +130,6 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
 	struct mlx5e_sq_stats *sq_stats;
 	int i, j;
 
-	read_lock(&priv->stats_lock);
-	if (!priv->channels_active)
-	        goto out;
-
 	memset(s, 0, sizeof(*s));
 	for (i = 0; i < priv->channels.num; i++) {
 		struct mlx5e_channel *c = priv->channels.c[i];
@@ -150,8 +146,6 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
 			s->tx_bytes		+= sq_stats->bytes;
 		}
 	}
-out:
-	read_unlock(&priv->stats_lock);
 }
 
 static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 3b2aed43f660..697dc7397ba2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -114,9 +114,6 @@ void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 	int i;
 
 	memset(s, 0, sizeof(*s));
-	read_lock(&priv->stats_lock);
-	if (!priv->channels_active)
-		goto out;
 
 	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++) {
 		struct mlx5e_channel_stats *channel_stats =
@@ -177,8 +174,6 @@ void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 	}
 
 	memcpy(&priv->stats.sw, s, sizeof(*s));
-out:
-	read_unlock(&priv->stats_lock);
 }
 
 static const struct counter_desc q_stats_desc[] = {
-- 
2.17.0

^ permalink raw reply related

* [net-next 05/17] net/mlx5e: Increase aRFS flow tables size
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Maor Gottlieb, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Maor Gottlieb <maorg@mellanox.com>

Increase the aRFS flow table size to 64k so it could contain up to 64k
different streams.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index f64b5e78519b..75e4308ba786 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -213,7 +213,7 @@ static int arfs_add_default_rule(struct mlx5e_priv *priv,
 }
 
 #define MLX5E_ARFS_NUM_GROUPS	2
-#define MLX5E_ARFS_GROUP1_SIZE	BIT(12)
+#define MLX5E_ARFS_GROUP1_SIZE	(BIT(16) - 1)
 #define MLX5E_ARFS_GROUP2_SIZE	BIT(0)
 #define MLX5E_ARFS_TABLE_SIZE	(MLX5E_ARFS_GROUP1_SIZE +\
 				 MLX5E_ARFS_GROUP2_SIZE)
-- 
2.17.0

^ permalink raw reply related

* [net-next 06/17] net/mlx5e: Support configurable MTU for vport representors
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Adi Nissim, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Adi Nissim <adin@mellanox.com>

The representor MTU was hard coded to 1500 bytes.
Allow setting arbitrary MTU values up to the max supported by the FW.

Signed-off-by: Adi Nissim <adin@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  4 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 14 ++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 14 ++++++++++++++
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 372cdf8a496c..a23fa47fbbcc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1102,6 +1102,10 @@ void mlx5e_update_stats_work(struct work_struct *work);
 
 int mlx5e_bits_invert(unsigned long a, int size);
 
+typedef int (*change_hw_mtu_cb)(struct mlx5e_priv *priv);
+int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
+		     change_hw_mtu_cb set_mtu_cb);
+
 /* ethtool helpers */
 void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
 			       struct ethtool_drvinfo *drvinfo);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index dd119bad102d..ab7b2a4e6edc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3495,7 +3495,8 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 	return features;
 }
 
-static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
+int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
+		     change_hw_mtu_cb set_mtu_cb)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_channels new_channels = {};
@@ -3522,7 +3523,7 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 
 	if (!reset) {
 		params->sw_mtu = new_mtu;
-		mlx5e_set_dev_port_mtu(priv);
+		set_mtu_cb(priv);
 		netdev->mtu = params->sw_mtu;
 		goto out;
 	}
@@ -3531,7 +3532,7 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 	if (err)
 		goto out;
 
-	mlx5e_switch_priv_channels(priv, &new_channels, mlx5e_set_dev_port_mtu);
+	mlx5e_switch_priv_channels(priv, &new_channels, set_mtu_cb);
 	netdev->mtu = new_channels.params.sw_mtu;
 
 out:
@@ -3539,6 +3540,11 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 	return err;
 }
 
+static int mlx5e_change_nic_mtu(struct net_device *netdev, int new_mtu)
+{
+	return mlx5e_change_mtu(netdev, new_mtu, mlx5e_set_dev_port_mtu);
+}
+
 int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
 {
 	struct hwtstamp_config config;
@@ -4033,7 +4039,7 @@ static const struct net_device_ops mlx5e_netdev_ops = {
 	.ndo_vlan_rx_kill_vid    = mlx5e_vlan_rx_kill_vid,
 	.ndo_set_features        = mlx5e_set_features,
 	.ndo_fix_features        = mlx5e_fix_features,
-	.ndo_change_mtu          = mlx5e_change_mtu,
+	.ndo_change_mtu          = mlx5e_change_nic_mtu,
 	.ndo_do_ioctl            = mlx5e_ioctl,
 	.ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
 	.ndo_udp_tunnel_add      = mlx5e_add_vxlan_port,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 1fb4835eac72..8ab4c96b7f7c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -900,6 +900,11 @@ static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
 	.switchdev_port_attr_get	= mlx5e_attr_get,
 };
 
+int mlx5e_change_rep_mtu(struct net_device *netdev, int new_mtu)
+{
+	return mlx5e_change_mtu(netdev, new_mtu, NULL);
+}
+
 static const struct net_device_ops mlx5e_netdev_ops_rep = {
 	.ndo_open                = mlx5e_rep_open,
 	.ndo_stop                = mlx5e_rep_close,
@@ -909,6 +914,7 @@ static const struct net_device_ops mlx5e_netdev_ops_rep = {
 	.ndo_get_stats64         = mlx5e_rep_get_stats,
 	.ndo_has_offload_stats	 = mlx5e_has_offload_stats,
 	.ndo_get_offload_stats	 = mlx5e_get_offload_stats,
+	.ndo_change_mtu          = mlx5e_change_rep_mtu,
 };
 
 static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
@@ -935,6 +941,10 @@ static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
 
 static void mlx5e_build_rep_netdev(struct net_device *netdev)
 {
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 max_mtu;
+
 	netdev->netdev_ops = &mlx5e_netdev_ops_rep;
 
 	netdev->watchdog_timeo    = 15 * HZ;
@@ -947,6 +957,10 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev)
 	netdev->hw_features      |= NETIF_F_HW_TC;
 
 	eth_hw_addr_random(netdev);
+
+	netdev->min_mtu = ETH_MIN_MTU;
+	mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
+	netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu);
 }
 
 static void mlx5e_init_rep(struct mlx5_core_dev *mdev,
-- 
2.17.0

^ permalink raw reply related

* [net-next 08/17] net/mlx5e: TX, Obsolete maintaining local copies of skb->len/data
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Instead of maintaining a local copy of skb->len/data and updating
it upon every copy to the WQE inline part, just calculate it once
when needed, using the ihs.

This obsoletes the function mlx5e_tx_skb_pull_inline.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 42 ++++++-------------
 1 file changed, 12 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 725c06221e95..f29deb44bf3b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -188,28 +188,16 @@ static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
 	return min_t(u16, hlen, skb_headlen(skb));
 }
 
-static inline void mlx5e_tx_skb_pull_inline(unsigned char **skb_data,
-					    unsigned int *skb_len,
-					    unsigned int len)
-{
-	*skb_len -= len;
-	*skb_data += len;
-}
-
-static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs,
-				     unsigned char **skb_data,
-				     unsigned int *skb_len)
+static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs)
 {
 	struct vlan_ethhdr *vhdr = (struct vlan_ethhdr *)start;
 	int cpy1_sz = 2 * ETH_ALEN;
 	int cpy2_sz = ihs - cpy1_sz;
 
-	memcpy(vhdr, *skb_data, cpy1_sz);
-	mlx5e_tx_skb_pull_inline(skb_data, skb_len, cpy1_sz);
+	memcpy(vhdr, skb->data, cpy1_sz);
 	vhdr->h_vlan_proto = skb->vlan_proto;
 	vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb));
-	memcpy(&vhdr->h_vlan_encapsulated_proto, *skb_data, cpy2_sz);
-	mlx5e_tx_skb_pull_inline(skb_data, skb_len, cpy2_sz);
+	memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz);
 }
 
 static inline void
@@ -357,8 +345,6 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	struct mlx5e_tx_wqe_info *wi;
 
 	struct mlx5e_sq_stats *stats = sq->stats;
-	unsigned char *skb_data = skb->data;
-	unsigned int skb_len = skb->len;
 	u16 ds_cnt, ds_cnt_inl = 0;
 	u16 headlen, ihs, frag_pi;
 	u8 num_wqebbs, opcode;
@@ -385,7 +371,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	stats->bytes     += num_bytes;
 	stats->xmit_more += skb->xmit_more;
 
-	headlen = skb_len - ihs - skb->data_len;
+	headlen = skb->len - ihs - skb->data_len;
 	ds_cnt += !!headlen;
 	ds_cnt += skb_shinfo(skb)->nr_frags;
 
@@ -414,15 +400,14 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	eseg->mss = mss;
 
 	if (ihs) {
+		eseg->inline_hdr.sz = cpu_to_be16(ihs);
 		if (skb_vlan_tag_present(skb)) {
-			mlx5e_insert_vlan(eseg->inline_hdr.start, skb,
-					  ihs - VLAN_HLEN, &skb_data, &skb_len);
+			ihs -= VLAN_HLEN;
+			mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs);
 			stats->added_vlan_packets++;
 		} else {
-			memcpy(eseg->inline_hdr.start, skb_data, ihs);
-			mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
+			memcpy(eseg->inline_hdr.start, skb->data, ihs);
 		}
-		eseg->inline_hdr.sz = cpu_to_be16(ihs);
 		dseg += ds_cnt_inl;
 	} else if (skb_vlan_tag_present(skb)) {
 		eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
@@ -432,7 +417,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		stats->added_vlan_packets++;
 	}
 
-	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen, dseg);
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + ihs, headlen, dseg);
 	if (unlikely(num_dma < 0))
 		goto err_drop;
 
@@ -644,8 +629,6 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	struct mlx5e_tx_wqe_info *wi;
 
 	struct mlx5e_sq_stats *stats = sq->stats;
-	unsigned char *skb_data = skb->data;
-	unsigned int skb_len = skb->len;
 	u16 headlen, ihs, pi, frag_pi;
 	u16 ds_cnt, ds_cnt_inl = 0;
 	u8 num_wqebbs, opcode;
@@ -672,7 +655,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	stats->bytes     += num_bytes;
 	stats->xmit_more += skb->xmit_more;
 
-	headlen = skb_len - ihs - skb->data_len;
+	headlen = skb->len - ihs - skb->data_len;
 	ds_cnt += !!headlen;
 	ds_cnt += skb_shinfo(skb)->nr_frags;
 
@@ -704,13 +687,12 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	eseg->mss = mss;
 
 	if (ihs) {
-		memcpy(eseg->inline_hdr.start, skb_data, ihs);
-		mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
+		memcpy(eseg->inline_hdr.start, skb->data, ihs);
 		eseg->inline_hdr.sz = cpu_to_be16(ihs);
 		dseg += ds_cnt_inl;
 	}
 
-	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen, dseg);
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + ihs, headlen, dseg);
 	if (unlikely(num_dma < 0))
 		goto err_drop;
 
-- 
2.17.0

^ permalink raw reply related

* [net-next 07/17] net/mlx5: FPGA, Handle QP error event
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Ilan Tayari, Adi Nissim, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Ilan Tayari <ilant@mellanox.com>

Add handlers for this event to perform graceful teardown of the device.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../ethernet/mellanox/mlx5/core/fpga/core.c   | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index 26caa0475985..436a8136f26f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -50,6 +50,11 @@ static const char *const mlx5_fpga_error_strings[] = {
 	"Temperature Critical",
 };
 
+static const char * const mlx5_fpga_qp_error_strings[] = {
+	"Null Syndrome",
+	"Retry Counter Expired",
+	"RNR Expired",
+};
 static struct mlx5_fpga_device *mlx5_fpga_device_alloc(void)
 {
 	struct mlx5_fpga_device *fdev = NULL;
@@ -271,23 +276,38 @@ static const char *mlx5_fpga_syndrome_to_string(u8 syndrome)
 	return "Unknown";
 }
 
+static const char *mlx5_fpga_qp_syndrome_to_string(u8 syndrome)
+{
+	if (syndrome < ARRAY_SIZE(mlx5_fpga_qp_error_strings))
+		return mlx5_fpga_qp_error_strings[syndrome];
+	return "Unknown";
+}
+
 void mlx5_fpga_event(struct mlx5_core_dev *mdev, u8 event, void *data)
 {
 	struct mlx5_fpga_device *fdev = mdev->fpga;
 	const char *event_name;
 	bool teardown = false;
 	unsigned long flags;
+	u32 fpga_qpn;
 	u8 syndrome;
 
-	if (event != MLX5_EVENT_TYPE_FPGA_ERROR) {
+	switch (event) {
+	case MLX5_EVENT_TYPE_FPGA_ERROR:
+		syndrome = MLX5_GET(fpga_error_event, data, syndrome);
+		event_name = mlx5_fpga_syndrome_to_string(syndrome);
+		break;
+	case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+		syndrome = MLX5_GET(fpga_qp_error_event, data, syndrome);
+		event_name = mlx5_fpga_qp_syndrome_to_string(syndrome);
+		fpga_qpn = MLX5_GET(fpga_qp_error_event, data, fpga_qpn);
+		break;
+	default:
 		mlx5_fpga_warn_ratelimited(fdev, "Unexpected event %u\n",
 					   event);
 		return;
 	}
 
-	syndrome = MLX5_GET(fpga_error_event, data, syndrome);
-	event_name = mlx5_fpga_syndrome_to_string(syndrome);
-
 	spin_lock_irqsave(&fdev->state_lock, flags);
 	switch (fdev->state) {
 	case MLX5_FPGA_STATUS_SUCCESS:
-- 
2.17.0

^ permalink raw reply related

* [net-next 11/17] net/mlx5e: RX, Dedicate a function for copying SKB header
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Get the logic of copying the packet header into the SKB linear part
into a generic function. Function does copy length alignment
and dma buffer sync.

It is currently called only within the MPWQE flow.
In a downstream patch, it will be called within the legacy RQ flow
as well.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 546e3bc46cd1..634540afdcfc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -324,6 +324,20 @@ mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
 			di->page, frag_offset, len, truesize);
 }
 
+static inline void
+mlx5e_copy_skb_header(struct device *pdev, struct sk_buff *skb,
+		      struct mlx5e_dma_info *dma_info,
+		      int offset_from, int offset_to, u32 headlen)
+{
+	const void *from = page_address(dma_info->page) + offset_from;
+	/* Aligning len to sizeof(long) optimizes memcpy performance */
+	unsigned int len = ALIGN(headlen, sizeof(long));
+
+	dma_sync_single_for_cpu(pdev, dma_info->addr + offset_from, len,
+				DMA_FROM_DEVICE);
+	skb_copy_to_linear_data_offset(skb, offset_to, from, len);
+}
+
 static inline void
 mlx5e_copy_skb_header_mpwqe(struct device *pdev,
 			    struct sk_buff *skb,
@@ -331,23 +345,13 @@ mlx5e_copy_skb_header_mpwqe(struct device *pdev,
 			    u32 offset, u32 headlen)
 {
 	u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - offset);
-	unsigned int len;
 
-	 /* Aligning len to sizeof(long) optimizes memcpy performance */
-	len = ALIGN(headlen_pg, sizeof(long));
-	dma_sync_single_for_cpu(pdev, dma_info->addr + offset, len,
-				DMA_FROM_DEVICE);
-	skb_copy_to_linear_data(skb, page_address(dma_info->page) + offset, len);
+	mlx5e_copy_skb_header(pdev, skb, dma_info, offset, 0, headlen_pg);
 
 	if (unlikely(offset + headlen > PAGE_SIZE)) {
 		dma_info++;
-		headlen_pg = len;
-		len = ALIGN(headlen - headlen_pg, sizeof(long));
-		dma_sync_single_for_cpu(pdev, dma_info->addr, len,
-					DMA_FROM_DEVICE);
-		skb_copy_to_linear_data_offset(skb, headlen_pg,
-					       page_address(dma_info->page),
-					       len);
+		mlx5e_copy_skb_header(pdev, skb, dma_info, 0, headlen_pg,
+				      headlen - headlen_pg);
 	}
 }
 
-- 
2.17.0

^ permalink raw reply related

* [net-next 10/17] net/mlx5e: RX, Generalise function of SKB frag addition
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Rename it and pass truesize as an extra argument, as it will be used also
in Legacy RQ in a downstream patch.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 61a76faa2779..546e3bc46cd1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -311,13 +311,11 @@ void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 		mlx5e_free_rx_wqe(rq, wi);
 }
 
-static inline void mlx5e_add_skb_frag_mpwqe(struct mlx5e_rq *rq,
-					    struct sk_buff *skb,
-					    struct mlx5e_dma_info *di,
-					    u32 frag_offset, u32 len)
+static inline void
+mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
+		   struct mlx5e_dma_info *di, u32 frag_offset, u32 len,
+		   unsigned int truesize)
 {
-	unsigned int truesize = ALIGN(len, BIT(rq->mpwqe.log_stride_sz));
-
 	dma_sync_single_for_cpu(rq->pdev,
 				di->addr + frag_offset,
 				len, DMA_FROM_DEVICE);
@@ -1094,9 +1092,11 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
 	while (byte_cnt) {
 		u32 pg_consumed_bytes =
 			min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
+		unsigned int truesize =
+			ALIGN(pg_consumed_bytes, BIT(rq->mpwqe.log_stride_sz));
 
-		mlx5e_add_skb_frag_mpwqe(rq, skb, di, frag_offset,
-					 pg_consumed_bytes);
+		mlx5e_add_skb_frag(rq, skb, di, frag_offset,
+				   pg_consumed_bytes, truesize);
 		byte_cnt -= pg_consumed_bytes;
 		frag_offset = 0;
 		di++;
-- 
2.17.0

^ permalink raw reply related

* [net-next 09/17] net/mlx5e: RX, Generalise name of non-linear SKB head size
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Make name more generic by dropping MPWRQ from it, as it will be
used also in Legacy RQ in a downstream patch.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h          | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c       | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index a23fa47fbbcc..9b4ed83783e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -112,7 +112,7 @@ struct page_pool;
 
 #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x2
 
-#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD	(256)
+#define MLX5E_RX_MAX_HEAD (256)
 
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
 #define MLX5E_DEFAULT_LRO_TIMEOUT                       32
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 5551e4e22ad0..61a76faa2779 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1070,7 +1070,7 @@ struct sk_buff *
 mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 				   u16 cqe_bcnt, u32 head_offset, u32 page_idx)
 {
-	u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
+	u16 headlen = min_t(u16, MLX5E_RX_MAX_HEAD, cqe_bcnt);
 	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
 	u32 frag_offset    = head_offset + headlen;
 	u32 byte_cnt       = cqe_bcnt - headlen;
@@ -1078,7 +1078,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
 	struct sk_buff *skb;
 
 	skb = napi_alloc_skb(rq->cq.napi,
-			     ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, sizeof(long)));
+			     ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long)));
 	if (unlikely(!skb)) {
 		rq->stats->buff_alloc_err++;
 		return NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 027f54ac1ca2..4d316cc9b008 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -100,7 +100,7 @@ static int mlx5e_test_link_speed(struct mlx5e_priv *priv)
 
 #ifdef CONFIG_INET
 /* loopback test */
-#define MLX5E_TEST_PKT_SIZE (MLX5_MPWRQ_SMALL_PACKET_THRESHOLD - NET_IP_ALIGN)
+#define MLX5E_TEST_PKT_SIZE (MLX5E_RX_MAX_HEAD - NET_IP_ALIGN)
 static const char mlx5e_test_text[ETH_GSTRING_LEN] = "MLX5E SELF TEST";
 #define MLX5E_TEST_MAGIC 0x5AEED15C001ULL
 
-- 
2.17.0

^ permalink raw reply related

* [net-next 13/17] net/mlx5e: RX, Split WQ objects for different RQ types
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Replace the common RQ WQ object with two separate ones for the
different RQ types.
This is in preparation for switching to using a cyclic WQ type
in Legacy RQ.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   4 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 128 ++++++++++++------
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   |  35 +++--
 3 files changed, 110 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9b4ed83783e4..f2f2dcf6b23c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -498,10 +498,9 @@ enum mlx5e_rq_flag {
 
 struct mlx5e_rq {
 	/* data path */
-	struct mlx5_wq_ll      wq;
-
 	union {
 		struct {
+			struct mlx5_wq_ll      wq;
 			struct mlx5e_wqe_frag_info *frag_info;
 			u32 frag_sz;	/* max possible skb frag_sz */
 			union {
@@ -509,6 +508,7 @@ struct mlx5e_rq {
 			};
 		} wqe;
 		struct {
+			struct mlx5_wq_ll      wq;
 			struct mlx5e_umr_wqe   umr_wqe;
 			struct mlx5e_mpw_info *info;
 			mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3f1f0552843c..3a007717cba5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -319,10 +319,30 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
 	ucseg->mkey_mask     = cpu_to_be64(MLX5_MKEY_MASK_FREE);
 }
 
+static u32 mlx5e_rqwq_get_size(struct mlx5e_rq *rq)
+{
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return mlx5_wq_ll_get_size(&rq->mpwqe.wq);
+	default:
+		return mlx5_wq_ll_get_size(&rq->wqe.wq);
+	}
+}
+
+static u32 mlx5e_rqwq_get_cur_sz(struct mlx5e_rq *rq)
+{
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return rq->mpwqe.wq.cur_sz;
+	default:
+		return rq->wqe.wq.cur_sz;
+	}
+}
+
 static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
 				     struct mlx5e_channel *c)
 {
-	int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+	int wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 
 	rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
 				      GFP_KERNEL, cpu_to_node(c->cpu));
@@ -370,7 +390,7 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 
 static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
 {
-	u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->wq));
+	u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->mpwqe.wq));
 
 	return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey);
 }
@@ -397,15 +417,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
 
-	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
-				&rq->wq_ctrl);
-	if (err)
-		return err;
-
-	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
-
-	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
-
 	rq->wq_type = params->rq_wq_type;
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
@@ -434,8 +445,17 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->mpwqe.wq,
+					&rq->wq_ctrl);
+		if (err)
+			return err;
+
+		rq->mpwqe.wq.db = &rq->mpwqe.wq.db[MLX5_RCV_DBR];
+
+		wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 
 		pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
+
 		rq->post_wqes = mlx5e_post_rx_mpwqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
 
@@ -472,6 +492,15 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			goto err_destroy_umr_mkey;
 		break;
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
+					&rq->wq_ctrl);
+		if (err)
+			return err;
+
+		rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR];
+
+		wq_sz = mlx5_wq_ll_get_size(&rq->wqe.wq);
+
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
 				     GFP_KERNEL, cpu_to_node(c->cpu));
@@ -538,16 +567,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		goto err_rq_wq_destroy;
 
 	for (i = 0; i < wq_sz; i++) {
-		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
-
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+			struct mlx5e_rx_wqe *wqe =
+				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
 			wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
-		}
+			wqe->data.byte_count = cpu_to_be32(byte_count);
+			wqe->data.lkey = rq->mkey_be;
+		} else {
+			struct mlx5e_rx_wqe *wqe =
+				mlx5_wq_ll_get_wqe(&rq->wqe.wq, i);
 
-		wqe->data.byte_count = cpu_to_be32(byte_count);
-		wqe->data.lkey = rq->mkey_be;
+			wqe->data.byte_count = cpu_to_be32(byte_count);
+			wqe->data.lkey = rq->mkey_be;
+		}
 	}
 
 	INIT_WORK(&rq->dim.work, mlx5e_rx_dim_work);
@@ -744,51 +778,65 @@ static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
 	unsigned long exp_time = jiffies + msecs_to_jiffies(wait_time);
 	struct mlx5e_channel *c = rq->channel;
 
-	struct mlx5_wq_ll *wq = &rq->wq;
-	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5_wq_ll_get_size(wq));
+	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5e_rqwq_get_size(rq));
 
 	do {
-		if (wq->cur_sz >= min_wqes)
+		if (mlx5e_rqwq_get_cur_sz(rq) >= min_wqes)
 			return 0;
 
 		msleep(20);
 	} while (time_before(jiffies, exp_time));
 
 	netdev_warn(c->netdev, "Failed to get min RX wqes on Channel[%d] RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n",
-		    c->ix, rq->rqn, wq->cur_sz, min_wqes);
+		    c->ix, rq->rqn, mlx5e_rqwq_get_cur_sz(rq), min_wqes);
 
 	return -ETIMEDOUT;
 }
 
 static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
-	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_ix_be;
 	u16 wqe_ix;
 
-	/* UMR WQE (if in progress) is always at wq->head */
-	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ &&
-	    rq->mpwqe.umr_in_progress)
-		mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
-
-	while (!mlx5_wq_ll_is_empty(wq)) {
-		wqe_ix_be = *wq->tail_next;
-		wqe_ix    = be16_to_cpu(wqe_ix_be);
-		wqe       = mlx5_wq_ll_get_wqe(&rq->wq, wqe_ix);
-		rq->dealloc_wqe(rq, wqe_ix);
-		mlx5_wq_ll_pop(&rq->wq, wqe_ix_be,
-			       &wqe->next.next_wqe_index);
-	}
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+		struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+
+		if (rq->mpwqe.umr_in_progress)
+			mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
+
+		while (!mlx5_wq_ll_is_empty(wq)) {
+			struct mlx5e_rx_wqe *wqe;
+
+			wqe_ix_be = *wq->tail_next;
+			wqe_ix    = be16_to_cpu(wqe_ix_be);
+			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+			rq->dealloc_wqe(rq, wqe_ix);
+			mlx5_wq_ll_pop(wq, wqe_ix_be,
+				       &wqe->next.next_wqe_index);
+		}
+	} else {
+		struct mlx5_wq_ll *wq = &rq->wqe.wq;
+
+		while (!mlx5_wq_ll_is_empty(wq)) {
+			struct mlx5e_rx_wqe *wqe;
+
+			wqe_ix_be = *wq->tail_next;
+			wqe_ix    = be16_to_cpu(wqe_ix_be);
+			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+			rq->dealloc_wqe(rq, wqe_ix);
+			mlx5_wq_ll_pop(wq, wqe_ix_be,
+				       &wqe->next.next_wqe_index);
+		}
 
-	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST && rq->wqe.page_reuse) {
 		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
 		 * but yet to be re-posted.
 		 */
-		int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+		if (rq->wqe.page_reuse) {
+			int wq_sz = mlx5_wq_ll_get_size(wq);
 
-		for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
-			rq->dealloc_wqe(rq, wqe_ix);
+			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
+				rq->dealloc_wqe(rq, wqe_ix);
+		}
 	}
 }
 
@@ -2809,7 +2857,7 @@ static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
 
 	param->wq.db_numa_node = param->wq.buf_numa_node;
 
-	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
+	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
 				&rq->wq_ctrl);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 634540afdcfc..3b12d4de5b98 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -113,7 +113,7 @@ static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
 			mpwrq_get_cqe_consumed_strides(&cq->title);
 	else
 		cq->decmprs_wqe_counter =
-			mlx5_wq_ll_ctr2ix(&rq->wq, cq->decmprs_wqe_counter + 1);
+			mlx5_wq_ll_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
 }
 
 static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
@@ -369,7 +369,7 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 	struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
 	rq->mpwqe.umr_in_progress = false;
@@ -470,7 +470,7 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
@@ -546,7 +546,7 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
 
 bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
@@ -987,6 +987,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -996,7 +997,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1018,7 +1019,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
@@ -1029,6 +1030,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
@@ -1038,7 +1040,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1063,7 +1065,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 #endif
@@ -1164,6 +1166,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
 	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
 	struct mlx5e_rx_wqe *wqe;
+	struct mlx5_wq_ll *wq;
 	struct sk_buff *skb;
 	u16 cqe_bcnt;
 
@@ -1193,9 +1196,10 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
 		return;
 
-	wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
+	wq  = &rq->mpwqe.wq;
+	wqe = mlx5_wq_ll_get_wqe(wq, wqe_id);
 	mlx5e_free_rx_mpwqe(rq, wi);
-	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
+	mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index);
 }
 
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
@@ -1399,6 +1403,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -1408,7 +1413,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1425,7 +1430,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 wq_free_wqe:
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
@@ -1435,6 +1440,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -1444,7 +1450,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1465,8 +1471,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+	mlx5_wq_ll_pop(wq, wqe_counter_be, &wqe->next.next_wqe_index);
 }
 
 #endif /* CONFIG_MLX5_EN_IPSEC */
-- 
2.17.0

^ permalink raw reply related

* [net-next 12/17] net/mlx5e: RX, Remove HW LRO support in legacy RQ
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Current LRO implementation in Legacy RQ uses high-order pages.
In downstream patches of this series we complete the transition
to using only order-0 pages in RX datapath (which was already done
in Striding RQ).

Unlike the more advanced Striding RQ, Legacy RQ does not make reuse
of any non-consumed buffers of non-full LRO sessions, and combining
it with order-0 pages has many performance drawbacks.

Hence, here we totally remove LRO support in Legacy RQ.
This guarantees having no out-of-order completions, which allows using
a cyclic work queue (instead of a linked-list) in a downstream patch.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  |  7 ++++
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 33 +++++++++++--------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 42bd256e680d..fffe514ba855 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1515,6 +1515,9 @@ static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable)
 			return -EOPNOTSUPP;
 		if (!mlx5e_striding_rq_possible(mdev, &priv->channels.params))
 			return -EINVAL;
+	} else if (priv->channels.params.lro_en) {
+		netdev_warn(netdev, "Can't set legacy RQ with LRO, disable LRO first\n");
+		return -EINVAL;
 	}
 
 	new_channels.params = priv->channels.params;
@@ -1589,6 +1592,10 @@ static int mlx5e_set_priv_flags(struct net_device *netdev, u32 pflags)
 
 out:
 	mutex_unlock(&priv->state_lock);
+
+	/* Need to fix some features.. */
+	netdev_update_features(netdev);
+
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ab7b2a4e6edc..3f1f0552843c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -182,14 +182,6 @@ void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
 	params->log_rq_mtu_frames = is_kdump_kernel() ?
 		MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
 		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
-	switch (params->rq_wq_type) {
-	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		/* Extra room needed for build_skb */
-		params->lro_wqe_sz -= mlx5e_get_rq_headroom(mdev, params) +
-			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	}
 
 	mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n",
 		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ,
@@ -503,14 +495,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			goto err_rq_wq_destroy;
 		}
 
-		byte_count = params->lro_en  ?
-				params->lro_wqe_sz :
-				MLX5E_SW2HW_MTU(params, params->sw_mtu);
+		byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
 #ifdef CONFIG_MLX5_EN_IPSEC
 		if (MLX5_IPSEC_DEV(mdev))
 			byte_count += MLX5E_METADATA_ETHER_LEN;
 #endif
-		rq->wqe.page_reuse = !params->xdp_prog && !params->lro_en;
+		rq->wqe.page_reuse = !params->xdp_prog;
 
 		/* calc the required page order */
 		rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->buff.headroom + byte_count);
@@ -3311,6 +3301,12 @@ static int set_feature_lro(struct net_device *netdev, bool enable)
 	mutex_lock(&priv->state_lock);
 
 	old_params = &priv->channels.params;
+	if (enable && !MLX5E_GET_PFLAG(old_params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
+		netdev_warn(netdev, "can't set LRO with legacy RQ\n");
+		err = -EINVAL;
+		goto out;
+	}
+
 	reset = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	new_channels.params = *old_params;
@@ -3480,16 +3476,24 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 					    netdev_features_t features)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_params *params;
 
 	mutex_lock(&priv->state_lock);
+	params = &priv->channels.params;
 	if (!bitmap_empty(priv->fs.vlan.active_svlans, VLAN_N_VID)) {
 		/* HW strips the outer C-tag header, this is a problem
 		 * for S-tag traffic.
 		 */
 		features &= ~NETIF_F_HW_VLAN_CTAG_RX;
-		if (!priv->channels.params.vlan_strip_disable)
+		if (!params->vlan_strip_disable)
 			netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n");
 	}
+	if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
+		features &= ~NETIF_F_LRO;
+		if (params->lro_en)
+			netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n");
+	}
+
 	mutex_unlock(&priv->state_lock);
 
 	return features;
@@ -4328,7 +4332,8 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_TX;
 	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_RX;
 
-	if (!!MLX5_CAP_ETH(mdev, lro_cap))
+	if (!!MLX5_CAP_ETH(mdev, lro_cap) &&
+	    mlx5e_check_fragmented_striding_rq_cap(mdev))
 		netdev->vlan_features    |= NETIF_F_LRO;
 
 	netdev->hw_features       = netdev->vlan_features;
-- 
2.17.0

^ permalink raw reply related

* [net-next 16/17] net/mlx5e: RX, Always prefer Linear SKB configuration
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Prefer the linear SKB configuration of Legacy RQ over the
non-linear one of Striding RQ.

This implies that ConnectX-4 LX now uses legacy RQ by default,
as it does not support the linear configuration of Striding RQ.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2c634e50d051..333d4ed52b94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4405,9 +4405,16 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
 	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def);
 
 	/* RQ */
-	if (mlx5e_striding_rq_possible(mdev, params))
-		MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ,
-				!slow_pci_heuristic(mdev));
+	/* Prefer Striding RQ, unless any of the following holds:
+	 * - Striding RQ configuration is not possible/supported.
+	 * - Slow PCI heuristic.
+	 * - Legacy RQ would use linear SKB while Striding RQ would use non-linear.
+	 */
+	if (!slow_pci_heuristic(mdev) &&
+	    mlx5e_striding_rq_possible(mdev, params) &&
+	    (mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ||
+	     !mlx5e_rx_is_linear_skb(mdev, params)))
+		MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, true);
 	mlx5e_set_rq_type(mdev, params);
 	mlx5e_init_rq_type_params(mdev, params);
 
-- 
2.17.0

^ permalink raw reply related

* [net-next 15/17] net/mlx5e: RX, Enhance legacy Receive Queue memory scheme
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Enhance the memory scheme of the legacy RQ, such that
only order-0 pages are used.

Whenever possible, prefer using a linear SKB, and build it
wrapping the WQE buffer.

Otherwise (for example, jumbo frames on x86), use non-linear SKB,
with as many frags as needed. In this case, multiple WQE
scatter entries are used, up to a maximum of 4 frags and 10KB of MTU.

This implied to remove support of HW LRO in legacy RQ, as it would
require large number of page allocations and scatter entries per WQE
on archs with PAGE_SIZE = 4KB, yielding bad performance.

In earlier patches, we guaranteed that all completions are in-order,
and that we use a cyclic WQ.
This creates an oppurtunity for a performance optimization:
The mapping between a "struct mlx5e_dma_info", and the
WQEs (struct mlx5e_wqe_frag_info) pointing to it, is constant
across different cycles of a WQ. This allows initializing
the mapping in the time of RQ creation, and not handle it
in datapath.

A struct mlx5e_dma_info that is shared between different WQEs
is allocated by the first WQE, and freed by the last one.
This implies an important requirement: WQEs that share the same
struct mlx5e_dma_info must be posted within the same NAPI.
Otherwise, upon completion, struct mlx5e_wqe_frag_info would mistakenly
point to the new struct mlx5e_dma_info, not the one that was posted
(and the HW wrote to).
This bulking requirement is actually good also for performance reasons,
hence we extend the bulk beyong the minimal requirement above.

With this memory scheme, the RQs memory footprint is reduce by a
factor of 2 on x86, and by a factor of 32 on PowerPC.
Same factors apply for the number of pages in a GRO session.

Performance tests:
ConnectX-4, single core, single RX ring, default MTU.

x86:
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

Packet rate (early drop in TC): no degradation
TCP streams: ~5% improvement

PowerPC:
CPU: POWER8 (raw), altivec supported

Packet rate (early drop in TC): 20% gain
TCP streams: 25% gain

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  44 +++-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 238 +++++++++++++-----
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 207 ++++++++++-----
 3 files changed, 362 insertions(+), 127 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index af521dd52993..eb9eb7aa953a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -101,11 +101,15 @@ struct page_pool;
 	(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW + \
 	 (MLX5_MPWRQ_LOG_WQE_SZ - MLX5E_ORDER2_MAX_PACKET_MTU))
 
+#define MLX5E_MIN_SKB_FRAG_SZ		(MLX5_SKB_FRAG_SZ(MLX5_RX_HEADROOM))
+#define MLX5E_LOG_MAX_RX_WQE_BULK	\
+	(ilog2(PAGE_SIZE / roundup_pow_of_two(MLX5E_MIN_SKB_FRAG_SZ)))
+
 #define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE                0x6
 #define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xd
 
-#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE                0x1
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE (1 + MLX5E_LOG_MAX_RX_WQE_BULK)
 #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE min_t(u8, 0xd,	\
 					       MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW)
@@ -462,8 +466,9 @@ struct mlx5e_dma_info {
 };
 
 struct mlx5e_wqe_frag_info {
-	struct mlx5e_dma_info di;
+	struct mlx5e_dma_info *di;
 	u32 offset;
+	bool last_in_page;
 };
 
 struct mlx5e_umr_dma_info {
@@ -476,6 +481,8 @@ struct mlx5e_mpw_info {
 	DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
 };
 
+#define MLX5E_MAX_RX_FRAGS 4
+
 /* a single cache unit is capable to serve one napi call (for non-striding rq)
  * or a MPWQE (for striding rq).
  */
@@ -493,6 +500,9 @@ typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
 typedef struct sk_buff *
 (*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 			       u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+typedef struct sk_buff *
+(*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			 struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
 typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
 typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
 
@@ -500,16 +510,27 @@ enum mlx5e_rq_flag {
 	MLX5E_RQ_FLAG_XDP_XMIT = BIT(0),
 };
 
+struct mlx5e_rq_frag_info {
+	int frag_size;
+	int frag_stride;
+};
+
+struct mlx5e_rq_frags_info {
+	struct mlx5e_rq_frag_info arr[MLX5E_MAX_RX_FRAGS];
+	u8 num_frags;
+	u8 log_num_frags;
+	u8 wqe_bulk;
+};
+
 struct mlx5e_rq {
 	/* data path */
 	union {
 		struct {
-			struct mlx5_wq_cyc     wq;
-			struct mlx5e_wqe_frag_info *frag_info;
-			u32 frag_sz;	/* max possible skb frag_sz */
-			union {
-				bool page_reuse;
-			};
+			struct mlx5_wq_cyc          wq;
+			struct mlx5e_wqe_frag_info *frags;
+			struct mlx5e_dma_info      *di;
+			struct mlx5e_rq_frags_info  info;
+			mlx5e_fp_skb_from_cqe       skb_from_cqe;
 		} wqe;
 		struct {
 			struct mlx5_wq_ll      wq;
@@ -523,7 +544,6 @@ struct mlx5e_rq {
 	};
 	struct {
 		u16            headroom;
-		u8             page_order;
 		u8             map_dir;   /* dma map direction */
 	} buff;
 
@@ -879,6 +899,12 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 struct sk_buff *
 mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 				   u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+struct sk_buff *
+mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
+struct sk_buff *
+mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7fd2d736fbb1..2c634e50d051 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -51,6 +51,7 @@
 struct mlx5e_rq_param {
 	u32			rqc[MLX5_ST_SZ_DW(rqc)];
 	struct mlx5_wq_param	wq;
+	struct mlx5e_rq_frags_info frags_info;
 };
 
 struct mlx5e_sq_param {
@@ -93,7 +94,7 @@ bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 	return true;
 }
 
-static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
+static u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params)
 {
 	if (!params->xdp_prog) {
 		u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
@@ -107,19 +108,27 @@ static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
 
 static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
 {
-	u32 linear_frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+	u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params);
 
 	return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
 }
 
+static bool mlx5e_rx_is_linear_skb(struct mlx5_core_dev *mdev,
+				   struct mlx5e_params *params)
+{
+	u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+
+	return !params->lro_en && frag_sz <= PAGE_SIZE;
+}
+
 static bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
 					 struct mlx5e_params *params)
 {
-	u32 frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+	u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
 	s8 signed_log_num_strides_param;
 	u8 log_num_strides;
 
-	if (params->lro_en || frag_sz > PAGE_SIZE)
+	if (!mlx5e_rx_is_linear_skb(mdev, params))
 		return false;
 
 	if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
@@ -145,7 +154,7 @@ static u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
 					  struct mlx5e_params *params)
 {
 	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
-		return order_base_2(mlx5e_mpwqe_get_linear_frag_sz(params));
+		return order_base_2(mlx5e_rx_get_linear_frag_sz(params));
 
 	return MLX5E_MPWQE_STRIDE_SZ(mdev,
 		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
@@ -163,16 +172,15 @@ static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
 {
 	u16 linear_rq_headroom = params->xdp_prog ?
 		XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
+	bool is_linear_skb;
 
 	linear_rq_headroom += NET_IP_ALIGN;
 
-	if (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC)
-		return linear_rq_headroom;
-
-	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
-		return linear_rq_headroom;
+	is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
+		mlx5e_rx_is_linear_skb(mdev, params) :
+		mlx5e_rx_mpwqe_is_linear_skb(mdev, params);
 
-	return 0;
+	return is_linear_skb ? linear_rq_headroom : 0;
 }
 
 void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
@@ -400,6 +408,61 @@ static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
 	return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
 }
 
+static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
+{
+	struct mlx5e_wqe_frag_info next_frag, *prev;
+	int i;
+
+	next_frag.di = &rq->wqe.di[0];
+	next_frag.offset = 0;
+	prev = NULL;
+
+	for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {
+		struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
+		struct mlx5e_wqe_frag_info *frag =
+			&rq->wqe.frags[i << rq->wqe.info.log_num_frags];
+		int f;
+
+		for (f = 0; f < rq->wqe.info.num_frags; f++, frag++) {
+			if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE) {
+				next_frag.di++;
+				next_frag.offset = 0;
+				if (prev)
+					prev->last_in_page = true;
+			}
+			*frag = next_frag;
+
+			/* prepare next */
+			next_frag.offset += frag_info[f].frag_stride;
+			prev = frag;
+		}
+	}
+
+	if (prev)
+		prev->last_in_page = true;
+}
+
+static int mlx5e_init_di_list(struct mlx5e_rq *rq,
+			      struct mlx5e_params *params,
+			      int wq_sz, int cpu)
+{
+	int len = wq_sz << rq->wqe.info.log_num_frags;
+
+	rq->wqe.di = kvzalloc_node(len * sizeof(*rq->wqe.di),
+				   GFP_KERNEL, cpu_to_node(cpu));
+	if (!rq->wqe.di)
+		return -ENOMEM;
+
+	mlx5e_init_frags_partition(rq);
+
+	return 0;
+}
+
+static void mlx5e_free_di_list(struct mlx5e_rq *rq)
+{
+	kvfree(rq->wqe.di);
+}
+
 static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			  struct mlx5e_params *params,
 			  struct mlx5e_rq_param *rqp,
@@ -409,8 +472,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = c->mdev;
 	void *rqc = rqp->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
-	u32 byte_count, pool_size;
-	int npages;
+	u32 pool_size;
 	int wq_sz;
 	int err;
 	int i;
@@ -480,8 +542,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params);
 		rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params));
 
-		byte_count = rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
-
 		err = mlx5e_create_rq_umr_mkey(mdev, rq);
 		if (err)
 			goto err_rq_wq_destroy;
@@ -489,7 +549,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 		err = mlx5e_rq_alloc_mpwqe_info(rq, c);
 		if (err)
-			goto err_destroy_umr_mkey;
+			goto err_free;
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
 		err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
@@ -501,13 +561,17 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 		wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
 
-		rq->wqe.frag_info =
-			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
-				     GFP_KERNEL, cpu_to_node(c->cpu));
-		if (!rq->wqe.frag_info) {
-			err = -ENOMEM;
-			goto err_rq_wq_destroy;
-		}
+		rq->wqe.info = rqp->frags_info;
+		rq->wqe.frags =
+			kvzalloc_node((wq_sz << rq->wqe.info.log_num_frags) *
+				      sizeof(*rq->wqe.frags),
+				      GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe.frags)
+			goto err_free;
+
+		err = mlx5e_init_di_list(rq, params, wq_sz, c->cpu);
+		if (err)
+			goto err_free;
 		rq->post_wqes = mlx5e_post_rx_wqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
 
@@ -518,30 +582,19 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 #endif
 			rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe;
 		if (!rq->handle_rx_cqe) {
-			kfree(rq->wqe.frag_info);
 			err = -EINVAL;
 			netdev_err(c->netdev, "RX handler of RQ is not set, err %d\n", err);
-			goto err_rq_wq_destroy;
+			goto err_free;
 		}
 
-		byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
-#ifdef CONFIG_MLX5_EN_IPSEC
-		if (MLX5_IPSEC_DEV(mdev))
-			byte_count += MLX5E_METADATA_ETHER_LEN;
-#endif
-		rq->wqe.page_reuse = !params->xdp_prog;
-
-		/* calc the required page order */
-		rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->buff.headroom + byte_count);
-		npages = DIV_ROUND_UP(rq->wqe.frag_sz, PAGE_SIZE);
-		rq->buff.page_order = order_base_2(npages);
-
-		byte_count |= MLX5_HW_START_PADDING;
+		rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(mdev, params) ?
+			mlx5e_skb_from_cqe_linear :
+			mlx5e_skb_from_cqe_nonlinear;
 		rq->mkey_be = c->mkey_be;
 	}
 
 	/* Create a page_pool and register it with rxq */
-	pp_params.order     = rq->buff.page_order;
+	pp_params.order     = 0;
 	pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
 	pp_params.pool_size = pool_size;
 	pp_params.nid       = cpu_to_node(c->cpu);
@@ -555,21 +608,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	 */
 	rq->page_pool = page_pool_create(&pp_params);
 	if (IS_ERR(rq->page_pool)) {
-		if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
-			kfree(rq->wqe.frag_info);
 		err = PTR_ERR(rq->page_pool);
 		rq->page_pool = NULL;
-		goto err_rq_wq_destroy;
+		goto err_free;
 	}
 	err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
 					 MEM_TYPE_PAGE_POOL, rq->page_pool);
 	if (err)
-		goto err_rq_wq_destroy;
+		goto err_free;
 
 	for (i = 0; i < wq_sz; i++) {
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 			struct mlx5e_rx_wqe_ll *wqe =
 				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
+			u32 byte_count =
+				rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
 			wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
@@ -578,9 +631,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		} else {
 			struct mlx5e_rx_wqe_cyc *wqe =
 				mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i);
+			int f;
 
-			wqe->data[0].byte_count = cpu_to_be32(byte_count);
-			wqe->data[0].lkey = rq->mkey_be;
+			for (f = 0; f < rq->wqe.info.num_frags; f++) {
+				u32 frag_size = rq->wqe.info.arr[f].frag_size |
+					MLX5_HW_START_PADDING;
+
+				wqe->data[f].byte_count = cpu_to_be32(frag_size);
+				wqe->data[f].lkey = rq->mkey_be;
+			}
+			/* check if num_frags is not a pow of two */
+			if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) {
+				wqe->data[f].byte_count = 0;
+				wqe->data[f].lkey = cpu_to_be32(MLX5_INVALID_LKEY);
+				wqe->data[f].addr = 0;
+			}
 		}
 	}
 
@@ -600,8 +665,16 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	return 0;
 
-err_destroy_umr_mkey:
-	mlx5_core_destroy_mkey(mdev, &rq->umr_mkey);
+err_free:
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kfree(rq->mpwqe.info);
+		mlx5_core_destroy_mkey(mdev, &rq->umr_mkey);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		kvfree(rq->wqe.frags);
+		mlx5e_free_di_list(rq);
+	}
 
 err_rq_wq_destroy:
 	if (rq->xdp_prog)
@@ -631,7 +704,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
-		kfree(rq->wqe.frag_info);
+		kvfree(rq->wqe.frags);
+		mlx5e_free_di_list(rq);
 	}
 
 	for (i = rq->page_cache.head; i != rq->page_cache.tail;
@@ -823,17 +897,8 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 			rq->dealloc_wqe(rq, wqe_ix);
 			mlx5_wq_cyc_pop(wq);
 		}
-
-		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
-		 * but yet to be re-posted.
-		 */
-		if (rq->wqe.page_reuse) {
-			int wq_sz = mlx5_wq_cyc_get_size(wq);
-
-			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
-				rq->dealloc_wqe(rq, wqe_ix);
-		}
 	}
+
 }
 
 static int mlx5e_open_rq(struct mlx5e_channel *c,
@@ -1954,6 +2019,61 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
 	kfree(c);
 }
 
+#define DEFAULT_FRAG_SIZE (2048)
+
+static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
+				      struct mlx5e_params *params,
+				      struct mlx5e_rq_frags_info *info)
+{
+	u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	int frag_size_max = DEFAULT_FRAG_SIZE;
+	u32 buf_size = 0;
+	int i;
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (MLX5_IPSEC_DEV(mdev))
+		byte_count += MLX5E_METADATA_ETHER_LEN;
+#endif
+
+	if (mlx5e_rx_is_linear_skb(mdev, params)) {
+		int frag_stride;
+
+		frag_stride = mlx5e_rx_get_linear_frag_sz(params);
+		frag_stride = roundup_pow_of_two(frag_stride);
+
+		info->arr[0].frag_size = byte_count;
+		info->arr[0].frag_stride = frag_stride;
+		info->num_frags = 1;
+		info->wqe_bulk = PAGE_SIZE / frag_stride;
+		goto out;
+	}
+
+	if (byte_count > PAGE_SIZE +
+	    (MLX5E_MAX_RX_FRAGS - 1) * frag_size_max)
+		frag_size_max = PAGE_SIZE;
+
+	i = 0;
+	while (buf_size < byte_count) {
+		int frag_size = byte_count - buf_size;
+
+		if (i < MLX5E_MAX_RX_FRAGS - 1)
+			frag_size = min(frag_size, frag_size_max);
+
+		info->arr[i].frag_size = frag_size;
+		info->arr[i].frag_stride = roundup_pow_of_two(frag_size);
+
+		buf_size += frag_size;
+		i++;
+	}
+	info->num_frags = i;
+	/* number of different wqes sharing a page */
+	info->wqe_bulk = 1 + (info->num_frags % 2);
+
+out:
+	info->wqe_bulk = max_t(u8, info->wqe_bulk, 8);
+	info->log_num_frags = order_base_2(info->num_frags);
+}
+
 static inline u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs)
 {
 	int sz = sizeof(struct mlx5_wqe_data_seg) * ndsegs;
@@ -1990,6 +2110,8 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
 		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
+		mlx5e_build_rq_frags_info(mdev, params, &param->frags_info);
+		ndsegs = param->frags_info.num_frags;
 	}
 
 	MLX5_SET(wq, wq, wq_type,          params->rq_wq_type);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 3cdf2c097356..d3a1dd20e41d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -164,8 +164,6 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
 	return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
 }
 
-#define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)
-
 static inline bool mlx5e_page_is_reserved(struct page *page)
 {
 	return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id();
@@ -214,7 +212,7 @@ static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
 	stats->cache_reuse++;
 
 	dma_sync_single_for_device(rq->pdev, dma_info->addr,
-				   RQ_PAGE_SIZE(rq),
+				   PAGE_SIZE,
 				   DMA_FROM_DEVICE);
 	return true;
 }
@@ -230,7 +228,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 		return -ENOMEM;
 
 	dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0,
-				      RQ_PAGE_SIZE(rq), rq->buff.map_dir);
+				      PAGE_SIZE, rq->buff.map_dir);
 	if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
 		put_page(dma_info->page);
 		dma_info->page = NULL;
@@ -243,8 +241,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
 					struct mlx5e_dma_info *dma_info)
 {
-	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
-		       rq->buff.map_dir);
+	dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir);
 }
 
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
@@ -262,58 +259,96 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
 	}
 }
 
-static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
-				    struct mlx5e_wqe_frag_info *wi)
+static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
+				    struct mlx5e_wqe_frag_info *frag)
+{
+	int err = 0;
+
+	if (!frag->offset)
+		/* On first frag (offset == 0), replenish page (dma_info actually).
+		 * Other frags that point to the same dma_info (with a different
+		 * offset) should just use the new one without replenishing again
+		 * by themselves.
+		 */
+		err = mlx5e_page_alloc_mapped(rq, frag->di);
+
+	return err;
+}
+
+static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq,
+				     struct mlx5e_wqe_frag_info *frag)
 {
-	return rq->wqe.page_reuse && wi->di.page &&
-		(wi->offset + rq->wqe.frag_sz <= RQ_PAGE_SIZE(rq)) &&
-		!mlx5e_page_is_reserved(wi->di.page);
+	if (frag->last_in_page)
+		mlx5e_page_release(rq, frag->di, true);
 }
 
 static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
 {
-	return &rq->wqe.frag_info[ix];
+	return &rq->wqe.frags[ix << rq->wqe.info.log_num_frags];
 }
 
-static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, u16 ix)
+static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe,
+			      u16 ix)
 {
-	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
+	struct mlx5e_wqe_frag_info *frag = get_frag(rq, ix);
+	int err;
+	int i;
 
-	/* check if page exists, hence can be reused */
-	if (!wi->di.page) {
-		if (unlikely(mlx5e_page_alloc_mapped(rq, &wi->di)))
-			return -ENOMEM;
-		wi->offset = 0;
+	for (i = 0; i < rq->wqe.info.num_frags; i++, frag++) {
+		err = mlx5e_get_rx_frag(rq, frag);
+		if (unlikely(err))
+			goto free_frags;
+
+		wqe->data[i].addr = cpu_to_be64(frag->di->addr +
+						frag->offset + rq->buff.headroom);
 	}
 
-	wqe->data[0].addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
 	return 0;
+
+free_frags:
+	while (--i >= 0)
+		mlx5e_put_rx_frag(rq, --frag);
+
+	return err;
 }
 
 static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
 				     struct mlx5e_wqe_frag_info *wi)
 {
-	mlx5e_page_release(rq, &wi->di, true);
-	wi->di.page = NULL;
+	int i;
+
+	for (i = 0; i < rq->wqe.info.num_frags; i++, wi++)
+		mlx5e_put_rx_frag(rq, wi);
 }
 
-static inline void mlx5e_free_rx_wqe_reuse(struct mlx5e_rq *rq,
-					   struct mlx5e_wqe_frag_info *wi)
+void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 {
-	if (mlx5e_page_reuse(rq, wi)) {
-		rq->stats->page_reuse++;
-		return;
-	}
+	struct mlx5e_wqe_frag_info *wi = get_frag(rq, ix);
 
 	mlx5e_free_rx_wqe(rq, wi);
 }
 
-void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
+static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
 {
-	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	int err;
+	int i;
 
-	if (wi->di.page)
-		mlx5e_free_rx_wqe(rq, wi);
+	for (i = 0; i < wqe_bulk; i++) {
+		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i);
+
+		err = mlx5e_alloc_rx_wqe(rq, wqe, ix + i);
+		if (unlikely(err))
+			goto free_wqes;
+	}
+
+	return 0;
+
+free_wqes:
+	while (--i >= 0)
+		mlx5e_dealloc_rx_wqe(rq, ix + i);
+
+	return err;
 }
 
 static inline void
@@ -476,26 +511,28 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	u8 wqe_bulk;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
-	if (mlx5_wq_cyc_is_full(wq))
+	wqe_bulk = rq->wqe.info.wqe_bulk;
+
+	if (mlx5_wq_cyc_missing(wq) < wqe_bulk)
 		return false;
 
 	do {
 		u16 head = mlx5_wq_cyc_get_head(wq);
-		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, head);
 
-		err = mlx5e_alloc_rx_wqe(rq, wqe, head);
+		err = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
 		if (unlikely(err)) {
 			rq->stats->buff_alloc_err++;
 			break;
 		}
 
-		mlx5_wq_cyc_push(wq);
-	} while (!mlx5_wq_cyc_is_full(wq));
+		mlx5_wq_cyc_push_n(wq, wqe_bulk);
+	} while (mlx5_wq_cyc_missing(wq) >= wqe_bulk);
 
 	/* ensure wqes are visible to device before updating doorbell record */
 	dma_wmb();
@@ -949,11 +986,11 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
 	return skb;
 }
 
-static inline
-struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
-			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+struct sk_buff *
+mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
 {
-	struct mlx5e_dma_info *di = &wi->di;
+	struct mlx5e_dma_info *di = wi->di;
 	u16 rx_headroom = rq->buff.headroom;
 	struct sk_buff *skb;
 	void *va, *data;
@@ -968,7 +1005,6 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 				      frag_size, DMA_FROM_DEVICE);
 	prefetchw(va); /* xdp_frame data area */
 	prefetch(data);
-	wi->offset += frag_size;
 
 	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
 		rq->stats->wqe_err++;
@@ -991,6 +1027,56 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 	return skb;
 }
 
+struct sk_buff *
+mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+{
+	struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
+	struct mlx5e_wqe_frag_info *head_wi = wi;
+	u16 headlen      = min_t(u32, MLX5E_RX_MAX_HEAD, cqe_bcnt);
+	u16 frag_headlen = headlen;
+	u16 byte_cnt     = cqe_bcnt - headlen;
+	struct sk_buff *skb;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats->wqe_err++;
+		return NULL;
+	}
+
+	/* XDP is not supported in this configuration, as incoming packets
+	 * might spread among multiple pages.
+	 */
+	skb = napi_alloc_skb(rq->cq.napi,
+			     ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long)));
+	if (unlikely(!skb)) {
+		rq->stats->buff_alloc_err++;
+		return NULL;
+	}
+
+	prefetchw(skb->data);
+
+	while (byte_cnt) {
+		u16 frag_consumed_bytes =
+			min_t(u16, frag_info->frag_size - frag_headlen, byte_cnt);
+
+		mlx5e_add_skb_frag(rq, skb, wi->di, wi->offset + frag_headlen,
+				   frag_consumed_bytes, frag_info->frag_stride);
+		byte_cnt -= frag_consumed_bytes;
+		frag_headlen = 0;
+		frag_info++;
+		wi++;
+	}
+
+	/* copy header */
+	mlx5e_copy_skb_header(rq->pdev, skb, head_wi->di, head_wi->offset,
+			      0, headlen);
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	return skb;
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1003,23 +1089,23 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
 		/* probably for XDP */
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
-			wi->di.page = NULL;
-			/* do not return page to cache, it will be returned on XDP_TX completion */
+			/* do not return page to cache,
+			 * it will be returned on XDP_TX completion.
+			 */
 			goto wq_cyc_pop;
 		}
-		/* probably an XDP_DROP, save the page-reuse checks */
-		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_cyc_pop;
+		goto free_wqe;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
@@ -1041,16 +1127,16 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
+		/* probably for XDP */
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
-			wi->di.page = NULL;
-			/* do not return page to cache, it will be returned on XDP_TX completion */
+			/* do not return page to cache,
+			 * it will be returned on XDP_TX completion.
+			 */
 			goto wq_cyc_pop;
 		}
-		/* probably an XDP_DROP, save the page-reuse checks */
-		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_cyc_pop;
+		goto free_wqe;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
@@ -1060,7 +1146,8 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
@@ -1409,7 +1496,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb)
 		goto wq_free_wqe;
 
@@ -1421,7 +1508,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	napi_gro_receive(rq->cq.napi, skb);
 
 wq_free_wqe:
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+	mlx5e_free_rx_wqe(rq, wi);
 	mlx5_wq_cyc_pop(wq);
 }
 
@@ -1441,7 +1528,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (unlikely(!skb)) {
 		/* a DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
@@ -1456,7 +1543,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
-- 
2.17.0

^ permalink raw reply related

* [net-next 14/17] net/mlx5e: RX, Use cyclic WQ in legacy RQ
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Now that LRO is not supported for Legacy RQ, there is no source of
out-of-order completions in the WQ, and we can use a cyclic one.
This has multiple advantages:
- reduces the WQE size (smaller PCI transactions).
- lower overhead in datapath (no handling of 'next' pointers).
- no reserved WQE for the WQ head (was need in linked-list).
- allows using a constant map between frag and dma_info struct, in downstream patch.

Performance tests:
ConnectX-4, single core, single RX ring.
Major gain in packet rate of single ring XDP drop.
Bottleneck is shifted form HW (at 16Mpps) to SW (at 20Mpps).

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  10 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  89 ++++++++------
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 115 ++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/wq.c  |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/wq.h  |  55 ++++++++-
 6 files changed, 161 insertions(+), 111 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index f2f2dcf6b23c..af521dd52993 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -186,9 +186,13 @@ struct mlx5e_tx_wqe {
 	struct mlx5_wqe_data_seg data[0];
 };
 
-struct mlx5e_rx_wqe {
+struct mlx5e_rx_wqe_ll {
 	struct mlx5_wqe_srq_next_seg  next;
-	struct mlx5_wqe_data_seg      data;
+	struct mlx5_wqe_data_seg      data[0];
+};
+
+struct mlx5e_rx_wqe_cyc {
+	struct mlx5_wqe_data_seg      data[0];
 };
 
 struct mlx5e_umr_wqe {
@@ -500,7 +504,7 @@ struct mlx5e_rq {
 	/* data path */
 	union {
 		struct {
-			struct mlx5_wq_ll      wq;
+			struct mlx5_wq_cyc     wq;
 			struct mlx5e_wqe_frag_info *frag_info;
 			u32 frag_sz;	/* max possible skb frag_sz */
 			union {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3a007717cba5..7fd2d736fbb1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -166,7 +166,7 @@ static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
 
 	linear_rq_headroom += NET_IP_ALIGN;
 
-	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST)
+	if (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC)
 		return linear_rq_headroom;
 
 	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
@@ -205,7 +205,7 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
 	params->rq_wq_type = mlx5e_striding_rq_possible(mdev, params) &&
 		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ) ?
 		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
-		MLX5_WQ_TYPE_LINKED_LIST;
+		MLX5_WQ_TYPE_CYCLIC;
 }
 
 static void mlx5e_update_carrier(struct mlx5e_priv *priv)
@@ -325,7 +325,7 @@ static u32 mlx5e_rqwq_get_size(struct mlx5e_rq *rq)
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
 		return mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 	default:
-		return mlx5_wq_ll_get_size(&rq->wqe.wq);
+		return mlx5_wq_cyc_get_size(&rq->wqe.wq);
 	}
 }
 
@@ -491,15 +491,15 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		if (err)
 			goto err_destroy_umr_mkey;
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
-					&rq->wq_ctrl);
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
+					 &rq->wq_ctrl);
 		if (err)
 			return err;
 
 		rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR];
 
-		wq_sz = mlx5_wq_ll_get_size(&rq->wqe.wq);
+		wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
 
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
@@ -568,19 +568,19 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	for (i = 0; i < wq_sz; i++) {
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
-			struct mlx5e_rx_wqe *wqe =
+			struct mlx5e_rx_wqe_ll *wqe =
 				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
-			wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
-			wqe->data.byte_count = cpu_to_be32(byte_count);
-			wqe->data.lkey = rq->mkey_be;
+			wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
+			wqe->data[0].byte_count = cpu_to_be32(byte_count);
+			wqe->data[0].lkey = rq->mkey_be;
 		} else {
-			struct mlx5e_rx_wqe *wqe =
-				mlx5_wq_ll_get_wqe(&rq->wqe.wq, i);
+			struct mlx5e_rx_wqe_cyc *wqe =
+				mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i);
 
-			wqe->data.byte_count = cpu_to_be32(byte_count);
-			wqe->data.lkey = rq->mkey_be;
+			wqe->data[0].byte_count = cpu_to_be32(byte_count);
+			wqe->data[0].lkey = rq->mkey_be;
 		}
 	}
 
@@ -630,7 +630,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		kfree(rq->mpwqe.info);
 		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		kfree(rq->wqe.frag_info);
 	}
 
@@ -801,11 +801,12 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 		struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 
+		/* UMR WQE (if in progress) is always at wq->head */
 		if (rq->mpwqe.umr_in_progress)
 			mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
 
 		while (!mlx5_wq_ll_is_empty(wq)) {
-			struct mlx5e_rx_wqe *wqe;
+			struct mlx5e_rx_wqe_ll *wqe;
 
 			wqe_ix_be = *wq->tail_next;
 			wqe_ix    = be16_to_cpu(wqe_ix_be);
@@ -815,24 +816,19 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 				       &wqe->next.next_wqe_index);
 		}
 	} else {
-		struct mlx5_wq_ll *wq = &rq->wqe.wq;
-
-		while (!mlx5_wq_ll_is_empty(wq)) {
-			struct mlx5e_rx_wqe *wqe;
+		struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 
-			wqe_ix_be = *wq->tail_next;
-			wqe_ix    = be16_to_cpu(wqe_ix_be);
-			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+		while (!mlx5_wq_cyc_is_empty(wq)) {
+			wqe_ix = mlx5_wq_cyc_get_tail(wq);
 			rq->dealloc_wqe(rq, wqe_ix);
-			mlx5_wq_ll_pop(wq, wqe_ix_be,
-				       &wqe->next.next_wqe_index);
+			mlx5_wq_cyc_pop(wq);
 		}
 
 		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
 		 * but yet to be re-posted.
 		 */
 		if (rq->wqe.page_reuse) {
-			int wq_sz = mlx5_wq_ll_get_size(wq);
+			int wq_sz = mlx5_wq_cyc_get_size(wq);
 
 			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
 				rq->dealloc_wqe(rq, wqe_ix);
@@ -1958,6 +1954,21 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
 	kfree(c);
 }
 
+static inline u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs)
+{
+	int sz = sizeof(struct mlx5_wqe_data_seg) * ndsegs;
+
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		sz += sizeof(struct mlx5e_rx_wqe_ll);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		sz += sizeof(struct mlx5e_rx_wqe_cyc);
+	}
+
+	return order_base_2(sz);
+}
+
 static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 				 struct mlx5e_params *params,
 				 struct mlx5e_rq_param *param)
@@ -1965,6 +1976,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	int ndsegs = 1;
 
 	switch (params->rq_wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
@@ -1974,16 +1986,16 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 		MLX5_SET(wq, wq, log_wqe_stride_size,
 			 mlx5e_mpwqe_get_log_stride_size(mdev, params) -
 			 MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
-		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ);
 		MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params));
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
 	}
 
+	MLX5_SET(wq, wq, wq_type,          params->rq_wq_type);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
-	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, log_wq_stride,
+		 mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
 	MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.pdn);
 	MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
 	MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
@@ -1999,8 +2011,9 @@ static void mlx5e_build_drop_rq_param(struct mlx5e_priv *priv,
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
-	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
-	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq, wq, log_wq_stride,
+		 mlx5e_get_rqwq_log_stride(MLX5_WQ_TYPE_CYCLIC, 1));
 	MLX5_SET(rqc, rqc, counter_set_id, priv->drop_rq_q_counter);
 
 	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
@@ -2051,7 +2064,7 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 		log_cq_size = mlx5e_mpwqe_get_log_rq_size(params) +
 			mlx5e_mpwqe_get_log_num_strides(mdev, params);
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		log_cq_size = params->log_rq_mtu_frames;
 	}
 
@@ -2857,8 +2870,8 @@ static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
 
 	param->wq.db_numa_node = param->wq.buf_numa_node;
 
-	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
-				&rq->wq_ctrl);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
+				 &rq->wq_ctrl);
 	if (err)
 		return err;
 
@@ -3360,7 +3373,7 @@ static int set_feature_lro(struct net_device *netdev, bool enable)
 	new_channels.params = *old_params;
 	new_channels.params.lro_en = enable;
 
-	if (old_params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+	if (old_params->rq_wq_type != MLX5_WQ_TYPE_CYCLIC) {
 		if (mlx5e_rx_mpwqe_is_linear_skb(mdev, old_params) ==
 		    mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_channels.params))
 			reset = false;
@@ -3566,7 +3579,7 @@ int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
 	new_channels.params = *params;
 	new_channels.params.sw_mtu = new_mtu;
 
-	if (params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 		u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params);
 		u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 8ab4c96b7f7c..3857f22b5500 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -927,7 +927,7 @@ static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
 	params->hard_mtu    = MLX5E_ETH_HARD_MTU;
 	params->sw_mtu      = mtu;
 	params->log_sq_size = MLX5E_REP_PARAMS_LOG_SQ_SIZE;
-	params->rq_wq_type  = MLX5_WQ_TYPE_LINKED_LIST;
+	params->rq_wq_type  = MLX5_WQ_TYPE_CYCLIC;
 	params->log_rq_mtu_frames = MLX5E_REP_PARAMS_LOG_RQ_SIZE;
 
 	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 3b12d4de5b98..3cdf2c097356 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -113,7 +113,7 @@ static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
 			mpwrq_get_cqe_consumed_strides(&cq->title);
 	else
 		cq->decmprs_wqe_counter =
-			mlx5_wq_ll_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
+			mlx5_wq_cyc_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
 }
 
 static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
@@ -270,7 +270,12 @@ static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
 		!mlx5e_page_is_reserved(wi->di.page);
 }
 
-static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
+{
+	return &rq->wqe.frag_info[ix];
+}
+
+static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, u16 ix)
 {
 	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
 
@@ -281,7 +286,7 @@ static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16
 		wi->offset = 0;
 	}
 
-	wqe->data.addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
+	wqe->data[0].addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
 	return 0;
 }
 
@@ -370,7 +375,7 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
-	struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+	struct mlx5e_rx_wqe_ll *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
 	rq->mpwqe.umr_in_progress = false;
 
@@ -470,31 +475,32 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
-	if (mlx5_wq_ll_is_full(wq))
+	if (mlx5_wq_cyc_is_full(wq))
 		return false;
 
 	do {
-		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+		u16 head = mlx5_wq_cyc_get_head(wq);
+		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, head);
 
-		err = mlx5e_alloc_rx_wqe(rq, wqe, wq->head);
+		err = mlx5e_alloc_rx_wqe(rq, wqe, head);
 		if (unlikely(err)) {
 			rq->stats->buff_alloc_err++;
 			break;
 		}
 
-		mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
-	} while (!mlx5_wq_ll_is_full(wq));
+		mlx5_wq_cyc_push(wq);
+	} while (!mlx5_wq_cyc_is_full(wq));
 
 	/* ensure wqes are visible to device before updating doorbell record */
 	dma_wmb();
 
-	mlx5_wq_ll_update_db_record(wq);
+	mlx5_wq_cyc_update_db_record(wq);
 
 	return !!err;
 }
@@ -987,19 +993,15 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
@@ -1007,20 +1009,19 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
 			wi->di.page = NULL;
 			/* do not return page to cache, it will be returned on XDP_TX completion */
-			goto wq_ll_pop;
+			goto wq_cyc_pop;
 		}
 		/* probably an XDP_DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 
 #ifdef CONFIG_MLX5_ESWITCH
@@ -1030,30 +1031,26 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
-	__be16 wqe_counter_be;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
 			wi->di.page = NULL;
 			/* do not return page to cache, it will be returned on XDP_TX completion */
-			goto wq_ll_pop;
+			goto wq_cyc_pop;
 		}
 		/* probably an XDP_DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
@@ -1064,9 +1061,8 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 #endif
 
@@ -1165,7 +1161,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	u32 wqe_offset     = stride_ix << rq->mpwqe.log_stride_sz;
 	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
 	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
-	struct mlx5e_rx_wqe *wqe;
+	struct mlx5e_rx_wqe_ll *wqe;
 	struct mlx5_wq_ll *wq;
 	struct sk_buff *skb;
 	u16 cqe_bcnt;
@@ -1403,19 +1399,15 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb)
@@ -1430,8 +1422,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 wq_free_wqe:
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+	mlx5_wq_cyc_pop(wq);
 }
 
 #endif /* CONFIG_MLX5_CORE_IPOIB */
@@ -1440,38 +1431,34 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (unlikely(!skb)) {
 		/* a DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 	skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb);
 	if (unlikely(!skb)) {
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be, &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 
 #endif /* CONFIG_MLX5_EN_IPSEC */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index 5b8b35392025..b97bb72b4db4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -85,6 +85,7 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
 		      MLX5_GET(wq, wqc, log_wq_sz),
 		      fbc);
+	wq->sz    = wq->fbc.sz_m1 + 1;
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index b9d7c01fc7cb..0b47126815b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -51,6 +51,9 @@ struct mlx5_wq_ctrl {
 struct mlx5_wq_cyc {
 	struct mlx5_frag_buf_ctrl fbc;
 	__be32			*db;
+	u16			sz;
+	u16			wqe_ctr;
+	u16			cur_sz;
 };
 
 struct mlx5_wq_qp {
@@ -95,6 +98,43 @@ u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq);
 
 void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl);
 
+static inline int mlx5_wq_cyc_is_full(struct mlx5_wq_cyc *wq)
+{
+	return wq->cur_sz == wq->sz;
+}
+
+static inline int mlx5_wq_cyc_missing(struct mlx5_wq_cyc *wq)
+{
+	return wq->sz - wq->cur_sz;
+}
+
+static inline int mlx5_wq_cyc_is_empty(struct mlx5_wq_cyc *wq)
+{
+	return !wq->cur_sz;
+}
+
+static inline void mlx5_wq_cyc_push(struct mlx5_wq_cyc *wq)
+{
+	wq->wqe_ctr++;
+	wq->cur_sz++;
+}
+
+static inline void mlx5_wq_cyc_push_n(struct mlx5_wq_cyc *wq, u8 n)
+{
+	wq->wqe_ctr += n;
+	wq->cur_sz += n;
+}
+
+static inline void mlx5_wq_cyc_pop(struct mlx5_wq_cyc *wq)
+{
+	wq->cur_sz--;
+}
+
+static inline void mlx5_wq_cyc_update_db_record(struct mlx5_wq_cyc *wq)
+{
+	*wq->db = cpu_to_be32(wq->wqe_ctr);
+}
+
 static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
 {
 	return ctr & wq->fbc.sz_m1;
@@ -105,6 +145,16 @@ static inline u16 mlx5_wq_cyc_ctr2fragix(struct mlx5_wq_cyc *wq, u16 ctr)
 	return ctr & wq->fbc.frag_sz_m1;
 }
 
+static inline u16 mlx5_wq_cyc_get_head(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr);
+}
+
+static inline u16 mlx5_wq_cyc_get_tail(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr - wq->cur_sz);
+}
+
 static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix)
 {
 	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
@@ -179,11 +229,6 @@ static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
 	return !wq->cur_sz;
 }
 
-static inline u16 mlx5_wq_ll_ctr2ix(struct mlx5_wq_ll *wq, u16 ctr)
-{
-	return ctr & wq->fbc.sz_m1;
-}
-
 static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
 {
 	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
-- 
2.17.0

^ permalink raw reply related

* [net-next 17/17] net/mlx5e: TX, Separate cachelines of xmit and completion stats
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Avoid false sharing of cachelines by separating the cachelines of
TX stats that are dertied in xmit flow and in completion flow.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 8 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 697dc7397ba2..1646859974ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -64,11 +64,11 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_dropped) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xmit_more) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_recover) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
@@ -1137,11 +1137,11 @@ static const struct counter_desc sq_stats_desc[] = {
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_none) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, stopped) },
-	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, dropped) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, xmit_more) },
-	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, recover) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
 };
 
 static const struct counter_desc ch_stats_desc[] = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 390c7afa5188..643153bb3607 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -75,11 +75,11 @@ struct mlx5e_sw_stats {
 	u64 tx_csum_partial;
 	u64 tx_csum_partial_inner;
 	u64 tx_queue_stopped;
-	u64 tx_queue_wake;
 	u64 tx_queue_dropped;
 	u64 tx_xmit_more;
-	u64 tx_cqe_err;
 	u64 tx_recover;
+	u64 tx_queue_wake;
+	u64 tx_cqe_err;
 	u64 rx_wqe_err;
 	u64 rx_mpwqe_filler;
 	u64 rx_buff_alloc_err;
@@ -203,10 +203,11 @@ struct mlx5e_sq_stats {
 	/* less likely accessed in data path */
 	u64 csum_none;
 	u64 stopped;
-	u64 wake;
 	u64 dropped;
-	u64 cqe_err;
 	u64 recover;
+	/* dirtied @completion */
+	u64 wake ____cacheline_aligned_in_smp;
+	u64 cqe_err;
 };
 
 struct mlx5e_ch_stats {
-- 
2.17.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox