Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/5] netfilter: nft_compat: check extension hook mask only if set
From: Pablo Neira Ayuso @ 2017-08-24 14:43 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1503585811-13447-1-git-send-email-pablo@netfilter.org>

If the x_tables extension comes with no hook mask, skip this validation.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index f5a7cb68694e..b89f4f65b2a0 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -305,7 +305,7 @@ static int nft_target_validate(const struct nft_ctx *ctx,
 		const struct nf_hook_ops *ops = &basechain->ops[0];
 
 		hook_mask = 1 << ops->hooknum;
-		if (!(hook_mask & target->hooks))
+		if (target->hooks && !(hook_mask & target->hooks))
 			return -EINVAL;
 
 		ret = nft_compat_chain_validate_dependency(target->table,
@@ -484,7 +484,7 @@ static int nft_match_validate(const struct nft_ctx *ctx,
 		const struct nf_hook_ops *ops = &basechain->ops[0];
 
 		hook_mask = 1 << ops->hooknum;
-		if (!(hook_mask & match->hooks))
+		if (match->hooks && !(hook_mask & match->hooks))
 			return -EINVAL;
 
 		ret = nft_compat_chain_validate_dependency(match->table,
-- 
2.1.4



^ permalink raw reply related

* [PATCH 1/5] netfilter: ipt_CLUSTERIP: fix use-after-free of proc entry
From: Pablo Neira Ayuso @ 2017-08-24 14:43 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1503585811-13447-1-git-send-email-pablo@netfilter.org>

From: Sabrina Dubroca <sd@queasysnail.net>

When we delete a netns with a CLUSTERIP rule, clusterip_net_exit() is
called first, removing /proc/net/ipt_CLUSTERIP.
Then clusterip_config_entry_put() is called from clusterip_tg_destroy(),
and tries to remove its entry under /proc/net/ipt_CLUSTERIP/.

Fix this by checking that the parent directory of the entry to remove
hasn't already been deleted.

The following triggers a KASAN splat (stealing the reproducer from
202f59afd441, thanks to Jianlin Shi and Xin Long):

    ip netns add test
    ip link add veth0_in type veth peer name veth0_out
    ip link set veth0_in netns test
    ip netns exec test ip link set lo up
    ip netns exec test ip link set veth0_in up
    ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j     \
        CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \
        --local-node 1 --hashmode sourceip-sourceport
    ip netns del test

Fixes: ce4ff76c15a8 ("netfilter: ipt_CLUSTERIP: make proc directory per net namespace")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 7d72decb80f9..efaa04dcc80e 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -117,7 +117,8 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
 		 * functions are also incrementing the refcount on their own,
 		 * so it's safe to remove the entry even if it's in use. */
 #ifdef CONFIG_PROC_FS
-		proc_remove(c->pde);
+		if (cn->procdir)
+			proc_remove(c->pde);
 #endif
 		return;
 	}
@@ -815,6 +816,7 @@ static void clusterip_net_exit(struct net *net)
 #ifdef CONFIG_PROC_FS
 	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 	proc_remove(cn->procdir);
+	cn->procdir = NULL;
 #endif
 	nf_unregister_net_hook(net, &cip_arp_ops);
 }
-- 
2.1.4



^ permalink raw reply related

* [PATCH net v1 3/3] tipc: context imbalance at node read unlock
From: Parthasarathy Bhuvaragan @ 2017-08-24 14:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, tipc-discussion, jon.maloy, maloy, ying.xue
In-Reply-To: <1503585084-14079-1-git-send-email-parthasarathy.bhuvaragan@ericsson.com>

If we fail to find a valid bearer in tipc_node_get_linkname(),
node_read_unlock() is called without holding the node read lock.

This commit fixes this error.

Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
---
 net/tipc/node.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tipc/node.c b/net/tipc/node.c
index b113a52f8914..7dd22330a6b4 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1126,8 +1126,8 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
 		strncpy(linkname, tipc_link_name(link), len);
 		err = 0;
 	}
-exit:
 	tipc_node_read_unlock(node);
+exit:
 	tipc_node_put(node);
 	return err;
 }
-- 
2.1.4

^ permalink raw reply related

* [PATCH net v1 2/3] tipc: reassign pointers after skb reallocation / linearization
From: Parthasarathy Bhuvaragan @ 2017-08-24 14:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, tipc-discussion, jon.maloy, maloy, ying.xue
In-Reply-To: <1503585084-14079-1-git-send-email-parthasarathy.bhuvaragan@ericsson.com>

In tipc_msg_reverse(), we assign skb attributes to local pointers
in stack at startup. This is followed by skb_linearize() and for
cloned buffers we perform skb relocation using pskb_expand_head().
Both these methods may update the skb attributes and thus making
the pointers incorrect.

In this commit, we fix this error by ensuring that the pointers
are re-assigned after any of these skb operations.

Fixes: 29042e19f2c60 ("tipc: let function tipc_msg_reverse() expand header
when needed")
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/msg.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index dcd90e6fa7c3..6ef379f004ac 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -479,13 +479,14 @@ bool tipc_msg_make_bundle(struct sk_buff **skb,  struct tipc_msg *msg,
 bool tipc_msg_reverse(u32 own_node,  struct sk_buff **skb, int err)
 {
 	struct sk_buff *_skb = *skb;
-	struct tipc_msg *hdr = buf_msg(_skb);
+	struct tipc_msg *hdr;
 	struct tipc_msg ohdr;
-	int dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE);
+	int dlen;
 
 	if (skb_linearize(_skb))
 		goto exit;
 	hdr = buf_msg(_skb);
+	dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE);
 	if (msg_dest_droppable(hdr))
 		goto exit;
 	if (msg_errcode(hdr))
@@ -511,6 +512,8 @@ bool tipc_msg_reverse(u32 own_node,  struct sk_buff **skb, int err)
 	    pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
 		goto exit;
 
+	/* reassign after skb header modifications */
+	hdr = buf_msg(_skb);
 	/* Now reverse the concerned fields */
 	msg_set_errcode(hdr, err);
 	msg_set_non_seq(hdr, 0);
-- 
2.1.4

^ permalink raw reply related

* [PATCH net v1 1/3] tipc: perform skb_linearize() before parsing the inner header
From: Parthasarathy Bhuvaragan @ 2017-08-24 14:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, tipc-discussion, jon.maloy, maloy, ying.xue
In-Reply-To: <1503585084-14079-1-git-send-email-parthasarathy.bhuvaragan@ericsson.com>

In tipc_rcv(), we linearize only the header and usually the packets
are consumed as the nodes permit direct reception. However, if the
skb contains tunnelled message due to fail over or synchronization
we parse it in tipc_node_check_state() without performing
linearization. This will cause link disturbances if the skb was
non linear.

In this commit, we perform linearization for the above messages.

Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/node.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/tipc/node.c b/net/tipc/node.c
index 9b4dcb6a16b5..b113a52f8914 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1557,6 +1557,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 
 	/* Check/update node state before receiving */
 	if (unlikely(skb)) {
+		if (unlikely(skb_linearize(skb)))
+			goto discard;
 		tipc_node_write_lock(n);
 		if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) {
 			if (le->link) {
-- 
2.1.4

^ permalink raw reply related

* [PATCH net v1 0/3] tipc: buffer reassignment fixes
From: Parthasarathy Bhuvaragan @ 2017-08-24 14:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, tipc-discussion, jon.maloy, maloy, ying.xue

This series contains fixes for buffer reassignments and a context imbalance.

Parthasarathy Bhuvaragan (3):
  tipc: perform skb_linearize() before parsing the inner header
  tipc: reassign pointers after skb reallocation / linearization
  tipc: context imbalance at node read unlock

 net/tipc/msg.c  | 7 +++++--
 net/tipc/node.c | 4 +++-
 2 files changed, 8 insertions(+), 3 deletions(-)

-- 
2.1.4

^ permalink raw reply

* Re: [PATCH net-next 07/13] net: mvpp2: improve the link management function
From: Antoine Tenart @ 2017-08-24 14:14 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Antoine Tenart, davem, kishon, jason, sebastian.hesselbarth,
	gregory.clement, thomas.petazzoni, nadavh, linux, linux-kernel,
	mw, stefanc, miquel.raynal, netdev
In-Reply-To: <20170824140625.GH8022@lunn.ch>

[-- Attachment #1: Type: text/plain, Size: 994 bytes --]

Hi Andrew,

On Thu, Aug 24, 2017 at 04:06:25PM +0200, Andrew Lunn wrote:
> On Thu, Aug 24, 2017 at 10:38:17AM +0200, Antoine Tenart wrote:
> > @@ -5753,14 +5753,24 @@ static void mvpp2_link_event(struct net_device *dev)
> >  		port->link = phydev->link;
> >  
> >  		if (phydev->link) {
> > +			mvpp2_interrupts_enable(port);
> > +			mvpp2_port_enable(port);
> > +
> >  			mvpp2_egress_enable(port);
> >  			mvpp2_ingress_enable(port);
> > +			netif_carrier_on(dev);
> 
> Have you seen cases where it is required to change the carrier state?
> The phy state machine should be doing this. e.g. when autoneg has
> completed, force link configuration, the link goes down etc.

I don't think I saw a case where this was needed. And if phylib already
handle this I think it should be removed from here as the
mvpp2_link_event only is called by phylib.

Thanks!
Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next v1] net: gso: Add GSO support for NSH (Network Service Header)
From: Jiri Benc @ 2017-08-24 14:13 UTC (permalink / raw)
  To: Yi Yang; +Cc: netdev, simon.horman
In-Reply-To: <1503567376-64933-1-git-send-email-yi.y.yang@intel.com>

On Thu, 24 Aug 2017 17:36:16 +0800, Yi Yang wrote:
>  include/net/nsh.h             | 307 ++++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/if_ether.h |   1 +
>  net/Kconfig                   |   1 +
>  net/Makefile                  |   1 +
>  net/nsh/Kconfig               |  10 ++
>  net/nsh/Makefile              |   4 +
>  net/nsh/nsh_gso.c             | 116 ++++++++++++++++

Please consider making this a patchset, with a patch adding the NSH
structures and helper functions, a patch adding GSO and a patch adding
OVS support. That way, everything can be reviewed together, yet the
patches are reasonably contained.

> +config NET_NSH_GSO
> +	bool "NSH GSO support"
> +	depends on INET
> +	default y
> +	---help---
> +	 This allows segmentation of GSO packet that have had NSH header pushed         onto them and thus become NSH GSO packets.

Seems you're missing a newline here.

> +static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
> +				       netdev_features_t features)
> +{
> +	struct sk_buff *segs = ERR_PTR(-EINVAL);
> +	__be16 protocol = skb->protocol;
> +	__be16 inner_proto;
> +	u16 mac_offset = skb->mac_header;
> +	u16 mac_len = skb->mac_len;
> +	struct nsh_hdr *nsh;
> +	unsigned int nsh_hlen;
> +	const struct net_offload *ops;
> +	struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
> +					     netdev_features_t features);
> +
> +	skb_reset_network_header(skb);
> +	nsh = (struct nsh_hdr *)skb_network_header(skb);
> +	nsh_hlen = nsh_hdr_len(nsh);
> +	if (unlikely(!pskb_may_pull(skb, nsh_hlen)))
> +		goto out;

You have to reload nsh after this.

> +
> +	__skb_pull(skb, nsh_hlen);
> +
> +	skb_reset_mac_header(skb);
> +	skb_reset_mac_len(skb);
> +
> +	rcu_read_lock();
> +	switch (nsh->next_proto) {
> +	case NSH_P_ETHERNET:
> +		inner_proto = htons(ETH_P_TEB);
> +		gso_inner_segment = skb_mac_gso_segment;
> +		break;
> +	case NSH_P_IPV4:
> +		inner_proto = htons(ETH_P_IP);
> +		ops = rcu_dereference(inet_offloads[inner_proto]);
> +		if (!ops || !ops->callbacks.gso_segment)
> +			goto out;
> +		gso_inner_segment = ops->callbacks.gso_segment;
> +		break;
> +	case NSH_P_IPV6:
> +		inner_proto = htons(ETH_P_IPV6);
> +		ops = rcu_dereference(inet6_offloads[inner_proto]);
> +		if (!ops || !ops->callbacks.gso_segment)
> +			goto out;
> +		gso_inner_segment = ops->callbacks.gso_segment;
> +		break;
> +	case NSH_P_NSH:
> +		inner_proto = htons(ETH_P_NSH);
> +		gso_inner_segment = nsh_gso_segment;

This doesn't look correct. Recursive call to nsh_gso_segment will reset
mac header, causing skb_segment not to copy the previous NSH header(s)
to the new segments.

> +		break;
> +	default:
> +		skb_gso_error_unwind(skb, protocol, nsh_hlen, mac_offset,
> +				     mac_len);
> +		goto out;
> +	}
> +
> +	skb->protocol = inner_proto;
> +	segs = gso_inner_segment(skb, features);
> +	if (IS_ERR_OR_NULL(segs)) {
> +		skb_gso_error_unwind(skb, protocol, nsh_hlen, mac_offset,
> +				     mac_len);
> +		goto out;
> +	}
> +
> +	do {
> +		skb->mac_len = mac_len;
> +		skb->protocol = protocol;
> +
> +		skb_reset_inner_network_header(skb);

This looks superfluous.

> +
> +		__skb_push(skb, nsh_hlen);
> +
> +		skb_reset_mac_header(skb);
> +		skb_set_network_header(skb, mac_len);
> +	} while ((skb = skb->next));
> +
> +out:
> +	rcu_read_unlock();
> +	return segs;
> +}
> +
> +static struct packet_offload nsh_offload __read_mostly = {
> +	.type = cpu_to_be16(ETH_P_NSH),
> +	.priority = 15,
> +	.callbacks = {
> +		.gso_segment    =	nsh_gso_segment,
> +	},
> +};
> +
> +static int __init nsh_gso_init(void)
> +{
> +	dev_add_offload(&nsh_offload);
> +
> +	return 0;
> +}
> +
> +fs_initcall(nsh_gso_init);

device_initcall should be enough. I doubt we'll be doing NFS over
NSH ;-)

 Jiri

^ permalink raw reply

* Re: [PATCH] netfilter: SYNPROXY: fix process non tcp packet bug in {ipv4,ipv6}_synproxy_hook
From: Pablo Neira Ayuso @ 2017-08-24 14:07 UTC (permalink / raw)
  To: Lin Zhang
  Cc: davem, kadlec, fw, kuznet, yoshfuji, netdev, netfilter-devel,
	coreteam
In-Reply-To: <1501221784-18226-1-git-send-email-xiaolou4617@gmail.com>

On Fri, Jul 28, 2017 at 02:03:04PM +0800, Lin Zhang wrote:
> In function {ipv4,ipv6}_synproxy_hook we expect a normal tcp packet,
> but the real server maybe reply an icmp error packet related to the 
> exist tcp conntrack, so we will access wrong tcp data.
> 
> For fix it, we simply pass IP_CT_RELATED_REPLY packets.
> 
> Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
> ---
>  net/ipv4/netfilter/ipt_SYNPROXY.c  | 2 +-
>  net/ipv6/netfilter/ip6t_SYNPROXY.c | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
> index f1528f7..3971fd9 100644
> --- a/net/ipv4/netfilter/ipt_SYNPROXY.c
> +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
> @@ -330,7 +330,7 @@ static unsigned int ipv4_synproxy_hook(void *priv,
>  	if (synproxy == NULL)
>  		return NF_ACCEPT;
>  
> -	if (nf_is_loopback_packet(skb))
> +	if (nf_is_loopback_packet(skb) || ctinfo == IP_CT_RELATED_REPLY)

If the intention is to inspect TCP traffic only, I would suggest you
just check for the protocol field here instead. So we are sure we only
deal with TCP traffic indeed.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next 07/13] net: mvpp2: improve the link management function
From: Andrew Lunn @ 2017-08-24 14:06 UTC (permalink / raw)
  To: Antoine Tenart
  Cc: davem, kishon, jason, sebastian.hesselbarth, gregory.clement,
	thomas.petazzoni, nadavh, linux, linux-kernel, mw, stefanc,
	miquel.raynal, netdev
In-Reply-To: <20170824083823.16826-8-antoine.tenart@free-electrons.com>

On Thu, Aug 24, 2017 at 10:38:17AM +0200, Antoine Tenart wrote:
> When the link status changes, the phylib calls the link_event function
> in the mvpp2 driver. Before this patch only the egress/ingress transmit
> was enabled/disabled. This patch adds more functionality to the link
> status management code by enabling/disabling the port per-cpu
> interrupts, and the port itself. The queues are now stopped as well, and
> the netif carrier helpers are called.
> 
> Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
> ---
>  drivers/net/ethernet/marvell/mvpp2.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
> index ebcc89b8f792..99847fec1c5a 100644
> --- a/drivers/net/ethernet/marvell/mvpp2.c
> +++ b/drivers/net/ethernet/marvell/mvpp2.c
> @@ -5753,14 +5753,24 @@ static void mvpp2_link_event(struct net_device *dev)
>  		port->link = phydev->link;
>  
>  		if (phydev->link) {
> +			mvpp2_interrupts_enable(port);
> +			mvpp2_port_enable(port);
> +
>  			mvpp2_egress_enable(port);
>  			mvpp2_ingress_enable(port);
> +			netif_carrier_on(dev);

Hi Antoine

Have you seen cases where it is required to change the carrier state?
The phy state machine should be doing this. e.g. when autoneg has
completed, force link configuration, the link goes down etc.

	   Andrew

^ permalink raw reply

* Re: [PATCH net-next 12/13] arm64: dts: marvell: mcbin: add comphy references to Ethernet ports
From: Antoine Tenart @ 2017-08-24 14:03 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Antoine Tenart, davem, kishon, jason, sebastian.hesselbarth,
	gregory.clement, thomas.petazzoni, nadavh, linux, linux-kernel,
	mw, stefanc, miquel.raynal, netdev
In-Reply-To: <20170824135813.GF8022@lunn.ch>

[-- Attachment #1: Type: text/plain, Size: 576 bytes --]

Hi Andrew,

On Thu, Aug 24, 2017 at 03:58:13PM +0200, Andrew Lunn wrote:
> > @@ -189,6 +191,7 @@
> >  	status = "okay";
> >  	phy = <&ge_phy>;
> >  	phy-mode = "sgmii";
> > +	phys = <&cps_comphy0 1>;
> 
> Does the binding document describe the meaning of the specifier?

Ahhh no you're right! It's the port number i.e. there are multiple
inputs, each of which can support different modes. So say the input is
GoP#0, it can support 10G and SGMII.

Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
From: Andrew Lunn @ 2017-08-24 14:02 UTC (permalink / raw)
  To: Stefan Chulski
  Cc: Antoine Tenart, davem@davemloft.net, kishon@ti.com,
	jason@lakedaemon.net, sebastian.hesselbarth@gmail.com,
	gregory.clement@free-electrons.com,
	thomas.petazzoni@free-electrons.com, Nadav Haklai,
	linux@armlinux.org.uk, linux-kernel@vger.kernel.org,
	mw@semihalf.com, miquel.raynal@free-electrons.com,
	netdev@vger.kernel.org
In-Reply-To: <e83030e8499843e8af9004c2562380dc@IL-EXCH01.marvell.com>

> > -----Original Message-----
> > From: Andrew Lunn [mailto:andrew@lunn.ch]
> > Sent: Thursday, August 24, 2017 4:51 PM
> > To: Antoine Tenart <antoine.tenart@free-electrons.com>
> > Cc: davem@davemloft.net; kishon@ti.com; jason@lakedaemon.net;
> > sebastian.hesselbarth@gmail.com; gregory.clement@free-electrons.com;
> > thomas.petazzoni@free-electrons.com; Nadav Haklai <nadavh@marvell.com>;
> > linux@armlinux.org.uk; linux-kernel@vger.kernel.org; mw@semihalf.com;
> > Stefan Chulski <stefanc@marvell.com>; miquel.raynal@free-electrons.com;
> > netdev@vger.kernel.org
> > Subject: Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
> > 
> > Hi Antione.
> > 
> > > How would you name it if not "comphy-cp110"?
> > 
> > Good question...
> > 
> > '7000-cpmphy-cp110'
> > '8000-cpmphy-cp110'
> > 
> > ??
> > 
> > 	Andrew
> 
> A8K Marvell SoC has two South Bridge communication controllers
> and A7K only one communication controllers, but its identical
> communication controllers 110. Next generation will has another number 1xx.

Save this email, so we know who to blame when Marvell does something
different :-)

	  Andrew

^ permalink raw reply

* Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
From: Antoine Tenart @ 2017-08-24 14:01 UTC (permalink / raw)
  To: Stefan Chulski
  Cc: Andrew Lunn, Antoine Tenart, davem@davemloft.net, kishon@ti.com,
	jason@lakedaemon.net, sebastian.hesselbarth@gmail.com,
	gregory.clement@free-electrons.com,
	thomas.petazzoni@free-electrons.com, Nadav Haklai,
	linux@armlinux.org.uk, linux-kernel@vger.kernel.org,
	mw@semihalf.com, miquel.raynal@free-electrons.com,
	netdev@vger.kernel.org
In-Reply-To: <e83030e8499843e8af9004c2562380dc@IL-EXCH01.marvell.com>

[-- Attachment #1: Type: text/plain, Size: 642 bytes --]

Hi Stefan,

On Thu, Aug 24, 2017 at 01:57:04PM +0000, Stefan Chulski wrote:
> 
> > > How would you name it if not "comphy-cp110"?
> > 
> > Good question...
> > 
> > '7000-cpmphy-cp110'
> > '8000-cpmphy-cp110'
> > 
> > ??
> > 
> > 	Andrew
> 
> A8K Marvell SoC has two South Bridge communication controllers
> and A7K only one communication controllers, but its identical
> communication controllers 110. Next generation will has another number 1xx.

OK, so I guess we can keep 'comphy-cp110' then.

Thanks!
Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 12/13] arm64: dts: marvell: mcbin: add comphy references to Ethernet ports
From: Andrew Lunn @ 2017-08-24 13:58 UTC (permalink / raw)
  To: Antoine Tenart
  Cc: davem, kishon, jason, sebastian.hesselbarth, gregory.clement,
	thomas.petazzoni, nadavh, linux, linux-kernel, mw, stefanc,
	miquel.raynal, netdev
In-Reply-To: <20170824083823.16826-13-antoine.tenart@free-electrons.com>

> @@ -189,6 +191,7 @@
>  	status = "okay";
>  	phy = <&ge_phy>;
>  	phy-mode = "sgmii";
> +	phys = <&cps_comphy0 1>;

Does the binding document describe the meaning of the specifier?

     Andrew

^ permalink raw reply

* Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
From: Antoine Tenart @ 2017-08-24 13:58 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Antoine Tenart, davem, kishon, jason, sebastian.hesselbarth,
	gregory.clement, thomas.petazzoni, nadavh, linux, linux-kernel,
	mw, stefanc, miquel.raynal, netdev
In-Reply-To: <20170824134504.GD8022@lunn.ch>

[-- Attachment #1: Type: text/plain, Size: 1033 bytes --]

Hi Andrew,

On Thu, Aug 24, 2017 at 03:45:04PM +0200, Andrew Lunn wrote:
> > +	for_each_available_child_of_node(pdev->dev.of_node, child) {
> > +		struct mvebu_comphy_lane *lane;
> > +		struct phy *phy;
> > +		int ret;
> > +		u32 val;
> > +
> > +		ret = of_property_read_u32(child, "reg", &val);
> > +		if (ret < 0) {
> > +			dev_err(&pdev->dev, "missing 'reg' property (%d)\n",
> > +				ret);
> > +			continue;
> > +		}
> 
> I'm just wondering why we need this. We know how many lanes there are
> from the table. So just create a generic PHY for each lane?

At first I did this statically. I moved to this solution because:
1. It represents the h/w correctly (there are 6 lanes duplicated here).
2. It eases the dt representation, we would have something like:
   <&cpm_comphy 0 1>; otherwise.
3. If the number of lanes changes in future revisions it'll be quite
   easy to handle.

Thanks!
Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* RE: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
From: Stefan Chulski @ 2017-08-24 13:57 UTC (permalink / raw)
  To: Andrew Lunn, Antoine Tenart
  Cc: davem@davemloft.net, kishon@ti.com, jason@lakedaemon.net,
	sebastian.hesselbarth@gmail.com,
	gregory.clement@free-electrons.com,
	thomas.petazzoni@free-electrons.com, Nadav Haklai,
	linux@armlinux.org.uk, linux-kernel@vger.kernel.org,
	mw@semihalf.com, miquel.raynal@free-electrons.com,
	netdev@vger.kernel.org
In-Reply-To: <20170824135127.GE8022@lunn.ch>



> -----Original Message-----
> From: Andrew Lunn [mailto:andrew@lunn.ch]
> Sent: Thursday, August 24, 2017 4:51 PM
> To: Antoine Tenart <antoine.tenart@free-electrons.com>
> Cc: davem@davemloft.net; kishon@ti.com; jason@lakedaemon.net;
> sebastian.hesselbarth@gmail.com; gregory.clement@free-electrons.com;
> thomas.petazzoni@free-electrons.com; Nadav Haklai <nadavh@marvell.com>;
> linux@armlinux.org.uk; linux-kernel@vger.kernel.org; mw@semihalf.com;
> Stefan Chulski <stefanc@marvell.com>; miquel.raynal@free-electrons.com;
> netdev@vger.kernel.org
> Subject: Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
> 
> Hi Antione.
> 
> > How would you name it if not "comphy-cp110"?
> 
> Good question...
> 
> '7000-cpmphy-cp110'
> '8000-cpmphy-cp110'
> 
> ??
> 
> 	Andrew

A8K Marvell SoC has two South Bridge communication controllers
and A7K only one communication controllers, but its identical
communication controllers 110. Next generation will has another number 1xx.

Stefan,
Regards.

^ permalink raw reply

* Re: Question about ip_defrag
From: Jesper Dangaard Brouer @ 2017-08-24 13:53 UTC (permalink / raw)
  To: liujian (CE)
  Cc: davem@davemloft.net, kuznet@ms2.inr.ac.ru,
	yoshfuji@linux-ipv6.org, elena.reshetova@intel.com,
	edumazet@google.com, netdev@vger.kernel.org, brouer
In-Reply-To: <4F88C5DDA1E80143B232E89585ACE27D018F07E2@DGGEMA502-MBX.china.huawei.com>


On Thu, 24 Aug 2017 13:15:33 +0000 "liujian (CE)" <liujian56@huawei.com> wrote:
> Hello,
> 
> With below patch we met one issue.
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v4.13-rc6&id=6d7b857d541e
> 
> the issue:
> Ip_defrag fail caused by frag_mem_limit reached 4M(frags.high_thresh).
> At this moment,sum_frag_mem_limit is about 10K.
> and my test machine's cpu num is 64.
> 
> Can i only change frag_mem_limit to sum_ frag_mem_limit?
> 
> 
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 96e95e8..f09c00b 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -120,7 +120,7 @@ static void inet_frag_secret_rebuild(struct inet_frags *f)
>  static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
>  {
>         return q->net->low_thresh == 0 ||
> -              frag_mem_limit(q->net) >= q->net->low_thresh;
> +              sum_frag_mem_limit(q->net) >= q->net->low_thresh;
>  }
> 
>  static unsigned int
> @@ -355,7 +355,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
>  {
>         struct inet_frag_queue *q;
> 
> -       if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
> +       if (!nf->high_thresh || sum_frag_mem_limit(nf) > nf->high_thresh) {
>                 inet_frag_schedule_worker(f);
>                 return NULL;
>         }
> @@ -396,7 +396,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
>         struct inet_frag_queue *q;
>         int depth = 0;
> 
> -       if (frag_mem_limit(nf) > nf->low_thresh)
> +       if (sum_frag_mem_limit(nf) > nf->low_thresh)
>                 inet_frag_schedule_worker(f);
> 
>         hash &= (INETFRAGS_HASHSZ - 1);
> --
> 
> Thank you for your time.

What kernel version have you seen this issue with?

As far as I remember, this issue have been fixed before...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
From: Andrew Lunn @ 2017-08-24 13:51 UTC (permalink / raw)
  To: Antoine Tenart
  Cc: davem, kishon, jason, sebastian.hesselbarth, gregory.clement,
	thomas.petazzoni, nadavh, linux, linux-kernel, mw, stefanc,
	miquel.raynal, netdev
In-Reply-To: <20170824134444.GA28443@kwain>

Hi Antione.

> How would you name it if not "comphy-cp110"?

Good question...

'7000-cpmphy-cp110'
'8000-cpmphy-cp110'

??

	Andrew

^ permalink raw reply

* Re: [PATCH net-next] virtio-net: invoke zerocopy callback on xmit path if no tx napi
From: Michael S. Tsirkin @ 2017-08-24 13:50 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Koichiro Den, virtualization
In-Reply-To: <CAF=yD-KSek+LmZu0X0TXetmFEQ59iF3NpjZ4KugbwLo1BGfhaA@mail.gmail.com>

On Wed, Aug 23, 2017 at 11:28:24PM -0400, Willem de Bruijn wrote:
> >> > * as a generic solution, if we were to somehow overcome the safety issue, track
> >> > the delay and do copy if some threshold is reached could be an answer, but it's
> >> > hard for now.> * so things like the current vhost-net implementation of deciding whether or not
> >> > to do zerocopy beforehand referring the zerocopy tx error ratio is a point of
> >> > practical compromise.
> >>
> >> The fragility of this mechanism is another argument for switching to tx napi
> >> as default.
> >>
> >> Is there any more data about the windows guest issues when completions
> >> are not queued within a reasonable timeframe? What is this timescale and
> >> do we really need to work around this.
> >
> > I think it's pretty large, many milliseconds.
> >
> > But I wonder what do you mean by "work around". Using buffers within
> > limited time frame sounds like a reasonable requirement to me.
> 
> Vhost-net zerocopy delays completions until the skb is really
> sent.

This is fundamental in any solution. Guest/application can not
write over a memory buffer as long as hardware might be reading it.

> Traffic shaping can introduce msec timescale latencies.
> 
> The delay may actually be a useful signal. If the guest does not
> orphan skbs early, TSQ will throttle the socket causing host
> queue build up.
> 
> But, if completions are queued in-order, unrelated flows may be
> throttled as well. Allowing out of order completions would resolve
> this HoL blocking.

We can allow out of order, no guests that follow virtio spec
will break. But this won't help in all cases
- a single slow flow can occupy the whole ring, you will not
  be able to make any new buffers available for the fast flow
- what host considers a single flow can be multiple flows for guest

There are many other examples.

> > Neither
> > do I see why would using tx interrupts within guest be a work around -
> > AFAIK windows driver uses tx interrupts.
> 
> It does not address completion latency itself. What I meant was
> that in an interrupt-driver model, additional starvation issues,
> such as the potential deadlock raised at the start of this thread,
> or the timer delay observed before packets were orphaned in
> virtio-net in commit b0c39dbdc204, are mitigated.
> 
> Specifically, it breaks the potential deadlock where sockets are
> blocked waiting for completions (to free up budget in sndbuf, tsq, ..),
> yet completion handling is blocked waiting for a new packet to
> trigger free_old_xmit_skbs from start_xmit.

This talk of potential deadlock confuses me - I think you mean we would
deadlock if we did not orphan skbs in !use_napi - is that right?  If you
mean that you can drop skb orphan and this won't lead to a deadlock if
free skbs upon a tx interrupt, I agree, for sure.

> >> That is the only thing keeping us from removing the HoL blocking in vhost-net zerocopy.
> >
> > We don't enable network watchdog on virtio but we could and maybe
> > should.
> 
> Can you elaborate?

The issue is that holding onto buffers for very long times makes guests
think they are stuck. This is funamentally because from guest point of
view this is a NIC, so it is supposed to transmit things out in
a timely manner. If host backs the virtual NIC by something that is not
a NIC, with traffic shaping etc introducing unbounded latencies,
guest will be confused.

For example, we could set ndo_tx_timeout within guest. Then
if tx queue is stopped for too long, a watchdog would fire.

We worked around most of the issues by introducing guest/host
copy. This copy, done by vhost-net, allows us to pretend that
a not-nic backend (e.g. a qdisc) is a nic (virtio-net).
This way you can both do traffic shaping in host with
unbounded latencies and limit latency from guest point of view.

Cost is both data copies and loss of end to end credit accounting.

Changing Linux as a host to limit latencies while not doing copies will
not be an easy task but that's the only fix that comes to mind.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
From: Andrew Lunn @ 2017-08-24 13:45 UTC (permalink / raw)
  To: Antoine Tenart
  Cc: davem, kishon, jason, sebastian.hesselbarth, gregory.clement,
	thomas.petazzoni, nadavh, linux, linux-kernel, mw, stefanc,
	miquel.raynal, netdev
In-Reply-To: <20170824083823.16826-3-antoine.tenart@free-electrons.com>

> +	for_each_available_child_of_node(pdev->dev.of_node, child) {
> +		struct mvebu_comphy_lane *lane;
> +		struct phy *phy;
> +		int ret;
> +		u32 val;
> +
> +		ret = of_property_read_u32(child, "reg", &val);
> +		if (ret < 0) {
> +			dev_err(&pdev->dev, "missing 'reg' property (%d)\n",
> +				ret);
> +			continue;
> +		}

I'm just wondering why we need this. We know how many lanes there are
from the table. So just create a generic PHY for each lane?

     Andrew

^ permalink raw reply

* Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
From: Antoine Tenart @ 2017-08-24 13:44 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Antoine Tenart, davem, kishon, jason, sebastian.hesselbarth,
	gregory.clement, thomas.petazzoni, nadavh, linux, linux-kernel,
	mw, stefanc, miquel.raynal, netdev
In-Reply-To: <20170824133922.GC8022@lunn.ch>

[-- Attachment #1: Type: text/plain, Size: 1685 bytes --]

Hi Andrew,

On Thu, Aug 24, 2017 at 03:39:22PM +0200, Andrew Lunn wrote:
> > +static const struct mvebu_comhy_conf mvebu_comphy_modes[] = {
> > +	/* lane 0 */
> > +	MVEBU_COMPHY_CONF(0, 1, PHY_MODE_SGMII, 0x1),
> > +	/* lane 1 */
> > +	MVEBU_COMPHY_CONF(1, 2, PHY_MODE_SGMII, 0x1),
> > +	/* lane 2 */
> > +	MVEBU_COMPHY_CONF(2, 0, PHY_MODE_SGMII, 0x1),
> > +	MVEBU_COMPHY_CONF(2, 0, PHY_MODE_10GKR, 0x1),
> > +	/* lane 3 */
> > +	MVEBU_COMPHY_CONF(3, 1, PHY_MODE_SGMII, 0x2),
> > +	/* lane 4 */
> > +	MVEBU_COMPHY_CONF(4, 0, PHY_MODE_SGMII, 0x2),
> > +	MVEBU_COMPHY_CONF(4, 0, PHY_MODE_10GKR, 0x2),
> > +	MVEBU_COMPHY_CONF(4, 1, PHY_MODE_SGMII, 0x1),
> > +	/* lane 5 */
> > +	MVEBU_COMPHY_CONF(5, 2, PHY_MODE_SGMII, 0x1),
> > +};
> 
> Do other Marvell SoCs re-use this IP? Maybe add cp110 to the name here
> to indicate what SoC this configuration belongs to? I guess at some
> point, the compatible string will be used to select the correct table
> for the hardware variant.

OK, I'll rename the variable mvebu_comphy_cp110_modes.

> > +static const struct of_device_id mvebu_comphy_of_match_table[] = {
> > +	{ .compatible = "marvell,comphy-cp110" },
> 
> Is that specific enough? It seems like this table is easy to change in
> the VHDL. Could there be another cp110 with a different configuration?

As far as I understand CP110 is the name of the CP, should there be
another one it should be named differently. But I can't be 100% sure,
you never know what comes next :)

How would you name it if not "comphy-cp110"?

Thanks!
Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 02/13] phy: add the mvebu cp110 comphy driver
From: Andrew Lunn @ 2017-08-24 13:39 UTC (permalink / raw)
  To: Antoine Tenart
  Cc: davem, kishon, jason, sebastian.hesselbarth, gregory.clement,
	thomas.petazzoni, nadavh, linux, linux-kernel, mw, stefanc,
	miquel.raynal, netdev
In-Reply-To: <20170824083823.16826-3-antoine.tenart@free-electrons.com>

> +static const struct mvebu_comhy_conf mvebu_comphy_modes[] = {
> +	/* lane 0 */
> +	MVEBU_COMPHY_CONF(0, 1, PHY_MODE_SGMII, 0x1),
> +	/* lane 1 */
> +	MVEBU_COMPHY_CONF(1, 2, PHY_MODE_SGMII, 0x1),
> +	/* lane 2 */
> +	MVEBU_COMPHY_CONF(2, 0, PHY_MODE_SGMII, 0x1),
> +	MVEBU_COMPHY_CONF(2, 0, PHY_MODE_10GKR, 0x1),
> +	/* lane 3 */
> +	MVEBU_COMPHY_CONF(3, 1, PHY_MODE_SGMII, 0x2),
> +	/* lane 4 */
> +	MVEBU_COMPHY_CONF(4, 0, PHY_MODE_SGMII, 0x2),
> +	MVEBU_COMPHY_CONF(4, 0, PHY_MODE_10GKR, 0x2),
> +	MVEBU_COMPHY_CONF(4, 1, PHY_MODE_SGMII, 0x1),
> +	/* lane 5 */
> +	MVEBU_COMPHY_CONF(5, 2, PHY_MODE_SGMII, 0x1),
> +};

Do other Marvell SoCs re-use this IP? Maybe add cp110 to the name here
to indicate what SoC this configuration belongs to? I guess at some
point, the compatible string will be used to select the correct table
for the hardware variant.

> +static const struct of_device_id mvebu_comphy_of_match_table[] = {
> +	{ .compatible = "marvell,comphy-cp110" },

Is that specific enough? It seems like this table is easy to change in
the VHDL. Could there be another cp110 with a different configuration?

    Andrew

^ permalink raw reply

* Re: [PATCH net-next 01/13] phy: add sgmii and 10gkr modes to the phy_mode enum
From: Antoine Tenart @ 2017-08-24 13:31 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Antoine Tenart, davem, kishon, jason, sebastian.hesselbarth,
	gregory.clement, thomas.petazzoni, nadavh, linux, linux-kernel,
	mw, stefanc, miquel.raynal, netdev
In-Reply-To: <20170824131938.GB8022@lunn.ch>

[-- Attachment #1: Type: text/plain, Size: 741 bytes --]

Hi Andrew,

On Thu, Aug 24, 2017 at 03:19:38PM +0200, Andrew Lunn wrote:
> On Thu, Aug 24, 2017 at 10:38:11AM +0200, Antoine Tenart wrote:
> > This patch adds more PHY modes to the phy_mode enum, to allow
> > configuring PHYs to the SGMII and/or the 10GKR mode by using the
> > set_mode callback.
> 
> You had me confused for a while here. If there is need for a respin,
> could you change the commit log and prefix PHY with either generic or
> Ethernet, just to make it clear which PHY subsystem we are talking
> about.

I understand the confusion :) I'll update the commit log if there is a
respin.

Thanks!
Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [net-next 7/8] net/mlx5: Add hash table for flow groups in flow table
From: Saeed Mahameed @ 2017-08-24 13:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Leon Romanovsky, Matan Barak, Saeed Mahameed
In-Reply-To: <20170824132152.29538-1-saeedm@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

When adding a flow table entry (fte) to a flow table (ft), we first
need to find its flow group (fg). Currently, this is done by
traversing a linear list of all flow groups in the flow table.
Furthermore, since multiple flow groups which correspond to the same
fte mask may exist in the same ft, we can't just stop at the first
match. Converting the linear list to rhltable in order to speed things
up.

The last four patches increases the steering rules update rate by a
factor of more than 7 (for insertion of 50K steering rules).

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 187 +++++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |   2 +
 2 files changed, 152 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index d8d45b006996..9704c2eb82a1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -158,6 +158,15 @@ static const struct rhashtable_params rhash_fte = {
 	.min_size = 1,
 };
 
+static const struct rhashtable_params rhash_fg = {
+	.key_len = FIELD_SIZEOF(struct mlx5_flow_group, mask),
+	.key_offset = offsetof(struct mlx5_flow_group, mask),
+	.head_offset = offsetof(struct mlx5_flow_group, hash),
+	.automatic_shrinking = true,
+	.min_size = 1,
+
+};
+
 static void del_rule(struct fs_node *node);
 static void del_flow_table(struct fs_node *node);
 static void del_flow_group(struct fs_node *node);
@@ -318,12 +327,22 @@ static bool check_valid_mask(u8 match_criteria_enable, const u32 *match_criteria
 	return check_last_reserved(match_criteria);
 }
 
-static bool compare_match_criteria(u8 match_criteria_enable1,
-				   u8 match_criteria_enable2,
-				   void *mask1, void *mask2)
+static bool check_valid_spec(const struct mlx5_flow_spec *spec)
 {
-	return match_criteria_enable1 == match_criteria_enable2 &&
-		!memcmp(mask1, mask2, MLX5_ST_SZ_BYTES(fte_match_param));
+	int i;
+
+	if (!check_valid_mask(spec->match_criteria_enable, spec->match_criteria)) {
+		pr_warn("mlx5_core: Match criteria given mismatches match_criteria_enable\n");
+		return false;
+	}
+
+	for (i = 0; i < MLX5_ST_SZ_DW_MATCH_PARAM; i++)
+		if (spec->match_value[i] & ~spec->match_criteria[i]) {
+			pr_warn("mlx5_core: match_value differs from match_criteria\n");
+			return false;
+		}
+
+	return check_last_reserved(spec->match_value);
 }
 
 static struct mlx5_flow_root_namespace *find_root(struct fs_node *node)
@@ -365,6 +384,7 @@ static void del_flow_table(struct fs_node *node)
 	if (err)
 		mlx5_core_warn(dev, "flow steering can't destroy ft\n");
 	ida_destroy(&ft->fte_allocator);
+	rhltable_destroy(&ft->fgs_hash);
 	fs_get_obj(prio, ft->node.parent);
 	prio->num_ft--;
 }
@@ -454,6 +474,7 @@ static void del_flow_group(struct fs_node *node)
 	struct mlx5_flow_group *fg;
 	struct mlx5_flow_table *ft;
 	struct mlx5_core_dev *dev;
+	int err;
 
 	fs_get_obj(fg, node);
 	fs_get_obj(ft, fg->node.parent);
@@ -463,6 +484,10 @@ static void del_flow_group(struct fs_node *node)
 		ft->autogroup.num_groups--;
 
 	rhashtable_destroy(&fg->ftes_hash);
+	err = rhltable_remove(&ft->fgs_hash,
+			      &fg->hash,
+			      rhash_fg);
+	WARN_ON(err);
 	if (mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
 		mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
 			       fg->id, ft->id);
@@ -525,10 +550,17 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_ft
 						u32 flags)
 {
 	struct mlx5_flow_table *ft;
+	int ret;
 
 	ft  = kzalloc(sizeof(*ft), GFP_KERNEL);
 	if (!ft)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
+
+	ret = rhltable_init(&ft->fgs_hash, &rhash_fg);
+	if (ret) {
+		kfree(ft);
+		return ERR_PTR(ret);
+	}
 
 	ft->level = level;
 	ft->node.type = FS_TYPE_FLOW_TABLE;
@@ -829,8 +861,8 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 			      ft_attr->max_fte ? roundup_pow_of_two(ft_attr->max_fte) : 0,
 			      root->table_type,
 			      op_mod, ft_attr->flags);
-	if (!ft) {
-		err = -ENOMEM;
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
 		goto unlock_root;
 	}
 
@@ -942,10 +974,14 @@ static struct mlx5_flow_group *create_flow_group_common(struct mlx5_flow_table *
 	if (IS_ERR(fg))
 		return fg;
 
-	err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id);
+	err = rhltable_insert(&ft->fgs_hash, &fg->hash, rhash_fg);
 	if (err)
 		goto err_free_fg;
 
+	err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id);
+	if (err)
+		goto err_remove_fg;
+
 	if (ft->autogroup.active)
 		ft->autogroup.num_groups++;
 	/* Add node to tree */
@@ -956,6 +992,10 @@ static struct mlx5_flow_group *create_flow_group_common(struct mlx5_flow_table *
 
 	return fg;
 
+err_remove_fg:
+	WARN_ON(rhltable_remove(&ft->fgs_hash,
+				&fg->hash,
+				rhash_fg));
 err_free_fg:
 	rhashtable_destroy(&fg->ftes_hash);
 	kfree(fg);
@@ -1291,18 +1331,13 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 					    u32 *match_value,
 					    struct mlx5_flow_act *flow_act,
 					    struct mlx5_flow_destination *dest,
-					    int dest_num)
+					    int dest_num,
+					    struct fs_fte *fte)
 {
-	u32 masked_val[sizeof(fg->mask.match_criteria)];
 	struct mlx5_flow_handle *handle;
 	struct mlx5_flow_table *ft;
-	struct fs_fte *fte;
 	int i;
 
-	nested_lock_ref_node(&fg->node, FS_MUTEX_PARENT);
-	for (i = 0; i < sizeof(masked_val); i++)
-		masked_val[i] = match_value[i] & fg->mask.match_criteria[i];
-	fte = rhashtable_lookup_fast(&fg->ftes_hash, masked_val, rhash_fte);
 	if (fte) {
 		int old_action;
 		int ret;
@@ -1324,15 +1359,12 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 		} else {
 			goto add_rules;
 		}
-		unlock_ref_node(&fte->node);
 	}
 	fs_get_obj(ft, fg->node.parent);
 
 	fte = create_fte(fg, match_value, flow_act);
-	if (IS_ERR(fte)) {
-		handle = (void *)fte;
-		goto unlock_fg;
-	}
+	if (IS_ERR(fte))
+		return (void *)fte;
 	tree_init_node(&fte->node, 0, del_fte);
 	nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
 	handle = add_rule_fte(fte, fg, dest, dest_num, false);
@@ -1340,7 +1372,7 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 		unlock_ref_node(&fte->node);
 		destroy_fte(fte, fg);
 		kfree(fte);
-		goto unlock_fg;
+		return handle;
 	}
 
 	tree_add_node(&fte->node, &fg->node);
@@ -1353,8 +1385,6 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	}
 unlock_fte:
 	unlock_ref_node(&fte->node);
-unlock_fg:
-	unlock_ref_node(&fg->node);
 	return handle;
 }
 
@@ -1403,6 +1433,96 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 }
 
 static struct mlx5_flow_handle *
+try_add_to_existing_fg(struct mlx5_flow_table *ft,
+		       struct mlx5_flow_spec *spec,
+		       struct mlx5_flow_act *flow_act,
+		       struct mlx5_flow_destination *dest,
+		       int dest_num)
+{
+	struct mlx5_flow_group *g;
+	struct mlx5_flow_handle *rule = ERR_PTR(-ENOENT);
+	struct rhlist_head *tmp, *list;
+	struct match_list {
+		struct list_head	list;
+		struct mlx5_flow_group *g;
+	} match_list, *iter;
+	LIST_HEAD(match_head);
+
+	rcu_read_lock();
+	/* Collect all fgs which has a matching match_criteria */
+	list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg);
+	rhl_for_each_entry_rcu(g, tmp, list, hash) {
+		struct match_list *curr_match;
+
+		if (likely(list_empty(&match_head))) {
+			match_list.g = g;
+			list_add_tail(&match_list.list, &match_head);
+			continue;
+		}
+		curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
+
+		if (!curr_match) {
+			rcu_read_unlock();
+			rule = ERR_PTR(-ENOMEM);
+			goto free_list;
+		}
+		curr_match->g = g;
+		list_add_tail(&curr_match->list, &match_head);
+	}
+	rcu_read_unlock();
+
+	/* Try to find a fg that already contains a matching fte */
+	list_for_each_entry(iter, &match_head, list) {
+		struct fs_fte *fte;
+
+		g = iter->g;
+		nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
+		fte = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
+					     rhash_fte);
+		if (fte) {
+			rule = add_rule_fg(g, spec->match_value,
+					   flow_act, dest, dest_num, fte);
+			unlock_ref_node(&g->node);
+			goto free_list;
+		}
+		unlock_ref_node(&g->node);
+	}
+
+	/* No group with matching fte found. Try to add a new fte to any
+	 * matching fg.
+	 */
+	list_for_each_entry(iter, &match_head, list) {
+		g = iter->g;
+
+		nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
+		rule = add_rule_fg(g, spec->match_value,
+				   flow_act, dest, dest_num, NULL);
+		if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC) {
+			unlock_ref_node(&g->node);
+			goto free_list;
+		}
+		unlock_ref_node(&g->node);
+	}
+
+free_list:
+	if (!list_empty(&match_head)) {
+		struct match_list *match_tmp;
+
+		/* The most common case is having one FG. Since we want to
+		 * optimize this case, we save the first on the stack.
+		 * Therefore, no need to free it.
+		 */
+		list_del(&list_first_entry(&match_head, typeof(*iter), list)->list);
+		list_for_each_entry_safe(iter, match_tmp, &match_head, list) {
+			list_del(&iter->list);
+			kfree(iter);
+		}
+	}
+
+	return rule;
+}
+
+static struct mlx5_flow_handle *
 _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		     struct mlx5_flow_spec *spec,
 		     struct mlx5_flow_act *flow_act,
@@ -1414,8 +1534,7 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 	struct mlx5_flow_handle *rule;
 	int i;
 
-	if (!check_valid_mask(spec->match_criteria_enable,
-			      spec->match_criteria))
+	if (!check_valid_spec(spec))
 		return ERR_PTR(-EINVAL);
 
 	for (i = 0; i < dest_num; i++) {
@@ -1424,16 +1543,9 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 	}
 
 	nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
-	fs_for_each_fg(g, ft)
-		if (compare_match_criteria(g->mask.match_criteria_enable,
-					   spec->match_criteria_enable,
-					   g->mask.match_criteria,
-					   spec->match_criteria)) {
-			rule = add_rule_fg(g, spec->match_value,
-					   flow_act, dest, dest_num);
-			if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC)
-				goto unlock;
-		}
+	rule = try_add_to_existing_fg(ft, spec, flow_act, dest, dest_num);
+	if (!IS_ERR(rule))
+		goto unlock;
 
 	g = create_autogroup(ft, spec->match_criteria_enable,
 			     spec->match_criteria);
@@ -1442,7 +1554,8 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		goto unlock;
 	}
 
-	rule = add_rule_fg(g, spec->match_value, flow_act, dest, dest_num);
+	rule = add_rule_fg(g, spec->match_value, flow_act, dest,
+			   dest_num, NULL);
 	if (IS_ERR(rule)) {
 		/* Remove assumes refcount > 0 and autogroup creates a group
 		 * with a refcount = 0.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 62709a3865d2..5509a752f98e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -120,6 +120,7 @@ struct mlx5_flow_table {
 	struct list_head		fwd_rules;
 	u32				flags;
 	struct ida			fte_allocator;
+	struct rhltable			fgs_hash;
 };
 
 struct mlx5_fc_cache {
@@ -200,6 +201,7 @@ struct mlx5_flow_group {
 	u32				max_ftes;
 	u32				id;
 	struct rhashtable		ftes_hash;
+	struct rhlist_head		hash;
 };
 
 struct mlx5_flow_root_namespace {
-- 
2.13.0

^ permalink raw reply related

* [net-next 8/8] net/mlx5: Add tracepoints
From: Saeed Mahameed @ 2017-08-24 13:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Leon Romanovsky, Matan Barak, Saeed Mahameed
In-Reply-To: <20170824132152.29538-1-saeedm@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

Add a tracepoint infrastructure for mlx5_core driver.
Implemented flow steering tracepoints:
1. Add flow group
2. Remove flow group
3. Add flow table entry
4. Remove flow table entry
5. Add flow table rule
6. Remove flow table rule

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   5 +-
 .../net/ethernet/mellanox/mlx5/core/diag/Makefile  |   1 +
 .../mellanox/mlx5/core/diag/fs_tracepoint.c        | 261 +++++++++++++++++++
 .../mellanox/mlx5/core/diag/fs_tracepoint.h        | 282 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |  11 +-
 5 files changed, 558 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 22ed657d263a..87a3099808f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -4,7 +4,8 @@ subdir-ccflags-y += -I$(src)
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o \
 		mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
-		fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o
+		fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o \
+		diag/fs_tracepoint.o
 
 mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o
 
@@ -25,3 +26,5 @@ mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o
 
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 		en_accel/ipsec_stats.o
+
+CFLAGS_tracepoint.o := -I$(src)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
new file mode 100644
index 000000000000..d8e17110f25d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
new file mode 100644
index 000000000000..0be4575b58a2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define CREATE_TRACE_POINTS
+
+#include "fs_tracepoint.h"
+#include <linux/stringify.h>
+
+#define DECLARE_MASK_VAL(type, name) struct {type m; type v; } name
+#define MASK_VAL(type, spec, name, mask, val, fld)	\
+		DECLARE_MASK_VAL(type, name) =		\
+			{.m = MLX5_GET(spec, mask, fld),\
+			 .v = MLX5_GET(spec, val, fld)}
+#define MASK_VAL_BE(type, spec, name, mask, val, fld)	\
+		    DECLARE_MASK_VAL(type, name) =	\
+			{.m = MLX5_GET_BE(type, spec, mask, fld),\
+			 .v = MLX5_GET_BE(type, spec, val, fld)}
+#define GET_MASKED_VAL(name) (name.m & name.v)
+
+#define GET_MASK_VAL(name, type, mask, val, fld)	\
+		(name.m = MLX5_GET(type, mask, fld),	\
+		 name.v = MLX5_GET(type, val, fld),	\
+		 name.m & name.v)
+#define PRINT_MASKED_VAL(name, p, format) {		\
+	if (name.m)			\
+		trace_seq_printf(p, __stringify(name) "=" format " ", name.v); \
+	}
+#define PRINT_MASKED_VALP(name, cast, p, format) {	\
+	if (name.m)			\
+		trace_seq_printf(p, __stringify(name) "=" format " ",	       \
+				 (cast)&name.v);\
+	}
+
+static void print_lyr_2_4_hdrs(struct trace_seq *p,
+			       const u32 *mask, const u32 *value)
+{
+#define MASK_VAL_L2(type, name, fld) \
+	MASK_VAL(type, fte_match_set_lyr_2_4, name, mask, value, fld)
+	DECLARE_MASK_VAL(u64, smac) = {
+		.m = MLX5_GET(fte_match_set_lyr_2_4, mask, smac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, mask, smac_15_0),
+		.v = MLX5_GET(fte_match_set_lyr_2_4, value, smac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, value, smac_15_0)};
+	DECLARE_MASK_VAL(u64, dmac) = {
+		.m = MLX5_GET(fte_match_set_lyr_2_4, mask, dmac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, mask, dmac_15_0),
+		.v = MLX5_GET(fte_match_set_lyr_2_4, value, dmac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, value, dmac_15_0)};
+	MASK_VAL_L2(u16, ethertype, ethertype);
+
+	PRINT_MASKED_VALP(smac, u8 *, p, "%pM");
+	PRINT_MASKED_VALP(dmac, u8 *, p, "%pM");
+	PRINT_MASKED_VAL(ethertype, p, "%04x");
+
+	if (ethertype.m == 0xffff) {
+		if (ethertype.v == ETH_P_IP) {
+#define MASK_VAL_L2_BE(type, name, fld) \
+	MASK_VAL_BE(type, fte_match_set_lyr_2_4, name, mask, value, fld)
+			MASK_VAL_L2_BE(u32, src_ipv4,
+				       src_ipv4_src_ipv6.ipv4_layout.ipv4);
+			MASK_VAL_L2_BE(u32, dst_ipv4,
+				       dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+			PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p,
+					  "%pI4");
+			PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p,
+					  "%pI4");
+		} else if (ethertype.v == ETH_P_IPV6) {
+			static const struct in6_addr full_ones = {
+				.in6_u.u6_addr32 = {htonl(0xffffffff),
+						    htonl(0xffffffff),
+						    htonl(0xffffffff),
+						    htonl(0xffffffff)},
+			};
+			DECLARE_MASK_VAL(struct in6_addr, src_ipv6);
+			DECLARE_MASK_VAL(struct in6_addr, dst_ipv6);
+
+			memcpy(src_ipv6.m.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
+					    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+			       sizeof(src_ipv6.m));
+			memcpy(dst_ipv6.m.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
+					    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+			       sizeof(dst_ipv6.m));
+			memcpy(src_ipv6.v.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
+					    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+			       sizeof(src_ipv6.v));
+			memcpy(dst_ipv6.v.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
+					    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+			       sizeof(dst_ipv6.v));
+
+			if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones)))
+				trace_seq_printf(p, "src_ipv6=%pI6 ",
+						 &src_ipv6.v);
+			if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones)))
+				trace_seq_printf(p, "dst_ipv6=%pI6 ",
+						 &dst_ipv6.v);
+		}
+	}
+
+#define PRINT_MASKED_VAL_L2(type, name, fld, p, format) {\
+	MASK_VAL_L2(type, name, fld);		         \
+	PRINT_MASKED_VAL(name, p, format);		 \
+}
+
+	PRINT_MASKED_VAL_L2(u8, ip_protocol, ip_protocol, p, "%02x");
+	PRINT_MASKED_VAL_L2(u16, tcp_flags, tcp_flags, p, "%x");
+	PRINT_MASKED_VAL_L2(u16, tcp_sport, tcp_sport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, tcp_dport, tcp_dport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, udp_sport, udp_sport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, udp_dport, udp_dport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, first_vid, first_vid, p, "%04x");
+	PRINT_MASKED_VAL_L2(u8, first_prio, first_prio, p, "%x");
+	PRINT_MASKED_VAL_L2(u8, first_cfi, first_cfi, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, ip_dscp, ip_dscp, p, "%02x");
+	PRINT_MASKED_VAL_L2(u8, ip_ecn, ip_ecn, p, "%x");
+	PRINT_MASKED_VAL_L2(u8, cvlan_tag, cvlan_tag, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, svlan_tag, svlan_tag, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, frag, frag, p, "%d");
+}
+
+static void print_misc_parameters_hdrs(struct trace_seq *p,
+				       const u32 *mask, const u32 *value)
+{
+#define MASK_VAL_MISC(type, name, fld) \
+	MASK_VAL(type, fte_match_set_misc, name, mask, value, fld)
+#define PRINT_MASKED_VAL_MISC(type, name, fld, p, format) {\
+	MASK_VAL_MISC(type, name, fld);			   \
+	PRINT_MASKED_VAL(name, p, format);		   \
+}
+	DECLARE_MASK_VAL(u64, gre_key) = {
+		.m = MLX5_GET(fte_match_set_misc, mask, gre_key_h) << 8 |
+		     MLX5_GET(fte_match_set_misc, mask, gre_key_l),
+		.v = MLX5_GET(fte_match_set_misc, value, gre_key_h) << 8 |
+		     MLX5_GET(fte_match_set_misc, value, gre_key_l)};
+
+	PRINT_MASKED_VAL(gre_key, p, "%llu");
+	PRINT_MASKED_VAL_MISC(u32, source_sqn, source_sqn, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, source_port, source_port, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_prio, outer_second_prio,
+			      p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_cfi, outer_second_cfi, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, outer_second_vid, outer_second_vid, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_prio, inner_second_prio,
+			      p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_cfi, inner_second_cfi, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, inner_second_vid, inner_second_vid, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u8, outer_second_cvlan_tag,
+			      outer_second_cvlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_cvlan_tag,
+			      inner_second_cvlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_svlan_tag,
+			      outer_second_svlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_svlan_tag,
+			      inner_second_svlan_tag, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u8, gre_protocol, gre_protocol, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u32, vxlan_vni, vxlan_vni, p, "%u");
+	PRINT_MASKED_VAL_MISC(u32, outer_ipv6_flow_label, outer_ipv6_flow_label,
+			      p, "%x");
+	PRINT_MASKED_VAL_MISC(u32, inner_ipv6_flow_label, inner_ipv6_flow_label,
+			      p, "%x");
+}
+
+const char *parse_fs_hdrs(struct trace_seq *p,
+			  u8 match_criteria_enable,
+			  const u32 *mask_outer,
+			  const u32 *mask_misc,
+			  const u32 *mask_inner,
+			  const u32 *value_outer,
+			  const u32 *value_misc,
+			  const u32 *value_inner)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS) {
+		trace_seq_printf(p, "[outer] ");
+		print_lyr_2_4_hdrs(p, mask_outer, value_outer);
+	}
+
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS) {
+		trace_seq_printf(p, "[misc] ");
+		print_misc_parameters_hdrs(p, mask_misc, value_misc);
+	}
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS) {
+		trace_seq_printf(p, "[inner] ");
+		print_lyr_2_4_hdrs(p, mask_inner, value_inner);
+	}
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+const char *parse_fs_dst(struct trace_seq *p,
+			 const struct mlx5_flow_destination *dst,
+			 u32 counter_id)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	switch (dst->type) {
+	case MLX5_FLOW_DESTINATION_TYPE_VPORT:
+		trace_seq_printf(p, "vport=%u\n", dst->vport_num);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
+		trace_seq_printf(p, "ft=%p\n", dst->ft);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_TIR:
+		trace_seq_printf(p, "tir=%u\n", dst->tir_num);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_COUNTER:
+		trace_seq_printf(p, "counter_id=%u\n", counter_id);
+		break;
+	}
+
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_fg);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fg);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_set_fte);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fte);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_rule);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_rule);
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
new file mode 100644
index 000000000000..1e3a6c3e4132
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_MLX5_FS_TP_) || defined(TRACE_HEADER_MULTI_READ)
+#define _MLX5_FS_TP_
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+#include "../fs_core.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mlx5
+
+#define __parse_fs_hdrs(match_criteria_enable, mouter, mmisc, minner, vouter, \
+			vinner, vmisc)					      \
+	parse_fs_hdrs(p, match_criteria_enable, mouter, mmisc, minner, vouter,\
+		      vinner, vmisc)
+
+const char *parse_fs_hdrs(struct trace_seq *p,
+			  u8 match_criteria_enable,
+			  const u32 *mask_outer,
+			  const u32 *mask_misc,
+			  const u32 *mask_inner,
+			  const u32 *value_outer,
+			  const u32 *value_misc,
+			  const u32 *value_inner);
+
+#define __parse_fs_dst(dst, counter_id) \
+	parse_fs_dst(p, (const struct mlx5_flow_destination *)dst, counter_id)
+
+const char *parse_fs_dst(struct trace_seq *p,
+			 const struct mlx5_flow_destination *dst,
+			 u32 counter_id);
+
+TRACE_EVENT(mlx5_fs_add_fg,
+	    TP_PROTO(const struct mlx5_flow_group *fg),
+	    TP_ARGS(fg),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_group *, fg)
+		__field(const struct mlx5_flow_table *, ft)
+		__field(u32, start_index)
+		__field(u32, end_index)
+		__field(u32, id)
+		__field(u8, mask_enable)
+		__array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+	    ),
+	    TP_fast_assign(
+			   __entry->fg = fg;
+			   fs_get_obj(__entry->ft, fg->node.parent);
+			   __entry->start_index = fg->start_index;
+			   __entry->end_index = fg->start_index + fg->max_ftes;
+			   __entry->id = fg->id;
+			   __entry->mask_enable = fg->mask.match_criteria_enable;
+			   memcpy(__entry->mask_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       outer_headers),
+				  sizeof(__entry->mask_outer));
+			   memcpy(__entry->mask_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       inner_headers),
+				  sizeof(__entry->mask_inner));
+			   memcpy(__entry->mask_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       misc_parameters),
+				  sizeof(__entry->mask_misc));
+
+	    ),
+	    TP_printk("fg=%p ft=%p id=%u start=%u end=%u bit_mask=%02x %s\n",
+		      __entry->fg, __entry->ft, __entry->id,
+		      __entry->start_index, __entry->end_index,
+		      __entry->mask_enable,
+		      __parse_fs_hdrs(__entry->mask_enable,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_fg,
+	    TP_PROTO(const struct mlx5_flow_group *fg),
+	    TP_ARGS(fg),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_group *, fg)
+		__field(u32, id)
+	    ),
+	    TP_fast_assign(
+			   __entry->fg = fg;
+			   __entry->id = fg->id;
+
+	    ),
+	    TP_printk("fg=%p id=%u\n",
+		      __entry->fg, __entry->id)
+	    );
+
+#define ACTION_FLAGS \
+	{MLX5_FLOW_CONTEXT_ACTION_ALLOW,	 "ALLOW"},\
+	{MLX5_FLOW_CONTEXT_ACTION_DROP,		 "DROP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,	 "FWD"},\
+	{MLX5_FLOW_CONTEXT_ACTION_COUNT,	 "CNT"},\
+	{MLX5_FLOW_CONTEXT_ACTION_ENCAP,	 "ENCAP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_DECAP,	 "DECAP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_MOD_HDR,	 "MOD_HDR"},\
+	{MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO, "NEXT_PRIO"}
+
+TRACE_EVENT(mlx5_fs_set_fte,
+	    TP_PROTO(const struct fs_fte *fte, bool new_fte),
+	    TP_ARGS(fte, new_fte),
+	    TP_STRUCT__entry(
+		__field(const struct fs_fte *, fte)
+		__field(const struct mlx5_flow_group *, fg)
+		__field(u32, group_index)
+		__field(u32, index)
+		__field(u32, action)
+		__field(u32, flow_tag)
+		__field(u8,  mask_enable)
+		__field(bool, new_fte)
+		__array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+		__array(u32, value_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, value_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, value_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+	    ),
+	    TP_fast_assign(
+			   __entry->fte = fte;
+			   __entry->new_fte = new_fte;
+			   fs_get_obj(__entry->fg, fte->node.parent);
+			   __entry->group_index = __entry->fg->id;
+			   __entry->index = fte->index;
+			   __entry->action = fte->action;
+			   __entry->mask_enable = __entry->fg->mask.match_criteria_enable;
+			   __entry->flow_tag = fte->flow_tag;
+			   memcpy(__entry->mask_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       outer_headers),
+				  sizeof(__entry->mask_outer));
+			   memcpy(__entry->mask_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       inner_headers),
+				  sizeof(__entry->mask_inner));
+			   memcpy(__entry->mask_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       misc_parameters),
+				  sizeof(__entry->mask_misc));
+			   memcpy(__entry->value_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       outer_headers),
+				  sizeof(__entry->value_outer));
+			   memcpy(__entry->value_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       inner_headers),
+				  sizeof(__entry->value_inner));
+			   memcpy(__entry->value_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       misc_parameters),
+				  sizeof(__entry->value_misc));
+
+	    ),
+	    TP_printk("op=%s fte=%p fg=%p index=%u group_index=%u action=<%s> flow_tag=%x %s\n",
+		      __entry->new_fte ? "add" : "set",
+		      __entry->fte, __entry->fg, __entry->index,
+		      __entry->group_index, __print_flags(__entry->action, "|",
+							  ACTION_FLAGS),
+		      __entry->flow_tag,
+		      __parse_fs_hdrs(__entry->mask_enable,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner,
+				      __entry->value_outer,
+				      __entry->value_misc,
+				      __entry->value_inner))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_fte,
+	    TP_PROTO(const struct fs_fte *fte),
+	    TP_ARGS(fte),
+	    TP_STRUCT__entry(
+		__field(const struct fs_fte *, fte)
+		__field(u32, index)
+	    ),
+	    TP_fast_assign(
+			   __entry->fte = fte;
+			   __entry->index = fte->index;
+
+	    ),
+	    TP_printk("fte=%p index=%u\n",
+		      __entry->fte, __entry->index)
+	    );
+
+TRACE_EVENT(mlx5_fs_add_rule,
+	    TP_PROTO(const struct mlx5_flow_rule *rule),
+	    TP_ARGS(rule),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_rule *, rule)
+		__field(const struct fs_fte *, fte)
+		__field(u32, sw_action)
+		__field(u32, index)
+		__field(u32, counter_id)
+		__array(u8, destination, sizeof(struct mlx5_flow_destination))
+	    ),
+	    TP_fast_assign(
+			   __entry->rule = rule;
+			   fs_get_obj(__entry->fte, rule->node.parent);
+			   __entry->index = __entry->fte->dests_size - 1;
+			   __entry->sw_action = rule->sw_action;
+			   memcpy(__entry->destination,
+				  &rule->dest_attr,
+				  sizeof(__entry->destination));
+			   if (rule->dest_attr.type & MLX5_FLOW_DESTINATION_TYPE_COUNTER &&
+			       rule->dest_attr.counter)
+				__entry->counter_id =
+				rule->dest_attr.counter->id;
+	    ),
+	    TP_printk("rule=%p fte=%p index=%u sw_action=<%s> [dst] %s\n",
+		      __entry->rule, __entry->fte, __entry->index,
+		      __print_flags(__entry->sw_action, "|", ACTION_FLAGS),
+		      __parse_fs_dst(__entry->destination, __entry->counter_id))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_rule,
+	    TP_PROTO(const struct mlx5_flow_rule *rule),
+	    TP_ARGS(rule),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_rule *, rule)
+		__field(const struct fs_fte *, fte)
+	    ),
+	    TP_fast_assign(
+			   __entry->rule = rule;
+			   fs_get_obj(__entry->fte, rule->node.parent);
+	    ),
+	    TP_printk("rule=%p fte=%p\n",
+		      __entry->rule, __entry->fte)
+	    );
+#endif
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ./diag
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE fs_tracepoint
+#include <trace/define_trace.h>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9704c2eb82a1..d731d57a996a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -36,6 +36,7 @@
 #include "mlx5_core.h"
 #include "fs_core.h"
 #include "fs_cmd.h"
+#include "diag/fs_tracepoint.h"
 
 #define INIT_TREE_NODE_ARRAY_SIZE(...)	(sizeof((struct init_tree_node[]){__VA_ARGS__}) /\
 					 sizeof(struct init_tree_node))
@@ -404,6 +405,7 @@ static void del_rule(struct fs_node *node)
 	fs_get_obj(fte, rule->node.parent);
 	fs_get_obj(fg, fte->node.parent);
 	fs_get_obj(ft, fg->node.parent);
+	trace_mlx5_fs_del_rule(rule);
 	list_del(&rule->node.list);
 	if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
 		mutex_lock(&rule->dest_attr.ft->lock);
@@ -457,6 +459,7 @@ static void del_fte(struct fs_node *node)
 	fs_get_obj(fte, node);
 	fs_get_obj(fg, fte->node.parent);
 	fs_get_obj(ft, fg->node.parent);
+	trace_mlx5_fs_del_fte(fte);
 
 	dev = get_dev(&ft->node);
 	err = mlx5_cmd_delete_fte(dev, ft,
@@ -479,6 +482,7 @@ static void del_flow_group(struct fs_node *node)
 	fs_get_obj(fg, node);
 	fs_get_obj(ft, fg->node.parent);
 	dev = get_dev(&ft->node);
+	trace_mlx5_fs_del_fg(fg);
 
 	if (ft->autogroup.active)
 		ft->autogroup.num_groups--;
@@ -990,6 +994,7 @@ static struct mlx5_flow_group *create_flow_group_common(struct mlx5_flow_table *
 	/* Add node to group list */
 	list_add(&fg->node.list, prev_fg);
 
+	trace_mlx5_fs_add_fg(fg);
 	return fg;
 
 err_remove_fg:
@@ -1357,6 +1362,7 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 			fte->action = old_action;
 			goto unlock_fte;
 		} else {
+			trace_mlx5_fs_set_fte(fte, false);
 			goto add_rules;
 		}
 	}
@@ -1378,10 +1384,13 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	tree_add_node(&fte->node, &fg->node);
 	/* fte list isn't sorted */
 	list_add_tail(&fte->node.list, &fg->node.children);
+	trace_mlx5_fs_set_fte(fte, true);
 add_rules:
 	for (i = 0; i < handle->num_rules; i++) {
-		if (atomic_read(&handle->rule[i]->node.refcount) == 1)
+		if (atomic_read(&handle->rule[i]->node.refcount) == 1) {
 			tree_add_node(&handle->rule[i]->node, &fte->node);
+			trace_mlx5_fs_add_rule(handle->rule[i]);
+		}
 	}
 unlock_fte:
 	unlock_ref_node(&fte->node);
-- 
2.13.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox