Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 1/4] net: Add SRIOV VGT+ support
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eugenia Emantayev, Mohamad Haj Yahia, Saeed Mahameed
In-Reply-To: <20170827110618.20599-1-saeedm@mellanox.com>

From: Mohamad Haj Yahia <mohamad@mellanox.com>

VGT+ is a security feature that gives the administrator the ability of
controlling the allowed vlan-ids list that can be transmitted/received
from/to the VF.
The allowed vlan-ids list is called "trunk".
Admin can add/remove a range of allowed vlan-ids via iptool.
Example:
After this series of configuration :
1) ip link set eth3 vf 0 trunk add 10 100 (allow vlan-id 10-100, default tpid 0x8100)
2) ip link set eth3 vf 0 trunk add 105 proto 802.1q (allow vlan-id 105 tpid 0x8100)
3) ip link set eth3 vf 0 trunk add 105 proto 802.1ad (allow vlan-id 105 tpid 0x88a8)
4) ip link set eth3 vf 0 trunk rem 90 (block vlan-id 90)
5) ip link set eth3 vf 0 trunk rem 50 60 (block vlan-ids 50-60)

The VF 0 can only communicate on vlan-ids: 10-49,61-89,91-100,105 with
tpid 0x8100 and vlan-id 105 with tpid 0x88a8.

For this purpose we added the following netlink sr-iov commands:

1) IFLA_VF_VLAN_RANGE: used to add/remove allowed vlan-ids range.
We added the ifla_vf_vlan_range struct to specify the range we want to
add/remove from the userspace.
We added ndo_add_vf_vlan_trunk_range and ndo_del_vf_vlan_trunk_range
netdev ops to add/remove allowed vlan-ids range in the netdev.

2) IFLA_VF_VLAN_TRUNK: used to query the allowed vlan-ids trunk.
We added trunk bitmap to the ifla_vf_info struct to get the current
allowed vlan-ids trunk from the netdev.
We added ifla_vf_vlan_trunk struct for sending the allowed vlan-ids
trunk to the userspace.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/if_link.h      |   2 +
 include/linux/netdevice.h    |  12 +++++
 include/uapi/linux/if_link.h |  20 ++++++++
 net/core/rtnetlink.c         | 109 +++++++++++++++++++++++++++++++------------
 4 files changed, 114 insertions(+), 29 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 0b17c585b5cd..da70af27e42e 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -25,6 +25,8 @@ struct ifla_vf_info {
 	__u32 max_tx_rate;
 	__u32 rss_query_en;
 	__u32 trusted;
+	__u64 trunk_8021q[VF_VLAN_BITMAP];
+	__u64 trunk_8021ad[VF_VLAN_BITMAP];
 	__be16 vlan_proto;
 };
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c5475b37a631..10633cabc58f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -959,6 +959,10 @@ struct xfrmdev_ops {
  *      Hash Key. This is needed since on some devices VF share this information
  *      with PF and querying it may introduce a theoretical security risk.
  * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
+ * int (*ndo_add_vf_vlan_trunk_range)(struct net_device *dev, int vf,
+ *				      u16 start_vid, u16 end_vid, __be16 proto);
+ * int (*ndo_del_vf_vlan_trunk_range)(struct net_device *dev, int vf,
+ *				      u16 start_vid, u16 end_vid, __be16 proto);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
  * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
  *		       void *type_data);
@@ -1208,6 +1212,14 @@ struct net_device_ops {
 	int			(*ndo_set_vf_rss_query_en)(
 						   struct net_device *dev,
 						   int vf, bool setting);
+	int			(*ndo_add_vf_vlan_trunk_range)(
+						   struct net_device *dev,
+						   int vf, u16 start_vid,
+						   u16 end_vid, __be16 proto);
+	int			(*ndo_del_vf_vlan_trunk_range)(
+						   struct net_device *dev,
+						   int vf, u16 start_vid,
+						   u16 end_vid, __be16 proto);
 	int			(*ndo_setup_tc)(struct net_device *dev,
 						enum tc_setup_type type,
 						void *type_data);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d062c58d5cb..3aa895c5fbc1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -168,6 +168,8 @@ enum {
 #ifndef __KERNEL__
 #define IFLA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg))))
 #define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg))
+#define BITS_PER_BYTE 8
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
 #endif
 
 enum {
@@ -645,6 +647,8 @@ enum {
 	IFLA_VF_IB_NODE_GUID,	/* VF Infiniband node GUID */
 	IFLA_VF_IB_PORT_GUID,	/* VF Infiniband port GUID */
 	IFLA_VF_VLAN_LIST,	/* nested list of vlans, option for QinQ */
+	IFLA_VF_VLAN_RANGE,	/* add/delete vlan range filtering */
+	IFLA_VF_VLAN_TRUNK,	/* vlan trunk filtering */
 	__IFLA_VF_MAX,
 };
 
@@ -669,6 +673,7 @@ enum {
 
 #define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1)
 #define MAX_VLAN_LIST_LEN 1
+#define VF_VLAN_N_VID 4096
 
 struct ifla_vf_vlan_info {
 	__u32 vf;
@@ -677,6 +682,21 @@ struct ifla_vf_vlan_info {
 	__be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */
 };
 
+struct ifla_vf_vlan_range {
+	__u32 vf;
+	__u32 start_vid;   /* 1 - 4095 */
+	__u32 end_vid;     /* 1 - 4095 */
+	__u32 setting;
+	__be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */
+};
+
+#define VF_VLAN_BITMAP	DIV_ROUND_UP(VF_VLAN_N_VID, sizeof(__u64) * BITS_PER_BYTE)
+struct ifla_vf_vlan_trunk {
+	__u32 vf;
+	__u64 allowed_vlans_8021q_bm[VF_VLAN_BITMAP];
+	__u64 allowed_vlans_8021ad_bm[VF_VLAN_BITMAP];
+};
+
 struct ifla_vf_tx_rate {
 	__u32 vf;
 	__u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a78fd61da0ec..56909f11d88e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -827,6 +827,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 			 nla_total_size(MAX_VLAN_LIST_LEN *
 					sizeof(struct ifla_vf_vlan_info)) +
 			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+			 nla_total_size(sizeof(struct ifla_vf_vlan_trunk)) +
 			 nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
 			 nla_total_size(sizeof(struct ifla_vf_rate)) +
 			 nla_total_size(sizeof(struct ifla_vf_link_state)) +
@@ -1098,31 +1099,43 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	struct ifla_vf_link_state vf_linkstate;
 	struct ifla_vf_vlan_info vf_vlan_info;
 	struct ifla_vf_spoofchk vf_spoofchk;
+	struct ifla_vf_vlan_trunk *vf_trunk;
 	struct ifla_vf_tx_rate vf_tx_rate;
 	struct ifla_vf_stats vf_stats;
 	struct ifla_vf_trust vf_trust;
 	struct ifla_vf_vlan vf_vlan;
 	struct ifla_vf_rate vf_rate;
 	struct ifla_vf_mac vf_mac;
-	struct ifla_vf_info ivi;
+	struct ifla_vf_info *ivi;
 
-	memset(&ivi, 0, sizeof(ivi));
+	ivi = kzalloc(sizeof(*ivi), GFP_KERNEL);
+	if (!ivi)
+		return -ENOMEM;
+
+	vf_trunk = kzalloc(sizeof(*vf_trunk), GFP_KERNEL);
+	if (!vf_trunk) {
+		kfree(ivi);
+		return -ENOMEM;
+	}
 
 	/* Not all SR-IOV capable drivers support the
 	 * spoofcheck and "RSS query enable" query.  Preset to
 	 * -1 so the user space tool can detect that the driver
 	 * didn't report anything.
 	 */
-	ivi.spoofchk = -1;
-	ivi.rss_query_en = -1;
-	ivi.trusted = -1;
+	ivi->spoofchk = -1;
+	ivi->rss_query_en = -1;
+	ivi->trusted = -1;
+	memset(ivi->mac, 0, sizeof(ivi->mac));
+	memset(ivi->trunk_8021q, 0, sizeof(ivi->trunk_8021q));
+	memset(ivi->trunk_8021ad, 0, sizeof(ivi->trunk_8021ad));
 	/* The default value for VF link state is "auto"
 	 * IFLA_VF_LINK_STATE_AUTO which equals zero
 	 */
-	ivi.linkstate = 0;
+	ivi->linkstate = 0;
 	/* VLAN Protocol by default is 802.1Q */
-	ivi.vlan_proto = htons(ETH_P_8021Q);
-	if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
+	ivi->vlan_proto = htons(ETH_P_8021Q);
+	if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, ivi))
 		return 0;
 
 	memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
@@ -1135,21 +1148,24 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 		vf_spoofchk.vf =
 		vf_linkstate.vf =
 		vf_rss_query_en.vf =
-		vf_trust.vf = ivi.vf;
-
-	memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
-	vf_vlan.vlan = ivi.vlan;
-	vf_vlan.qos = ivi.qos;
-	vf_vlan_info.vlan = ivi.vlan;
-	vf_vlan_info.qos = ivi.qos;
-	vf_vlan_info.vlan_proto = ivi.vlan_proto;
-	vf_tx_rate.rate = ivi.max_tx_rate;
-	vf_rate.min_tx_rate = ivi.min_tx_rate;
-	vf_rate.max_tx_rate = ivi.max_tx_rate;
-	vf_spoofchk.setting = ivi.spoofchk;
-	vf_linkstate.link_state = ivi.linkstate;
-	vf_rss_query_en.setting = ivi.rss_query_en;
-	vf_trust.setting = ivi.trusted;
+		vf_trunk->vf =
+		vf_trust.vf = ivi->vf;
+
+	memcpy(vf_mac.mac, ivi->mac, sizeof(ivi->mac));
+	memcpy(vf_trunk->allowed_vlans_8021q_bm, ivi->trunk_8021q, sizeof(ivi->trunk_8021q));
+	memcpy(vf_trunk->allowed_vlans_8021ad_bm, ivi->trunk_8021ad, sizeof(ivi->trunk_8021ad));
+	vf_vlan.vlan = ivi->vlan;
+	vf_vlan.qos = ivi->qos;
+	vf_vlan_info.vlan = ivi->vlan;
+	vf_vlan_info.qos = ivi->qos;
+	vf_vlan_info.vlan_proto = ivi->vlan_proto;
+	vf_tx_rate.rate = ivi->max_tx_rate;
+	vf_rate.min_tx_rate = ivi->min_tx_rate;
+	vf_rate.max_tx_rate = ivi->max_tx_rate;
+	vf_spoofchk.setting = ivi->spoofchk;
+	vf_linkstate.link_state = ivi->linkstate;
+	vf_rss_query_en.setting = ivi->rss_query_en;
+	vf_trust.setting = ivi->trusted;
 	vf = nla_nest_start(skb, IFLA_VF_INFO);
 	if (!vf)
 		goto nla_put_vfinfo_failure;
@@ -1167,7 +1183,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 		    sizeof(vf_rss_query_en),
 		    &vf_rss_query_en) ||
 	    nla_put(skb, IFLA_VF_TRUST,
-		    sizeof(vf_trust), &vf_trust))
+		    sizeof(vf_trust), &vf_trust) ||
+	    nla_put(skb, IFLA_VF_VLAN_TRUNK,
+		    sizeof(*vf_trunk), vf_trunk))
 		goto nla_put_vf_failure;
 	vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
 	if (!vfvlanlist)
@@ -1202,12 +1220,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	}
 	nla_nest_end(skb, vfstats);
 	nla_nest_end(skb, vf);
+	kfree(vf_trunk);
+	kfree(ivi);
 	return 0;
 
 nla_put_vf_failure:
 	nla_nest_cancel(skb, vf);
 nla_put_vfinfo_failure:
 	nla_nest_cancel(skb, vfinfo);
+	kfree(vf_trunk);
+	kfree(ivi);
 	return -EMSGSIZE;
 }
 
@@ -1784,6 +1806,26 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 			return err;
 	}
 
+	if (tb[IFLA_VF_VLAN_RANGE]) {
+		struct ifla_vf_vlan_range *ivvr =
+					nla_data(tb[IFLA_VF_VLAN_RANGE]);
+		bool add = !!ivvr->setting;
+
+		err = -EOPNOTSUPP;
+		if (add && ops->ndo_add_vf_vlan_trunk_range)
+			err = ops->ndo_add_vf_vlan_trunk_range(dev, ivvr->vf,
+							       ivvr->start_vid,
+							       ivvr->end_vid,
+							       ivvr->vlan_proto);
+		else if (!add && ops->ndo_del_vf_vlan_trunk_range)
+			err = ops->ndo_del_vf_vlan_trunk_range(dev, ivvr->vf,
+							       ivvr->start_vid,
+							       ivvr->end_vid,
+							       ivvr->vlan_proto);
+		if (err < 0)
+			return err;
+	}
+
 	if (tb[IFLA_VF_VLAN_LIST]) {
 		struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
 		struct nlattr *attr;
@@ -1815,21 +1857,30 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 
 	if (tb[IFLA_VF_TX_RATE]) {
 		struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
-		struct ifla_vf_info ivf;
+		struct ifla_vf_info *ivf;
+
+		ivf = kzalloc(sizeof(*ivf), GFP_KERNEL);
+		if (!ivf)
+			return -ENOMEM;
 
 		err = -EOPNOTSUPP;
 		if (ops->ndo_get_vf_config)
-			err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
-		if (err < 0)
+			err = ops->ndo_get_vf_config(dev, ivt->vf, ivf);
+		if (err < 0) {
+			kfree(ivf);
 			return err;
+		}
 
 		err = -EOPNOTSUPP;
 		if (ops->ndo_set_vf_rate)
 			err = ops->ndo_set_vf_rate(dev, ivt->vf,
-						   ivf.min_tx_rate,
+						   ivf->min_tx_rate,
 						   ivt->rate);
-		if (err < 0)
+		if (err < 0) {
+			kfree(ivf);
 			return err;
+		}
+		kfree(ivf);
 	}
 
 	if (tb[IFLA_VF_RATE]) {
-- 
2.13.0

^ permalink raw reply related

* [PATCH net-next 4/4] net/mlx5e: E-switch, Add steering drop counters
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eugenia Emantayev, Saeed Mahameed
In-Reply-To: <20170827110618.20599-1-saeedm@mellanox.com>

From: Eugenia Emantayev <eugenia@mellanox.com>

Add flow counters to count packets dropped due to drop rules
configured in eswitch egress and ingress ACLs.
These counters will count VFs violations and incoming traffic drops.
Will be presented on hypervisor via standard 'ip -s link show' command.

Example: "ip -s link show dev enp5s0f0"

6: enp5s0f0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000
    link/ether 24:8a:07:a5:28:f0 brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast
    0          0        0       0       0       2
    TX: bytes  packets  errors  dropped carrier collsns
    1406       17       0       0       0       0
    vf 0 MAC 00:00:ca:fe:ca:fe, vlan 5, spoof checking off, link-state auto, trust off, query_rss off
    RX: bytes  packets  mcast   bcast   dropped
    1666       29       14         32      0
    TX: bytes  packets   dropped
    2880       44       2412

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 97 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  2 +
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  |  6 ++
 3 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index a8e8670c7c8d..6c992e43e397 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -37,6 +37,7 @@
 #include <linux/mlx5/fs.h>
 #include "mlx5_core.h"
 #include "eswitch.h"
+#include "fs_core.h"
 
 #define UPLINK_VPORT 0xFFFF
 
@@ -1007,8 +1008,14 @@ static void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw,
 		kfree(trunk_vlan_rule);
 	}
 
-	if (!IS_ERR_OR_NULL(vport->egress.drop_rule))
+	if (!IS_ERR_OR_NULL(vport->egress.drop_rule)) {
+		struct mlx5_fc *drop_counter =
+			mlx5_flow_rule_counter(vport->egress.drop_rule);
+
 		mlx5_del_flow_rules(vport->egress.drop_rule);
+		if (drop_counter)
+			mlx5_fc_destroy(vport->dev, drop_counter);
+	}
 
 	if (!IS_ERR_OR_NULL(vport->egress.allow_untagged_rule))
 		mlx5_del_flow_rules(vport->egress.allow_untagged_rule);
@@ -1174,8 +1181,14 @@ static void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
 {
 	struct mlx5_acl_vlan *trunk_vlan_rule, *tmp;
 
-	if (!IS_ERR_OR_NULL(vport->ingress.drop_rule))
+	if (!IS_ERR_OR_NULL(vport->ingress.drop_rule)) {
+		struct mlx5_fc *drop_counter =
+			mlx5_flow_rule_counter(vport->ingress.drop_rule);
+
 		mlx5_del_flow_rules(vport->ingress.drop_rule);
+		if (drop_counter)
+			mlx5_fc_destroy(vport->dev, drop_counter);
+	}
 
 	list_for_each_entry_safe(trunk_vlan_rule, tmp,
 				 &vport->ingress.allowed_vlans_rules, list) {
@@ -1222,6 +1235,8 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 	bool need_vlan_filter = !!bitmap_weight(vport->info.vlan_trunk_8021q_bitmap,
 						VLAN_N_VID);
 	struct mlx5_acl_vlan *trunk_vlan_rule;
+	struct mlx5_flow_destination dest;
+	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_spec *spec;
 	bool need_acl_table = true;
@@ -1333,18 +1348,33 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 	}
 
 drop_rule:
+	/* Alloc ingress drop flow counter */
+	counter = mlx5_fc_create(esw->dev, false);
+	if (IS_ERR(counter)) {
+		esw_warn(esw->dev,
+			 "vport[%d] configure ingress drop rule counter failed\n",
+			 vport->vport);
+		counter = NULL;
+	} else {
+		dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest.counter = counter;
+	}
+
+	/* Drop others rule (star rule) */
 	memset(spec, 0, sizeof(*spec));
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+	if (counter)
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
 	vport->ingress.drop_rule =
-		mlx5_add_flow_rules(vport->ingress.acl, spec,
-				    &flow_act, NULL, 0);
+		mlx5_add_flow_rules(vport->ingress.acl, spec, &flow_act, &dest, 1);
 	if (IS_ERR(vport->ingress.drop_rule)) {
 		err = PTR_ERR(vport->ingress.drop_rule);
 		esw_warn(esw->dev,
 			 "vport[%d] configure ingress drop rule, err(%d)\n",
 			 vport->vport, err);
 		vport->ingress.drop_rule = NULL;
-		goto out;
+		if (counter)
+			mlx5_fc_destroy(vport->dev, counter);
 	}
 
 out:
@@ -1362,6 +1392,8 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 	bool need_acl_table = vport->info.vlan || vport->info.qos ||
 			      need_vlan_filter;
 	struct mlx5_acl_vlan *trunk_vlan_rule;
+	struct mlx5_flow_destination dest;
+	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_spec *spec;
 	u16 vlan_id = 0;
@@ -1454,18 +1486,33 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 		list_add(&trunk_vlan_rule->list, &vport->egress.allowed_vlans_rules);
 	}
 
+	/* Alloc egress drop flow counter */
+	counter = mlx5_fc_create(esw->dev, false);
+	if (IS_ERR(counter)) {
+		esw_warn(esw->dev,
+			 "vport[%d] configure egress drop rule counter failed\n",
+			 vport->vport);
+		counter = NULL;
+	} else {
+		dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest.counter = counter;
+	}
+
 	/* Drop others rule (star rule) */
 	memset(spec, 0, sizeof(*spec));
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+	if (counter)
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
 	vport->egress.drop_rule =
-		mlx5_add_flow_rules(vport->egress.acl, spec,
-				    &flow_act, NULL, 0);
+		mlx5_add_flow_rules(vport->egress.acl, spec, &flow_act, &dest, 1);
 	if (IS_ERR(vport->egress.drop_rule)) {
 		err = PTR_ERR(vport->egress.drop_rule);
 		esw_warn(esw->dev,
 			 "vport[%d] configure egress drop rule failed, err(%d)\n",
 			 vport->vport, err);
 		vport->egress.drop_rule = NULL;
+		if (counter)
+			mlx5_fc_destroy(vport->dev, counter);
 	}
 out:
 	if (err)
@@ -2316,6 +2363,38 @@ int mlx5_eswitch_del_vport_trunk_range(struct mlx5_eswitch *esw,
 	return err;
 }
 
+static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev,
+					       int vport_idx,
+					       u64 *rx_dropped,
+					       u64 *tx_dropped)
+{
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5_vport *vport = &esw->vports[vport_idx];
+	struct mlx5_fc *drop_counter;
+	u16 idx = 0;
+	u64 dummy;
+
+	if (!vport->enabled)
+		return 0;
+
+	if (vport->egress.drop_rule) {
+		drop_counter = mlx5_flow_rule_counter(vport->egress.drop_rule);
+		if (drop_counter) {
+			idx = drop_counter->id;
+			mlx5_fc_query(dev, idx, rx_dropped, &dummy);
+		}
+	}
+
+	if (vport->ingress.drop_rule) {
+		drop_counter = mlx5_flow_rule_counter(vport->ingress.drop_rule);
+		if (drop_counter) {
+			idx = drop_counter->id;
+			mlx5_fc_query(dev, idx, tx_dropped, &dummy);
+		}
+	}
+	return 0;
+}
+
 int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
 				 int vport,
 				 struct ifla_vf_stats *vf_stats)
@@ -2376,6 +2455,10 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
 	vf_stats->broadcast =
 		MLX5_GET_CTR(out, received_eth_broadcast.packets);
 
+	mlx5_eswitch_query_vport_drop_stats(esw->dev, vport,
+					    &vf_stats->rx_dropped,
+					    &vf_stats->tx_dropped);
+
 free_out:
 	kvfree(out);
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 5509a752f98e..e86d75fbc0f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -221,6 +221,8 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
 			      unsigned long delay);
 void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
 				      unsigned long interval);
+int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
+		  u64 *packets, u64 *bytes);
 
 int mlx5_init_fs(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 89d1f8650033..b7ab929d5f8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -312,6 +312,12 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 	}
 }
 
+int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
+		  u64 *packets, u64 *bytes)
+{
+	return mlx5_cmd_fc_query(dev, id, packets, bytes);
+}
+
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse)
 {
-- 
2.13.0

^ permalink raw reply related

* [PATCH net-next 0/4] SRIOV VF VGT+ and violation counters support
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eugenia Emantayev, Saeed Mahameed

Hi Dave

This series provides two security SRIOV related features (VGT+ and VF violation counters).

VGT+ is a security feature that gives the administrator the ability of controlling
the allowed VGT vlan IDs list that can be transmitted/received from/to the VF.
The allowed VGT vlan IDs list is called "trunk".

Admin can add/remove a range of allowed vlan-ids via iptool:
ip link set { DEVICE } [ vf NUM [ trunk { add | rem } START-VLAN-ID [ END-VLAN-ID ] [ proto VLAN-PROTO ] ] ]

Example:
After this series of configuration :
1) ip link set eth3 vf 0 trunk add 10 100 (allow vlan-id 10-100, default tpid 0x8100)
2) ip link set eth3 vf 0 trunk add 105 proto 802.1q (allow vlan-id 105 tpid 0x8100)
3) ip link set eth3 vf 0 trunk add 105 proto 802.1ad (allow vlan-id 105 tpid 0x88a8)
4) ip link set eth3 vf 0 trunk rem 90 (block vlan-id 90)
5) ip link set eth3 vf 0 trunk rem 50 60 (block vlan-ids 50-60)

VF 0 can only communicate on vlan-ids: 10-49,61-89,91-100,105 with tpid 0x8100 and vlan-id 105 with tpid 0x88a8.

For this purpose following net_device callbacks were added:
int (*ndo_add_vf_vlan_trunk_range)(struct net_device *dev, int vf, u16 start_vid, u16 end_vid, __be16 proto);
int (*ndo_del_vf_vlan_trunk_range)(struct net_device *dev, int vf, u16 start_vid, u16 end_vid, __be16 proto);

This feature is implemented and demonstrated in mlx5 via ACL steering tables and vlan rules attached to the VF's
corresponding E-Switch vport.

I addition to VGT+ we introduce new set of counter to VF statistics, to collect counters for traffic violating
VF ACL rules (such as VGT+ violation), for that we extend the current ifla_vf_stats to include rx_dropped/tx_dropped
to be reported per VF.

Example:
> ip link set eth3 vf 0 trunk add 10 100
VF 0 transmits 2412 packets on a vlan id not in [10,100] range will be dropped and reported in hypervisor
via:
> ip -s link show dev enp5s0f0"
      6: enp5s0f0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000
        [...]
	vf 0 MAC 00:00:ca:fe:ca:fe, vlan 5, spoof checking off, link-state auto, trust off, query_rss off
        RX: bytes  packets  mcast   bcast   dropped
        1666       29       14         32      0
        TX: bytes  packets   dropped
        2880       44       2412

Thanks,
Saeed.

Eugenia Emantayev (2):
  net/core: Add violation counters to VF statisctics
  net/mlx5e: E-switch, Add steering drop counters

Mohamad Haj Yahia (2):
  net: Add SRIOV VGT+ support
  net/mlx5: Add SRIOV VGT+ support

 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  28 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 589 +++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  31 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |   2 +
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    |  19 +-
 include/linux/if_link.h                            |   4 +
 include/linux/mlx5/vport.h                         |   6 +-
 include/linux/netdevice.h                          |  12 +
 include/uapi/linux/if_link.h                       |  22 +
 net/core/rtnetlink.c                               | 119 +++--
 11 files changed, 681 insertions(+), 157 deletions(-)

-- 
2.13.0

^ permalink raw reply

* [PATCH net-next 3/4] net/core: Add violation counters to VF statisctics
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eugenia Emantayev, Saeed Mahameed
In-Reply-To: <20170827110618.20599-1-saeedm@mellanox.com>

From: Eugenia Emantayev <eugenia@mellanox.com>

Add receive and transmit violation counters to be
displayed in iproute2 VF statistics.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/if_link.h      |  2 ++
 include/uapi/linux/if_link.h |  2 ++
 net/core/rtnetlink.c         | 10 +++++++++-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index da70af27e42e..ebf3448acb5b 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -12,6 +12,8 @@ struct ifla_vf_stats {
 	__u64 tx_bytes;
 	__u64 broadcast;
 	__u64 multicast;
+	__u64 rx_dropped;
+	__u64 tx_dropped;
 };
 
 struct ifla_vf_info {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3aa895c5fbc1..68cd31b281a1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -743,6 +743,8 @@ enum {
 	IFLA_VF_STATS_BROADCAST,
 	IFLA_VF_STATS_MULTICAST,
 	IFLA_VF_STATS_PAD,
+	IFLA_VF_STATS_RX_DROPPED,
+	IFLA_VF_STATS_TX_DROPPED,
 	__IFLA_VF_STATS_MAX,
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 56909f11d88e..1a653bb00d6e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -845,6 +845,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 			 nla_total_size_64bit(sizeof(__u64)) +
 			 /* IFLA_VF_STATS_MULTICAST */
 			 nla_total_size_64bit(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_RX_DROPPED */
+			 nla_total_size_64bit(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_TX_DROPPED */
+			 nla_total_size_64bit(sizeof(__u64)) +
 			 nla_total_size(sizeof(struct ifla_vf_trust)));
 		return size;
 	} else
@@ -1214,7 +1218,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
 			      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
-			      vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+			      vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+			      vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+			      vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
 		nla_nest_cancel(skb, vfstats);
 		goto nla_put_vf_failure;
 	}
-- 
2.13.0

^ permalink raw reply related

* (unknown), 
From: agar2000 @ 2017-08-27 10:55 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: MAIL_34929959_netdev.zip --]
[-- Type: application/zip, Size: 72397 bytes --]

^ permalink raw reply

* Re: [PATCH net] bridge: check for null fdb->dst before notifying switchdev drivers
From: Arkadi Sharshevsky @ 2017-08-27  8:46 UTC (permalink / raw)
  To: Roopa Prabhu, davem; +Cc: netdev
In-Reply-To: <1503807228-16281-1-git-send-email-roopa@cumulusnetworks.com>



On 08/27/2017 07:13 AM, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> 
> current switchdev drivers dont seem to support offloading fdb
> entries pointing to the bridge device which have fdb->dst
> not set to any port. This patch adds a NULL fdb->dst check in
> the switchdev notifier code.
> 
> This patch fixes the below NULL ptr dereference:
> $bridge fdb add 00:02:00:00:00:33 dev br0 self
> 
> [   69.953374] BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000008
> [   69.954044] IP: br_switchdev_fdb_notify+0x29/0x80
> [   69.954044] PGD 66527067
> [   69.954044] P4D 66527067
> [   69.954044] PUD 7899c067
> [   69.954044] PMD 0
> [   69.954044]
> [   69.954044] Oops: 0000 [#1] SMP
> [   69.954044] Modules linked in:
> [   69.954044] CPU: 1 PID: 3074 Comm: bridge Not tainted 4.13.0-rc6+ #1
> [   69.954044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org
> 04/01/2014
> [   69.954044] task: ffff88007b827140 task.stack: ffffc90001564000
> [   69.954044] RIP: 0010:br_switchdev_fdb_notify+0x29/0x80
> [   69.954044] RSP: 0018:ffffc90001567918 EFLAGS: 00010246
> [   69.954044] RAX: 0000000000000000 RBX: ffff8800795e0880 RCX:
> 00000000000000c0
> [   69.954044] RDX: ffffc90001567920 RSI: 000000000000001c RDI:
> ffff8800795d0600
> [   69.954044] RBP: ffffc90001567938 R08: ffff8800795d0600 R09:
> 0000000000000000
> [   69.954044] R10: ffffc90001567a88 R11: ffff88007b849400 R12:
> ffff8800795e0880
> [   69.954044] R13: ffff8800795d0600 R14: ffffffff81ef8880 R15:
> 000000000000001c
> [   69.954044] FS:  00007f93d3085700(0000) GS:ffff88007fd00000(0000)
> knlGS:0000000000000000
> [   69.954044] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   69.954044] CR2: 0000000000000008 CR3: 0000000066551000 CR4:
> 00000000000006e0
> [   69.954044] Call Trace:
> [   69.954044]  fdb_notify+0x3f/0xf0
> [   69.954044]  __br_fdb_add.isra.12+0x1a7/0x370
> [   69.954044]  br_fdb_add+0x178/0x280
> [   69.954044]  rtnl_fdb_add+0x10a/0x200
> [   69.954044]  rtnetlink_rcv_msg+0x1b4/0x240
> [   69.954044]  ? skb_free_head+0x21/0x40
> [   69.954044]  ? rtnl_calcit.isra.18+0xf0/0xf0
> [   69.954044]  netlink_rcv_skb+0xed/0x120
> [   69.954044]  rtnetlink_rcv+0x15/0x20
> [   69.954044]  netlink_unicast+0x180/0x200
> [   69.954044]  netlink_sendmsg+0x291/0x370
> [   69.954044]  ___sys_sendmsg+0x180/0x2e0
> [   69.954044]  ? filemap_map_pages+0x2db/0x370
> [   69.954044]  ? do_wp_page+0x11d/0x420
> [   69.954044]  ? __handle_mm_fault+0x794/0xd80
> [   69.954044]  ? vma_link+0xcb/0xd0
> [   69.954044]  __sys_sendmsg+0x4c/0x90
> [   69.954044]  SyS_sendmsg+0x12/0x20
> [   69.954044]  do_syscall_64+0x63/0xe0
> [   69.954044]  entry_SYSCALL64_slow_path+0x25/0x25
> [   69.954044] RIP: 0033:0x7f93d2bad690
> [   69.954044] RSP: 002b:00007ffc7217a638 EFLAGS: 00000246 ORIG_RAX:
> 000000000000002e
> [   69.954044] RAX: ffffffffffffffda RBX: 00007ffc72182eac RCX:
> 00007f93d2bad690
> [   69.954044] RDX: 0000000000000000 RSI: 00007ffc7217a670 RDI:
> 0000000000000003
> [   69.954044] RBP: 0000000059a1f7f8 R08: 0000000000000006 R09:
> 000000000000000a
> [   69.954044] R10: 00007ffc7217a400 R11: 0000000000000246 R12:
> 00007ffc7217a670
> [   69.954044] R13: 00007ffc72182a98 R14: 00000000006114c0 R15:
> 00007ffc72182aa0
> [   69.954044] Code: 1f 00 66 66 66 66 90 55 48 89 e5 48 83 ec 20 f6 47
> 20 04 74 0a 83 fe 1c 74 09 83 fe 1d 74 2c c9 66 90 c3 48 8b 47 10 48 8d
> 55 e8 <48> 8b 70 08 0f b7 47 1e 48 83 c7 18 48 89 7d f0 bf 03 00 00 00
> [   69.954044] RIP: br_switchdev_fdb_notify+0x29/0x80 RSP:
> ffffc90001567918
> [   69.954044] CR2: 0000000000000008
> [   69.954044] ---[ end trace 03e9eec4a82c238b ]---
> 
> Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about FDB add/del")
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> ---
>  net/bridge/br_switchdev.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
> index 181a44d..f6b1c7d 100644
> --- a/net/bridge/br_switchdev.c
> +++ b/net/bridge/br_switchdev.c
> @@ -115,7 +115,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
>  void
>  br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
>  {
> -	if (!fdb->added_by_user)
> +	if (!fdb->added_by_user || !fdb->dst)
>  		return;
>  
>  	switch (type) {
> 

Thanks, missed that.
Arkadi

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH] e1000e: apply burst mode settings only on default
From: Neftin, Sasha @ 2017-08-27  8:34 UTC (permalink / raw)
  To: Willem de Bruijn, jeffrey.t.kirsher, alexander.h.duyck,
	raanan.avargil, dima.ruinskiy
  Cc: netdev, Willem de Bruijn, intel-wired-lan
In-Reply-To: <e589e146-3536-9ef3-486c-f5d115eb83cf@intel.com>

On 8/27/2017 11:32, Neftin, Sasha wrote:
> On 8/27/2017 11:30, Neftin, Sasha wrote:
>> On 8/25/2017 18:06, Willem de Bruijn wrote:
>>> From: Willem de Bruijn <willemb@google.com>
>>>
>>> Devices that support FLAG2_DMA_BURST have different default values
>>> for RDTR and RADV. Apply burst mode default settings only when no
>>> explicit value was passed at module load.
>>>
>>> The RDTR default is zero. If the module is loaded for low latency
>>> operation with RxIntDelay=0, do not override this value with a burst
>>> default of 32.
>>>
>>> Move the decision to apply burst values earlier, where explicitly
>>> initialized module variables can be distinguished from defaults.
>>>
>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>> ---
>>>   drivers/net/ethernet/intel/e1000e/e1000.h  |  4 ----
>>>   drivers/net/ethernet/intel/e1000e/netdev.c |  8 --------
>>>   drivers/net/ethernet/intel/e1000e/param.c  | 16 +++++++++++++++-
>>>   3 files changed, 15 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h 
>>> b/drivers/net/ethernet/intel/e1000e/e1000.h
>>> index 98e68888abb1..2311b31bdcac 100644
>>> --- a/drivers/net/ethernet/intel/e1000e/e1000.h
>>> +++ b/drivers/net/ethernet/intel/e1000e/e1000.h
>>> @@ -94,10 +94,6 @@ struct e1000_info;
>>>    */
>>>   #define E1000_CHECK_RESET_COUNT        25
>>>   -#define DEFAULT_RDTR            0
>>> -#define DEFAULT_RADV            8
>>> -#define BURST_RDTR            0x20
>>> -#define BURST_RADV            0x20
>>>   #define PCICFG_DESC_RING_STATUS        0xe4
>>>   #define FLUSH_DESC_REQUIRED        0x100
>>>   diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
>>> b/drivers/net/ethernet/intel/e1000e/netdev.c
>>> index 327dfe5bedc0..47b89aac7969 100644
>>> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
>>> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
>>> @@ -3223,14 +3223,6 @@ static void e1000_configure_rx(struct 
>>> e1000_adapter *adapter)
>>>            */
>>>           ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
>>>           ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
>>> -
>>> -        /* override the delay timers for enabling bursting, only if
>>> -         * the value was not set by the user via module options
>>> -         */
>>> -        if (adapter->rx_int_delay == DEFAULT_RDTR)
>>> -            adapter->rx_int_delay = BURST_RDTR;
>>> -        if (adapter->rx_abs_int_delay == DEFAULT_RADV)
>>> -            adapter->rx_abs_int_delay = BURST_RADV;
>>>       }
>>>         /* set the Receive Delay Timer Register */
>>> diff --git a/drivers/net/ethernet/intel/e1000e/param.c 
>>> b/drivers/net/ethernet/intel/e1000e/param.c
>>> index 6d8c39abee16..bb696c98f9b0 100644
>>> --- a/drivers/net/ethernet/intel/e1000e/param.c
>>> +++ b/drivers/net/ethernet/intel/e1000e/param.c
>>> @@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute 
>>> Interrupt Delay");
>>>   /* Receive Interrupt Delay in units of 1.024 microseconds
>>>    * hardware will likely hang if you set this to anything but zero.
>>>    *
>>> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>>> + *
>>>    * Valid Range: 0-65535
>>>    */
>>>   E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
>>> +#define DEFAULT_RDTR            0
>>> +#define BURST_RDTR            0x20
>>>   #define MAX_RXDELAY 0xFFFF
>>>   #define MIN_RXDELAY 0
>>>     /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
>>> + *
>>> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>>>    *
>>>    * Valid Range: 0-65535
>>>    */
>>>   E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
>>> +#define DEFAULT_RADV            8
>>> +#define BURST_RADV            0x20
>>>   #define MAX_RXABSDELAY 0xFFFF
>>>   #define MIN_RXABSDELAY 0
>>>   @@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter 
>>> *adapter)
>>>                        .max = MAX_RXDELAY } }
>>>           };
>>>   +        if (adapter->flags2 & FLAG2_DMA_BURST)
>>> +            opt.def = BURST_RDTR;
>>> +
>>>           if (num_RxIntDelay > bd) {
>>>               adapter->rx_int_delay = RxIntDelay[bd];
>>> e1000_validate_option(&adapter->rx_int_delay, &opt,
>>> @@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter 
>>> *adapter)
>>>       }
>>>       /* Receive Absolute Interrupt Delay */
>>>       {
>>> -        static const struct e1000_option opt = {
>>> +        static struct e1000_option opt = {
>>>               .type = range_option,
>>>               .name = "Receive Absolute Interrupt Delay",
>>>               .err  = "using default of "
>>> @@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter 
>>> *adapter)
>>>                        .max = MAX_RXABSDELAY } }
>>>           };
>>>   +        if (adapter->flags2 & FLAG2_DMA_BURST)
>>> +            opt.def = BURST_RADV;
>>> +
>>>           if (num_RxAbsIntDelay > bd) {
>>>               adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
>>> e1000_validate_option(&adapter->rx_abs_int_delay, &opt,
>>
>> This patch looks good for me, but I would like hear second opinion.
>>
>> _______________________________________________
>> Intel-wired-lan mailing list
>> Intel-wired-lan@osuosl.org
>> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
>
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH] e1000e: apply burst mode settings only on default
From: Neftin, Sasha @ 2017-08-27  8:32 UTC (permalink / raw)
  To: Willem de Bruijn, "jeffrey.t.kirsher
  Cc: netdev, Willem de Bruijn, intel-wired-lan
In-Reply-To: <291b863f-552e-2f3b-f658-e812d0848949@intel.com>

On 8/27/2017 11:30, Neftin, Sasha wrote:
> On 8/25/2017 18:06, Willem de Bruijn wrote:
>> From: Willem de Bruijn <willemb@google.com>
>>
>> Devices that support FLAG2_DMA_BURST have different default values
>> for RDTR and RADV. Apply burst mode default settings only when no
>> explicit value was passed at module load.
>>
>> The RDTR default is zero. If the module is loaded for low latency
>> operation with RxIntDelay=0, do not override this value with a burst
>> default of 32.
>>
>> Move the decision to apply burst values earlier, where explicitly
>> initialized module variables can be distinguished from defaults.
>>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>> ---
>>   drivers/net/ethernet/intel/e1000e/e1000.h  |  4 ----
>>   drivers/net/ethernet/intel/e1000e/netdev.c |  8 --------
>>   drivers/net/ethernet/intel/e1000e/param.c  | 16 +++++++++++++++-
>>   3 files changed, 15 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h 
>> b/drivers/net/ethernet/intel/e1000e/e1000.h
>> index 98e68888abb1..2311b31bdcac 100644
>> --- a/drivers/net/ethernet/intel/e1000e/e1000.h
>> +++ b/drivers/net/ethernet/intel/e1000e/e1000.h
>> @@ -94,10 +94,6 @@ struct e1000_info;
>>    */
>>   #define E1000_CHECK_RESET_COUNT        25
>>   -#define DEFAULT_RDTR            0
>> -#define DEFAULT_RADV            8
>> -#define BURST_RDTR            0x20
>> -#define BURST_RADV            0x20
>>   #define PCICFG_DESC_RING_STATUS        0xe4
>>   #define FLUSH_DESC_REQUIRED        0x100
>>   diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
>> b/drivers/net/ethernet/intel/e1000e/netdev.c
>> index 327dfe5bedc0..47b89aac7969 100644
>> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
>> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
>> @@ -3223,14 +3223,6 @@ static void e1000_configure_rx(struct 
>> e1000_adapter *adapter)
>>            */
>>           ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
>>           ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
>> -
>> -        /* override the delay timers for enabling bursting, only if
>> -         * the value was not set by the user via module options
>> -         */
>> -        if (adapter->rx_int_delay == DEFAULT_RDTR)
>> -            adapter->rx_int_delay = BURST_RDTR;
>> -        if (adapter->rx_abs_int_delay == DEFAULT_RADV)
>> -            adapter->rx_abs_int_delay = BURST_RADV;
>>       }
>>         /* set the Receive Delay Timer Register */
>> diff --git a/drivers/net/ethernet/intel/e1000e/param.c 
>> b/drivers/net/ethernet/intel/e1000e/param.c
>> index 6d8c39abee16..bb696c98f9b0 100644
>> --- a/drivers/net/ethernet/intel/e1000e/param.c
>> +++ b/drivers/net/ethernet/intel/e1000e/param.c
>> @@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute 
>> Interrupt Delay");
>>   /* Receive Interrupt Delay in units of 1.024 microseconds
>>    * hardware will likely hang if you set this to anything but zero.
>>    *
>> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>> + *
>>    * Valid Range: 0-65535
>>    */
>>   E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
>> +#define DEFAULT_RDTR            0
>> +#define BURST_RDTR            0x20
>>   #define MAX_RXDELAY 0xFFFF
>>   #define MIN_RXDELAY 0
>>     /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
>> + *
>> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>>    *
>>    * Valid Range: 0-65535
>>    */
>>   E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
>> +#define DEFAULT_RADV            8
>> +#define BURST_RADV            0x20
>>   #define MAX_RXABSDELAY 0xFFFF
>>   #define MIN_RXABSDELAY 0
>>   @@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter 
>> *adapter)
>>                        .max = MAX_RXDELAY } }
>>           };
>>   +        if (adapter->flags2 & FLAG2_DMA_BURST)
>> +            opt.def = BURST_RDTR;
>> +
>>           if (num_RxIntDelay > bd) {
>>               adapter->rx_int_delay = RxIntDelay[bd];
>> e1000_validate_option(&adapter->rx_int_delay, &opt,
>> @@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter 
>> *adapter)
>>       }
>>       /* Receive Absolute Interrupt Delay */
>>       {
>> -        static const struct e1000_option opt = {
>> +        static struct e1000_option opt = {
>>               .type = range_option,
>>               .name = "Receive Absolute Interrupt Delay",
>>               .err  = "using default of "
>> @@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter 
>> *adapter)
>>                        .max = MAX_RXABSDELAY } }
>>           };
>>   +        if (adapter->flags2 & FLAG2_DMA_BURST)
>> +            opt.def = BURST_RADV;
>> +
>>           if (num_RxAbsIntDelay > bd) {
>>               adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
>> e1000_validate_option(&adapter->rx_abs_int_delay, &opt,
>
> This patch looks good for me, but I would like hear second opinion.
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply

* Re: [patch net-next 11/12] mlxsw: spectrum_dpipe: Add support for IPv4 host table dump
From: Arkadi Sharshevsky @ 2017-08-27  8:31 UTC (permalink / raw)
  To: David Ahern, Jiri Pirko, netdev; +Cc: davem, idosch, mlxsw
In-Reply-To: <a4335a01-a98b-573c-9747-b4039188340c@gmail.com>



On 08/25/2017 10:51 PM, David Ahern wrote:
> On 8/25/17 2:26 AM, Arkadi Sharshevsky wrote:
>>
>>
>> On 08/24/2017 10:26 PM, David Ahern wrote:
>>> On 8/23/17 11:40 PM, Jiri Pirko wrote:
>>>> +static int
>>>> +mlxsw_sp_dpipe_table_host_entries_get(struct mlxsw_sp *mlxsw_sp,
>>>> +				      struct devlink_dpipe_entry *entry,
>>>> +				      bool counters_enabled,
>>>> +				      struct devlink_dpipe_dump_ctx *dump_ctx,
>>>> +				      int type)
>>>> +{
>>>> +	int rif_neigh_count = 0;
>>>> +	int rif_neigh_skip = 0;
>>>> +	int neigh_count = 0;
>>>> +	int rif_count;
>>>> +	int i, j;
>>>> +	int err;
>>>> +
>>>> +	rtnl_lock();
>>>
>>> Why does a h/w driver dumping its tables need the rtnl lock?
>>>
>>
>> This table represents the hw IPv4 arp table, and the
>> driver depends on rtnl to be held.
>>
> 
> Meaning mlxsw does not have its own locks protecting data structures --
> e.g., rif adds and deletes, so it is relying on rtnl?
> 
> Also, this dpipe capability seems to be just dumping data structures
> maintained by the driver. ie., you can compare the mlxsw view of
> networking state to IPv4 and IPv6 level tables. Any plans to offer a
> command that reads data from the h/w and passes that back to the user?
> i.e, a command to compare kernel tables to h/w state?
> 

So this infra should provide several things-

1) Reveal the interactions between various hardware tables
2) Counters for this tables
3) Debugabillity

The first two can be achieved right now. Regarding debugabillity, which
is a bit vague, the current assumption is that the drivers internal data
structures are synced with hardware (which is no always true), and maybe
are not synced with the kernel, so this can be achieved right now by
dumping the internal state of the driver. Furthermore, the counters are
dumped from the hardware and give the user additional indication.

I completely agree that the hardware should be dumped in order to
validate the internal data structures are really synced with HW. This
could be usable for observing data corruptions inside the ASIC and
various complex bugs.

In order to address that I though about maybe add a flag called
"validate_hw" so that during the dump the driver<-->hw state could be
validated.

What do you think about it?

Thanks,
Arkadi

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH] e1000e: apply burst mode settings only on default
From: Neftin, Sasha @ 2017-08-27  8:30 UTC (permalink / raw)
  To: Willem de Bruijn, "jeffrey.t.kirsher
  Cc: netdev, Willem de Bruijn, intel-wired-lan
In-Reply-To: <20170825150626.2843-1-willemdebruijn.kernel@gmail.com>

On 8/25/2017 18:06, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
>
> Devices that support FLAG2_DMA_BURST have different default values
> for RDTR and RADV. Apply burst mode default settings only when no
> explicit value was passed at module load.
>
> The RDTR default is zero. If the module is loaded for low latency
> operation with RxIntDelay=0, do not override this value with a burst
> default of 32.
>
> Move the decision to apply burst values earlier, where explicitly
> initialized module variables can be distinguished from defaults.
>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>   drivers/net/ethernet/intel/e1000e/e1000.h  |  4 ----
>   drivers/net/ethernet/intel/e1000e/netdev.c |  8 --------
>   drivers/net/ethernet/intel/e1000e/param.c  | 16 +++++++++++++++-
>   3 files changed, 15 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
> index 98e68888abb1..2311b31bdcac 100644
> --- a/drivers/net/ethernet/intel/e1000e/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000e/e1000.h
> @@ -94,10 +94,6 @@ struct e1000_info;
>    */
>   #define E1000_CHECK_RESET_COUNT		25
>   
> -#define DEFAULT_RDTR			0
> -#define DEFAULT_RADV			8
> -#define BURST_RDTR			0x20
> -#define BURST_RADV			0x20
>   #define PCICFG_DESC_RING_STATUS		0xe4
>   #define FLUSH_DESC_REQUIRED		0x100
>   
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
> index 327dfe5bedc0..47b89aac7969 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> @@ -3223,14 +3223,6 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
>   		 */
>   		ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
>   		ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
> -
> -		/* override the delay timers for enabling bursting, only if
> -		 * the value was not set by the user via module options
> -		 */
> -		if (adapter->rx_int_delay == DEFAULT_RDTR)
> -			adapter->rx_int_delay = BURST_RDTR;
> -		if (adapter->rx_abs_int_delay == DEFAULT_RADV)
> -			adapter->rx_abs_int_delay = BURST_RADV;
>   	}
>   
>   	/* set the Receive Delay Timer Register */
> diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c
> index 6d8c39abee16..bb696c98f9b0 100644
> --- a/drivers/net/ethernet/intel/e1000e/param.c
> +++ b/drivers/net/ethernet/intel/e1000e/param.c
> @@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute Interrupt Delay");
>   /* Receive Interrupt Delay in units of 1.024 microseconds
>    * hardware will likely hang if you set this to anything but zero.
>    *
> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
> + *
>    * Valid Range: 0-65535
>    */
>   E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
> +#define DEFAULT_RDTR			0
> +#define BURST_RDTR			0x20
>   #define MAX_RXDELAY 0xFFFF
>   #define MIN_RXDELAY 0
>   
>   /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
> + *
> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>    *
>    * Valid Range: 0-65535
>    */
>   E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
> +#define DEFAULT_RADV			8
> +#define BURST_RADV			0x20
>   #define MAX_RXABSDELAY 0xFFFF
>   #define MIN_RXABSDELAY 0
>   
> @@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
>   					 .max = MAX_RXDELAY } }
>   		};
>   
> +		if (adapter->flags2 & FLAG2_DMA_BURST)
> +			opt.def = BURST_RDTR;
> +
>   		if (num_RxIntDelay > bd) {
>   			adapter->rx_int_delay = RxIntDelay[bd];
>   			e1000_validate_option(&adapter->rx_int_delay, &opt,
> @@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter *adapter)
>   	}
>   	/* Receive Absolute Interrupt Delay */
>   	{
> -		static const struct e1000_option opt = {
> +		static struct e1000_option opt = {
>   			.type = range_option,
>   			.name = "Receive Absolute Interrupt Delay",
>   			.err  = "using default of "
> @@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
>   					 .max = MAX_RXABSDELAY } }
>   		};
>   
> +		if (adapter->flags2 & FLAG2_DMA_BURST)
> +			opt.def = BURST_RADV;
> +
>   		if (num_RxAbsIntDelay > bd) {
>   			adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
>   			e1000_validate_option(&adapter->rx_abs_int_delay, &opt,

This patch looks good for me, but I would like hear second opinion.

^ permalink raw reply

* Re: [PATCH] pktgen: add a new sample script for 40G and above link testing
From: Tariq Toukan @ 2017-08-27  8:25 UTC (permalink / raw)
  To: Robert Hoo, davem, tariqt, brouer, kyle.leet; +Cc: netdev, robert.hu
In-Reply-To: <1503653196-64418-1-git-send-email-robert.hu@linux.intel.com>



On 25/08/2017 12:26 PM, Robert Hoo wrote:
> (Sorry for yesterday's wrong sending, I finally fixed my MTA and git
> send-email settings.)
> 
> It's hard to benchmark 40G+ network bandwidth using ordinary
> tools like iperf, netperf (see reference 1).
> Pktgen, packet generator from Kernel sapce, shall be a candidate.
> I then tried with pktgen multiqueue sample scripts, but still
> cannot reach line rate.

Try samples 03 and 04.

> I then derived this NUMA awared irq affinity sample script from
> multi-queue sample one, successfully benchmarked 40G link. I think this can
> also be useful for 100G reference, though I haven't got device to test yet.
> 
> This script simply does:
> Detect $DEV's NUMA node belonging.
> Bind each thread (processor from that NUMA node) with each $DEV queue's
> irq affinity, 1:1 mapping.
> How many '-t' threads input determines how many queues will be
> utilized.

I agree this is an essential capability.
This was the main reason I added support for the -f argument.
Using it, I could choose cores of local NUMA, especially for single 
thread, or when cores of the NUMA are sequential.

> 
> Tested with Intel XL710 NIC with Cisco 3172 switch.
> 
> It would be even slightly better if the irqbalance service is turned
> off outside.
> 
> Referrences:
> https://people.netfilter.org/hawk/presentations/LCA2015/net_stack_challenges_100G_LCA2015.pdf
> http://www.intel.cn/content/dam/www/public/us/en/documents/reference-guides/xl710-x710-performance-tuning-linux-guide.pdf
> 
> Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
> ---

Regards,
Tariq Toukan

^ permalink raw reply

* [PATCH] be2net: Fix some u16 fields appropriately
From: Haishuang Yan @ 2017-08-27  7:24 UTC (permalink / raw)
  To: Sathya Perla, jit Khaparde, Sriharsha Basavapatna, Somnath Kotur
  Cc: netdev, linux-kernel, Haishuang Yan

In be_tx_compl_process, frag_index declared as u32, so it's better to
declare last_index as u32 also.

CC: Ajit Khaparde <ajit.khaparde@broadcom.com>
Fixes: b0fd2eb28bd4 ("be2net: Declare some u16 fields as u32 to improve
performance")
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
---
 drivers/net/ethernet/emulex/benet/be.h      | 2 +-
 drivers/net/ethernet/emulex/benet/be_main.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 674cf9d..2ba4d61 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -255,7 +255,7 @@ struct be_tx_stats {
 /* Structure to hold some data of interest obtained from a TX CQE */
 struct be_tx_compl_info {
 	u8 status;		/* Completion status */
-	u16 end_index;		/* Completed TXQ Index */
+	u32 end_index;		/* Completed TXQ Index */
 };
 
 struct be_tx_obj {
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 319eee3..3645344 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2606,7 +2606,7 @@ static struct be_tx_compl_info *be_tx_compl_get(struct be_tx_obj *txo)
 }
 
 static u16 be_tx_compl_process(struct be_adapter *adapter,
-			       struct be_tx_obj *txo, u16 last_index)
+			       struct be_tx_obj *txo, u32 last_index)
 {
 	struct sk_buff **sent_skbs = txo->sent_skb_list;
 	struct be_queue_info *txq = &txo->q;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH] igb: check memory allocation failure
From: Christophe JAILLET @ 2017-08-27  6:39 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: intel-wired-lan, netdev, linux-kernel, kernel-janitors,
	Christophe JAILLET

Check memory allocation failures and return -ENOMEM in such cases, as
already done for other memory allocations in this function.

This avoids NULL pointers dereference.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..837d9b46a390 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3162,6 +3162,8 @@ static int igb_sw_init(struct igb_adapter *adapter)
 	/* Setup and initialize a copy of the hw vlan table array */
 	adapter->shadow_vfta = kcalloc(E1000_VLAN_FILTER_TBL_SIZE, sizeof(u32),
 				       GFP_ATOMIC);
+	if (!adapter->shadow_vfta)
+		return -ENOMEM;
 
 	/* This call may decrease the number of queues */
 	if (igb_init_interrupt_scheme(adapter, true)) {
-- 
2.11.0

^ permalink raw reply related

* [PATCH net] bridge: check for null fdb->dst before notifying switchdev drivers
From: Roopa Prabhu @ 2017-08-27  4:13 UTC (permalink / raw)
  To: davem; +Cc: netdev, arkadis

From: Roopa Prabhu <roopa@cumulusnetworks.com>

current switchdev drivers dont seem to support offloading fdb
entries pointing to the bridge device which have fdb->dst
not set to any port. This patch adds a NULL fdb->dst check in
the switchdev notifier code.

This patch fixes the below NULL ptr dereference:
$bridge fdb add 00:02:00:00:00:33 dev br0 self

[   69.953374] BUG: unable to handle kernel NULL pointer dereference at
0000000000000008
[   69.954044] IP: br_switchdev_fdb_notify+0x29/0x80
[   69.954044] PGD 66527067
[   69.954044] P4D 66527067
[   69.954044] PUD 7899c067
[   69.954044] PMD 0
[   69.954044]
[   69.954044] Oops: 0000 [#1] SMP
[   69.954044] Modules linked in:
[   69.954044] CPU: 1 PID: 3074 Comm: bridge Not tainted 4.13.0-rc6+ #1
[   69.954044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org
04/01/2014
[   69.954044] task: ffff88007b827140 task.stack: ffffc90001564000
[   69.954044] RIP: 0010:br_switchdev_fdb_notify+0x29/0x80
[   69.954044] RSP: 0018:ffffc90001567918 EFLAGS: 00010246
[   69.954044] RAX: 0000000000000000 RBX: ffff8800795e0880 RCX:
00000000000000c0
[   69.954044] RDX: ffffc90001567920 RSI: 000000000000001c RDI:
ffff8800795d0600
[   69.954044] RBP: ffffc90001567938 R08: ffff8800795d0600 R09:
0000000000000000
[   69.954044] R10: ffffc90001567a88 R11: ffff88007b849400 R12:
ffff8800795e0880
[   69.954044] R13: ffff8800795d0600 R14: ffffffff81ef8880 R15:
000000000000001c
[   69.954044] FS:  00007f93d3085700(0000) GS:ffff88007fd00000(0000)
knlGS:0000000000000000
[   69.954044] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   69.954044] CR2: 0000000000000008 CR3: 0000000066551000 CR4:
00000000000006e0
[   69.954044] Call Trace:
[   69.954044]  fdb_notify+0x3f/0xf0
[   69.954044]  __br_fdb_add.isra.12+0x1a7/0x370
[   69.954044]  br_fdb_add+0x178/0x280
[   69.954044]  rtnl_fdb_add+0x10a/0x200
[   69.954044]  rtnetlink_rcv_msg+0x1b4/0x240
[   69.954044]  ? skb_free_head+0x21/0x40
[   69.954044]  ? rtnl_calcit.isra.18+0xf0/0xf0
[   69.954044]  netlink_rcv_skb+0xed/0x120
[   69.954044]  rtnetlink_rcv+0x15/0x20
[   69.954044]  netlink_unicast+0x180/0x200
[   69.954044]  netlink_sendmsg+0x291/0x370
[   69.954044]  ___sys_sendmsg+0x180/0x2e0
[   69.954044]  ? filemap_map_pages+0x2db/0x370
[   69.954044]  ? do_wp_page+0x11d/0x420
[   69.954044]  ? __handle_mm_fault+0x794/0xd80
[   69.954044]  ? vma_link+0xcb/0xd0
[   69.954044]  __sys_sendmsg+0x4c/0x90
[   69.954044]  SyS_sendmsg+0x12/0x20
[   69.954044]  do_syscall_64+0x63/0xe0
[   69.954044]  entry_SYSCALL64_slow_path+0x25/0x25
[   69.954044] RIP: 0033:0x7f93d2bad690
[   69.954044] RSP: 002b:00007ffc7217a638 EFLAGS: 00000246 ORIG_RAX:
000000000000002e
[   69.954044] RAX: ffffffffffffffda RBX: 00007ffc72182eac RCX:
00007f93d2bad690
[   69.954044] RDX: 0000000000000000 RSI: 00007ffc7217a670 RDI:
0000000000000003
[   69.954044] RBP: 0000000059a1f7f8 R08: 0000000000000006 R09:
000000000000000a
[   69.954044] R10: 00007ffc7217a400 R11: 0000000000000246 R12:
00007ffc7217a670
[   69.954044] R13: 00007ffc72182a98 R14: 00000000006114c0 R15:
00007ffc72182aa0
[   69.954044] Code: 1f 00 66 66 66 66 90 55 48 89 e5 48 83 ec 20 f6 47
20 04 74 0a 83 fe 1c 74 09 83 fe 1d 74 2c c9 66 90 c3 48 8b 47 10 48 8d
55 e8 <48> 8b 70 08 0f b7 47 1e 48 83 c7 18 48 89 7d f0 bf 03 00 00 00
[   69.954044] RIP: br_switchdev_fdb_notify+0x29/0x80 RSP:
ffffc90001567918
[   69.954044] CR2: 0000000000000008
[   69.954044] ---[ end trace 03e9eec4a82c238b ]---

Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about FDB add/del")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 net/bridge/br_switchdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 181a44d..f6b1c7d 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -115,7 +115,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
 void
 br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 {
-	if (!fdb->added_by_user)
+	if (!fdb->added_by_user || !fdb->dst)
 		return;
 
 	switch (type) {
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH RFC WIP 0/5] IGMP snooping for local traffic
From: Andrew Lunn @ 2017-08-26 23:02 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: netdev, Vivien Didelot, Florian Fainelli, jiri, roopa, stephen,
	bridge
In-Reply-To: <c3f5050f-7a38-c28c-31d0-df5909259fb1@cumulusnetworks.com>

> Hi Andrew,
> 
> Have you taken a look at mglist (the boolean, probably needs a rename) ? It is for
> exactly that purpose, to track which groups the bridge is interested in.
> I assume I'm forgetting or missing something here.
> 
> > performed. With a pure software bridge, it is not required. All
> > mulitcast frames are passed to the brX interface, and the network
> 
> If mglist (again the boolean) is false then they won't be passed up.
> 
> > stack filters them, as it does for any interface. However, when
> > hardware offload is involved, things change. We should program the
> > hardware to only send multcast packets to the host when the host has
> > in interest in them.
> 
> Granted the boolean mglist might need some changes (esp. with host group leave)
> but I think it can be used to program switchdev for host join/leave, can't
> we adjust its behaviour instead of introducing this complexity and avoid many
> headaches ?

I would like to avoid this complexity as well. I will take a look at
mglist. Thanks for the hint.

	Andrew

^ permalink raw reply

* Re: [PATCH RFC WIP 0/5] IGMP snooping for local traffic
From: Nikolay Aleksandrov @ 2017-08-26 22:40 UTC (permalink / raw)
  To: Andrew Lunn, netdev; +Cc: Florian Fainelli, Vivien Didelot, roopa, bridge, jiri
In-Reply-To: <c3f5050f-7a38-c28c-31d0-df5909259fb1@cumulusnetworks.com>

On 27.08.2017 01:17, Nikolay Aleksandrov wrote:
> On 26/08/17 23:56, Andrew Lunn wrote:
>> This is a WIP patchset i would like comments on from bridge, switchdev
>> and hardware offload people.
>>
>> The linux bridge supports IGMP snooping. It will listen to IGMP
>> reports on bridge ports and keep track of which groups have been
>> joined on an interface. It will then forward multicast based on this
>> group membership.
>>
>> When the bridge adds or removed groups from an interface, it uses
>> switchdev to request the hardware add an mdb to a port, so the
>> hardware can perform the selective forwarding between ports.
>>
>> What is not covered by the current bridge code, is IGMP joins/leaves
>> from the host on the brX interface. No such monitoring is
> 
> Hi Andrew,
> 
> Have you taken a look at mglist (the boolean, probably needs a rename) ? It is for
> exactly that purpose, to track which groups the bridge is interested in.
> I assume I'm forgetting or missing something here.
> 
>> performed. With a pure software bridge, it is not required. All
>> mulitcast frames are passed to the brX interface, and the network
> 
> If mglist (again the boolean) is false then they won't be passed up.
> 
>> stack filters them, as it does for any interface. However, when
>> hardware offload is involved, things change. We should program the
>> hardware to only send multcast packets to the host when the host has
>> in interest in them.
> 
> Granted the boolean mglist might need some changes (esp. with host group leave)
> but I think it can be used to program switchdev for host join/leave, can't
> we adjust its behaviour instead of introducing this complexity and avoid many
> headaches ?
> 
>>
>> Thus we need to perform IGMP snooping on the brX interface, just like
>> any other interface of the bridge. However, currently the brX
>> interface is missing all the needed data structures to do this. There
>> is no net_bridge_port structure for the brX interface. This strucuture
>> is created when an interface is added to the bridge. But the brX
>> interface is not a member of the bridge. So this patchset makes the
>> brX interface a first class member of the bridge. When the brX
>> interface is opened, the interface is added to the bridge. A
>> net_bridge_port is allocated for it, and IGMP snooping is performed as
>> usual.
> 
> I have actually discussed this idea long time ago with Vlad and it has very nice
> upsides (most important one removing br/port checks everywhere) but it blows up
> fast with special cases for the bridge and things look very similar. You'll need
> to rework the whole bridge and turn every bridge special case into either a port 
> generic one or again bridge-specific special case but with a check for the new flag.
> I will not point out every bug that comes out of this, but registering the bridge
> rx handler to itself is simply wrong on many levels and breaks many setups.

This was a digression about making the bridge a proper port of itself
(e.g. port 0, linked and all), it is only tangential to this
implementation as it doesn't link the new port.

> 
>>
>> There are some complexities here. Some assumptions are broken, like
>> the master interface of a port interface is the bridge interface. The
>> brX interface cannot be its own master. The use of
>> netdev_master_upper_dev_get() within the bridge code has been changed
>> to reflecit this. The bridge receive handler needs to not process
>> frames for the brX interface, etc.
>>
>> The interface downward to the hardware is also an issue. The code
>> presented here is a hack and needs to change. But that is secondary
>> and can be solved once it is agreed how the bridge needs to change to
>> support this use case.
> 
> Definitely agree with this statement. :-)
> 
>>
>> Comment welcome and wanted.
>>
>> 	Andrew
>>
>> Andrew Lunn (5):
>>   net: rtnetlink: Handle bridge port without upper device
>>   net: bridge: Skip receive handler on brX interface
>>   net: bridge: Make the brX interface a member of the bridge
>>   net: dsa: HACK: Handle MDB add/remove for none-switch ports
>>   net: dsa: Don't include CPU port when adding MDB to a port
>>
>>  include/linux/if_bridge.h |  1 +
>>  net/bridge/br_device.c    | 12 ++++++++++--
>>  net/bridge/br_if.c        | 37 ++++++++++++++++++++++++-------------
>>  net/bridge/br_input.c     |  4 ++++
>>  net/bridge/br_mdb.c       |  2 --
>>  net/bridge/br_multicast.c |  7 ++++---
>>  net/bridge/br_private.h   |  1 +
>>  net/core/rtnetlink.c      | 23 +++++++++++++++++++++--
>>  net/dsa/port.c            | 19 +++++++++++++++++--
>>  net/dsa/switch.c          |  2 +-
>>  10 files changed, 83 insertions(+), 25 deletions(-)
>>
> 

^ permalink raw reply

* Re: [PATCH RFC WIP 0/5] IGMP snooping for local traffic
From: Nikolay Aleksandrov @ 2017-08-26 22:17 UTC (permalink / raw)
  To: Andrew Lunn, netdev
  Cc: Vivien Didelot, Florian Fainelli, jiri, roopa, stephen, bridge
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>

On 26/08/17 23:56, Andrew Lunn wrote:
> This is a WIP patchset i would like comments on from bridge, switchdev
> and hardware offload people.
> 
> The linux bridge supports IGMP snooping. It will listen to IGMP
> reports on bridge ports and keep track of which groups have been
> joined on an interface. It will then forward multicast based on this
> group membership.
> 
> When the bridge adds or removed groups from an interface, it uses
> switchdev to request the hardware add an mdb to a port, so the
> hardware can perform the selective forwarding between ports.
> 
> What is not covered by the current bridge code, is IGMP joins/leaves
> from the host on the brX interface. No such monitoring is

Hi Andrew,

Have you taken a look at mglist (the boolean, probably needs a rename) ? It is for
exactly that purpose, to track which groups the bridge is interested in.
I assume I'm forgetting or missing something here.

> performed. With a pure software bridge, it is not required. All
> mulitcast frames are passed to the brX interface, and the network

If mglist (again the boolean) is false then they won't be passed up.

> stack filters them, as it does for any interface. However, when
> hardware offload is involved, things change. We should program the
> hardware to only send multcast packets to the host when the host has
> in interest in them.

Granted the boolean mglist might need some changes (esp. with host group leave)
but I think it can be used to program switchdev for host join/leave, can't
we adjust its behaviour instead of introducing this complexity and avoid many
headaches ?

> 
> Thus we need to perform IGMP snooping on the brX interface, just like
> any other interface of the bridge. However, currently the brX
> interface is missing all the needed data structures to do this. There
> is no net_bridge_port structure for the brX interface. This strucuture
> is created when an interface is added to the bridge. But the brX
> interface is not a member of the bridge. So this patchset makes the
> brX interface a first class member of the bridge. When the brX
> interface is opened, the interface is added to the bridge. A
> net_bridge_port is allocated for it, and IGMP snooping is performed as
> usual.

I have actually discussed this idea long time ago with Vlad and it has very nice
upsides (most important one removing br/port checks everywhere) but it blows up
fast with special cases for the bridge and things look very similar. You'll need
to rework the whole bridge and turn every bridge special case into either a port 
generic one or again bridge-specific special case but with a check for the new flag.
I will not point out every bug that comes out of this, but registering the bridge
rx handler to itself is simply wrong on many levels and breaks many setups.

> 
> There are some complexities here. Some assumptions are broken, like
> the master interface of a port interface is the bridge interface. The
> brX interface cannot be its own master. The use of
> netdev_master_upper_dev_get() within the bridge code has been changed
> to reflecit this. The bridge receive handler needs to not process
> frames for the brX interface, etc.
> 
> The interface downward to the hardware is also an issue. The code
> presented here is a hack and needs to change. But that is secondary
> and can be solved once it is agreed how the bridge needs to change to
> support this use case.

Definitely agree with this statement. :-)

> 
> Comment welcome and wanted.
> 
> 	Andrew
> 
> Andrew Lunn (5):
>   net: rtnetlink: Handle bridge port without upper device
>   net: bridge: Skip receive handler on brX interface
>   net: bridge: Make the brX interface a member of the bridge
>   net: dsa: HACK: Handle MDB add/remove for none-switch ports
>   net: dsa: Don't include CPU port when adding MDB to a port
> 
>  include/linux/if_bridge.h |  1 +
>  net/bridge/br_device.c    | 12 ++++++++++--
>  net/bridge/br_if.c        | 37 ++++++++++++++++++++++++-------------
>  net/bridge/br_input.c     |  4 ++++
>  net/bridge/br_mdb.c       |  2 --
>  net/bridge/br_multicast.c |  7 ++++---
>  net/bridge/br_private.h   |  1 +
>  net/core/rtnetlink.c      | 23 +++++++++++++++++++++--
>  net/dsa/port.c            | 19 +++++++++++++++++--
>  net/dsa/switch.c          |  2 +-
>  10 files changed, 83 insertions(+), 25 deletions(-)
> 

^ permalink raw reply

* Re: [PATCH v4 4/5] net: stmmac: dwmac-sun8i: choose internal PHY via phy-is-integrated
From: Andrew Lunn @ 2017-08-26 21:20 UTC (permalink / raw)
  To: Corentin Labbe
  Cc: robh+dt, mark.rutland, maxime.ripard, wens, linux,
	peppe.cavallaro, alexandre.torgue, f.fainelli, icenowy, netdev,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <20170826073311.25612-5-clabbe.montjoie@gmail.com>

Hi Corentin

I think we have now all agreed this is an mdio-mux, plus it is also an
MII mux. We should represent that in device tree. This patchset does
this. However, as it is now, the mux structure in DT is ignored. All
it does is search for the phy-is-integrated flags and goes on that.

I made the comment that the device tree representation cannot be
implemented using an MDIO mux driver, because of driver loading
issues.  However, the core of the MDIO mux code is just a library,
symbols exported as GPL, free for anything to use.

What i think should happen is the mdio-mux is implemented inside the
MAC driver, using the mux-core as a library. The device tree structure
of a mix is then reflected within Linux. The mux switch callback is
implemented within the MAC driver. So it can reset the MAC when the
mux is switched. The 'phy-is-integrated' property is then no longer
needed.

I would suggest a binding something like:

emac: ethernet@1c0b000 {
        compatible = "allwinner,sun8i-h3-emac";
        syscon = <&syscon>;
        reg = <0x01c0b000 0x104>;
        interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
        interrupt-names = "macirq";
        resets = <&ccu RST_BUS_EMAC>;
        reset-names = "stmmaceth";
        clocks = <&ccu CLK_BUS_EMAC>;
        clock-names = "stmmaceth";
        #address-cells = <1>;
        #size-cells = <0>;

        phy-handle = <&int_mii_phy>;
        phy-mode = "mii";
        allwinner,leds-active-low;

        mdio: mdio {
                #address-cells = <1>;
                #size-cells = <0>;
	}

	mdio-mux {
                #address-cells = <1>;
                #size-cells = <0>;

		mdio@0 {
			reg = <0>;
                        #address-cells = <1>;
                        #size-cells = <0>;

                        int_mii_phy: ethernet-phy@1 {
                                reg = <1>;
                                clocks = <&ccu CLK_BUS_EPHY>;
                                resets = <&ccu RST_BUS_EPHY>;
                        };
                };
                ext_mdio: mdio@0 {
                        #address-cells = <1>;
                        #size-cells = <0>;

                        ext_rgmii_phy: ethernet-phy@1 {
                                reg = <1>;
                        };
                };
       };
};

	Andrew

^ permalink raw reply

* [PATCH RFC WIP 4/5] net: dsa: HACK: Handle MDB add/remove for none-switch ports
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
  To: netdev
  Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
	bridge, Andrew Lunn
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>

When there is a mdb added to a port which is not in the switch, we
need the switch to forward traffic for the group to the software
bridge, so it can forward it out the none-switch port.

The current implementation is a hack and will be replaced. Currently
only the bridge soft interface is supported. When there is a
join/leave on the soft interface, switchdev calls are made on the soft
interface device, brX. This does not have a switchdev ops structure
registered, so all lower interfaces of brX get there switchdev
function called. These are switch ports, and do have switchdev ops. By
comparing the original interface to the called interface, we can
determine this is not for a switch port, and add/remove the mdb to the
CPU port.
---
 net/dsa/port.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/net/dsa/port.c b/net/dsa/port.c
index d6e07176df3f..d8e4bfefd97d 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -194,8 +194,15 @@ int dsa_port_mdb_add(struct dsa_port *dp,
 		.mdb = mdb,
 	};
 
-	pr_info("dsa_port_mdb_add: %d %d", info.sw_index, info.port);
-
+	if (dp->netdev != mdb->obj.orig_dev) {
+		/* Not a port for this switch, so forward
+		 * multicast out the CPU port to the bridge.
+		 */
+		struct dsa_switch_tree *dst = dp->ds->dst;
+		struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
+		info.port = cpu_dp->index;
+		return dsa_port_notify(cpu_dp, DSA_NOTIFIER_MDB_ADD, &info);
+	}
 	return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info);
 }
 
@@ -208,6 +215,14 @@ int dsa_port_mdb_del(struct dsa_port *dp,
 		.mdb = mdb,
 	};
 
+	if (dp->netdev != mdb->obj.orig_dev) {
+		struct dsa_switch_tree *dst = dp->ds->dst;
+		struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
+
+		info.port = cpu_dp->index;
+		return dsa_port_notify(cpu_dp, DSA_NOTIFIER_MDB_DEL, &info);
+	}
+
 	return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info);
 }
 
-- 
2.14.1

^ permalink raw reply related

* [PATCH RFC WIP 3/5] net: bridge: Make the brX interface a member of the bridge
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
  To: netdev
  Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
	bridge, Andrew Lunn
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>

In order to perform IGMP snooping on the brX interface, it has to be
part of the bridge, so that the code snooping on normal bridge ports
keeps track of IGMP joins and leaves.

When the brX interface is opened, add the interface to the bridge.
When the brX interface is closed, remove it from the bridge.

This port does however need some special handling. So add a bridge
port flag, BR_SOFT_INTERFACE, indicating a port is the sort interface
of the bridge.

When the port is added to the bridge, the netdev for this port cannot
be linked to the master device, since it is the master device.
Similarly when removing the port, it cannot be unlinked from the
master device.

With the brX interface now being a member of the bridge, and having
all associated structures, we can process IGMP messages sent by the
interface. This is done by the br_multicast_rcv() function, which
takes the bridge_port structure as a parameter. This cannot be easily
found, so keep track of it in the net_bridge structure.
---
 include/linux/if_bridge.h |  1 +
 net/bridge/br_device.c    | 12 ++++++++++--
 net/bridge/br_if.c        | 37 ++++++++++++++++++++++++-------------
 net/bridge/br_mdb.c       |  2 --
 net/bridge/br_multicast.c |  7 ++++---
 net/bridge/br_private.h   |  1 +
 6 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 3cd18ac0697f..8a03821d1827 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -49,6 +49,7 @@ struct br_ip_list {
 #define BR_MULTICAST_TO_UNICAST	BIT(12)
 #define BR_VLAN_TUNNEL		BIT(13)
 #define BR_BCAST_FLOOD		BIT(14)
+#define BR_SOFT_INTERFACE	BIT(15)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 861ae2a165f4..f27ca62fd4a5 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -69,7 +69,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
 			goto out;
 		}
-		if (br_multicast_rcv(br, NULL, skb, vid)) {
+		if (br_multicast_rcv(br, br->local_port, skb, vid)) {
 			kfree_skb(skb);
 			goto out;
 		}
@@ -133,6 +133,14 @@ static void br_dev_uninit(struct net_device *dev)
 static int br_dev_open(struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
+	int err;
+
+	err = br_add_if(br, br->dev);
+	if (err)
+		return err;
+
+	br->local_port = list_first_or_null_rcu(&br->port_list,
+						struct net_bridge_port, list);
 
 	netdev_update_features(dev);
 	netif_start_queue(dev);
@@ -161,7 +169,7 @@ static int br_dev_stop(struct net_device *dev)
 
 	netif_stop_queue(dev);
 
-	return 0;
+	return br_del_if(br, br->dev);
 }
 
 static void br_get_stats64(struct net_device *dev,
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f3aef22931ab..49208e774191 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -284,7 +284,8 @@ static void del_nbp(struct net_bridge_port *p)
 
 	nbp_update_port_count(br);
 
-	netdev_upper_dev_unlink(dev, br->dev);
+	if (!(p->flags & BR_SOFT_INTERFACE))
+		netdev_upper_dev_unlink(dev, br->dev);
 
 	dev->priv_flags &= ~IFF_BRIDGE_PORT;
 
@@ -362,6 +363,8 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	p->priority = 0x8000 >> BR_PORT_BITS;
 	p->port_no = index;
 	p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
+	if (br->dev == dev)
+		p->flags |= BR_SOFT_INTERFACE;
 	br_init_port(p);
 	br_set_state(p, BR_STATE_DISABLED);
 	br_stp_port_timer_init(p);
@@ -500,8 +503,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 		return -EINVAL;
 
 	/* No bridging of bridges */
-	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
-		return -ELOOP;
+	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) {
+		/* Unless it is our own soft interface */
+		if (br->dev != dev)
+			return -ELOOP;
+	}
 
 	/* Device is already being bridged */
 	if (br_port_exists(dev))
@@ -540,9 +546,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	dev->priv_flags |= IFF_BRIDGE_PORT;
 
-	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
-	if (err)
-		goto err5;
+	if (!(p->flags & BR_SOFT_INTERFACE)) {
+		err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
+		if (err)
+			goto err5;
+	}
 
 	err = nbp_switchdev_mark_set(p);
 	if (err)
@@ -563,13 +571,15 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	else
 		netdev_set_rx_headroom(dev, br_hr);
 
-	if (br_fdb_insert(br, p, dev->dev_addr, 0))
-		netdev_err(dev, "failed insert local address bridge forwarding table\n");
+	if (!(p->flags & BR_SOFT_INTERFACE)) {
+		if (br_fdb_insert(br, p, dev->dev_addr, 0))
+			netdev_err(dev, "failed insert local address bridge forwarding table\n");
 
-	err = nbp_vlan_init(p);
-	if (err) {
-		netdev_err(dev, "failed to initialize vlan filtering on this port\n");
-		goto err7;
+		err = nbp_vlan_init(p);
+		if (err) {
+			netdev_err(dev, "failed to initialize vlan filtering on this port\n");
+			goto err7;
+		}
 	}
 
 	spin_lock_bh(&br->lock);
@@ -597,7 +607,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	br_fdb_delete_by_port(br, p, 0, 1);
 	nbp_update_port_count(br);
 err6:
-	netdev_upper_dev_unlink(dev, br->dev);
+	if (!(p->flags & BR_SOFT_INTERFACE))
+		netdev_upper_dev_unlink(dev, br->dev);
 err5:
 	dev->priv_flags &= ~IFF_BRIDGE_PORT;
 	netdev_rx_handler_unregister(dev);
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index a0b11e7d67d9..47f0d9b4221d 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -117,8 +117,6 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 				struct br_mdb_entry e;
 
 				port = p->port;
-				if (!port)
-					continue;
 
 				memset(&e, 0, sizeof(e));
 				e.ifindex = port->dev->ifindex;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index dae3af1f531a..f1bf9ec15de8 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -915,7 +915,7 @@ static void __br_multicast_send_query(struct net_bridge *br,
 	if (!skb)
 		return;
 
-	if (port) {
+	if (port && !(port->flags & BR_SOFT_INTERFACE)) {
 		skb->dev = port->dev;
 		br_multicast_count(br, port, skb, igmp_type,
 				   BR_MCAST_DIR_TX);
@@ -944,8 +944,9 @@ static void br_multicast_send_query(struct net_bridge *br,
 
 	memset(&br_group.u, 0, sizeof(br_group.u));
 
-	if (port ? (own_query == &port->ip4_own_query) :
-		   (own_query == &br->ip4_own_query)) {
+	if (port && !(port->flags & BR_SOFT_INTERFACE) ?
+	    (own_query == &port->ip4_own_query) :
+	    (own_query == &br->ip4_own_query)) {
 		other_query = &br->ip4_other_query;
 		br_group.proto = htons(ETH_P_IP);
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index fd9ee73e0a6d..c4b99a35abb0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -296,6 +296,7 @@ struct net_bridge {
 	spinlock_t			lock;
 	spinlock_t			hash_lock;
 	struct list_head		port_list;
+	struct net_bridge_port		*local_port;
 	struct net_device		*dev;
 	struct pcpu_sw_netstats		__percpu *stats;
 	/* These fields are accessed on each packet */
-- 
2.14.1

^ permalink raw reply related

* [PATCH RFC WIP 2/5] net: bridge: Skip receive handler on brX interface
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
  To: netdev
  Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
	bridge, Andrew Lunn
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>

The brX interface will soon become a member of the bridge. As such, it
will get a receiver handler assigned. However, we don't want to handle
packets received on this soft interfaces. So detect the condition and
say all the packets pass.
---
 net/bridge/br_input.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7637f58c1226..38c2a41968f2 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -267,6 +267,10 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		return RX_HANDLER_CONSUMED;
 
 	p = br_port_get_rcu(skb->dev);
+
+	if (p->dev == p->br->dev)
+		return RX_HANDLER_PASS;
+
 	if (p->flags & BR_VLAN_TUNNEL) {
 		if (br_handle_ingress_vlan_tunnel(skb, p,
 						  nbp_vlan_group_rcu(p)))
-- 
2.14.1

^ permalink raw reply related

* [PATCH RFC WIP 1/5] net: rtnetlink: Handle bridge port without upper device
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
  To: netdev
  Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
	bridge, Andrew Lunn
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>

The brX interface will with a following patch becomes a member of the
bridge. It however cannot be a slave interface, since it would have to
be a slave of itself. netdev_master_upper_dev_get() returns NULL as a
result. Handle this NULL, by knowing this bridge slave must also be
the master, i.e. what we are looking for.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
---
 net/core/rtnetlink.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9201e3621351..2673eb430b6f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3093,8 +3093,12 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
 	    (dev->priv_flags & IFF_BRIDGE_PORT)) {
 		struct net_device *br_dev = netdev_master_upper_dev_get(dev);
-		const struct net_device_ops *ops = br_dev->netdev_ops;
+		const struct net_device_ops *ops;
 
+		if (!br_dev)
+			br_dev = dev;
+
+		ops = br_dev->netdev_ops;
 		err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
 				       nlh->nlmsg_flags);
 		if (err)
@@ -3197,7 +3201,12 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
 	    (dev->priv_flags & IFF_BRIDGE_PORT)) {
 		struct net_device *br_dev = netdev_master_upper_dev_get(dev);
-		const struct net_device_ops *ops = br_dev->netdev_ops;
+		const struct net_device_ops *ops;
+
+		if (!br_dev)
+			br_dev = dev;
+
+		ops = br_dev->netdev_ops;
 
 		if (ops->ndo_fdb_del)
 			err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
@@ -3332,6 +3341,8 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			if (!br_idx) { /* user did not specify a specific bridge */
 				if (dev->priv_flags & IFF_BRIDGE_PORT) {
 					br_dev = netdev_master_upper_dev_get(dev);
+					if (!br_dev)
+						br_dev = dev;
 					cops = br_dev->netdev_ops;
 				}
 			} else {
@@ -3410,6 +3421,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	struct net_device *br_dev = netdev_master_upper_dev_get(dev);
 	int err = 0;
 
+	if (!br_dev)
+		br_dev = dev;
+
 	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
 	if (nlh == NULL)
 		return -EMSGSIZE;
@@ -3647,6 +3661,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
 		struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+		if (!br_dev)
+			br_dev = dev;
 
 		if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
 			err = -EOPNOTSUPP;
@@ -3723,6 +3739,9 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
 		struct net_device *br_dev = netdev_master_upper_dev_get(dev);
 
+		if (!br_dev)
+			br_dev = dev;
+
 		if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
 			err = -EOPNOTSUPP;
 			goto out;
-- 
2.14.1

^ permalink raw reply related

* [PATCH RFC WIP 0/5] IGMP snooping for local traffic
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
  To: netdev
  Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
	bridge, Andrew Lunn

This is a WIP patchset i would like comments on from bridge, switchdev
and hardware offload people.

The linux bridge supports IGMP snooping. It will listen to IGMP
reports on bridge ports and keep track of which groups have been
joined on an interface. It will then forward multicast based on this
group membership.

When the bridge adds or removed groups from an interface, it uses
switchdev to request the hardware add an mdb to a port, so the
hardware can perform the selective forwarding between ports.

What is not covered by the current bridge code, is IGMP joins/leaves
from the host on the brX interface. No such monitoring is
performed. With a pure software bridge, it is not required. All
mulitcast frames are passed to the brX interface, and the network
stack filters them, as it does for any interface. However, when
hardware offload is involved, things change. We should program the
hardware to only send multcast packets to the host when the host has
in interest in them.

Thus we need to perform IGMP snooping on the brX interface, just like
any other interface of the bridge. However, currently the brX
interface is missing all the needed data structures to do this. There
is no net_bridge_port structure for the brX interface. This strucuture
is created when an interface is added to the bridge. But the brX
interface is not a member of the bridge. So this patchset makes the
brX interface a first class member of the bridge. When the brX
interface is opened, the interface is added to the bridge. A
net_bridge_port is allocated for it, and IGMP snooping is performed as
usual.

There are some complexities here. Some assumptions are broken, like
the master interface of a port interface is the bridge interface. The
brX interface cannot be its own master. The use of
netdev_master_upper_dev_get() within the bridge code has been changed
to reflecit this. The bridge receive handler needs to not process
frames for the brX interface, etc.

The interface downward to the hardware is also an issue. The code
presented here is a hack and needs to change. But that is secondary
and can be solved once it is agreed how the bridge needs to change to
support this use case.

Comment welcome and wanted.

	Andrew

Andrew Lunn (5):
  net: rtnetlink: Handle bridge port without upper device
  net: bridge: Skip receive handler on brX interface
  net: bridge: Make the brX interface a member of the bridge
  net: dsa: HACK: Handle MDB add/remove for none-switch ports
  net: dsa: Don't include CPU port when adding MDB to a port

 include/linux/if_bridge.h |  1 +
 net/bridge/br_device.c    | 12 ++++++++++--
 net/bridge/br_if.c        | 37 ++++++++++++++++++++++++-------------
 net/bridge/br_input.c     |  4 ++++
 net/bridge/br_mdb.c       |  2 --
 net/bridge/br_multicast.c |  7 ++++---
 net/bridge/br_private.h   |  1 +
 net/core/rtnetlink.c      | 23 +++++++++++++++++++++--
 net/dsa/port.c            | 19 +++++++++++++++++--
 net/dsa/switch.c          |  2 +-
 10 files changed, 83 insertions(+), 25 deletions(-)

-- 
2.14.1

^ permalink raw reply

* [PATCH RFC WIP 5/5] net: dsa: Don't include CPU port when adding MDB to a port
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, Florian Fainelli, Vivien Didelot, nikolay, roopa,
	bridge, jiri
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>

Now that the MDB are explicitly added to the CPU port when required,
don't add the CPU port adding an MDB to a switch port.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
---
 net/dsa/switch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 97e2e9c8cf3f..c178e2b86a9a 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -130,7 +130,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
 	if (ds->index == info->sw_index)
 		set_bit(info->port, group);
 	for (port = 0; port < ds->num_ports; port++)
-		if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
+		if (dsa_is_dsa_port(ds, port))
 			set_bit(port, group);
 
 	if (switchdev_trans_ph_prepare(trans)) {
-- 
2.14.1

^ permalink raw reply related

* [PATCH] net: ethernet: broadcom: Remove null check before kfree
From: Himanshu Jha @ 2017-08-26 20:17 UTC (permalink / raw)
  To: davem; +Cc: jarod, netdev, linux-kernel, Himanshu Jha

Kfree on NULL pointer is a no-op and therefore checking is redundant.

Signed-off-by: Himanshu Jha <himanshujha199640@gmail.com>
---
 drivers/net/ethernet/broadcom/sb1250-mac.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c b/drivers/net/ethernet/broadcom/sb1250-mac.c
index 16a0f19..ecdef42 100644
--- a/drivers/net/ethernet/broadcom/sb1250-mac.c
+++ b/drivers/net/ethernet/broadcom/sb1250-mac.c
@@ -1367,15 +1367,11 @@ static int sbmac_initctx(struct sbmac_softc *s)
 
 static void sbdma_uninitctx(struct sbmacdma *d)
 {
-	if (d->sbdma_dscrtable_unaligned) {
-		kfree(d->sbdma_dscrtable_unaligned);
-		d->sbdma_dscrtable_unaligned = d->sbdma_dscrtable = NULL;
-	}
+	kfree(d->sbdma_dscrtable_unaligned);
+	d->sbdma_dscrtable_unaligned = d->sbdma_dscrtable = NULL;
 
-	if (d->sbdma_ctxtable) {
-		kfree(d->sbdma_ctxtable);
-		d->sbdma_ctxtable = NULL;
-	}
+	kfree(d->sbdma_ctxtable);
+	d->sbdma_ctxtable = NULL;
 }
 
 
-- 
2.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox