Netdev List

Netdev List
 help / color / mirror / Atom feed

* [patch net-next v9 2/3] net: core: Add offload stats to if_stats_msg
From: Jiri Pirko @ 2016-09-14  9:28 UTC (permalink / raw)
  To: netdev
  Cc: davem, nogahf, idosch, eladr, yotamg, ogerlitz, roopa, nikolay,
	linville, tgraf, gospo, sfeldma, sd, eranbe, ast, edumazet,
	hannes, f.fainelli, dsa
In-Reply-To: <1473845322-16679-1-git-send-email-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

Add a nested attribute of offload stats to if_stats_msg
named IFLA_STATS_LINK_OFFLOAD_XSTATS.
Under it, add SW stats, meaning stats only per packets that went via
slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/uapi/linux/if_link.h |   9 ++++
 net/core/rtnetlink.c         | 111 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9bf3aec..2351776 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -826,6 +826,7 @@ enum {
 	IFLA_STATS_LINK_64,
 	IFLA_STATS_LINK_XSTATS,
 	IFLA_STATS_LINK_XSTATS_SLAVE,
+	IFLA_STATS_LINK_OFFLOAD_XSTATS,
 	__IFLA_STATS_MAX,
 };
 
@@ -845,6 +846,14 @@ enum {
 };
 #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1)
 
+/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */
+enum {
+	IFLA_OFFLOAD_XSTATS_UNSPEC,
+	IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */
+	__IFLA_OFFLOAD_XSTATS_MAX
+};
+#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1)
+
 /* XDP section */
 
 enum {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 937e459..ae2048a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3577,6 +3577,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
 	       (!idxattr || idxattr == attrid);
 }
 
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return sizeof(struct rtnl_link_stats64);
+	}
+
+	return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+				  int *prividx)
+{
+	struct nlattr *attr = NULL;
+	int attr_id, size;
+	void *attr_data;
+	int err;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return -ENODATA;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (attr_id < *prividx)
+			continue;
+
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		if (!size)
+			continue;
+
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+
+		attr = nla_reserve_64bit(skb, attr_id, size,
+					 IFLA_OFFLOAD_XSTATS_UNSPEC);
+		if (!attr)
+			goto nla_put_failure;
+
+		attr_data = nla_data(attr);
+		memset(attr_data, 0, size);
+		err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+							     attr_data);
+		if (err)
+			goto get_offload_stats_failure;
+	}
+
+	if (!attr)
+		return -ENODATA;
+
+	*prividx = 0;
+	return 0;
+
+nla_put_failure:
+	err = -EMSGSIZE;
+get_offload_stats_failure:
+	*prividx = attr_id;
+	return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+	int nla_size = 0;
+	int attr_id;
+	int size;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return 0;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		nla_size += nla_total_size_64bit(size);
+	}
+
+	if (nla_size != 0)
+		nla_size += nla_total_size(0);
+
+	return nla_size;
+}
+
 static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 			       int type, u32 pid, u32 seq, u32 change,
 			       unsigned int flags, unsigned int filter_mask,
@@ -3586,6 +3671,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 	struct nlmsghdr *nlh;
 	struct nlattr *attr;
 	int s_prividx = *prividx;
+	int err;
 
 	ASSERT_RTNL();
 
@@ -3614,8 +3700,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
 
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS);
@@ -3639,8 +3723,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		if (master)
 			ops = master->rtnl_link_ops;
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+			     *idxattr)) {
+		*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+		attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+		if (!attr)
+			goto nla_put_failure;
+
+		err = rtnl_get_offload_stats(skb, dev, prividx);
+		if (err == -ENODATA)
+			nla_nest_cancel(skb, attr);
+		else
+			nla_nest_end(skb, attr);
+
+		if ((err) && (err != -ENODATA))
+			goto nla_put_failure;
+		*idxattr = 0;
+	}
+
 	nlmsg_end(skb, nlh);
 
 	return 0;
@@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+		size += rtnl_get_offload_stats_size(dev);
+
 	return size;
 }
 
-- 
2.5.5

^ permalink raw reply related

* [patch net-next v9 3/3] mlxsw: spectrum: Implement offload stats ndo and expose HW stats by default
From: Jiri Pirko @ 2016-09-14  9:28 UTC (permalink / raw)
  To: netdev
  Cc: davem, nogahf, idosch, eladr, yotamg, ogerlitz, roopa, nikolay,
	linville, tgraf, gospo, sfeldma, sd, eranbe, ast, edumazet,
	hannes, f.fainelli, dsa
In-Reply-To: <1473845322-16679-1-git-send-email-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

Change the default statistics ndo to return HW statistics
(like the one returned by ethtool_ops).
The HW stats are collected to a cache by delayed work every 1 sec.
Implement the offload stat ndo.
Add a function to get SW statistics, to be called from this function.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 129 +++++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |   5 +
 2 files changed, 127 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 27bbcaf..171f8dd 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -819,9 +819,9 @@ err_span_port_mtu_update:
 	return err;
 }
 
-static struct rtnl_link_stats64 *
-mlxsw_sp_port_get_stats64(struct net_device *dev,
-			  struct rtnl_link_stats64 *stats)
+int
+mlxsw_sp_port_get_sw_stats64(const struct net_device *dev,
+			     struct rtnl_link_stats64 *stats)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 	struct mlxsw_sp_port_pcpu_stats *p;
@@ -848,6 +848,107 @@ mlxsw_sp_port_get_stats64(struct net_device *dev,
 		tx_dropped	+= p->tx_dropped;
 	}
 	stats->tx_dropped	= tx_dropped;
+	return 0;
+}
+
+bool mlxsw_sp_port_has_offload_stats(int attr_id)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return true;
+	}
+
+	return false;
+}
+
+int mlxsw_sp_port_get_offload_stats(int attr_id, const struct net_device *dev,
+				    void *sp)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return mlxsw_sp_port_get_sw_stats64(dev, sp);
+	}
+
+	return -EINVAL;
+}
+
+static int mlxsw_sp_port_get_stats_raw(struct net_device *dev, int grp,
+				       int prio, char *ppcnt_pl)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	mlxsw_reg_ppcnt_pack(ppcnt_pl, mlxsw_sp_port->local_port, grp, prio);
+	return mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ppcnt), ppcnt_pl);
+}
+
+static int mlxsw_sp_port_get_hw_stats(struct net_device *dev,
+				      struct rtnl_link_stats64 *stats)
+{
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int err;
+
+	err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_IEEE_8023_CNT,
+					  0, ppcnt_pl);
+	if (err)
+		goto out;
+
+	stats->tx_packets =
+		mlxsw_reg_ppcnt_a_frames_transmitted_ok_get(ppcnt_pl);
+	stats->rx_packets =
+		mlxsw_reg_ppcnt_a_frames_received_ok_get(ppcnt_pl);
+	stats->tx_bytes =
+		mlxsw_reg_ppcnt_a_octets_transmitted_ok_get(ppcnt_pl);
+	stats->rx_bytes =
+		mlxsw_reg_ppcnt_a_octets_received_ok_get(ppcnt_pl);
+	stats->multicast =
+		mlxsw_reg_ppcnt_a_multicast_frames_received_ok_get(ppcnt_pl);
+
+	stats->rx_crc_errors =
+		mlxsw_reg_ppcnt_a_frame_check_sequence_errors_get(ppcnt_pl);
+	stats->rx_frame_errors =
+		mlxsw_reg_ppcnt_a_alignment_errors_get(ppcnt_pl);
+
+	stats->rx_length_errors = (
+		mlxsw_reg_ppcnt_a_in_range_length_errors_get(ppcnt_pl) +
+		mlxsw_reg_ppcnt_a_out_of_range_length_field_get(ppcnt_pl) +
+		mlxsw_reg_ppcnt_a_frame_too_long_errors_get(ppcnt_pl));
+
+	stats->rx_errors = (stats->rx_crc_errors +
+		stats->rx_frame_errors + stats->rx_length_errors);
+
+out:
+	return err;
+}
+
+static void update_stats_cache(struct work_struct *work)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+		container_of(work, struct mlxsw_sp_port,
+			     hw_stats.update_dw.work);
+
+	if (!netif_carrier_ok(mlxsw_sp_port->dev))
+		goto out;
+
+	mlxsw_sp_port_get_hw_stats(mlxsw_sp_port->dev,
+				   mlxsw_sp_port->hw_stats.cache);
+
+out:
+	mlxsw_core_schedule_dw(&mlxsw_sp_port->hw_stats.update_dw,
+			       MLXSW_HW_STATS_UPDATE_TIME);
+}
+
+/* Return the stats from a cache that is updated periodically,
+ * as this function might get called in an atomic context.
+ */
+static struct rtnl_link_stats64 *
+mlxsw_sp_port_get_stats64(struct net_device *dev,
+			  struct rtnl_link_stats64 *stats)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	memcpy(stats, mlxsw_sp_port->hw_stats.cache, sizeof(*stats));
+
 	return stats;
 }
 
@@ -1209,6 +1310,8 @@ static const struct net_device_ops mlxsw_sp_port_netdev_ops = {
 	.ndo_set_mac_address	= mlxsw_sp_port_set_mac_address,
 	.ndo_change_mtu		= mlxsw_sp_port_change_mtu,
 	.ndo_get_stats64	= mlxsw_sp_port_get_stats64,
+	.ndo_has_offload_stats	= mlxsw_sp_port_has_offload_stats,
+	.ndo_get_offload_stats	= mlxsw_sp_port_get_offload_stats,
 	.ndo_vlan_rx_add_vid	= mlxsw_sp_port_add_vid,
 	.ndo_vlan_rx_kill_vid	= mlxsw_sp_port_kill_vid,
 	.ndo_neigh_construct	= mlxsw_sp_router_neigh_construct,
@@ -1547,8 +1650,6 @@ static void __mlxsw_sp_port_get_stats(struct net_device *dev,
 				      enum mlxsw_reg_ppcnt_grp grp, int prio,
 				      u64 *data, int data_index)
 {
-	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct mlxsw_sp_port_hw_stats *hw_stats;
 	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
 	int i, len;
@@ -1557,8 +1658,7 @@ static void __mlxsw_sp_port_get_stats(struct net_device *dev,
 	err = mlxsw_sp_get_hw_stats_by_group(&hw_stats, &len, grp);
 	if (err)
 		return;
-	mlxsw_reg_ppcnt_pack(ppcnt_pl, mlxsw_sp_port->local_port, grp, prio);
-	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ppcnt), ppcnt_pl);
+	mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl);
 	for (i = 0; i < len; i++)
 		data[data_index + i] = !err ? hw_stats[i].getter(ppcnt_pl) : 0;
 }
@@ -2145,6 +2245,16 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
 		goto err_alloc_stats;
 	}
 
+	mlxsw_sp_port->hw_stats.cache =
+		kzalloc(sizeof(*mlxsw_sp_port->hw_stats.cache), GFP_KERNEL);
+
+	if (!mlxsw_sp_port->hw_stats.cache) {
+		err = -ENOMEM;
+		goto err_alloc_hw_stats;
+	}
+	INIT_DELAYED_WORK(&mlxsw_sp_port->hw_stats.update_dw,
+			  &update_stats_cache);
+
 	dev->netdev_ops = &mlxsw_sp_port_netdev_ops;
 	dev->ethtool_ops = &mlxsw_sp_port_ethtool_ops;
 
@@ -2245,6 +2355,7 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
 		goto err_core_port_init;
 	}
 
+	mlxsw_core_schedule_dw(&mlxsw_sp_port->hw_stats.update_dw, 0);
 	return 0;
 
 err_core_port_init:
@@ -2265,6 +2376,8 @@ err_port_system_port_mapping_set:
 err_dev_addr_init:
 	mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT);
 err_port_swid_set:
+	kfree(mlxsw_sp_port->hw_stats.cache);
+err_alloc_hw_stats:
 	free_percpu(mlxsw_sp_port->pcpu_stats);
 err_alloc_stats:
 	kfree(mlxsw_sp_port->untagged_vlans);
@@ -2281,6 +2394,7 @@ static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port)
 
 	if (!mlxsw_sp_port)
 		return;
+	cancel_delayed_work_sync(&mlxsw_sp_port->hw_stats.update_dw);
 	mlxsw_core_port_fini(&mlxsw_sp_port->core_port);
 	unregister_netdev(mlxsw_sp_port->dev); /* This calls ndo_stop */
 	mlxsw_sp->ports[local_port] = NULL;
@@ -2290,6 +2404,7 @@ static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port)
 	mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT);
 	mlxsw_sp_port_module_unmap(mlxsw_sp, mlxsw_sp_port->local_port);
 	free_percpu(mlxsw_sp_port->pcpu_stats);
+	kfree(mlxsw_sp_port->hw_stats.cache);
 	kfree(mlxsw_sp_port->untagged_vlans);
 	kfree(mlxsw_sp_port->active_vlans);
 	WARN_ON_ONCE(!list_empty(&mlxsw_sp_port->vports_list));
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 969c250..49f4caf 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -361,6 +361,11 @@ struct mlxsw_sp_port {
 	struct list_head vports_list;
 	/* TC handles */
 	struct list_head mall_tc_list;
+	struct {
+		#define MLXSW_HW_STATS_UPDATE_TIME HZ
+		struct rtnl_link_stats64 *cache;
+		struct delayed_work update_dw;
+	} hw_stats;
 };
 
 struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev);
-- 
2.5.5

^ permalink raw reply related

* Re: [PATCH RFC 5/6] net: Generic resolver backend
From: Thomas Graf @ 2016-09-14  9:49 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, kernel-team
In-Reply-To: <1473463197-3076903-6-git-send-email-tom@herbertland.com>

On 09/09/16 at 04:19pm, Tom Herbert wrote:
> diff --git a/net/core/resolver.c b/net/core/resolver.c
> new file mode 100644
> index 0000000..61b36c5
> --- /dev/null
> +++ b/net/core/resolver.c
> @@ -0,0 +1,267 @@
> +#include <linux/errno.h>
> +#include <linux/ip.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/netlink.h>
> +#include <linux/skbuff.h>
> +#include <linux/socket.h>
> +#include <linux/types.h>
> +#include <linux/vmalloc.h>
> +#include <net/checksum.h>
> +#include <net/ip.h>
> +#include <net/ip6_fib.h>
> +#include <net/lwtunnel.h>
> +#include <net/protocol.h>
> +#include <net/resolver.h>
> +#include <uapi/linux/ila.h>

This include list could be stripped down a bit. ila, lwt, fib, ...

> +
> +static struct net_rslv_ent *net_rslv_new_ent(struct net_rslv *nrslv,
> +					     void *key)

Comment above that net_rslv_get_lock() must be held?

> +{
> +	struct net_rslv_ent *nrent;
> +	int err;
> +
> +	nrent = kzalloc(sizeof(*nrent) + nrslv->obj_size, GFP_KERNEL);

GFP_ATOMIC since you typically hold net_rslv_get_lock() spinlock?

> +	if (!nrent)
> +		return ERR_PTR(-EAGAIN);
> +
> +	/* Key is always at beginning of object data */
> +	memcpy(nrent->object, key, nrslv->params.key_len);
> +
> +	/* Initialize user data */
> +	if (nrslv->rslv_init)
> +		nrslv->rslv_init(nrslv, nrent);
> +
> +	/* Put in hash table */
> +	err = rhashtable_lookup_insert_fast(&nrslv->rhash_table,
> +					    &nrent->node, nrslv->params);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	if (nrslv->timeout) {
> +		/* Schedule timeout for resolver */
> +		INIT_DELAYED_WORK(&nrent->timeout_work, net_rslv_delayed_work);

Should this be done before inserting into rhashtable?

> +		schedule_delayed_work(&nrent->timeout_work, nrslv->timeout);
> +	}
> +
> +	nrent->nrslv = nrslv;

Same here.  net_rslv_cancel_all_delayed_work() walking the rhashtable could
see ->nrslv as NULL.

^ permalink raw reply

* Re: [RFC PATCH] xen-netback: fix error handling on netback_probe()
From: Wei Liu @ 2016-09-14 10:10 UTC (permalink / raw)
  To: Filipe Manco; +Cc: netdev, wei.liu2, Xen-devel
In-Reply-To: <1473768687-27384-1-git-send-email-filipe.manco@neclab.eu>

CC xen-devel as well.

On Tue, Sep 13, 2016 at 02:11:27PM +0200, Filipe Manco wrote:
> In case of error during netback_probe() (e.g. an entry missing on the
> xenstore) netback_remove() is called on the new device, which will set
> the device backend state to XenbusStateClosed by calling
> set_backend_state(). However, the backend state wasn't initialized by
> netback_probe() at this point, which will cause and invalid transaction
> and set_backend_state() to BUG().
> 
> Initialize the backend state at the beginning of netback_probe() to
> XenbusStateInitialising, and create a new valid state transaction on
> set_backend_state(), from XenbusStateInitialising to XenbusStateClosed.
> 
> Signed-off-by: Filipe Manco <filipe.manco@neclab.eu>

There is a state machine right before set_backend_state. You would also
need to update that.

According to the definition of XenbusStateInitialising, this patch looks
plausible to me.

Wei.

> ---
>  drivers/net/xen-netback/xenbus.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
> index 6a31f2610c23..c0e5f6994d01 100644
> --- a/drivers/net/xen-netback/xenbus.c
> +++ b/drivers/net/xen-netback/xenbus.c
> @@ -270,6 +270,7 @@ static int netback_probe(struct xenbus_device *dev,
>  
>  	be->dev = dev;
>  	dev_set_drvdata(&dev->dev, be);
> +	be->state = XenbusStateInitialising;
>  
>  	sg = 1;
>  
> @@ -515,6 +516,15 @@ static void set_backend_state(struct backend_info *be,
>  {
>  	while (be->state != state) {
>  		switch (be->state) {
> +		case XenbusStateInitialising:
> +			switch (state) {
> +			case XenbusStateClosed:
> +				backend_switch_state(be, XenbusStateClosed);
> +				break;
> +			default:
> +				BUG();
> +			}
> +			break;
>  		case XenbusStateClosed:
>  			switch (state) {
>  			case XenbusStateInitWait:
> -- 
> 2.7.4
> 

^ permalink raw reply

* RE: [RFC 03/11] Add support for RoCE HW init
From: Amrani, Ram @ 2016-09-14 10:13 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Yuval.Mintz@qlogic.com, Ariel.Elior@qlogic.com,
	Michal.Kalderon@qlogic.com, rajesh.borundia@qlogic.com,
	linux-rdma@vger.kernel.org, netdev@vger.kernel.org,
	dledford@redhat.com, davem@davemloft.net
In-Reply-To: <29e0d88d-63f2-2b30-2856-a0e94cf31f8e@grimberg.me>

> > +	dev->max_sge = min_t(u32, RDMA_MAX_SGE_PER_SQ_WQE,
> > +			     RDMA_MAX_SGE_PER_RQ_WQE);
> 
> Our kernel target mode consumers sort of rely on max_sge_rd, you need to
> make sure to set it too.
Good catch. Thanks!

^ permalink raw reply

* RE: [RFC 08/11] Add support for data path
From: Amrani, Ram @ 2016-09-14 10:16 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Yuval.Mintz@qlogic.com, Ariel.Elior@qlogic.com,
	Michal.Kalderon@qlogic.com, rajesh.borundia@qlogic.com,
	linux-rdma@vger.kernel.org, netdev@vger.kernel.org,
	dledford@redhat.com, davem@davemloft.net
In-Reply-To: <40a9df9e-a0aa-9020-c523-6042baf729d1@grimberg.me>

> > +	pbe = (struct regpair *)pbl_table->va;
> > +	num_pbes = 0;
> > +
> > +	for (i = 0; i < mr->npages &&
> > +	     (total_num_pbes != mr->info.pbl_info.num_pbes); i++) {
> > +		u64 buf_addr = mr->pages[i];
> > +
> > +		pbe->lo = cpu_to_le32((u32)buf_addr);
> > +		pbe->hi = cpu_to_le32((u32)upper_32_bits(buf_addr));
> 
> Thats a shame... you could have easily set the buf_addr correctly in
> qedr_set_page...
> 
> I think you could have also set the pbe directly from set_page if you have access
> to pbl_table from your mr context (and if I understand correctly I think you do,
> mr->info.pbl_table)...
I see what you mean, we can surely improve here and will. Thanks.

^ permalink raw reply

* RE: [RFC 07/11] Add support for memory registeration verbs
From: Kalderon, Michal @ 2016-09-14 10:02 UTC (permalink / raw)
  To: Sagi Grimberg, Ram Amrani, dledford@redhat.com,
	davem@davemloft.net
  Cc: Yuval.Mintz@qlogic.com, Ariel.Elior@qlogic.com,
	Michal.Kalderon@qlogic.com, rajesh.borundia@qlogic.com,
	linux-rdma@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <4b392963-c14c-d793-147a-67b08e97ee06@grimberg.me>

> >>> +struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, int
> >>> +max_page_list_len) {
> >>> +	struct qedr_pd *pd = get_qedr_pd(ibpd);
> >>> +	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
> >>> +	struct qedr_mr *mr;
> >>> +	int rc = -ENOMEM;
> >>> +
> >>> +	DP_VERBOSE(dev, QEDR_MSG_MR,
> >>> +		   "qedr_alloc_frmr pd = %d max_page_list_len= %d\n", pd-
> >>> pd_id,
> >>> +		   max_page_list_len);
> >>> +
> >>> +	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
> >>> +	if (!mr)
> >>> +		return ERR_PTR(rc);
> >>> +
> >>> +	mr->dev = dev;
> >>> +	mr->type = QEDR_MR_FRMR;
> >>> +
> >>> +	rc = init_mr_info(dev, &mr->info, max_page_list_len, 1);
> >>> +	if (rc)
> >>> +		goto err0;
> >>> +
> >>> +	rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
> >>> +	if (rc) {
> >>> +		DP_ERR(dev, "roce alloc tid returned an error %d\n", rc);
> >>> +		goto err0;
> >>> +	}
> >>> +
> >>> +	/* Index only, 18 bit long, lkey = itid << 8 | key */
> >>> +	mr->hw_mr.tid_type = QED_RDMA_TID_FMR;
> >>> +	mr->hw_mr.key = 0;
> >>> +	mr->hw_mr.pd = pd->pd_id;
> >>
> >> Do you have a real MR<->PD association in HW? If so, can you point me
> >> to the code that binds it? If not, any reason not to expose the
> local_dma_lkey?
> >>
> > Yes, we send the pd id to the FW in function qed_rdma_register_tid.
> 
> Right, thanks.
> 
> > In any case, if we didn't have the association in HW Wouldn't the
> > local_dma_lkey be relevant only to dma_mr ? ( code snippet above
> > refers to FMR)
> 
> I was just sticking to a location in the code where you associate MR<->PD...
> 
> The local_dma_lkey is a special key that spans the entire memory space and
> unlike the notorious dma_mr, its not associated with a PD.
> 
> See the code in ib_alloc_pd(), if the device does not support a single device
> local dma lkey, the core allocates a dma mr associated with the pd. If your
> device has such a key, you can save a dma mr allocation for each pd in the
> system.
Managed to miss this. Our device supports such a key, we'll add support. 

^ permalink raw reply

* Re: [PATCH v5 0/6] Add eBPF hooks for cgroups
From: Pablo Neira Ayuso @ 2016-09-14 10:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Mack, htejun-b10kYP2dOMg, daniel-FeC+5ew28dpmcu3hnIyYJQ,
	ast-b10kYP2dOMg, davem-fT/PcQaiUtIeIZ0/mPfg9Q, kafai-b10kYP2dOMg,
	fw-HFFVJYpyMKqzQB+pC5nmwQ, harald-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA, sargun-GaZTRHToo+CzQB+pC5nmwQ,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20160914044217.GA44742-+o4/htvd0TDFYCXBM6kdu7fOX0fSgVTm@public.gmane.org>

On Tue, Sep 13, 2016 at 09:42:19PM -0700, Alexei Starovoitov wrote:
[...]
> For us this cgroup+bpf is _not_ for filterting and _not_ for security.

If your goal is monitoring, then convert these hooks not to allow to
issue a verdict on the packet, so this becomes inoquous in the same
fashion as the tracing infrastructure.

[...]
> I'd really love to have an alternative to bpf for such tasks,
> but you seem to spend all the energy arguing against bpf whereas
> nft still has a lot to be desired.

Please Alexei, stop that FUD. Anyone that has spent just one day using
the bpf tooling and infrastructure knows you have problems to
resolve...

^ permalink raw reply

* [PATCH V2 0/3] net-next: dsa: add QCA8K support
From: John Crispin @ 2016-09-14 10:38 UTC (permalink / raw)
  To: David S. Miller, Andrew Lunn, Florian Fainelli
  Cc: netdev, linux-kernel, qsdk-review, John Crispin

This series is based on the AR8xxx series posted by Matthieu Olivari in may
2015. The following changes were made since then

* fixed the nitpicks from the previous review
* updated to latest API
* turned it into an mdio device
* added callbacks for fdb, bridge offloading, stp, eee, port status
* fixed several minor issues to the port setup and arp learning
* changed the namespacing as this driver to qca8k

The driver has so far only been tested on qca8337/N. It should work on other QCA
switches such as the qca8327 with minor changes.

John Crispin (3):
  Documentation: devicetree: add qca8k binding
  net-next: dsa: add Qualcomm tag RX/TX handler
  net-next: dsa: add new driver for qca8xxx family

 .../devicetree/bindings/net/dsa/qca8k.txt          |   88 ++
 drivers/net/dsa/Kconfig                            |    9 +
 drivers/net/dsa/Makefile                           |    1 +
 drivers/net/dsa/qca8k.c                            |  968 ++++++++++++++++++++
 drivers/net/dsa/qca8k.h                            |  180 ++++
 include/net/dsa.h                                  |    1 +
 net/dsa/Kconfig                                    |    3 +
 net/dsa/Makefile                                   |    1 +
 net/dsa/dsa.c                                      |    3 +
 net/dsa/dsa_priv.h                                 |    2 +
 net/dsa/tag_qca.c                                  |  138 +++
 11 files changed, 1394 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/dsa/qca8k.txt
 create mode 100644 drivers/net/dsa/qca8k.c
 create mode 100644 drivers/net/dsa/qca8k.h
 create mode 100644 net/dsa/tag_qca.c

-- 
1.7.10.4

^ permalink raw reply

* [PATCH V2 1/3] Documentation: devicetree: add qca8k binding
From: John Crispin @ 2016-09-14 10:39 UTC (permalink / raw)
  To: David S. Miller, Andrew Lunn, Florian Fainelli
  Cc: netdev, linux-kernel, qsdk-review, John Crispin, devicetree
In-Reply-To: <1473849542-3298-1-git-send-email-john@phrozen.org>

Add device-tree binding for ar8xxx switch families.

Cc: devicetree@vger.kernel.org
Signed-off-by: John Crispin <john@phrozen.org>
---
Changes in V2
* fixup ecample to include phy nodes and corresponding phandles
* add a note explaining why we need to phy nodes

 .../devicetree/bindings/net/dsa/qca8k.txt          |   88 ++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/dsa/qca8k.txt

diff --git a/Documentation/devicetree/bindings/net/dsa/qca8k.txt b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
new file mode 100644
index 0000000..2c1582a
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
@@ -0,0 +1,88 @@
+* Qualcomm Atheros QCA8xxx switch family
+
+Required properties:
+
+- compatible: should be "qca,qca8337"
+- #size-cells: must be 0
+- #address-cells: must be 1
+
+Subnodes:
+
+The integrated switch subnode should be specified according to the binding
+described in dsa/dsa.txt. As the QCA8K switches do not have a N:N mapping of
+port and PHY id, each subnode describing a port needs to have a valid phandle
+referencing the internal PHY connected to it.
+
+Example:
+
+
+	&mdio0 {
+		phy_port1: phy@0 {
+			reg = <0>;
+		};
+
+		phy_port2: phy@1 {
+			reg = <1>;
+		};
+
+		phy_port3: phy@2 {
+			reg = <2>;
+		};
+
+		phy_port4: phy@3 {
+			reg = <3>;
+		};
+
+		phy_port5: phy@4 {
+			reg = <4>;
+		};
+
+		switch0@0 {
+			compatible = "qca,qca8337";
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			reg = <0>;
+
+			ports {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				port@0 {
+					reg = <0>;
+					label = "cpu";
+					ethernet = <&gmac1>;
+					phy-mode = "rgmii";
+				};
+
+				port@1 {
+					reg = <1>;
+					label = "lan1";
+					phy-handle = <&phy_port1>;
+				};
+
+				port@2 {
+					reg = <2>;
+					label = "lan2";
+					phy-handle = <&phy_port2>;
+				};
+
+				port@3 {
+					reg = <3>;
+					label = "lan3";
+					phy-handle = <&phy_port3>;
+				};
+
+				port@4 {
+					reg = <4>;
+					label = "lan4";
+					phy-handle = <&phy_port4>;
+				};
+
+				port@5 {
+					reg = <5>;
+					label = "wan";
+					phy-handle = <&phy_port5>;
+				};
+			};
+		};
+	};
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH V2 2/3] net-next: dsa: add Qualcomm tag RX/TX handler
From: John Crispin @ 2016-09-14 10:39 UTC (permalink / raw)
  To: David S. Miller, Andrew Lunn, Florian Fainelli
  Cc: netdev, linux-kernel, qsdk-review, John Crispin
In-Reply-To: <1473849542-3298-1-git-send-email-john@phrozen.org>

Add support for the 2-bytes Qualcomm tag that gigabit switches such as
the QCA8337/N might insert when receiving packets, or that we need
to insert while targeting specific switch ports. The tag is inserted
directly behind the ethernet header.

Signed-off-by: John Crispin <john@phrozen.org>
---
* fix some comments
* remove dead code
* rename variable from phy->reg

 include/net/dsa.h  |    1 +
 net/dsa/Kconfig    |    3 ++
 net/dsa/Makefile   |    1 +
 net/dsa/dsa.c      |    3 ++
 net/dsa/dsa_priv.h |    2 +
 net/dsa/tag_qca.c  |  138 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 148 insertions(+)
 create mode 100644 net/dsa/tag_qca.c

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 24ee961..7fdd63e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -26,6 +26,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_TRAILER,
 	DSA_TAG_PROTO_EDSA,
 	DSA_TAG_PROTO_BRCM,
+	DSA_TAG_PROTO_QCA,
 	DSA_TAG_LAST,		/* MUST BE LAST */
 };
 
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index ff7736f..96e47c5 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,4 +38,7 @@ config NET_DSA_TAG_EDSA
 config NET_DSA_TAG_TRAILER
 	bool
 
+config NET_DSA_TAG_QCA
+	bool
+
 endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 8af4ded..a3380ed 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -7,3 +7,4 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
 dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index d8d267e..66e31ac 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -54,6 +54,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
 #ifdef CONFIG_NET_DSA_TAG_BRCM
 	[DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
 #endif
+#ifdef CONFIG_NET_DSA_TAG_QCA
+	[DSA_TAG_PROTO_QCA] = &qca_netdev_ops,
+#endif
 	[DSA_TAG_PROTO_NONE] = &none_ops,
 };
 
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 00077a9..6cfd738 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -81,5 +81,7 @@ extern const struct dsa_device_ops trailer_netdev_ops;
 /* tag_brcm.c */
 extern const struct dsa_device_ops brcm_netdev_ops;
 
+/* tag_qca.c */
+extern const struct dsa_device_ops qca_netdev_ops;
 
 #endif
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
new file mode 100644
index 0000000..0c90cac
--- /dev/null
+++ b/net/dsa/tag_qca.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/etherdevice.h>
+#include "dsa_priv.h"
+
+#define QCA_HDR_LEN	2
+#define QCA_HDR_VERSION	0x2
+
+#define QCA_HDR_RECV_VERSION_MASK	GENMASK(15, 14)
+#define QCA_HDR_RECV_VERSION_S		14
+#define QCA_HDR_RECV_PRIORITY_MASK	GENMASK(13, 11)
+#define QCA_HDR_RECV_PRIORITY_S		11
+#define QCA_HDR_RECV_TYPE_MASK		GENMASK(10, 6)
+#define QCA_HDR_RECV_TYPE_S		6
+#define QCA_HDR_RECV_FRAME_IS_TAGGED	BIT(3)
+#define QCA_HDR_RECV_SOURCE_PORT_MASK	GENMASK(2, 0)
+
+#define QCA_HDR_XMIT_VERSION_MASK	GENMASK(15, 14)
+#define QCA_HDR_XMIT_VERSION_S		14
+#define QCA_HDR_XMIT_PRIORITY_MASK	GENMASK(13, 11)
+#define QCA_HDR_XMIT_PRIORITY_S		11
+#define QCA_HDR_XMIT_CONTROL_MASK	GENMASK(10, 8)
+#define QCA_HDR_XMIT_CONTROL_S		8
+#define QCA_HDR_XMIT_FROM_CPU		BIT(7)
+#define QCA_HDR_XMIT_DP_BIT_MASK	GENMASK(6, 0)
+
+static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	u16 *phdr, hdr;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	if (skb_cow_head(skb, 0) < 0)
+		goto out_free;
+
+	skb_push(skb, QCA_HDR_LEN);
+
+	memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN);
+	phdr = (u16 *)(skb->data + 2 * ETH_ALEN);
+
+	/* Set the version field, and set destination port information */
+	hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
+		QCA_HDR_XMIT_FROM_CPU |
+		BIT(p->port);
+
+	*phdr = htons(hdr);
+
+	return skb;
+
+out_free:
+	kfree_skb(skb);
+	return NULL;
+}
+
+static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
+		       struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
+	u8 ver;
+	int port;
+	__be16 *phdr, hdr;
+
+	if (unlikely(!dst))
+		goto out_drop;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (!skb)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
+		goto out_drop;
+
+	/* The QCA header is added by the switch between src addr and Ethertype
+	 * At this point, skb->data points to ethertype so header should be
+	 * right before
+	 */
+	phdr = (__be16 *)(skb->data - 2);
+	hdr = ntohs(*phdr);
+
+	/* Make sure the version is correct */
+	ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
+	if (unlikely(ver != QCA_HDR_VERSION))
+		goto out_drop;
+
+	/* Remove QCA tag and recalculate checksum */
+	skb_pull_rcsum(skb, QCA_HDR_LEN);
+	memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN,
+		ETH_HLEN - QCA_HDR_LEN);
+
+	/* This protocol doesn't support cascading multiple switches so it's
+	 * safe to assume the switch is first in the tree
+	 */
+	ds = dst->ds[0];
+	if (!ds)
+		goto out_drop;
+
+	/* Get source port information */
+	port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
+	if (!ds->ports[port].netdev)
+		goto out_drop;
+
+	/* Update skb & forward the frame accordingly */
+	skb_push(skb, ETH_HLEN);
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = ds->ports[port].netdev;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+const struct dsa_device_ops qca_netdev_ops = {
+	.xmit	= qca_tag_xmit,
+	.rcv	= qca_tag_rcv,
+};
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH V2 3/3] net-next: dsa: add new driver for qca8xxx family
From: John Crispin @ 2016-09-14 10:39 UTC (permalink / raw)
  To: David S. Miller, Andrew Lunn, Florian Fainelli
  Cc: netdev, linux-kernel, qsdk-review, John Crispin
In-Reply-To: <1473849542-3298-1-git-send-email-john@phrozen.org>

This patch contains initial support for the QCA8337 switch. It
will detect a QCA8337 switch, if present and declared in the DT.

Each port will be represented through a standalone net_device interface,
as for other DSA switches. CPU can communicate with any of the ports by
setting an IP@ on ethN interface. Most of the extra callbacks of the DSA
subsystem are already supported, such as bridge offloading, stp, fdb.

Signed-off-by: John Crispin <john@phrozen.org>
---
Changes in V2
* add proper locking for the FDB table
* remove udelay when changing the page. neither datasheet nore SDK code
  requires a sleep
* add a cond_resched to the busy wait loop
* use nested locking when accessing the mdio bus
* remove the phy_to_port() wrappers
* remove mmd access function and use existing phy helpers
* fix a copy/paste bug breaking the eee callbacks
* use default vid 1 when fdb entries are added fro vid 0
* remove the phy id check and add a switch id check instead
* add error handling to the mdio read/write functions
* remove inline usage

 drivers/net/dsa/Kconfig  |    9 +
 drivers/net/dsa/Makefile |    1 +
 drivers/net/dsa/qca8k.c  |  968 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/qca8k.h  |  180 +++++++++
 4 files changed, 1158 insertions(+)
 create mode 100644 drivers/net/dsa/qca8k.c
 create mode 100644 drivers/net/dsa/qca8k.h

diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index de6d044..0659846 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -25,4 +25,13 @@ source "drivers/net/dsa/b53/Kconfig"
 
 source "drivers/net/dsa/mv88e6xxx/Kconfig"
 
+config NET_DSA_QCA8K
+	tristate "Qualcomm Atheros QCA8K Ethernet switch family support"
+	depends on NET_DSA
+	select NET_DSA_TAG_QCA
+	select REGMAP
+	---help---
+	  This enables support for the Qualcomm Atheros QCA8K Ethernet
+	  switch chips.
+
 endmenu
diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile
index ca1e71b..8346e4f 100644
--- a/drivers/net/dsa/Makefile
+++ b/drivers/net/dsa/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_NET_DSA_MV88E6060) += mv88e6060.o
 obj-$(CONFIG_NET_DSA_BCM_SF2)	+= bcm_sf2.o
+obj-$(CONFIG_NET_DSA_QCA8K)	+= qca8k.o
 
 obj-y				+= b53/
 obj-y				+= mv88e6xxx/
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
new file mode 100644
index 0000000..76a550f
--- /dev/null
+++ b/drivers/net/dsa/qca8k.c
@@ -0,0 +1,968 @@
+/*
+ * Copyright (C) 2009 Felix Fietkau <nbd@nbd.name>
+ * Copyright (C) 2011-2012 Gabor Juhos <juhosg@openwrt.org>
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2016 John Crispin <john@phrozen.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/phy.h>
+#include <linux/netdevice.h>
+#include <net/dsa.h>
+#include <net/switchdev.h>
+#include <linux/of_net.h>
+#include <linux/of_platform.h>
+#include <linux/if_bridge.h>
+#include <linux/mdio.h>
+#include <linux/etherdevice.h>
+
+#include "qca8k.h"
+
+#define MIB_DESC(_s, _o, _n)	\
+	{			\
+		.size = (_s),	\
+		.offset = (_o),	\
+		.name = (_n),	\
+	}
+
+static const struct qca8k_mib_desc ar8327_mib[] = {
+	MIB_DESC(1, 0x00, "RxBroad"),
+	MIB_DESC(1, 0x04, "RxPause"),
+	MIB_DESC(1, 0x08, "RxMulti"),
+	MIB_DESC(1, 0x0c, "RxFcsErr"),
+	MIB_DESC(1, 0x10, "RxAlignErr"),
+	MIB_DESC(1, 0x14, "RxRunt"),
+	MIB_DESC(1, 0x18, "RxFragment"),
+	MIB_DESC(1, 0x1c, "Rx64Byte"),
+	MIB_DESC(1, 0x20, "Rx128Byte"),
+	MIB_DESC(1, 0x24, "Rx256Byte"),
+	MIB_DESC(1, 0x28, "Rx512Byte"),
+	MIB_DESC(1, 0x2c, "Rx1024Byte"),
+	MIB_DESC(1, 0x30, "Rx1518Byte"),
+	MIB_DESC(1, 0x34, "RxMaxByte"),
+	MIB_DESC(1, 0x38, "RxTooLong"),
+	MIB_DESC(2, 0x3c, "RxGoodByte"),
+	MIB_DESC(2, 0x44, "RxBadByte"),
+	MIB_DESC(1, 0x4c, "RxOverFlow"),
+	MIB_DESC(1, 0x50, "Filtered"),
+	MIB_DESC(1, 0x54, "TxBroad"),
+	MIB_DESC(1, 0x58, "TxPause"),
+	MIB_DESC(1, 0x5c, "TxMulti"),
+	MIB_DESC(1, 0x60, "TxUnderRun"),
+	MIB_DESC(1, 0x64, "Tx64Byte"),
+	MIB_DESC(1, 0x68, "Tx128Byte"),
+	MIB_DESC(1, 0x6c, "Tx256Byte"),
+	MIB_DESC(1, 0x70, "Tx512Byte"),
+	MIB_DESC(1, 0x74, "Tx1024Byte"),
+	MIB_DESC(1, 0x78, "Tx1518Byte"),
+	MIB_DESC(1, 0x7c, "TxMaxByte"),
+	MIB_DESC(1, 0x80, "TxOverSize"),
+	MIB_DESC(2, 0x84, "TxByte"),
+	MIB_DESC(1, 0x8c, "TxCollision"),
+	MIB_DESC(1, 0x90, "TxAbortCol"),
+	MIB_DESC(1, 0x94, "TxMultiCol"),
+	MIB_DESC(1, 0x98, "TxSingleCol"),
+	MIB_DESC(1, 0x9c, "TxExcDefer"),
+	MIB_DESC(1, 0xa0, "TxDefer"),
+	MIB_DESC(1, 0xa4, "TxLateCol"),
+};
+
+/* The 32bit switch registers are accessed indirectly. To achieve this we need
+ * to set the page of the register. Track the last page that was set to reduce
+ * mdio writes
+ */
+static u16 qca8k_current_page = 0xffff;
+
+static struct
+qca8k_priv *qca8k_to_priv(struct dsa_switch *ds)
+{
+	struct qca8k_priv *priv = ds->priv;
+
+	return priv;
+}
+
+static void
+qca8k_split_addr(u32 regaddr, u16 *r1, u16 *r2, u16 *page)
+{
+	regaddr >>= 1;
+	*r1 = regaddr & 0x1e;
+
+	regaddr >>= 5;
+	*r2 = regaddr & 0x7;
+
+	regaddr >>= 3;
+	*page = regaddr & 0x3ff;
+}
+
+static u32
+qca8k_mii_read32(struct mii_bus *bus, int phy_id, u32 regnum)
+{
+	u32 val;
+	int ret;
+
+	ret = bus->read(bus, phy_id, regnum);
+	if (ret >= 0) {
+		val = ret;
+		ret = bus->read(bus, phy_id, regnum + 1);
+		val |= ret << 16;
+	}
+
+	if (ret < 0) {
+		dev_err_ratelimited(&bus->dev,
+				    "failed to read qca8k 32bit register\n");
+		return ret;
+	}
+
+	return val;
+}
+
+static void
+qca8k_mii_write32(struct mii_bus *bus, int phy_id, u32 regnum, u32 val)
+{
+	u16 lo, hi;
+	int ret;
+
+	lo = val & 0xffff;
+	hi = (u16)(val >> 16);
+
+	ret = bus->write(bus, phy_id, regnum, lo);
+	if (ret >= 0)
+		ret = bus->write(bus, phy_id, regnum + 1, hi);
+	if (ret < 0)
+		dev_err_ratelimited(&bus->dev,
+				    "failed to write qca8k 32bit register\n");
+}
+
+static void
+qca8k_set_page(struct mii_bus *bus, u16 page)
+{
+	if (page == qca8k_current_page)
+		return;
+
+	if (bus->write(bus, 0x18, 0, page) < 0)
+		dev_err_ratelimited(&bus->dev,
+				    "failed to set qca8k page\n");
+	qca8k_current_page = page;
+}
+
+static u32
+qca8k_read(struct qca8k_priv *priv, u32 reg)
+{
+	u16 r1, r2, page;
+	u32 val;
+
+	qca8k_split_addr(reg, &r1, &r2, &page);
+
+	mutex_lock_nested(&priv->bus->mdio_lock, MDIO_MUTEX_NESTED);
+
+	qca8k_set_page(priv->bus, page);
+	val = qca8k_mii_read32(priv->bus, 0x10 | r2, r1);
+
+	mutex_unlock(&priv->bus->mdio_lock);
+
+	return val;
+}
+
+static void
+qca8k_write(struct qca8k_priv *priv, u32 reg, u32 val)
+{
+	u16 r1, r2, page;
+
+	qca8k_split_addr(reg, &r1, &r2, &page);
+
+	mutex_lock_nested(&priv->bus->mdio_lock, MDIO_MUTEX_NESTED);
+
+	qca8k_set_page(priv->bus, page);
+	qca8k_mii_write32(priv->bus, 0x10 | r2, r1, val);
+
+	mutex_unlock(&priv->bus->mdio_lock);
+}
+
+static u32
+qca8k_rmw(struct qca8k_priv *priv, u32 reg, u32 mask, u32 val)
+{
+	u16 r1, r2, page;
+	u32 ret;
+
+	qca8k_split_addr(reg, &r1, &r2, &page);
+
+	mutex_lock_nested(&priv->bus->mdio_lock, MDIO_MUTEX_NESTED);
+
+	qca8k_set_page(priv->bus, page);
+	ret = qca8k_mii_read32(priv->bus, 0x10 | r2, r1);
+	ret &= ~mask;
+	ret |= val;
+	qca8k_mii_write32(priv->bus, 0x10 | r2, r1, ret);
+
+	mutex_unlock(&priv->bus->mdio_lock);
+
+	return ret;
+}
+
+static void
+qca8k_reg_set(struct qca8k_priv *priv, u32 reg, u32 val)
+{
+	qca8k_rmw(priv, reg, 0, val);
+}
+
+static void
+qca8k_reg_clear(struct qca8k_priv *priv, u32 reg, u32 val)
+{
+	qca8k_rmw(priv, reg, val, 0);
+}
+
+static int
+qca8k_regmap_read(void *ctx, uint32_t reg, uint32_t *val)
+{
+	struct qca8k_priv *priv = (struct qca8k_priv *)ctx;
+
+	*val = qca8k_read(priv, reg);
+
+	return 0;
+}
+
+static int
+qca8k_regmap_write(void *ctx, uint32_t reg, uint32_t val)
+{
+	struct qca8k_priv *priv = (struct qca8k_priv *)ctx;
+
+	qca8k_write(priv, reg, val);
+
+	return 0;
+}
+
+static const struct regmap_range qca8k_readable_ranges[] = {
+	regmap_reg_range(0x0000, 0x00e4), /* Global control */
+	regmap_reg_range(0x0100, 0x0168), /* EEE control */
+	regmap_reg_range(0x0200, 0x0270), /* Parser control */
+	regmap_reg_range(0x0400, 0x0454), /* ACL */
+	regmap_reg_range(0x0600, 0x0718), /* Lookup */
+	regmap_reg_range(0x0800, 0x0b70), /* QM */
+	regmap_reg_range(0x0c00, 0x0c80), /* PKT */
+	regmap_reg_range(0x0e00, 0x0e98), /* L3 */
+	regmap_reg_range(0x1000, 0x10ac), /* MIB - Port0 */
+	regmap_reg_range(0x1100, 0x11ac), /* MIB - Port1 */
+	regmap_reg_range(0x1200, 0x12ac), /* MIB - Port2 */
+	regmap_reg_range(0x1300, 0x13ac), /* MIB - Port3 */
+	regmap_reg_range(0x1400, 0x14ac), /* MIB - Port4 */
+	regmap_reg_range(0x1500, 0x15ac), /* MIB - Port5 */
+	regmap_reg_range(0x1600, 0x16ac), /* MIB - Port6 */
+
+};
+
+static struct regmap_access_table qca8k_readable_table = {
+	.yes_ranges = qca8k_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(qca8k_readable_ranges),
+};
+
+struct regmap_config qca8k_regmap_config = {
+	.reg_bits = 16,
+	.val_bits = 32,
+	.reg_stride = 4,
+	.max_register = 0x16ac, /* end MIB - Port6 range */
+	.reg_read = qca8k_regmap_read,
+	.reg_write = qca8k_regmap_write,
+	.rd_table = &qca8k_readable_table,
+};
+
+static int
+qca8k_fdb_busy_wait(struct qca8k_priv *priv)
+{
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(20);
+
+	/* loop until the busy flag has cleared */
+	do {
+		u32 reg = qca8k_read(priv, QCA8K_REG_ATU_FUNC);
+		int busy = reg & QCA8K_ATU_FUNC_BUSY;
+
+		if (!busy)
+			break;
+		cond_resched();
+	} while (!time_after_eq(jiffies, timeout));
+
+	return time_after_eq(jiffies, timeout);
+}
+
+static void
+qca8k_fdb_read(struct qca8k_priv *priv, struct qca8k_fdb *fdb)
+{
+	u32 reg[4];
+	int i;
+
+	/* load the ARL table into an array */
+	for (i = 0; i < 4; i++)
+		reg[i] = qca8k_read(priv, QCA8K_REG_ATU_DATA0 + (i * 4));
+
+	/* vid - 83:72 */
+	fdb->vid = (reg[2] >> QCA8K_ATU_VID_S) & QCA8K_ATU_VID_M;
+	/* aging - 67:64 */
+	fdb->aging = reg[2] & QCA8K_ATU_STATUS_M;
+	/* portmask - 54:48 */
+	fdb->port_mask = (reg[1] >> QCA8K_ATU_PORT_S) & QCA8K_ATU_PORT_M;
+	/* mac - 47:0 */
+	fdb->mac[0] = (reg[1] >> QCA8K_ATU_ADDR0_S) & 0xff;
+	fdb->mac[1] = reg[1] & 0xff;
+	fdb->mac[2] = (reg[0] >> QCA8K_ATU_ADDR2_S) & 0xff;
+	fdb->mac[3] = (reg[0] >> QCA8K_ATU_ADDR3_S) & 0xff;
+	fdb->mac[4] = (reg[0] >> QCA8K_ATU_ADDR4_S) & 0xff;
+	fdb->mac[5] = reg[0] & 0xff;
+}
+
+static void
+qca8k_fdb_write(struct qca8k_priv *priv, u16 vid, u8 port_mask, const u8 *mac,
+		u8 aging)
+{
+	u32 reg[3] = { 0 };
+	int i;
+
+	/* vid - 83:72 */
+	reg[2] = (vid & QCA8K_ATU_VID_M) << QCA8K_ATU_VID_S;
+	/* aging - 67:64 */
+	reg[2] |= aging & QCA8K_ATU_STATUS_M;
+	/* portmask - 54:48 */
+	reg[1] = (port_mask & QCA8K_ATU_PORT_M) << QCA8K_ATU_PORT_S;
+	/* mac - 47:0 */
+	reg[1] |= mac[0] << QCA8K_ATU_ADDR0_S;
+	reg[1] |= mac[1];
+	reg[0] |= mac[2] << QCA8K_ATU_ADDR2_S;
+	reg[0] |= mac[3] << QCA8K_ATU_ADDR3_S;
+	reg[0] |= mac[4] << QCA8K_ATU_ADDR4_S;
+	reg[0] |= mac[5];
+
+	/* load the array into the ARL table */
+	for (i = 0; i < 3; i++)
+		qca8k_write(priv, QCA8K_REG_ATU_DATA0 + (i * 4), reg[i]);
+}
+
+static int
+qca8k_fdb_access(struct qca8k_priv *priv, enum qca8k_fdb_cmd cmd, int port)
+{
+	u32 reg;
+
+	/* Set the command and FDB index */
+	reg = QCA8K_ATU_FUNC_BUSY;
+	reg |= cmd;
+	if (port >= 0) {
+		reg |= QCA8K_ATU_FUNC_PORT_EN;
+		reg |= (port && QCA8K_ATU_FUNC_PORT_M) << QCA8K_ATU_FUNC_PORT_S;
+	}
+
+	/* Write the function register triggering the table access */
+	qca8k_write(priv, QCA8K_REG_ATU_FUNC, reg);
+
+	/* wait for completion */
+	if (qca8k_fdb_busy_wait(priv))
+		return -1;
+
+	return 0;
+}
+
+static int
+qca8k_fdb_next(struct qca8k_priv *priv, struct qca8k_fdb *fdb, int port)
+{
+	int ret;
+
+	mutex_lock(&priv->fdb_mutex);
+	qca8k_fdb_write(priv, fdb->vid, fdb->port_mask, fdb->mac, fdb->aging);
+	ret = qca8k_fdb_access(priv, QCA8K_FDB_NEXT, port);
+	if (ret >= 0)
+		qca8k_fdb_read(priv, fdb);
+	mutex_unlock(&priv->fdb_mutex);
+
+	return ret;
+}
+
+static void
+qca8k_fdb_flush(struct qca8k_priv *priv)
+{
+	mutex_lock(&priv->fdb_mutex);
+	qca8k_fdb_access(priv, QCA8K_FDB_FLUSH, -1);
+	mutex_unlock(&priv->fdb_mutex);
+}
+
+/* The switch has 2 CPU ports. These can alternatively be configured to
+ * connected directly to one of the PHYs, bypassing the switching core
+ */
+static int
+qca8k_set_pad_ctrl(struct qca8k_priv *priv, int port, int mode)
+{
+	u32 reg;
+
+	switch (port) {
+	case 0:
+		reg = QCA8K_REG_PORT0_PAD_CTRL;
+		break;
+	case 6:
+		reg = QCA8K_REG_PORT6_PAD_CTRL;
+		break;
+	default:
+		pr_err("Can't set PAD_CTRL on port %d\n", port);
+		return -EINVAL;
+	}
+
+	/* Configure a port to be directly connected to a PHY */
+	switch (mode) {
+	case PHY_INTERFACE_MODE_RGMII:
+		qca8k_write(priv, reg,
+			    QCA8K_PORT_PAD_RGMII_EN |
+			    QCA8K_PORT_PAD_RGMII_TX_DELAY(3) |
+			    QCA8K_PORT_PAD_RGMII_RX_DELAY(3));
+
+		/* According to the datasheet, RGMII delay is enabled through
+		 * PORT5_PAD_CTRL for all ports, rather than individual port
+		 * registers
+		 */
+		qca8k_write(priv, QCA8K_REG_PORT5_PAD_CTRL,
+			    QCA8K_PORT_PAD_RGMII_RX_DELAY_EN);
+		break;
+	case PHY_INTERFACE_MODE_SGMII:
+		qca8k_write(priv, reg, QCA8K_PORT_PAD_SGMII_EN);
+		break;
+	default:
+		pr_err("xMII mode %d not supported\n", mode);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+qca8k_setup(struct dsa_switch *ds)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	int ret, i, phy_mode = -1;
+
+	/* Start by setting up the register mapping */
+	priv->regmap = devm_regmap_init(ds->dev, NULL, priv,
+					&qca8k_regmap_config);
+
+	if (IS_ERR(priv->regmap))
+		pr_warn("regmap initialization failed");
+
+	/* Initialize CPU port pad mode (xMII type, delays...) */
+	phy_mode = of_get_phy_mode(ds->ports[ds->dst->cpu_port].dn);
+	if (phy_mode < 0) {
+		pr_err("Can't find phy-mode for master device\n");
+		return phy_mode;
+	}
+	ret = qca8k_set_pad_ctrl(priv, QCA8K_CPU_PORT, phy_mode);
+	if (ret < 0)
+		return ret;
+
+	/* Enable CPU Port */
+	qca8k_reg_set(priv, QCA8K_REG_GLOBAL_FW_CTRL0,
+		      QCA8K_GLOBAL_FW_CTRL0_CPU_PORT_EN);
+
+	/* Enable MIB counters */
+	qca8k_reg_set(priv, QCA8K_REG_MIB, QCA8K_MIB_CPU_KEEP);
+	qca8k_write(priv, QCA8K_REG_MODULE_EN, QCA8K_MODULE_EN_MIB);
+
+	/* Enable QCA header mode on the cpu port */
+	qca8k_write(priv, QCA8K_REG_PORT_HDR_CTRL(QCA8K_CPU_PORT),
+		    QCA8K_PORT_HDR_CTRL_ALL << QCA8K_PORT_HDR_CTRL_TX_S |
+		    QCA8K_PORT_HDR_CTRL_ALL << QCA8K_PORT_HDR_CTRL_RX_S);
+
+	/* Disable forwarding by default on all ports */
+	for (i = 0; i < QCA8K_NUM_PORTS; i++)
+		qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(i),
+			  QCA8K_PORT_LOOKUP_MEMBER, 0);
+
+	/* Disable MAC by default on all ports */
+	for (i = 0; i < QCA8K_NUM_PORTS; i++) {
+		if (ds->enabled_port_mask & BIT(i))
+			qca8k_rmw(priv, QCA8K_REG_PORT_STATUS(i),
+				  QCA8K_PORT_STATUS_LINK_AUTO |
+				  QCA8K_PORT_STATUS_TXMAC, 0);
+	}
+
+	/* Forward all unknown frames to CPU port for Linux processing */
+	qca8k_write(priv, QCA8K_REG_GLOBAL_FW_CTRL1,
+		    BIT(0) << QCA8K_GLOBAL_FW_CTRL1_IGMP_DP_S |
+		    BIT(0) << QCA8K_GLOBAL_FW_CTRL1_BC_DP_S |
+		    BIT(0) << QCA8K_GLOBAL_FW_CTRL1_MC_DP_S |
+		    BIT(0) << QCA8K_GLOBAL_FW_CTRL1_UC_DP_S);
+
+	/* Setup connection between CPU ports & PHYs */
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		/* CPU port gets connected to all PHYs in the switch */
+		if (dsa_is_cpu_port(ds, i)) {
+			qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(QCA8K_CPU_PORT),
+				  QCA8K_PORT_LOOKUP_MEMBER,
+				  ds->enabled_port_mask);
+		}
+
+		/* Invividual PHYs gets connected to CPU port only */
+		if (ds->enabled_port_mask & BIT(i)) {
+			int shift = 16 * (i % 2);
+
+			qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(i),
+				  QCA8K_PORT_LOOKUP_MEMBER,
+				  BIT(QCA8K_CPU_PORT));
+
+			/* Enable ARP Auto-learning by default */
+			qca8k_reg_set(priv, QCA8K_PORT_LOOKUP_CTRL(i),
+				      QCA8K_PORT_LOOKUP_LEARN);
+
+			/* For port based vlans to work we need to set the
+			 * default egress vid
+			 */
+			qca8k_rmw(priv, QCA8K_EGRESS_VLAN(i),
+				  0xffff << shift, 1 << shift);
+			qca8k_write(priv, QCA8K_REG_PORT_VLAN_CTRL0(i),
+				    QCA8K_PORT_VLAN_CVID(1) |
+				    QCA8K_PORT_VLAN_SVID(1));
+		}
+	}
+
+	/* Flush the FDB table */
+	qca8k_fdb_flush(priv);
+
+	return 0;
+}
+
+static int
+qca8k_set_addr(struct dsa_switch *ds, u8 *addr)
+{
+	/* The subsystem always calls this function so add an empty stub */
+	return 0;
+}
+
+static int
+qca8k_phy_read(struct dsa_switch *ds, int phy, int regnum)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+
+	return mdiobus_read(priv->bus, phy, regnum);
+}
+
+static int
+qca8k_phy_write(struct dsa_switch *ds, int phy, int regnum, u16 val)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+
+	return mdiobus_write(priv->bus, phy, regnum, val);
+}
+
+static void
+qca8k_get_strings(struct dsa_switch *ds, int phy, uint8_t *data)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ar8327_mib); i++)
+		strncpy(data + i * ETH_GSTRING_LEN, ar8327_mib[i].name,
+			ETH_GSTRING_LEN);
+}
+
+static void
+qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
+			uint64_t *data)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	const struct qca8k_mib_desc *mib;
+	u32 reg, i;
+	u64 hi;
+
+	for (i = 0; i < ARRAY_SIZE(ar8327_mib); i++) {
+		mib = &ar8327_mib[i];
+		reg = QCA8K_PORT_MIB_COUNTER(port) + mib->offset;
+
+		data[i] = qca8k_read(priv, reg);
+		if (mib->size == 2) {
+			hi = qca8k_read(priv, reg + 4);
+			data[i] |= hi << 32;
+		}
+	}
+}
+
+static int
+qca8k_get_sset_count(struct dsa_switch *ds)
+{
+	return ARRAY_SIZE(ar8327_mib);
+}
+
+static void
+qca8k_eee_enable_set(struct dsa_switch *ds, int port, bool enable)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	u32 lpi_en = QCA8K_REG_EEE_CTRL_LPI_EN(port);
+	u32 reg;
+
+	reg = qca8k_read(priv, QCA8K_REG_EEE_CTRL);
+	if (enable)
+		reg |= lpi_en;
+	else
+		reg &= ~lpi_en;
+	qca8k_write(priv, QCA8K_REG_EEE_CTRL, reg);
+}
+
+static int
+qca8k_eee_init(struct dsa_switch *ds, int port,
+	       struct phy_device *phy)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	struct ethtool_eee *p = &priv->port_sts[port].eee;
+	int ret;
+
+	p->supported = (SUPPORTED_1000baseT_Full | SUPPORTED_100baseT_Full);
+
+	ret = phy_init_eee(phy, 0);
+	if (ret)
+		return ret;
+
+	qca8k_eee_enable_set(ds, port, true);
+
+	return 0;
+}
+
+static int
+qca8k_set_eee(struct dsa_switch *ds, int port,
+	      struct phy_device *phydev,
+	      struct ethtool_eee *e)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	struct ethtool_eee *p = &priv->port_sts[port].eee;
+	int ret = 0;
+
+	p->eee_enabled = e->eee_enabled;
+
+	if (e->eee_enabled) {
+		p->eee_enabled = qca8k_eee_init(ds, port, phydev);
+		if (!p->eee_enabled)
+			ret = -EOPNOTSUPP;
+	}
+	qca8k_eee_enable_set(ds, port, p->eee_enabled);
+
+	return ret;
+}
+
+static int
+qca8k_get_eee(struct dsa_switch *ds, int port,
+	      struct ethtool_eee *e)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	struct ethtool_eee *p = &priv->port_sts[port].eee;
+	struct net_device *netdev = ds->ports[port].netdev;
+	int ret;
+
+	ret = phy_ethtool_get_eee(netdev->phydev, p);
+	if (!ret)
+		e->eee_active =
+			!!(p->supported & p->advertised & p->lp_advertised);
+	else
+		e->eee_active = 0;
+
+	e->eee_enabled = p->eee_enabled;
+
+	return ret;
+}
+
+static void
+qca8k_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	u32 stp_state;
+
+	switch (state) {
+	case BR_STATE_DISABLED:
+		stp_state = QCA8K_PORT_LOOKUP_STATE_DISABLED;
+		break;
+	case BR_STATE_BLOCKING:
+		stp_state = QCA8K_PORT_LOOKUP_STATE_BLOCKING;
+		break;
+	case BR_STATE_LISTENING:
+		stp_state = QCA8K_PORT_LOOKUP_STATE_LISTENING;
+		break;
+	case BR_STATE_LEARNING:
+		stp_state = QCA8K_PORT_LOOKUP_STATE_LEARNING;
+		break;
+	case BR_STATE_FORWARDING:
+	default:
+		stp_state = QCA8K_PORT_LOOKUP_STATE_FORWARD;
+		break;
+	}
+
+	qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(port),
+		  QCA8K_PORT_LOOKUP_STATE_MASK, stp_state);
+}
+
+static int
+qca8k_port_bridge_join(struct dsa_switch *ds, int port,
+		       struct net_device *bridge)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	int port_mask = BIT(QCA8K_CPU_PORT);
+	int i;
+
+	priv->port_sts[port].bridge_dev = bridge;
+
+	for (i = 1; i < QCA8K_NUM_PORTS; i++) {
+		if (priv->port_sts[i].bridge_dev != bridge)
+			continue;
+		/* Add this port to the portvlan mask of the other ports
+		 * in the bridge
+		 */
+		qca8k_reg_set(priv,
+			      QCA8K_PORT_LOOKUP_CTRL(i),
+			      BIT(port));
+		if (i != port)
+			port_mask |= BIT(i);
+	}
+	/* Add all other ports to this ports portvlan mask */
+	qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(port),
+		  QCA8K_PORT_LOOKUP_MEMBER, port_mask);
+
+	return 0;
+}
+
+static void
+qca8k_port_bridge_leave(struct dsa_switch *ds, int port)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	int i;
+
+	for (i = 1; i < QCA8K_NUM_PORTS; i++) {
+		if (priv->port_sts[i].bridge_dev !=
+		    priv->port_sts[port].bridge_dev)
+			continue;
+		/* Remove this port to the portvlan mask of the other ports
+		 * in the bridge
+		 */
+		qca8k_reg_clear(priv,
+				QCA8K_PORT_LOOKUP_CTRL(i),
+				BIT(port));
+	}
+	priv->port_sts[port].bridge_dev = NULL;
+	/* Set the cpu port to be the only one in the portvlan mask of
+	 * this port
+	 */
+	qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(port),
+		  QCA8K_PORT_LOOKUP_MEMBER, BIT(QCA8K_CPU_PORT));
+}
+
+static int
+qca8k_port_enable(struct dsa_switch *ds, int port,
+		  struct phy_device *phy)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+
+	qca8k_reg_set(priv, QCA8K_REG_PORT_STATUS(port),
+		      QCA8K_PORT_STATUS_LINK_AUTO | QCA8K_PORT_STATUS_TXMAC);
+
+	return 0;
+}
+
+static void
+qca8k_port_disable(struct dsa_switch *ds, int port,
+		   struct phy_device *phy)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+
+	qca8k_reg_clear(priv, QCA8K_REG_PORT_STATUS(port),
+			QCA8K_PORT_STATUS_TXMAC | QCA8K_PORT_STATUS_LINK_AUTO);
+}
+
+static int
+qca8k_fdb_prepare(struct dsa_switch *ds, int port,
+		  const struct switchdev_obj_port_fdb *fdb,
+		  struct switchdev_trans *trans)
+{
+	/* We do not need to do anything specific here yet */
+	return 0;
+}
+
+static void
+qca8k_fdb_add(struct dsa_switch *ds, int port,
+	      const struct switchdev_obj_port_fdb *fdb,
+	      struct switchdev_trans *trans)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	u16 port_mask = BIT(port);
+	u16 vid = fdb->vid;
+
+	if (!vid)
+		vid = 1;
+
+	qca8k_fdb_write(priv, vid, port_mask, fdb->addr,
+			QCA8K_ATU_STATUS_STATIC);
+
+	qca8k_fdb_access(priv, QCA8K_FDB_LOAD, -1);
+}
+
+static int
+qca8k_fdb_del(struct dsa_switch *ds, int port,
+	      const struct switchdev_obj_port_fdb *fdb)
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	u16 port_mask = BIT(port);
+
+	qca8k_fdb_write(priv, fdb->vid, port_mask, fdb->addr, 0);
+
+	return qca8k_fdb_access(priv, QCA8K_FDB_PURGE, -1);
+}
+
+static int
+qca8k_fdb_dump(struct dsa_switch *ds, int port,
+	       struct switchdev_obj_port_fdb *fdb,
+	       int (*cb)(struct switchdev_obj *obj))
+{
+	struct qca8k_priv *priv = qca8k_to_priv(ds);
+	struct qca8k_fdb _fdb = { 0 };
+	int cnt = QCA8K_NUM_FDB_RECORDS;
+
+	while (cnt-- && !qca8k_fdb_next(priv, &_fdb, port)) {
+		int ret;
+
+		if (!_fdb.aging)
+			break;
+
+		ether_addr_copy(fdb->addr, _fdb.mac);
+		fdb->vid = _fdb.vid;
+		if (_fdb.aging == QCA8K_ATU_STATUS_STATIC)
+			fdb->ndm_state = NUD_NOARP;
+		else
+			fdb->ndm_state = NUD_REACHABLE;
+
+		ret = cb(&fdb->obj);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static enum dsa_tag_protocol
+qca8k_get_tag_protocol(struct dsa_switch *ds)
+{
+	return DSA_TAG_PROTO_QCA;
+}
+
+static struct dsa_switch_ops qca8k_switch_ops = {
+	.get_tag_protocol	= qca8k_get_tag_protocol,
+	.setup			= qca8k_setup,
+	.set_addr		= qca8k_set_addr,
+	.get_strings		= qca8k_get_strings,
+	.phy_read		= qca8k_phy_read,
+	.phy_write		= qca8k_phy_write,
+	.get_ethtool_stats	= qca8k_get_ethtool_stats,
+	.get_sset_count		= qca8k_get_sset_count,
+	.get_eee		= qca8k_get_eee,
+	.set_eee		= qca8k_set_eee,
+	.port_enable		= qca8k_port_enable,
+	.port_disable		= qca8k_port_disable,
+	.port_stp_state_set	= qca8k_port_stp_state_set,
+	.port_bridge_join	= qca8k_port_bridge_join,
+	.port_bridge_leave	= qca8k_port_bridge_leave,
+	.port_fdb_prepare	= qca8k_fdb_prepare,
+	.port_fdb_add		= qca8k_fdb_add,
+	.port_fdb_del		= qca8k_fdb_del,
+	.port_fdb_dump		= qca8k_fdb_dump,
+};
+
+static int
+qca8k_sw_probe(struct mdio_device *mdiodev)
+{
+	struct qca8k_priv *priv;
+	u32 id;
+
+	/* allocate the private data struct so that we can probe the switches
+	 * ID register
+	 */
+	priv = devm_kzalloc(&mdiodev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->bus = mdiodev->bus;
+
+	/* read the switches ID register */
+	id = qca8k_read(priv, QCA8K_REG_MASK_CTRL);
+	id >>= QCA8K_MASK_CTRL_ID_S;
+	id &= QCA8K_MASK_CTRL_ID_M;
+	if (id != QCA8K_ID_QCA8337)
+		return -ENODEV;
+
+	priv->ds = devm_kzalloc(&mdiodev->dev, sizeof(*priv->ds), GFP_KERNEL);
+	if (!priv->ds)
+		return -ENOMEM;
+
+	priv->ds->priv = priv;
+	priv->ds->dev = &mdiodev->dev;
+	priv->ds->ops = &qca8k_switch_ops;
+	mutex_init(&priv->fdb_mutex);
+	dev_set_drvdata(&mdiodev->dev, priv);
+
+	return dsa_register_switch(priv->ds, priv->ds->dev->of_node);
+}
+
+static void
+qca8k_sw_remove(struct mdio_device *mdiodev)
+{
+	struct qca8k_priv *priv = dev_get_drvdata(&mdiodev->dev);
+
+	dsa_unregister_switch(priv->ds);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int qca8k_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct qca8k_priv *priv = platform_get_drvdata(pdev);
+
+	return dsa_switch_suspend(priv->ds);
+}
+
+static int qca8k_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct qca8k_priv *priv = platform_get_drvdata(pdev);
+
+	return dsa_switch_resume(priv->ds);
+}
+#endif /* CONFIG_PM_SLEEP */
+
+static SIMPLE_DEV_PM_OPS(qca8k_pm_ops,
+			 qca8k_suspend, qca8k_resume);
+
+static const struct of_device_id qca8k_of_match[] = {
+	{ .compatible = "qca,qca8337" },
+	{ /* sentinel */ },
+};
+
+static struct mdio_driver qca8kmdio_driver = {
+	.probe  = qca8k_sw_probe,
+	.remove = qca8k_sw_remove,
+	.mdiodrv.driver = {
+		.name = "qca8k",
+		.of_match_table = qca8k_of_match,
+		.pm = &qca8k_pm_ops,
+	},
+};
+
+static int __init
+qca8kmdio_driver_register(void)
+{
+	return mdio_driver_register(&qca8kmdio_driver);
+}
+module_init(qca8kmdio_driver_register);
+
+static void __exit
+qca8kmdio_driver_unregister(void)
+{
+	mdio_driver_unregister(&qca8kmdio_driver);
+}
+module_exit(qca8kmdio_driver_unregister);
+
+MODULE_AUTHOR("Mathieu Olivari, John Crispin <john@phrozen.org>");
+MODULE_DESCRIPTION("Driver for QCA8K ethernet switch family");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:qca8k");
diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
new file mode 100644
index 0000000..374471f
--- /dev/null
+++ b/drivers/net/dsa/qca8k.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2009 Felix Fietkau <nbd@nbd.name>
+ * Copyright (C) 2011-2012 Gabor Juhos <juhosg@openwrt.org>
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __QCA8K_H
+#define __QCA8K_H
+
+#include <linux/delay.h>
+#include <linux/regmap.h>
+
+#define QCA8K_NUM_PORTS					7
+
+#define PHY_ID_QCA8337					0x004dd036
+#define QCA8K_ID_QCA8337				0x13
+
+#define QCA8K_NUM_FDB_RECORDS				2048
+
+#define QCA8K_CPU_PORT					0
+
+/* Global control registers */
+#define QCA8K_REG_MASK_CTRL				0x000
+#define   QCA8K_MASK_CTRL_ID_M				0xff
+#define   QCA8K_MASK_CTRL_ID_S				8
+#define QCA8K_REG_PORT0_PAD_CTRL			0x004
+#define QCA8K_REG_PORT5_PAD_CTRL			0x008
+#define QCA8K_REG_PORT6_PAD_CTRL			0x00c
+#define   QCA8K_PORT_PAD_RGMII_EN			BIT(26)
+#define   QCA8K_PORT_PAD_RGMII_TX_DELAY(x)		\
+						((0x8 + (x & 0x3)) << 22)
+#define   QCA8K_PORT_PAD_RGMII_RX_DELAY(x)		\
+						((0x10 + (x & 0x3)) << 20)
+#define   QCA8K_PORT_PAD_RGMII_RX_DELAY_EN		BIT(24)
+#define   QCA8K_PORT_PAD_SGMII_EN			BIT(7)
+#define QCA8K_REG_MODULE_EN				0x030
+#define   QCA8K_MODULE_EN_MIB				BIT(0)
+#define QCA8K_REG_MIB					0x034
+#define   QCA8K_MIB_CPU_KEEP				BIT(20)
+#define QCA8K_GOL_MAC_ADDR0				0x60
+#define QCA8K_GOL_MAC_ADDR1				0x64
+#define QCA8K_REG_PORT_STATUS(_i)			(0x07c + (_i) * 4)
+#define   QCA8K_PORT_STATUS_SPEED			GENMASK(2, 0)
+#define   QCA8K_PORT_STATUS_SPEED_S			0
+#define   QCA8K_PORT_STATUS_TXMAC			BIT(2)
+#define   QCA8K_PORT_STATUS_RXMAC			BIT(3)
+#define   QCA8K_PORT_STATUS_TXFLOW			BIT(4)
+#define   QCA8K_PORT_STATUS_RXFLOW			BIT(5)
+#define   QCA8K_PORT_STATUS_DUPLEX			BIT(6)
+#define   QCA8K_PORT_STATUS_LINK_UP			BIT(8)
+#define   QCA8K_PORT_STATUS_LINK_AUTO			BIT(9)
+#define   QCA8K_PORT_STATUS_LINK_PAUSE			BIT(10)
+#define QCA8K_REG_PORT_HDR_CTRL(_i)			(0x9c + (_i * 4))
+#define   QCA8K_PORT_HDR_CTRL_RX_MASK			GENMASK(3, 2)
+#define   QCA8K_PORT_HDR_CTRL_RX_S			2
+#define   QCA8K_PORT_HDR_CTRL_TX_MASK			GENMASK(1, 0)
+#define   QCA8K_PORT_HDR_CTRL_TX_S			0
+#define   QCA8K_PORT_HDR_CTRL_ALL			2
+#define   QCA8K_PORT_HDR_CTRL_MGMT			1
+#define   QCA8K_PORT_HDR_CTRL_NONE			0
+
+/* EEE control registers */
+#define QCA8K_REG_EEE_CTRL				0x100
+#define  QCA8K_REG_EEE_CTRL_LPI_EN(_i)			((_i + 1) * 2)
+
+/* ACL registers */
+#define QCA8K_REG_PORT_VLAN_CTRL0(_i)			(0x420 + (_i * 8))
+#define   QCA8K_PORT_VLAN_CVID(x)			(x << 16)
+#define   QCA8K_PORT_VLAN_SVID(x)			x
+#define QCA8K_REG_PORT_VLAN_CTRL1(_i)			(0x424 + (_i * 8))
+#define QCA8K_REG_IPV4_PRI_BASE_ADDR			0x470
+#define QCA8K_REG_IPV4_PRI_ADDR_MASK			0x474
+
+/* Lookup registers */
+#define QCA8K_REG_ATU_DATA0				0x600
+#define   QCA8K_ATU_ADDR2_S				24
+#define   QCA8K_ATU_ADDR3_S				16
+#define   QCA8K_ATU_ADDR4_S				8
+#define QCA8K_REG_ATU_DATA1				0x604
+#define   QCA8K_ATU_PORT_M				0x7f
+#define   QCA8K_ATU_PORT_S				16
+#define   QCA8K_ATU_ADDR0_S				8
+#define QCA8K_REG_ATU_DATA2				0x608
+#define   QCA8K_ATU_VID_M				0xfff
+#define   QCA8K_ATU_VID_S				8
+#define   QCA8K_ATU_STATUS_M				0xf
+#define   QCA8K_ATU_STATUS_STATIC			0xf
+#define QCA8K_REG_ATU_FUNC				0x60c
+#define   QCA8K_ATU_FUNC_BUSY				BIT(31)
+#define   QCA8K_ATU_FUNC_PORT_EN			BIT(14)
+#define   QCA8K_ATU_FUNC_PORT_M				0xf
+#define   QCA8K_ATU_FUNC_PORT_S				8
+#define QCA8K_REG_GLOBAL_FW_CTRL0			0x620
+#define   QCA8K_GLOBAL_FW_CTRL0_CPU_PORT_EN		BIT(10)
+#define QCA8K_REG_GLOBAL_FW_CTRL1			0x624
+#define   QCA8K_GLOBAL_FW_CTRL1_IGMP_DP_S		24
+#define   QCA8K_GLOBAL_FW_CTRL1_BC_DP_S			16
+#define   QCA8K_GLOBAL_FW_CTRL1_MC_DP_S			8
+#define   QCA8K_GLOBAL_FW_CTRL1_UC_DP_S			0
+#define QCA8K_PORT_LOOKUP_CTRL(_i)			(0x660 + (_i) * 0xc)
+#define   QCA8K_PORT_LOOKUP_MEMBER			GENMASK(6, 0)
+#define   QCA8K_PORT_LOOKUP_STATE_MASK			GENMASK(18, 16)
+#define   QCA8K_PORT_LOOKUP_STATE_DISABLED		(0 << 16)
+#define   QCA8K_PORT_LOOKUP_STATE_BLOCKING		(1 << 16)
+#define   QCA8K_PORT_LOOKUP_STATE_LISTENING		(2 << 16)
+#define   QCA8K_PORT_LOOKUP_STATE_LEARNING		(3 << 16)
+#define   QCA8K_PORT_LOOKUP_STATE_FORWARD		(4 << 16)
+#define   QCA8K_PORT_LOOKUP_STATE			GENMASK(18, 16)
+#define   QCA8K_PORT_LOOKUP_LEARN			BIT(20)
+
+/* Pkt edit registers */
+#define QCA8K_EGRESS_VLAN(x)				(0x0c70 + (4 * (x / 2)))
+
+/* L3 registers */
+#define QCA8K_HROUTER_CONTROL				0xe00
+#define   QCA8K_HROUTER_CONTROL_GLB_LOCKTIME_M		GENMASK(17, 16)
+#define   QCA8K_HROUTER_CONTROL_GLB_LOCKTIME_S		16
+#define   QCA8K_HROUTER_CONTROL_ARP_AGE_MODE		1
+#define QCA8K_HROUTER_PBASED_CONTROL1			0xe08
+#define QCA8K_HROUTER_PBASED_CONTROL2			0xe0c
+#define QCA8K_HNAT_CONTROL				0xe38
+
+/* MIB registers */
+#define QCA8K_PORT_MIB_COUNTER(_i)			(0x1000 + (_i) * 0x100)
+
+/* QCA specific MII registers */
+#define MII_ATH_MMD_ADDR				0x0d
+#define MII_ATH_MMD_DATA				0x0e
+
+enum {
+	QCA8K_PORT_SPEED_10M = 0,
+	QCA8K_PORT_SPEED_100M = 1,
+	QCA8K_PORT_SPEED_1000M = 2,
+	QCA8K_PORT_SPEED_ERR = 3,
+};
+
+enum qca8k_fdb_cmd {
+	QCA8K_FDB_FLUSH	= 1,
+	QCA8K_FDB_LOAD = 2,
+	QCA8K_FDB_PURGE = 3,
+	QCA8K_FDB_NEXT = 6,
+	QCA8K_FDB_SEARCH = 7,
+};
+
+struct ar8xxx_port_status {
+	struct ethtool_eee eee;
+	struct net_device *bridge_dev;
+};
+
+struct qca8k_priv {
+	struct regmap *regmap;
+	struct mii_bus *bus;
+	struct ar8xxx_port_status port_sts[QCA8K_NUM_PORTS];
+	struct dsa_switch *ds;
+	struct mutex fdb_mutex;
+};
+
+struct qca8k_mib_desc {
+	unsigned int size;
+	unsigned int offset;
+	const char *name;
+};
+
+struct qca8k_fdb {
+	u16 vid;
+	u8 port_mask;
+	u8 aging;
+	u8 mac[6];
+};
+
+#endif /* __QCA8K_H */
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH v5 0/6] Add eBPF hooks for cgroups
From: Thomas Graf @ 2016-09-14 11:06 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Alexei Starovoitov, Daniel Mack, htejun-b10kYP2dOMg,
	daniel-FeC+5ew28dpmcu3hnIyYJQ, ast-b10kYP2dOMg,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, kafai-b10kYP2dOMg,
	fw-HFFVJYpyMKqzQB+pC5nmwQ, harald-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA, sargun-GaZTRHToo+CzQB+pC5nmwQ,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20160914103038.GA910@salvia>

On 09/14/16 at 12:30pm, Pablo Neira Ayuso wrote:
> On Tue, Sep 13, 2016 at 09:42:19PM -0700, Alexei Starovoitov wrote:
> [...]
> > For us this cgroup+bpf is _not_ for filterting and _not_ for security.
> 
> If your goal is monitoring, then convert these hooks not to allow to
> issue a verdict on the packet, so this becomes inoquous in the same
> fashion as the tracing infrastructure.

Why? How is this at all offensive? We have three parties voicing
interest in this work for both monitoring and security. At least
two specific use cases have been described.  It builds on top of
existing infrastructure and nicely complements other ongoing work.
Why not both?

^ permalink raw reply

* Re: [PATCH v4 01/16] vmxnet3: Move PCI Id to pci_ids.h
From: Yuval Shaia @ 2016-09-14 11:08 UTC (permalink / raw)
  To: Adit Ranadive
  Cc: dledford, linux-rdma, pv-drivers, netdev, linux-pci, jhansen,
	asarwade, georgezhang, bryantan
In-Reply-To: <1473655766-31628-2-git-send-email-aditr@vmware.com>

Please update vmxnet3_drv.c accordingly.

Yuval

On Sun, Sep 11, 2016 at 09:49:11PM -0700, Adit Ranadive wrote:
> The VMXNet3 PCI Id will be shared with our paravirtual RDMA driver.
> Moved it to the shared location in pci_ids.h.
> 
> Suggested-by: Leon Romanovsky <leon@kernel.org>
> Signed-off-by: Adit Ranadive <aditr@vmware.com>
> ---
> ---
>  drivers/net/vmxnet3/vmxnet3_int.h | 3 +--
>  include/linux/pci_ids.h           | 1 +
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
> index 74fc030..2bd6bf8 100644
> --- a/drivers/net/vmxnet3/vmxnet3_int.h
> +++ b/drivers/net/vmxnet3/vmxnet3_int.h
> @@ -119,9 +119,8 @@ enum {
>  };
>  
>  /*
> - * PCI vendor and device IDs.
> + * Maximum devices supported.
>   */
> -#define PCI_DEVICE_ID_VMWARE_VMXNET3    0x07B0
>  #define MAX_ETHERNET_CARDS		10
>  #define MAX_PCI_PASSTHRU_DEVICE		6
>  
> diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
> index c58752f..98bb455 100644
> --- a/include/linux/pci_ids.h
> +++ b/include/linux/pci_ids.h
> @@ -2251,6 +2251,7 @@
>  #define PCI_DEVICE_ID_RASTEL_2PORT	0x2000
>  
>  #define PCI_VENDOR_ID_VMWARE		0x15ad
> +#define PCI_DEVICE_ID_VMWARE_VMXNET3	0x07b0
>  
>  #define PCI_VENDOR_ID_ZOLTRIX		0x15b0
>  #define PCI_DEVICE_ID_ZOLTRIX_2BD0	0x2bd0
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] net/mlx4_en: fix off by one in error handling
From: Sebastian Ott @ 2016-09-14 11:09 UTC (permalink / raw)
  To: Yishai Hadas, Tariq Toukan
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

If an error occurs in mlx4_init_eq_table the index used in the
err_out_unmap label is one too big which results in a panic in
mlx4_free_eq. This patch fixes the index in the error path.

Signed-off-by: Sebastian Ott <sebott-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx4/eq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index f613977..cf8f8a7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -1305,8 +1305,8 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 	return 0;
 
 err_out_unmap:
-	while (i >= 0)
-		mlx4_free_eq(dev, &priv->eq_table.eq[i--]);
+	while (i > 0)
+		mlx4_free_eq(dev, &priv->eq_table.eq[--i]);
 #ifdef CONFIG_RFS_ACCEL
 	for (i = 1; i <= dev->caps.num_ports; i++) {
 		if (mlx4_priv(dev)->port[i].rmap) {
-- 
2.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH v5 0/6] Add eBPF hooks for cgroups
From: Daniel Mack @ 2016-09-14 11:13 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: htejun, daniel, ast, davem, kafai, fw, harald, netdev, sargun,
	cgroups
In-Reply-To: <20160913172408.GC6138@salvia>

Hi Pablo,

On 09/13/2016 07:24 PM, Pablo Neira Ayuso wrote:
> On Tue, Sep 13, 2016 at 03:31:20PM +0200, Daniel Mack wrote:
>> On 09/13/2016 01:56 PM, Pablo Neira Ayuso wrote:
>>> On Mon, Sep 12, 2016 at 06:12:09PM +0200, Daniel Mack wrote:
>>>> This is v5 of the patch set to allow eBPF programs for network
>>>> filtering and accounting to be attached to cgroups, so that they apply
>>>> to all sockets of all tasks placed in that cgroup. The logic also
>>>> allows to be extendeded for other cgroup based eBPF logic.
>>>
>>> 1) This infrastructure can only be useful to systemd, or any similar
>>>    orchestration daemon. Look, you can only apply filtering policies
>>>    to processes that are launched by systemd, so this only works
>>>    for server processes.
>>
>> Sorry, but both statements aren't true. The eBPF policies apply to every
>> process that is placed in a cgroup, and my example program in 6/6 shows
>> how that can be done from the command line.
> 
> Then you have to explain me how can anyone else than systemd use this
> infrastructure?

I have no idea what makes you think this is limited to systemd. As I
said, I provided an example for userspace that works from the command
line. The same limitation apply as for all other users of cgroups.

> My main point is that those processes *need* to be launched by the
> orchestrator, which is was refering as 'server processes'.

Yes, that's right. But as I said, this rule applies to many other kernel
concepts, so I don't see any real issue.

>> That's a limitation that applies to many more control mechanisms in the
>> kernel, and it's something that can easily be solved with fork+exec.
> 
> As long as you have control to launch the processes yes, but this
> will not work in other scenarios. Just like cgroup net_cls and friends
> are broken for filtering for things that you have no control to
> fork+exec.

Probably, but that's only solvable with rules that store the full cgroup
path then, and do a string comparison (!) for each packet flying by.

>> That's just as transparent as SO_ATTACH_FILTER. What kind of
>> introspection mechanism do you have in mind?
> 
> SO_ATTACH_FILTER is called from the process itself, so this is a local
> filtering policy that you apply to your own process.

Not necessarily. You can as well do it the inetd way, and pass the
socket to a process that is launched on demand, but do SO_ATTACH_FILTER
+ SO_LOCK_FILTER  in the middle. What happens with payload on the socket
is not transparent to the launched binary at all. The proposed cgroup
eBPF solution implements a very similar behavior in that regard.

>> It's about filtering outgoing network packets of applications, and
>> providing them with L2 information for filtering purposes. I don't think
>> that's a very specific use-case.
>>
>> When the feature is not used at all, the added costs on the output path
>> are close to zero, due to the use of static branches.
> 
> *You're proposing a socket filtering facility that hooks layer 2
> output path*!

As I said, I'm open to discussing that. In order to make it work for L3,
the LL_OFF issues need to be solved, as Daniel explained. Daniel,
Alexei, any idea how much work that would be?

> That is only a rough ~30 lines kernel patchset to support this in
> netfilter and only one extra input hook, with potential access to
> conntrack and better integration with other existing subsystems.

Care to share the patches for that? I'd really like to have a look.

And FWIW, I agree with Thomas - there is nothing wrong with having
multiple options to use for such use-cases.


Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH v4 06/16] IB/pvrdma: Add paravirtual rdma device
From: Yuval Shaia @ 2016-09-14 11:17 UTC (permalink / raw)
  To: Adit Ranadive
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	pv-drivers-pghWNbHTmq7QT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-pci-u79uwXL29TY76Z2rM5mHXA, jhansen-pghWNbHTmq7QT0dZR+AlfA,
	asarwade-pghWNbHTmq7QT0dZR+AlfA,
	georgezhang-pghWNbHTmq7QT0dZR+AlfA,
	bryantan-pghWNbHTmq7QT0dZR+AlfA
In-Reply-To: <1473655766-31628-7-git-send-email-aditr-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>

No more comments.
Reviewed-by: Yuval Shaia <yuval.shaia-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

On Sun, Sep 11, 2016 at 09:49:16PM -0700, Adit Ranadive wrote:
> This patch adds the main device-level structures and functions to be used
> to provide RDMA functionality. Also, we define conversion functions from
> the IB core stack structures to the device-specific ones.
> 
> Reviewed-by: Jorgen Hansen <jhansen-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> Reviewed-by: George Zhang <georgezhang-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> Reviewed-by: Aditya Sarwade <asarwade-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> Reviewed-by: Bryan Tan <bryantan-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> Signed-off-by: Adit Ranadive <aditr-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> ---
> Changes v3->v4:
>  - Renamed pvrdma_flush_cqe to _pvrdma_flush_cqe since we hold a lock
>  to call it.
>  - Added wrapper functions for writing to UARs for CQ/QP.
>  - The conversion functions are updated as func_name(dst, src) format.
>  - Renamed max_gs to max_sg.
>  - Added work struct for net device events.
>  - priviledged -> privileged.
> 
> Changes v2->v3:
>  - Removed VMware vendor id redefinition.
>  - Removed the boolean in pvrdma_cmd_post.
> ---
>  drivers/infiniband/hw/pvrdma/pvrdma.h | 473 ++++++++++++++++++++++++++++++++++
>  1 file changed, 473 insertions(+)
>  create mode 100644 drivers/infiniband/hw/pvrdma/pvrdma.h
> 
> diff --git a/drivers/infiniband/hw/pvrdma/pvrdma.h b/drivers/infiniband/hw/pvrdma/pvrdma.h
> new file mode 100644
> index 0000000..fedd7cb
> --- /dev/null
> +++ b/drivers/infiniband/hw/pvrdma/pvrdma.h
> @@ -0,0 +1,473 @@
> +/*
> + * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of EITHER the GNU General Public License
> + * version 2 as published by the Free Software Foundation or the BSD
> + * 2-Clause License. This program is distributed in the hope that it
> + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
> + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
> + * See the GNU General Public License version 2 for more details at
> + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program available in the file COPYING in the main
> + * directory of this source tree.
> + *
> + * The BSD 2-Clause License
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
> + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
> + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
> + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
> + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
> + * OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef __PVRDMA_H__
> +#define __PVRDMA_H__
> +
> +#include <linux/compiler.h>
> +#include <linux/interrupt.h>
> +#include <linux/list.h>
> +#include <linux/mutex.h>
> +#include <linux/pci.h>
> +#include <linux/semaphore.h>
> +#include <linux/workqueue.h>
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +#include "pvrdma_defs.h"
> +#include "pvrdma_dev_api.h"
> +#include "pvrdma_verbs.h"
> +
> +/* NOT the same as BIT_MASK(). */
> +#define PVRDMA_MASK(n) ((n << 1) - 1)
> +
> +/*
> + * VMware PVRDMA PCI device id.
> + */
> +#define PCI_DEVICE_ID_VMWARE_PVRDMA	0x0820
> +
> +struct pvrdma_dev;
> +
> +struct pvrdma_page_dir {
> +	dma_addr_t dir_dma;
> +	u64 *dir;
> +	int ntables;
> +	u64 **tables;
> +	u64 npages;
> +	void **pages;
> +};
> +
> +struct pvrdma_cq {
> +	struct ib_cq ibcq;
> +	int offset;
> +	spinlock_t cq_lock; /* Poll lock. */
> +	struct pvrdma_uar_map *uar;
> +	struct ib_umem *umem;
> +	struct pvrdma_ring_state *ring_state;
> +	struct pvrdma_page_dir pdir;
> +	u32 cq_handle;
> +	bool is_kernel;
> +	atomic_t refcnt;
> +	wait_queue_head_t wait;
> +};
> +
> +struct pvrdma_id_table {
> +	u32 last;
> +	u32 top;
> +	u32 max;
> +	u32 mask;
> +	spinlock_t lock; /* Table lock. */
> +	unsigned long *table;
> +};
> +
> +struct pvrdma_uar_map {
> +	unsigned long pfn;
> +	void __iomem *map;
> +	int index;
> +};
> +
> +struct pvrdma_uar_table {
> +	struct pvrdma_id_table tbl;
> +	int size;
> +};
> +
> +struct pvrdma_ucontext {
> +	struct ib_ucontext ibucontext;
> +	struct pvrdma_dev *dev;
> +	struct pvrdma_uar_map uar;
> +	u64 ctx_handle;
> +};
> +
> +struct pvrdma_pd {
> +	struct ib_pd ibpd;
> +	u32 pdn;
> +	u32 pd_handle;
> +	int privileged;
> +};
> +
> +struct pvrdma_mr {
> +	u32 mr_handle;
> +	u64 iova;
> +	u64 size;
> +};
> +
> +struct pvrdma_user_mr {
> +	struct ib_mr ibmr;
> +	struct ib_umem *umem;
> +	struct pvrdma_mr mmr;
> +	struct pvrdma_page_dir pdir;
> +	u64 *pages;
> +	u32 npages;
> +	u32 max_pages;
> +	u32 page_shift;
> +};
> +
> +struct pvrdma_wq {
> +	struct pvrdma_ring *ring;
> +	spinlock_t lock; /* Work queue lock. */
> +	int wqe_cnt;
> +	int wqe_size;
> +	int max_sg;
> +	int offset;
> +};
> +
> +struct pvrdma_ah {
> +	struct ib_ah ibah;
> +	struct pvrdma_av av;
> +};
> +
> +struct pvrdma_qp {
> +	struct ib_qp ibqp;
> +	u32 qp_handle;
> +	u32 qkey;
> +	struct pvrdma_wq sq;
> +	struct pvrdma_wq rq;
> +	struct ib_umem *rumem;
> +	struct ib_umem *sumem;
> +	struct pvrdma_page_dir pdir;
> +	int npages;
> +	int npages_send;
> +	int npages_recv;
> +	u32 flags;
> +	u8 port;
> +	u8 state;
> +	bool is_kernel;
> +	struct mutex mutex; /* QP state mutex. */
> +	atomic_t refcnt;
> +	wait_queue_head_t wait;
> +};
> +
> +struct pvrdma_dev {
> +	/* PCI device-related information. */
> +	struct ib_device ib_dev;
> +	struct pci_dev *pdev;
> +	void __iomem *regs;
> +	struct pvrdma_device_shared_region *dsr; /* Shared region pointer */
> +	dma_addr_t dsrbase; /* Shared region base address */
> +	void *cmd_slot;
> +	void *resp_slot;
> +	unsigned long flags;
> +	struct list_head device_link;
> +
> +	/* Locking and interrupt information. */
> +	spinlock_t cmd_lock; /* Command lock. */
> +	struct semaphore cmd_sema;
> +	struct completion cmd_done;
> +	struct {
> +		enum pvrdma_intr_type type; /* Intr type */
> +		struct msix_entry msix_entry[PVRDMA_MAX_INTERRUPTS];
> +		irq_handler_t handler[PVRDMA_MAX_INTERRUPTS];
> +		u8 enabled[PVRDMA_MAX_INTERRUPTS];
> +		u8 size;
> +	} intr;
> +
> +	/* RDMA-related device information. */
> +	union ib_gid *sgid_tbl;
> +	struct pvrdma_ring_state *async_ring_state;
> +	struct pvrdma_page_dir async_pdir;
> +	struct pvrdma_ring_state *cq_ring_state;
> +	struct pvrdma_page_dir cq_pdir;
> +	struct pvrdma_cq **cq_tbl;
> +	spinlock_t cq_tbl_lock;
> +	struct pvrdma_qp **qp_tbl;
> +	spinlock_t qp_tbl_lock;
> +	struct pvrdma_uar_table uar_table;
> +	struct pvrdma_uar_map driver_uar;
> +	__be64 sys_image_guid;
> +	spinlock_t desc_lock; /* Device modification lock. */
> +	u32 port_cap_mask;
> +	struct mutex port_mutex; /* Port modification mutex. */
> +	bool ib_active;
> +	atomic_t num_qps;
> +	atomic_t num_cqs;
> +	atomic_t num_pds;
> +	atomic_t num_ahs;
> +
> +	/* Network device information. */
> +	struct net_device *netdev;
> +	struct notifier_block nb_netdev;
> +};
> +
> +struct pvrdma_netdevice_work {
> +	struct work_struct work;
> +	struct net_device *event_netdev;
> +	unsigned long event;
> +};
> +
> +static inline struct pvrdma_dev *to_vdev(struct ib_device *ibdev)
> +{
> +	return container_of(ibdev, struct pvrdma_dev, ib_dev);
> +}
> +
> +static inline struct
> +pvrdma_ucontext *to_vucontext(struct ib_ucontext *ibucontext)
> +{
> +	return container_of(ibucontext, struct pvrdma_ucontext, ibucontext);
> +}
> +
> +static inline struct pvrdma_pd *to_vpd(struct ib_pd *ibpd)
> +{
> +	return container_of(ibpd, struct pvrdma_pd, ibpd);
> +}
> +
> +static inline struct pvrdma_cq *to_vcq(struct ib_cq *ibcq)
> +{
> +	return container_of(ibcq, struct pvrdma_cq, ibcq);
> +}
> +
> +static inline struct pvrdma_user_mr *to_vmr(struct ib_mr *ibmr)
> +{
> +	return container_of(ibmr, struct pvrdma_user_mr, ibmr);
> +}
> +
> +static inline struct pvrdma_qp *to_vqp(struct ib_qp *ibqp)
> +{
> +	return container_of(ibqp, struct pvrdma_qp, ibqp);
> +}
> +
> +static inline struct pvrdma_ah *to_vah(struct ib_ah *ibah)
> +{
> +	return container_of(ibah, struct pvrdma_ah, ibah);
> +}
> +
> +static inline void pvrdma_write_reg(struct pvrdma_dev *dev, u32 reg, u32 val)
> +{
> +	writel(cpu_to_le32(val), dev->regs + reg);
> +}
> +
> +static inline u32 pvrdma_read_reg(struct pvrdma_dev *dev, u32 reg)
> +{
> +	return le32_to_cpu(readl(dev->regs + reg));
> +}
> +
> +static inline void pvrdma_write_uar_cq(struct pvrdma_dev *dev, u32 val)
> +{
> +	writel(cpu_to_le32(val), dev->driver_uar.map + PVRDMA_UAR_CQ_OFFSET);
> +}
> +
> +static inline void pvrdma_write_uar_qp(struct pvrdma_dev *dev, u32 val)
> +{
> +	writel(cpu_to_le32(val), dev->driver_uar.map + PVRDMA_UAR_QP_OFFSET);
> +}
> +
> +static inline void *pvrdma_page_dir_get_ptr(struct pvrdma_page_dir *pdir,
> +					    u64 offset)
> +{
> +	return pdir->pages[offset / PAGE_SIZE] + (offset % PAGE_SIZE);
> +}
> +
> +static inline enum pvrdma_mtu ib_mtu_to_pvrdma(enum ib_mtu mtu)
> +{
> +	return (enum pvrdma_mtu)mtu;
> +}
> +
> +static inline enum ib_mtu pvrdma_mtu_to_ib(enum pvrdma_mtu mtu)
> +{
> +	return (enum ib_mtu)mtu;
> +}
> +
> +static inline enum pvrdma_port_state ib_port_state_to_pvrdma(
> +					enum ib_port_state state)
> +{
> +	return (enum pvrdma_port_state)state;
> +}
> +
> +static inline enum ib_port_state pvrdma_port_state_to_ib(
> +					enum pvrdma_port_state state)
> +{
> +	return (enum ib_port_state)state;
> +}
> +
> +static inline int ib_port_cap_flags_to_pvrdma(int flags)
> +{
> +	return flags & PVRDMA_MASK(PVRDMA_PORT_CAP_FLAGS_MAX);
> +}
> +
> +static inline int pvrdma_port_cap_flags_to_ib(int flags)
> +{
> +	return flags;
> +}
> +
> +static inline enum pvrdma_port_width ib_port_width_to_pvrdma(
> +					enum ib_port_width width)
> +{
> +	return (enum pvrdma_port_width)width;
> +}
> +
> +static inline enum ib_port_width pvrdma_port_width_to_ib(
> +					enum pvrdma_port_width width)
> +{
> +	return (enum ib_port_width)width;
> +}
> +
> +static inline enum pvrdma_port_speed ib_port_speed_to_pvrdma(
> +					enum ib_port_speed speed)
> +{
> +	return (enum pvrdma_port_speed)speed;
> +}
> +
> +static inline enum ib_port_speed pvrdma_port_speed_to_ib(
> +					enum pvrdma_port_speed speed)
> +{
> +	return (enum ib_port_speed)speed;
> +}
> +
> +static inline int pvrdma_qp_attr_mask_to_ib(int attr_mask)
> +{
> +	return attr_mask;
> +}
> +
> +static inline int ib_qp_attr_mask_to_pvrdma(int attr_mask)
> +{
> +	return attr_mask & PVRDMA_MASK(PVRDMA_QP_ATTR_MASK_MAX);
> +}
> +
> +static inline enum pvrdma_mig_state ib_mig_state_to_pvrdma(
> +					enum ib_mig_state state)
> +{
> +	return (enum pvrdma_mig_state)state;
> +}
> +
> +static inline enum ib_mig_state pvrdma_mig_state_to_ib(
> +					enum pvrdma_mig_state state)
> +{
> +	return (enum ib_mig_state)state;
> +}
> +
> +static inline int ib_access_flags_to_pvrdma(int flags)
> +{
> +	return flags;
> +}
> +
> +static inline int pvrdma_access_flags_to_ib(int flags)
> +{
> +	return flags & PVRDMA_MASK(PVRDMA_ACCESS_FLAGS_MAX);
> +}
> +
> +static inline enum pvrdma_qp_type ib_qp_type_to_pvrdma(enum ib_qp_type type)
> +{
> +	return (enum pvrdma_qp_type)type;
> +}
> +
> +static inline enum ib_qp_type pvrdma_qp_type_to_ib(enum pvrdma_qp_type type)
> +{
> +	return (enum ib_qp_type)type;
> +}
> +
> +static inline enum pvrdma_qp_state ib_qp_state_to_pvrdma(enum ib_qp_state state)
> +{
> +	return (enum pvrdma_qp_state)state;
> +}
> +
> +static inline enum ib_qp_state pvrdma_qp_state_to_ib(enum pvrdma_qp_state state)
> +{
> +	return (enum ib_qp_state)state;
> +}
> +
> +static inline enum pvrdma_wr_opcode ib_wr_opcode_to_pvrdma(enum ib_wr_opcode op)
> +{
> +	return (enum pvrdma_wr_opcode)op;
> +}
> +
> +static inline enum ib_wc_status pvrdma_wc_status_to_ib(
> +					enum pvrdma_wc_status status)
> +{
> +	return (enum ib_wc_status)status;
> +}
> +
> +static inline int pvrdma_wc_opcode_to_ib(int opcode)
> +{
> +	return opcode;
> +}
> +
> +static inline int pvrdma_wc_flags_to_ib(int flags)
> +{
> +	return flags;
> +}
> +
> +static inline int ib_send_flags_to_pvrdma(int flags)
> +{
> +	return flags & PVRDMA_MASK(PVRDMA_SEND_FLAGS_MAX);
> +}
> +
> +void pvrdma_qp_cap_to_ib(struct ib_qp_cap *dst,
> +			 const struct pvrdma_qp_cap *src);
> +void ib_qp_cap_to_pvrdma(struct pvrdma_qp_cap *dst,
> +			 const struct ib_qp_cap *src);
> +void pvrdma_gid_to_ib(union ib_gid *dst, const union pvrdma_gid *src);
> +void ib_gid_to_pvrdma(union pvrdma_gid *dst, const union ib_gid *src);
> +void pvrdma_global_route_to_ib(struct ib_global_route *dst,
> +			       const struct pvrdma_global_route *src);
> +void ib_global_route_to_pvrdma(struct pvrdma_global_route *dst,
> +			       const struct ib_global_route *src);
> +void pvrdma_ah_attr_to_ib(struct ib_ah_attr *dst,
> +			  const struct pvrdma_ah_attr *src);
> +void ib_ah_attr_to_pvrdma(struct pvrdma_ah_attr *dst,
> +			  const struct ib_ah_attr *src);
> +
> +int pvrdma_uar_table_init(struct pvrdma_dev *dev);
> +void pvrdma_uar_table_cleanup(struct pvrdma_dev *dev);
> +
> +int pvrdma_uar_alloc(struct pvrdma_dev *dev, struct pvrdma_uar_map *uar);
> +void pvrdma_uar_free(struct pvrdma_dev *dev, struct pvrdma_uar_map *uar);
> +
> +void _pvrdma_flush_cqe(struct pvrdma_qp *qp, struct pvrdma_cq *cq);
> +
> +int pvrdma_page_dir_init(struct pvrdma_dev *dev, struct pvrdma_page_dir *pdir,
> +			 u64 npages, bool alloc_pages);
> +void pvrdma_page_dir_cleanup(struct pvrdma_dev *dev,
> +			     struct pvrdma_page_dir *pdir);
> +int pvrdma_page_dir_insert_dma(struct pvrdma_page_dir *pdir, u64 idx,
> +			       dma_addr_t daddr);
> +int pvrdma_page_dir_insert_umem(struct pvrdma_page_dir *pdir,
> +				struct ib_umem *umem, u64 offset);
> +dma_addr_t pvrdma_page_dir_get_dma(struct pvrdma_page_dir *pdir, u64 idx);
> +int pvrdma_page_dir_insert_page_list(struct pvrdma_page_dir *pdir,
> +				     u64 *page_list, int num_pages);
> +
> +int pvrdma_cmd_post(struct pvrdma_dev *dev, union pvrdma_cmd_req *req,
> +		    union pvrdma_cmd_resp *rsp);
> +
> +#endif /* __PVRDMA_H__ */
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v4 07/16] IB/pvrdma: Add helper functions
From: Yuval Shaia @ 2016-09-14 11:21 UTC (permalink / raw)
  To: Adit Ranadive
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	pv-drivers-pghWNbHTmq7QT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-pci-u79uwXL29TY76Z2rM5mHXA, jhansen-pghWNbHTmq7QT0dZR+AlfA,
	asarwade-pghWNbHTmq7QT0dZR+AlfA,
	georgezhang-pghWNbHTmq7QT0dZR+AlfA,
	bryantan-pghWNbHTmq7QT0dZR+AlfA
In-Reply-To: <1473655766-31628-8-git-send-email-aditr-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>

No more comments.
Reviewed-by: Yuval Shaia <yuval.shaia-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

On Sun, Sep 11, 2016 at 09:49:17PM -0700, Adit Ranadive wrote:
> This patch adds helper functions to store guest page addresses in a page
> directory structure. The page directory pointer is passed down to the
> backend which then maps the entire memory for the RDMA object by
> traversing the directory. We add some more helper functions for converting
> to/from RDMA stack address handles from/to PVRDMA ones.
> 
> Reviewed-by: Jorgen Hansen <jhansen-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> Reviewed-by: George Zhang <georgezhang-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> Reviewed-by: Aditya Sarwade <asarwade-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> Reviewed-by: Bryan Tan <bryantan-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> Signed-off-by: Adit Ranadive <aditr-pghWNbHTmq7QT0dZR+AlfA@public.gmane.org>
> ---
> Changes v3->v4:
>  - Updated conversion functions to func_name(dst, src) format.
>  - Removed unneeded local variables.
> ---
>  drivers/infiniband/hw/pvrdma/pvrdma_misc.c | 303 +++++++++++++++++++++++++++++
>  1 file changed, 303 insertions(+)
>  create mode 100644 drivers/infiniband/hw/pvrdma/pvrdma_misc.c
> 
> diff --git a/drivers/infiniband/hw/pvrdma/pvrdma_misc.c b/drivers/infiniband/hw/pvrdma/pvrdma_misc.c
> new file mode 100644
> index 0000000..1f12cd6
> --- /dev/null
> +++ b/drivers/infiniband/hw/pvrdma/pvrdma_misc.c
> @@ -0,0 +1,303 @@
> +/*
> + * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of EITHER the GNU General Public License
> + * version 2 as published by the Free Software Foundation or the BSD
> + * 2-Clause License. This program is distributed in the hope that it
> + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
> + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
> + * See the GNU General Public License version 2 for more details at
> + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program available in the file COPYING in the main
> + * directory of this source tree.
> + *
> + * The BSD 2-Clause License
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
> + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
> + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
> + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
> + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
> + * OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <linux/errno.h>
> +#include <linux/slab.h>
> +#include <linux/bitmap.h>
> +
> +#include "pvrdma.h"
> +
> +int pvrdma_page_dir_init(struct pvrdma_dev *dev, struct pvrdma_page_dir *pdir,
> +			 u64 npages, bool alloc_pages)
> +{
> +	u64 i;
> +
> +	if (npages > PVRDMA_PAGE_DIR_MAX_PAGES)
> +		return -EINVAL;
> +
> +	memset(pdir, 0, sizeof(*pdir));
> +
> +	pdir->dir = dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
> +				       &pdir->dir_dma, GFP_KERNEL);
> +	if (!pdir->dir)
> +		goto err;
> +
> +	pdir->ntables = PVRDMA_PAGE_DIR_TABLE(npages - 1) + 1;
> +	pdir->tables = kcalloc(pdir->ntables, sizeof(*pdir->tables),
> +			       GFP_KERNEL);
> +	if (!pdir->tables)
> +		goto err;
> +
> +	for (i = 0; i < pdir->ntables; i++) {
> +		pdir->tables[i] = dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
> +						     &pdir->dir[i], GFP_KERNEL);
> +		if (!pdir->tables[i])
> +			goto err;
> +	}
> +
> +	pdir->npages = npages;
> +
> +	if (alloc_pages) {
> +		pdir->pages = kcalloc(npages, sizeof(*pdir->pages),
> +				      GFP_KERNEL);
> +		if (!pdir->pages)
> +			goto err;
> +
> +		for (i = 0; i < pdir->npages; i++) {
> +			dma_addr_t page_dma;
> +
> +			pdir->pages[i] = dma_alloc_coherent(&dev->pdev->dev,
> +							    PAGE_SIZE,
> +							    &page_dma,
> +							    GFP_KERNEL);
> +			if (!pdir->pages[i])
> +				goto err;
> +
> +			pvrdma_page_dir_insert_dma(pdir, i, page_dma);
> +		}
> +	}
> +
> +	return 0;
> +
> +err:
> +	pvrdma_page_dir_cleanup(dev, pdir);
> +
> +	return -ENOMEM;
> +}
> +
> +static u64 *pvrdma_page_dir_table(struct pvrdma_page_dir *pdir, u64 idx)
> +{
> +	return pdir->tables[PVRDMA_PAGE_DIR_TABLE(idx)];
> +}
> +
> +dma_addr_t pvrdma_page_dir_get_dma(struct pvrdma_page_dir *pdir, u64 idx)
> +{
> +	return pvrdma_page_dir_table(pdir, idx)[PVRDMA_PAGE_DIR_PAGE(idx)];
> +}
> +
> +static void pvrdma_page_dir_cleanup_pages(struct pvrdma_dev *dev,
> +					  struct pvrdma_page_dir *pdir)
> +{
> +	if (pdir->pages) {
> +		u64 i;
> +
> +		for (i = 0; i < pdir->npages && pdir->pages[i]; i++) {
> +			dma_addr_t page_dma = pvrdma_page_dir_get_dma(pdir, i);
> +
> +			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
> +					  pdir->pages[i], page_dma);
> +		}
> +
> +		kfree(pdir->pages);
> +	}
> +}
> +
> +static void pvrdma_page_dir_cleanup_tables(struct pvrdma_dev *dev,
> +					   struct pvrdma_page_dir *pdir)
> +{
> +	if (pdir->tables) {
> +		int i;
> +
> +		pvrdma_page_dir_cleanup_pages(dev, pdir);
> +
> +		for (i = 0; i < pdir->ntables; i++) {
> +			u64 *table = pdir->tables[i];
> +
> +			if (table)
> +				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
> +						  table, pdir->dir[i]);
> +		}
> +
> +		kfree(pdir->tables);
> +	}
> +}
> +
> +void pvrdma_page_dir_cleanup(struct pvrdma_dev *dev,
> +			     struct pvrdma_page_dir *pdir)
> +{
> +	if (pdir->dir) {
> +		pvrdma_page_dir_cleanup_tables(dev, pdir);
> +		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
> +				  pdir->dir, pdir->dir_dma);
> +	}
> +}
> +
> +int pvrdma_page_dir_insert_dma(struct pvrdma_page_dir *pdir, u64 idx,
> +			       dma_addr_t daddr)
> +{
> +	u64 *table;
> +
> +	if (idx >= pdir->npages)
> +		return -EINVAL;
> +
> +	table = pvrdma_page_dir_table(pdir, idx);
> +	table[PVRDMA_PAGE_DIR_PAGE(idx)] = daddr;
> +
> +	return 0;
> +}
> +
> +int pvrdma_page_dir_insert_umem(struct pvrdma_page_dir *pdir,
> +				struct ib_umem *umem, u64 offset)
> +{
> +	u64 i = offset;
> +	int j, entry;
> +	int ret = 0, len = 0;
> +	struct scatterlist *sg;
> +
> +	if (offset >= pdir->npages)
> +		return -EINVAL;
> +
> +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
> +		len = sg_dma_len(sg) >> PAGE_SHIFT;
> +		for (j = 0; j < len; j++) {
> +			dma_addr_t addr = sg_dma_address(sg) +
> +					  umem->page_size * j;
> +
> +			ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
> +			if (ret)
> +				goto exit;
> +
> +			i++;
> +		}
> +	}
> +
> +exit:
> +	return ret;
> +}
> +
> +int pvrdma_page_dir_insert_page_list(struct pvrdma_page_dir *pdir,
> +				     u64 *page_list,
> +				     int num_pages)
> +{
> +	int i;
> +	int ret;
> +
> +	if (num_pages > pdir->npages)
> +		return -EINVAL;
> +
> +	for (i = 0; i < num_pages; i++) {
> +		ret = pvrdma_page_dir_insert_dma(pdir, i, page_list[i]);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +void pvrdma_qp_cap_to_ib(struct ib_qp_cap *dst, const struct pvrdma_qp_cap *src)
> +{
> +	dst->max_send_wr = src->max_send_wr;
> +	dst->max_recv_wr = src->max_recv_wr;
> +	dst->max_send_sge = src->max_send_sge;
> +	dst->max_recv_sge = src->max_recv_sge;
> +	dst->max_inline_data = src->max_inline_data;
> +}
> +
> +void ib_qp_cap_to_pvrdma(struct pvrdma_qp_cap *dst, const struct ib_qp_cap *src)
> +{
> +	dst->max_send_wr = src->max_send_wr;
> +	dst->max_recv_wr = src->max_recv_wr;
> +	dst->max_send_sge = src->max_send_sge;
> +	dst->max_recv_sge = src->max_recv_sge;
> +	dst->max_inline_data = src->max_inline_data;
> +}
> +
> +void pvrdma_gid_to_ib(union ib_gid *dst, const union pvrdma_gid *src)
> +{
> +	BUILD_BUG_ON(sizeof(union pvrdma_gid) != sizeof(union ib_gid));
> +	memcpy(dst, src, sizeof(*src));
> +}
> +
> +void ib_gid_to_pvrdma(union pvrdma_gid *dst, const union ib_gid *src)
> +{
> +	BUILD_BUG_ON(sizeof(union pvrdma_gid) != sizeof(union ib_gid));
> +	memcpy(dst, src, sizeof(*src));
> +}
> +
> +void pvrdma_global_route_to_ib(struct ib_global_route *dst,
> +			       const struct pvrdma_global_route *src)
> +{
> +	pvrdma_gid_to_ib(&dst->dgid, &src->dgid);
> +	dst->flow_label = src->flow_label;
> +	dst->sgid_index = src->sgid_index;
> +	dst->hop_limit = src->hop_limit;
> +	dst->traffic_class = src->traffic_class;
> +}
> +
> +void ib_global_route_to_pvrdma(struct pvrdma_global_route *dst,
> +			       const struct ib_global_route *src)
> +{
> +	ib_gid_to_pvrdma(&dst->dgid, &src->dgid);
> +	dst->flow_label = src->flow_label;
> +	dst->sgid_index = src->sgid_index;
> +	dst->hop_limit = src->hop_limit;
> +	dst->traffic_class = src->traffic_class;
> +}
> +
> +void pvrdma_ah_attr_to_ib(struct ib_ah_attr *dst,
> +			  const struct pvrdma_ah_attr *src)
> +{
> +	pvrdma_global_route_to_ib(&dst->grh, &src->grh);
> +	dst->dlid = src->dlid;
> +	dst->sl = src->sl;
> +	dst->src_path_bits = src->src_path_bits;
> +	dst->static_rate = src->static_rate;
> +	dst->ah_flags = src->ah_flags;
> +	dst->port_num = src->port_num;
> +	memcpy(&dst->dmac, &src->dmac, sizeof(dst->dmac));
> +}
> +
> +void ib_ah_attr_to_pvrdma(struct pvrdma_ah_attr *dst,
> +			  const struct ib_ah_attr *src)
> +{
> +	ib_global_route_to_pvrdma(&dst->grh, &src->grh);
> +	dst->dlid = src->dlid;
> +	dst->sl = src->sl;
> +	dst->src_path_bits = src->src_path_bits;
> +	dst->static_rate = src->static_rate;
> +	dst->ah_flags = src->ah_flags;
> +	dst->port_num = src->port_num;
> +	memcpy(&dst->dmac, &src->dmac, sizeof(dst->dmac));
> +}
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v3 net 1/1] net sched actions: fix GETing actions
From: Jamal Hadi Salim @ 2016-09-14 11:33 UTC (permalink / raw)
  To: Cong Wang; +Cc: David Miller, Linux Kernel Network Developers
In-Reply-To: <f24e1121-84c8-01d1-23cd-1288ba1402aa@mojatatu.com>

On 16-09-13 03:47 PM, Jamal Hadi Salim wrote:
> On 16-09-13 12:20 PM, Cong Wang wrote:
>> On Mon, Sep 12, 2016 at 4:07 PM, Jamal Hadi Salim <jhs@mojatatu.com>
>> wrote:

[..]
>> I am still trying to understand this piece, so here you hold the refcnt
>> for the same action used by the later iteration? Otherwise there is
>> almost none user inbetween hold and release...
>>
>> The comment you add is not clear to me, we use RTNL/RCU to
>> sync destroy and replace, so how could that happen?
>>
>
> I was worried about the destroy() hitting an error in that function.
> If an action already existed and all we asked for was to
> replace some attribute it would be deleted. It was the way the code was
> before your changes so i just restored it to its original form.
>

And I have verified this is needed. I went and made gact return
a failure if you replace something. I added a gact action; then
when i replaced it failed. And when it failed it replace the existing
action.
I then tried another experiment and batch replaced several actions
including the one i know would fail. I placed the failing action in
the middle and hallelujah, all the actions before the middle one got
deleted.

So please ACK this so we can move forward.

cheers,
jamal

^ permalink raw reply

* Re: [PATCH v5 0/6] Add eBPF hooks for cgroups
From: Daniel Borkmann @ 2016-09-14 11:36 UTC (permalink / raw)
  To: Pablo Neira Ayuso, Alexei Starovoitov
  Cc: Daniel Mack, htejun, ast, davem, kafai, fw, harald, netdev,
	sargun, cgroups
In-Reply-To: <20160914103038.GA910@salvia>

On 09/14/2016 12:30 PM, Pablo Neira Ayuso wrote:
> On Tue, Sep 13, 2016 at 09:42:19PM -0700, Alexei Starovoitov wrote:
> [...]
>> For us this cgroup+bpf is _not_ for filterting and _not_ for security.
>
> If your goal is monitoring, then convert these hooks not to allow to
> issue a verdict on the packet, so this becomes inoquous in the same
> fashion as the tracing infrastructure.
>
> [...]
>> I'd really love to have an alternative to bpf for such tasks,
>> but you seem to spend all the energy arguing against bpf whereas
>> nft still has a lot to be desired.
>
> Please Alexei, stop that FUD. Anyone that has spent just one day using
> the bpf tooling and infrastructure knows you have problems to
> resolve...

Not quite sure on the spreading of FUD, but sounds like we should all
get back to technical things to resolve. ;)

^ permalink raw reply

* Re: [PATCH v5 0/6] Add eBPF hooks for cgroups
From: Daniel Borkmann @ 2016-09-14 11:42 UTC (permalink / raw)
  To: Daniel Mack, Pablo Neira Ayuso
  Cc: htejun-b10kYP2dOMg, ast-b10kYP2dOMg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	kafai-b10kYP2dOMg, fw-HFFVJYpyMKqzQB+pC5nmwQ,
	harald-H+wXaHxf7aLQT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
	sargun-GaZTRHToo+CzQB+pC5nmwQ, cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <6de6809a-13f5-4000-5639-c760dde30223-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>

On 09/14/2016 01:13 PM, Daniel Mack wrote:
> On 09/13/2016 07:24 PM, Pablo Neira Ayuso wrote:
>> On Tue, Sep 13, 2016 at 03:31:20PM +0200, Daniel Mack wrote:
>>> On 09/13/2016 01:56 PM, Pablo Neira Ayuso wrote:
>>>> On Mon, Sep 12, 2016 at 06:12:09PM +0200, Daniel Mack wrote:
>>>>> This is v5 of the patch set to allow eBPF programs for network
>>>>> filtering and accounting to be attached to cgroups, so that they apply
>>>>> to all sockets of all tasks placed in that cgroup. The logic also
>>>>> allows to be extendeded for other cgroup based eBPF logic.
>>>>
>>>> 1) This infrastructure can only be useful to systemd, or any similar
>>>>     orchestration daemon. Look, you can only apply filtering policies
>>>>     to processes that are launched by systemd, so this only works
>>>>     for server processes.
>>>
>>> Sorry, but both statements aren't true. The eBPF policies apply to every
>>> process that is placed in a cgroup, and my example program in 6/6 shows
>>> how that can be done from the command line.
>>
>> Then you have to explain me how can anyone else than systemd use this
>> infrastructure?
>
> I have no idea what makes you think this is limited to systemd. As I
> said, I provided an example for userspace that works from the command
> line. The same limitation apply as for all other users of cgroups.
>
>> My main point is that those processes *need* to be launched by the
>> orchestrator, which is was refering as 'server processes'.
>
> Yes, that's right. But as I said, this rule applies to many other kernel
> concepts, so I don't see any real issue.
>
>>> That's a limitation that applies to many more control mechanisms in the
>>> kernel, and it's something that can easily be solved with fork+exec.
>>
>> As long as you have control to launch the processes yes, but this
>> will not work in other scenarios. Just like cgroup net_cls and friends
>> are broken for filtering for things that you have no control to
>> fork+exec.
>
> Probably, but that's only solvable with rules that store the full cgroup
> path then, and do a string comparison (!) for each packet flying by.
>
>>> That's just as transparent as SO_ATTACH_FILTER. What kind of
>>> introspection mechanism do you have in mind?
>>
>> SO_ATTACH_FILTER is called from the process itself, so this is a local
>> filtering policy that you apply to your own process.
>
> Not necessarily. You can as well do it the inetd way, and pass the
> socket to a process that is launched on demand, but do SO_ATTACH_FILTER
> + SO_LOCK_FILTER  in the middle. What happens with payload on the socket
> is not transparent to the launched binary at all. The proposed cgroup
> eBPF solution implements a very similar behavior in that regard.
>
>>> It's about filtering outgoing network packets of applications, and
>>> providing them with L2 information for filtering purposes. I don't think
>>> that's a very specific use-case.
>>>
>>> When the feature is not used at all, the added costs on the output path
>>> are close to zero, due to the use of static branches.
>>
>> *You're proposing a socket filtering facility that hooks layer 2
>> output path*!
>
> As I said, I'm open to discussing that. In order to make it work for L3,
> the LL_OFF issues need to be solved, as Daniel explained. Daniel,
> Alexei, any idea how much work that would be?

Not much. You simply need to declare your own struct bpf_verifier_ops
with a get_func_proto() handler that handles BPF_FUNC_skb_load_bytes,
and verifier in do_check() loop would need to handle that these ld_abs/
ld_ind are rejected for BPF_PROG_TYPE_CGROUP_SOCKET.

>> That is only a rough ~30 lines kernel patchset to support this in
>> netfilter and only one extra input hook, with potential access to
>> conntrack and better integration with other existing subsystems.
>
> Care to share the patches for that? I'd really like to have a look.
>
> And FWIW, I agree with Thomas - there is nothing wrong with having
> multiple options to use for such use-cases.
>
>
> Thanks,
> Daniel
>

^ permalink raw reply

* Re: [RFC PATCH v3 2/7] proc: Reduce cache miss in {snmp,netstat}_seq_show
From: Marcelo @ 2016-09-14 11:55 UTC (permalink / raw)
  To: hejianet
  Cc: netdev, linux-sctp, linux-kernel, davem, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Vlad Yasevich,
	Neil Horman, Steffen Klassert, Herbert Xu
In-Reply-To: <22d87ec3-608f-fe41-5eb7-fe1104f133dd@gmail.com>

Hi Jia,

On Wed, Sep 14, 2016 at 01:58:42PM +0800, hejianet wrote:
> Hi Marcelo
> 
> 
> On 9/13/16 2:57 AM, Marcelo wrote:
> > On Fri, Sep 09, 2016 at 02:33:57PM +0800, Jia He wrote:
> > > This is to use the generic interface snmp_get_cpu_field{,64}_batch to
> > > aggregate the data by going through all the items of each cpu sequentially.
> > > Then snmp_seq_show and netstat_seq_show are split into 2 parts to avoid build
> > > warning "the frame size" larger than 1024 on s390.
> > Yeah about that, did you test it with stack overflow detection?
> > These arrays can be quite large.
> > 
> > One more below..
> Do you think it is acceptable if the stack usage is a little larger than 1024?
> e.g. 1120
> I can't find any other way to reduce the stack usage except use "static" before
> unsigned long buff[TCP_MIB_MAX]
> 
> PS. sizeof buff is about TCP_MIB_MAX(116)*8=928
> B.R.

That's pretty much the question. Linux has the option on some archs to
run with 4Kb (4KSTACKS option), so this function alone would be using
25% of it in this last case. While on x86_64, it uses 16Kb (6538b8ea886e
("x86_64: expand kernel stack to 16K")).

Adding static to it is not an option as it actually makes the variable
shared amongst the CPUs (and then you have concurrency issues), plus the
fact that it's always allocated, even while not in use.

Others here certainly know better than me if it's okay to make such
usage of the stach.

> > > +static int netstat_seq_show_ipext(struct seq_file *seq, void *v)
> > > +{
> > > +	int i;
> > > +	u64 buff64[IPSTATS_MIB_MAX];
> > > +	struct net *net = seq->private;
> > >   	seq_puts(seq, "\nIpExt:");
> > >   	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
> > >   		seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
> > >   	seq_puts(seq, "\nIpExt:");
> > You're missing a memset() call here.

Not sure if you missed this one or not..

Thanks,
Marcelo

^ permalink raw reply

* Re: [PATCH v4 05/16] IB/pvrdma: Add functions for Verbs support
From: Yuval Shaia @ 2016-09-14 12:28 UTC (permalink / raw)
  To: Adit Ranadive
  Cc: dledford, linux-rdma, pv-drivers, netdev, linux-pci, jhansen,
	asarwade, georgezhang, bryantan
In-Reply-To: <1473655766-31628-6-git-send-email-aditr@vmware.com>

On Sun, Sep 11, 2016 at 09:49:15PM -0700, Adit Ranadive wrote:
> +
> +/**
> + * pvrdma_alloc_pd - allocate protection domain
> + * @ibdev: the IB device
> + * @context: user context
> + * @udata: user data
> + *
> + * @return: the ib_pd protection domain pointer on success, otherwise errno.
> + */
> +struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
> +			      struct ib_ucontext *context,
> +			      struct ib_udata *udata)
> +{
> +	struct pvrdma_pd *pd;
> +	struct pvrdma_dev *dev = to_vdev(ibdev);
> +	union pvrdma_cmd_req req;
> +	union pvrdma_cmd_resp rsp;
> +	struct pvrdma_cmd_create_pd *cmd = &req.create_pd;
> +	struct pvrdma_cmd_create_pd_resp *resp = &rsp.create_pd_resp;
> +	int ret;
> +	void *ptr;
> +
> +	/* Check allowed max pds */
> +	if (!atomic_add_unless(&dev->num_pds, 1, dev->dsr->caps.max_pd))
> +		return ERR_PTR(-EINVAL);
> +
> +	memset(cmd, 0, sizeof(*cmd));
> +	cmd->hdr.cmd = PVRDMA_CMD_CREATE_PD;
> +	cmd->ctx_handle = (context) ? to_vucontext(context)->ctx_handle : 0;
> +	ret = pvrdma_cmd_post(dev, &req, &rsp);
> +	if (ret < 0) {
> +		dev_warn(&dev->pdev->dev,
> +			 "failed to allocate protection domain, error: %d\n",
> +			 ret);
> +		ptr = ERR_PTR(ret);
> +		goto err;
> +	} else if (resp->hdr.ack != PVRDMA_CMD_CREATE_PD_RESP) {
> +		dev_warn(&dev->pdev->dev,
> +			 "unknown response for allocate protection domain\n");
> +		ptr = ERR_PTR(-EFAULT);
> +		goto err;
> +	}
> +
> +	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
> +	if (!pd) {
> +		ptr = ERR_PTR(-ENOMEM);
> +		goto err;
> +	}

I know that this was my suggestion but also remember that you raised a
(correct) argument that it is preferred to first do other allocation and
free them if command fails then the other way around where failure of
memory allocation (like here) will force us to do the opposite command
(pvrdma_dealloc_pd in this case).

So either accept your way (better) or call pvrdma_dealloc_pd when kmalloc
fails.

> +
> +	pd->privileged = !context;
> +	pd->pd_handle = resp->pd_handle;
> +	pd->pdn = resp->pd_handle;
> +
> +	if (context) {
> +		if (ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) {
> +			dev_warn(&dev->pdev->dev,
> +				 "failed to copy back protection domain\n");
> +			pvrdma_dealloc_pd(&pd->ibpd);
> +			return ERR_PTR(-EFAULT);
> +		}
> +	}
> +
> +	/* u32 pd handle */
> +	return  &pd->ibpd;
> +
> +err:
> +	atomic_dec(&dev->num_pds);
> +	return ptr;
> +}

^ permalink raw reply

* [PATCH 0/2] pull request for net: batman-adv 2016-09-14
From: Simon Wunderlich @ 2016-09-14 12:37 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r

Hi David,

here are some more bugfix patches which we would like to have integrated
into net.

Please pull or let me know of any problem!

Thank you,
      Simon

The following changes since commit 1fe323aa1b2390a0c57fb0b06a782f128d49094c:

  sctp: use event->chunk when it's valid (2016-08-08 14:31:23 -0700)

are available in the git repository at:

  git://git.open-mesh.org/linux-merge.git tags/batadv-net-for-davem-20160914

for you to fetch changes up to 1e5d343b8f23770e8ac5d31f5c439826bdb35148:

  batman-adv: fix elp packet data reservation (2016-08-26 15:22:31 +0200)

----------------------------------------------------------------
Here are two batman-adv bugfix patches:

 - Fix reference counting for last_bonding_candidate, by Sven Eckelmann

 - Fix head room reservation for ELP packets, by Linus Luessing

----------------------------------------------------------------
Linus Lüssing (1):
      batman-adv: fix elp packet data reservation

Sven Eckelmann (1):
      batman-adv: Add missing refcnt for last_candidate

 net/batman-adv/bat_v_elp.c |  2 +-
 net/batman-adv/routing.c   | 28 +++++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

^ permalink raw reply

* [PATCH 1/2] batman-adv: Add missing refcnt for last_candidate
From: Simon Wunderlich @ 2016-09-14 12:37 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r, Marek Lindner
In-Reply-To: <20160914123735.9411-1-sw-2YrNx6rUIHYiY0qSoAWiAoQuADTiUCJX@public.gmane.org>

From: Sven Eckelmann <sven-KaDOiPu9UxWEi8DpZVb4nw@public.gmane.org>

batadv_find_router dereferences last_bonding_candidate from
orig_node without making sure that it has a valid reference. This reference
has to be retrieved by increasing the reference counter while holding
neigh_list_lock. The lock is required to avoid that
batadv_last_bonding_replace removes the current last_bonding_candidate,
reduces the reference counter and maybe destroys the object in this
process.

Fixes: f3b3d9018975 ("batman-adv: add bonding again")
Signed-off-by: Sven Eckelmann <sven-KaDOiPu9UxWEi8DpZVb4nw@public.gmane.org>
Signed-off-by: Marek Lindner <mareklindner-rVWd3aGhH2z5bpWLKbzFeg@public.gmane.org>
Signed-off-by: Simon Wunderlich <sw-2YrNx6rUIHYiY0qSoAWiAoQuADTiUCJX@public.gmane.org>
---
 net/batman-adv/routing.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 7602c00..3d19947 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -470,6 +470,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
 }
 
 /**
+ * batadv_last_bonding_get - Get last_bonding_candidate of orig_node
+ * @orig_node: originator node whose last bonding candidate should be retrieved
+ *
+ * Return: last bonding candidate of router or NULL if not found
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+static struct batadv_orig_ifinfo *
+batadv_last_bonding_get(struct batadv_orig_node *orig_node)
+{
+	struct batadv_orig_ifinfo *last_bonding_candidate;
+
+	spin_lock_bh(&orig_node->neigh_list_lock);
+	last_bonding_candidate = orig_node->last_bonding_candidate;
+
+	if (last_bonding_candidate)
+		kref_get(&last_bonding_candidate->refcount);
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+
+	return last_bonding_candidate;
+}
+
+/**
  * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node
  * @orig_node: originator node whose bonding candidates should be replaced
  * @new_candidate: new bonding candidate or NULL
@@ -539,7 +562,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
 	 * router - obviously there are no other candidates.
 	 */
 	rcu_read_lock();
-	last_candidate = orig_node->last_bonding_candidate;
+	last_candidate = batadv_last_bonding_get(orig_node);
 	if (last_candidate)
 		last_cand_router = rcu_dereference(last_candidate->router);
 
@@ -631,6 +654,9 @@ next:
 		batadv_orig_ifinfo_put(next_candidate);
 	}
 
+	if (last_candidate)
+		batadv_orig_ifinfo_put(last_candidate);
+
 	return router;
 }
 
-- 
2.9.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox