Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 10/15] nfp: add RTsym access helpers
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

RTsyms may have special encodings for more complex symbol types.
For example symbols which are placed in external memory unit's
cache directly, constants or local memory.  Add set of helpers
which will check for those special encodings and handle them
correctly.

For now only add direct cache accesses, we don't have a need to
access the other ones in foreseeable future.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Francois H. Theron <francois.theron@netronome.com>
---
 .../ethernet/netronome/nfp/nfpcore/nfp_nffw.h |  25 +++
 .../netronome/nfp/nfpcore/nfp_rtsym.c         | 146 ++++++++++++++++++
 2 files changed, 171 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
index df599d5b6bb3..04700278d00d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
@@ -98,6 +98,31 @@ const struct nfp_rtsym *nfp_rtsym_get(struct nfp_rtsym_table *rtbl, int idx);
 const struct nfp_rtsym *
 nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name);
 
+int __nfp_rtsym_read(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		     u8 action, u8 token, u64 off, void *buf, size_t len);
+int nfp_rtsym_read(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		   void *buf, size_t len);
+int __nfp_rtsym_readl(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		      u8 action, u8 token, u64 off, u32 *value);
+int nfp_rtsym_readl(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		    u32 *value);
+int __nfp_rtsym_readq(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		      u8 action, u8 token, u64 off, u64 *value);
+int nfp_rtsym_readq(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		    u64 *value);
+int __nfp_rtsym_write(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		      u8 action, u8 token, u64 off, void *buf, size_t len);
+int nfp_rtsym_write(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		    void *buf, size_t len);
+int __nfp_rtsym_writel(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		       u8 action, u8 token, u64 off, u32 value);
+int nfp_rtsym_writel(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		     u32 value);
+int __nfp_rtsym_writeq(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		       u8 action, u8 token, u64 off, u64 value);
+int nfp_rtsym_writeq(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		     u64 value);
+
 u64 nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name,
 		      int *error);
 int nfp_rtsym_write_le(struct nfp_rtsym_table *rtbl, const char *name,
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
index 9e34216578da..1c0b1b11b69f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
@@ -233,6 +233,152 @@ nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name)
 	return NULL;
 }
 
+static int
+nfp_rtsym_to_dest(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		  u8 action, u8 token, u64 off, u32 *cpp_id, u64 *addr)
+{
+	*addr = sym->addr + off;
+
+	if (sym->target == NFP_RTSYM_TARGET_EMU_CACHE) {
+		int locality_off = nfp_cpp_mu_locality_lsb(cpp);
+
+		*addr &= ~(NFP_MU_ADDR_ACCESS_TYPE_MASK << locality_off);
+		*addr |= NFP_MU_ADDR_ACCESS_TYPE_DIRECT << locality_off;
+
+		*cpp_id = NFP_CPP_ISLAND_ID(NFP_CPP_TARGET_MU, action, token,
+					    sym->domain);
+	} else if (sym->target < 0) {
+		nfp_err(cpp, "Unhandled RTsym target encoding: %d\n",
+			sym->target);
+		return -EINVAL;
+	} else {
+		*cpp_id = NFP_CPP_ISLAND_ID(sym->target, action, token,
+					    sym->domain);
+	}
+
+	return 0;
+}
+
+int __nfp_rtsym_read(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		     u8 action, u8 token, u64 off, void *buf, size_t len)
+{
+	u32 cpp_id;
+	u64 addr;
+	int err;
+
+	err = nfp_rtsym_to_dest(cpp, sym, action, token, off, &cpp_id, &addr);
+	if (err)
+		return err;
+
+	return nfp_cpp_read(cpp, cpp_id, addr, buf, len);
+}
+
+int nfp_rtsym_read(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		   void *buf, size_t len)
+{
+	return __nfp_rtsym_read(cpp, sym, NFP_CPP_ACTION_RW, 0, off, buf, len);
+}
+
+int __nfp_rtsym_readl(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		      u8 action, u8 token, u64 off, u32 *value)
+{
+	u32 cpp_id;
+	u64 addr;
+	int err;
+
+	err = nfp_rtsym_to_dest(cpp, sym, action, token, off, &cpp_id, &addr);
+	if (err)
+		return err;
+
+	return nfp_cpp_readl(cpp, cpp_id, addr, value);
+}
+
+int nfp_rtsym_readl(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		    u32 *value)
+{
+	return __nfp_rtsym_readl(cpp, sym, NFP_CPP_ACTION_RW, 0, off, value);
+}
+
+int __nfp_rtsym_readq(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		      u8 action, u8 token, u64 off, u64 *value)
+{
+	u32 cpp_id;
+	u64 addr;
+	int err;
+
+	err = nfp_rtsym_to_dest(cpp, sym, action, token, off, &cpp_id, &addr);
+	if (err)
+		return err;
+
+	return nfp_cpp_readq(cpp, cpp_id, addr, value);
+}
+
+int nfp_rtsym_readq(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		    u64 *value)
+{
+	return __nfp_rtsym_readq(cpp, sym, NFP_CPP_ACTION_RW, 0, off, value);
+}
+
+int __nfp_rtsym_write(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		      u8 action, u8 token, u64 off, void *buf, size_t len)
+{
+	u32 cpp_id;
+	u64 addr;
+	int err;
+
+	err = nfp_rtsym_to_dest(cpp, sym, action, token, off, &cpp_id, &addr);
+	if (err)
+		return err;
+
+	return nfp_cpp_write(cpp, cpp_id, addr, buf, len);
+}
+
+int nfp_rtsym_write(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		    void *buf, size_t len)
+{
+	return __nfp_rtsym_write(cpp, sym, NFP_CPP_ACTION_RW, 0, off, buf, len);
+}
+
+int __nfp_rtsym_writel(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		       u8 action, u8 token, u64 off, u32 value)
+{
+	u32 cpp_id;
+	u64 addr;
+	int err;
+
+	err = nfp_rtsym_to_dest(cpp, sym, action, token, off, &cpp_id, &addr);
+	if (err)
+		return err;
+
+	return nfp_cpp_writel(cpp, cpp_id, addr, value);
+}
+
+int nfp_rtsym_writel(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		     u32 value)
+{
+	return __nfp_rtsym_writel(cpp, sym, NFP_CPP_ACTION_RW, 0, off, value);
+}
+
+int __nfp_rtsym_writeq(struct nfp_cpp *cpp, const struct nfp_rtsym *sym,
+		       u8 action, u8 token, u64 off, u64 value)
+{
+	u32 cpp_id;
+	u64 addr;
+	int err;
+
+	err = nfp_rtsym_to_dest(cpp, sym, action, token, off, &cpp_id, &addr);
+	if (err)
+		return err;
+
+	return nfp_cpp_writeq(cpp, cpp_id, addr, value);
+}
+
+int nfp_rtsym_writeq(struct nfp_cpp *cpp, const struct nfp_rtsym *sym, u64 off,
+		     u64 value)
+{
+	return __nfp_rtsym_writeq(cpp, sym, NFP_CPP_ACTION_RW, 0, off, value);
+}
+
 /**
  * nfp_rtsym_read_le() - Read a simple unsigned scalar value from symbol
  * @rtbl:	NFP RTsym table
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 09/15] nfp: add basic errors messages to target logic
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

Add error prints to CPP target encoding/decoding logic, otherwise
it's quite hard to pin point the reasons why read or write
operations fail.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Francois H. Theron <francois.theron@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfpcore/nfp_target.c  | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_target.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_target.c
index 4ea1e585d945..f691c6587c76 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_target.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_target.c
@@ -39,7 +39,11 @@
  *          Francois H. Theron <francois.theron@netronome.com>
  */
 
+#define pr_fmt(fmt)       "NFP target: " fmt
+
 #include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
 
 #include "nfp_cpp.h"
 
@@ -733,8 +737,10 @@ int nfp_target_cpp(u32 cpp_island_id, u64 cpp_island_address,
 	u32 imb;
 	int err;
 
-	if (target < 0 || target >= 16)
+	if (target < 0 || target >= 16) {
+		pr_err("Invalid CPP target: %d\n", target);
 		return -EINVAL;
+	}
 
 	if (island == 0) {
 		/* Already translated */
@@ -753,8 +759,10 @@ int nfp_target_cpp(u32 cpp_island_id, u64 cpp_island_address,
 	err = nfp_cppat_addr_encode(cpp_target_address, island, target,
 				    ((imb >> 13) & 7), ((imb >> 12) & 1),
 				    ((imb >> 6)  & 0x3f), ((imb >> 0)  & 0x3f));
-	if (err)
+	if (err) {
+		pr_err("Can't encode CPP address: %d\n", err);
 		return err;
+	}
 
 	*cpp_target_id = NFP_CPP_ID(target,
 				    NFP_CPP_ID_ACTION_of(cpp_island_id),
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 08/15] nfp: save the MU locality field offset
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

We will soon need the MU locality field offset much more
often than just for decoding MIP address.  Save it in nfp_cpp
for quick access.  Note that we can already reuse the target
config from nfp_cpp, no need to do the XPB read.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Francois H. Theron <francois.theron@netronome.com>
---
 .../ethernet/netronome/nfp/nfpcore/nfp_cpp.h  |  1 +
 .../netronome/nfp/nfpcore/nfp_cppcore.c       | 36 +++++++++++++++++++
 .../ethernet/netronome/nfp/nfpcore/nfp_nffw.c | 32 +----------------
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
index af19fe9f4934..991b8ed7e036 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
@@ -233,6 +233,7 @@ void nfp_cpp_free(struct nfp_cpp *cpp);
 u32 nfp_cpp_model(struct nfp_cpp *cpp);
 u16 nfp_cpp_interface(struct nfp_cpp *cpp);
 int nfp_cpp_serial(struct nfp_cpp *cpp, const u8 **serial);
+unsigned int nfp_cpp_mu_locality_lsb(struct nfp_cpp *cpp);
 
 struct nfp_cpp_area *nfp_cpp_area_alloc_with_name(struct nfp_cpp *cpp,
 						  u32 cpp_id,
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
index 73de57a09800..f7e1d79e735f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
@@ -75,6 +75,7 @@ struct nfp_cpp_resource {
  * @interface:		chip interface id we are using to reach it
  * @serial:		chip serial number
  * @imb_cat_table:	CPP Mapping Table
+ * @mu_locality_lsb:	MU access type bit offset
  *
  * Following fields use explicit locking:
  * @resource_list:	NFP CPP resource list
@@ -100,6 +101,7 @@ struct nfp_cpp {
 	wait_queue_head_t waitq;
 
 	u32 imb_cat_table[16];
+	unsigned int mu_locality_lsb;
 
 	struct mutex area_cache_mutex;
 	struct list_head area_cache_list;
@@ -266,6 +268,34 @@ int nfp_cpp_serial(struct nfp_cpp *cpp, const u8 **serial)
 	return sizeof(cpp->serial);
 }
 
+#define NFP_IMB_TGTADDRESSMODECFG_MODE_of(_x)		(((_x) >> 13) & 0x7)
+#define NFP_IMB_TGTADDRESSMODECFG_ADDRMODE		BIT(12)
+#define   NFP_IMB_TGTADDRESSMODECFG_ADDRMODE_32_BIT	0
+#define   NFP_IMB_TGTADDRESSMODECFG_ADDRMODE_40_BIT	BIT(12)
+
+static int nfp_cpp_set_mu_locality_lsb(struct nfp_cpp *cpp)
+{
+	unsigned int mode, addr40;
+	u32 imbcppat;
+	int res;
+
+	imbcppat = cpp->imb_cat_table[NFP_CPP_TARGET_MU];
+	mode = NFP_IMB_TGTADDRESSMODECFG_MODE_of(imbcppat);
+	addr40 = !!(imbcppat & NFP_IMB_TGTADDRESSMODECFG_ADDRMODE);
+
+	res = nfp_cppat_mu_locality_lsb(mode, addr40);
+	if (res < 0)
+		return res;
+	cpp->mu_locality_lsb = res;
+
+	return 0;
+}
+
+unsigned int nfp_cpp_mu_locality_lsb(struct nfp_cpp *cpp)
+{
+	return cpp->mu_locality_lsb;
+}
+
 /**
  * nfp_cpp_area_alloc_with_name() - allocate a new CPP area
  * @cpp:	CPP device handle
@@ -1241,6 +1271,12 @@ nfp_cpp_from_operations(const struct nfp_cpp_operations *ops,
 	nfp_cpp_readl(cpp, arm, NFP_ARM_GCSR + NFP_ARM_GCSR_SOFTMODEL3,
 		      &mask[1]);
 
+	err = nfp_cpp_set_mu_locality_lsb(cpp);
+	if (err < 0) {
+		dev_err(parent,	"Can't calculate MU locality bit offset\n");
+		goto err_out;
+	}
+
 	dev_info(cpp->dev.parent, "Model: 0x%08x, SN: %pM, Ifc: 0x%04x\n",
 		 nfp_cpp_model(cpp), cpp->serial, nfp_cpp_interface(cpp));
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.c
index 40510860341b..a164fbc85cd3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.c
@@ -156,29 +156,6 @@ static u64 nffw_fwinfo_mip_offset_get(const struct nffw_fwinfo *fi)
 	return (mip_off_hi & 0xFF) << 32 | le32_to_cpu(fi->mip_offset_lo);
 }
 
-#define NFP_IMB_TGTADDRESSMODECFG_MODE_of(_x)		(((_x) >> 13) & 0x7)
-#define NFP_IMB_TGTADDRESSMODECFG_ADDRMODE		BIT(12)
-#define   NFP_IMB_TGTADDRESSMODECFG_ADDRMODE_32_BIT	0
-#define   NFP_IMB_TGTADDRESSMODECFG_ADDRMODE_40_BIT	BIT(12)
-
-static int nfp_mip_mu_locality_lsb(struct nfp_cpp *cpp)
-{
-	unsigned int mode, addr40;
-	u32 xpbaddr, imbcppat;
-	int err;
-
-	/* Hardcoded XPB IMB Base, island 0 */
-	xpbaddr = 0x000a0000 + NFP_CPP_TARGET_MU * 4;
-	err = nfp_xpb_readl(cpp, xpbaddr, &imbcppat);
-	if (err < 0)
-		return err;
-
-	mode = NFP_IMB_TGTADDRESSMODECFG_MODE_of(imbcppat);
-	addr40 = !!(imbcppat & NFP_IMB_TGTADDRESSMODECFG_ADDRMODE);
-
-	return nfp_cppat_mu_locality_lsb(mode, addr40);
-}
-
 static unsigned int
 nffw_res_fwinfos(struct nfp_nffw_info_data *fwinf, struct nffw_fwinfo **arr)
 {
@@ -304,14 +281,7 @@ int nfp_nffw_info_mip_first(struct nfp_nffw_info *state, u32 *cpp_id, u64 *off)
 	*off = nffw_fwinfo_mip_offset_get(fwinfo);
 
 	if (nffw_fwinfo_mip_mu_da_get(fwinfo)) {
-		int locality_off;
-
-		if (NFP_CPP_ID_TARGET_of(*cpp_id) != NFP_CPP_TARGET_MU)
-			return 0;
-
-		locality_off = nfp_mip_mu_locality_lsb(state->cpp);
-		if (locality_off < 0)
-			return locality_off;
+		int locality_off = nfp_cpp_mu_locality_lsb(state->cpp);
 
 		*off &= ~(NFP_MU_ADDR_ACCESS_TYPE_MASK << locality_off);
 		*off |= NFP_MU_ADDR_ACCESS_TYPE_DIRECT << locality_off;
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 07/15] nfp: refactor the per-chip PCIe config
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

Use a switch statement instead of ifs for code dependent
on chip version.  While at it make sure we fail for unknown
chip revisions.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../netronome/nfp/nfpcore/nfp6000_pcie.c      | 50 ++++++++++++++-----
 .../ethernet/netronome/nfp/nfpcore/nfp_cpp.h  |  4 ++
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
index 6ef5ac2d0827..fd63d83bdea5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
@@ -138,6 +138,7 @@
 
 /* The number of explicit BARs to reserve.
  * Minimum is 0, maximum is 4 on the NFP6000.
+ * The NFP3800 can have only one per PF.
  */
 #define NFP_PCIE_EXPLICIT_BARS		2
 
@@ -589,8 +590,8 @@ static int enable_bars(struct nfp6000_pcie *nfp, u16 interface)
 			NFP_PCIE_BAR_PCIE2CPP_MapType_EXPLICIT3),
 	};
 	char status_msg[196] = {};
+	int i, err, bars_free;
 	struct nfp_bar *bar;
-	int i, bars_free;
 	int expl_groups;
 	char *msg, *end;
 
@@ -643,6 +644,8 @@ static int enable_bars(struct nfp6000_pcie *nfp, u16 interface)
 		bar->iomem = ioremap_nocache(nfp_bar_resource_start(bar),
 					     nfp_bar_resource_len(bar));
 	if (bar->iomem) {
+		int pf;
+
 		msg += snprintf(msg, end - msg,	"0.0: General/MSI-X SRAM, ");
 		atomic_inc(&bar->refcnt);
 		bars_free--;
@@ -651,24 +654,40 @@ static int enable_bars(struct nfp6000_pcie *nfp, u16 interface)
 
 		nfp->expl.data = bar->iomem + NFP_PCIE_SRAM + 0x1000;
 
-		if (nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP4000 ||
-		    nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP5000 ||
-		    nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP6000) {
-			nfp->iomem.csr = bar->iomem + NFP_PCIE_BAR(0);
-		} else {
-			int pf = nfp->pdev->devfn & 7;
-
+		switch (nfp->pdev->device) {
+		case PCI_DEVICE_ID_NETRONOME_NFP3800:
+			pf = nfp->pdev->devfn & 7;
 			nfp->iomem.csr = bar->iomem + NFP_PCIE_BAR(pf);
+			break;
+		case PCI_DEVICE_ID_NETRONOME_NFP4000:
+		case PCI_DEVICE_ID_NETRONOME_NFP5000:
+		case PCI_DEVICE_ID_NETRONOME_NFP6000:
+			nfp->iomem.csr = bar->iomem + NFP_PCIE_BAR(0);
+			break;
+		default:
+			dev_err(nfp->dev, "Unsupported device ID: %04hx!\n",
+				nfp->pdev->device);
+			err = -EINVAL;
+			goto err_unmap_bar0;
 		}
 		nfp->iomem.em = bar->iomem + NFP_PCIE_EM;
 	}
 
-	if (nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP4000 ||
-	    nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP5000 ||
-	    nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP6000)
-		expl_groups = 4;
-	else
+	switch (nfp->pdev->device) {
+	case PCI_DEVICE_ID_NETRONOME_NFP3800:
 		expl_groups = 1;
+		break;
+	case PCI_DEVICE_ID_NETRONOME_NFP4000:
+	case PCI_DEVICE_ID_NETRONOME_NFP5000:
+	case PCI_DEVICE_ID_NETRONOME_NFP6000:
+		expl_groups = 4;
+		break;
+	default:
+		dev_err(nfp->dev, "Unsupported device ID: %04hx!\n",
+			nfp->pdev->device);
+		err = -EINVAL;
+		goto err_unmap_bar0;
+	}
 
 	/* Configure, and lock, BAR0.1 for PCIe XPB (MSI-X PBA) */
 	bar = &nfp->bar[1];
@@ -713,6 +732,11 @@ static int enable_bars(struct nfp6000_pcie *nfp, u16 interface)
 	dev_info(nfp->dev, "%sfree: %d/%d\n", status_msg, bars_free, nfp->bars);
 
 	return 0;
+
+err_unmap_bar0:
+	if (nfp->bar[0].iomem)
+		iounmap(nfp->bar[0].iomem);
+	return err;
 }
 
 static void disable_bars(struct nfp6000_pcie *nfp)
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
index 3b5182143ec7..af19fe9f4934 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
@@ -62,6 +62,10 @@
 
 #define PCI_64BIT_BAR_COUNT             3
 
+/* NFP hardware vendor/device ids.
+ */
+#define PCI_DEVICE_ID_NETRONOME_NFP3800	0x3800
+
 #define NFP_CPP_NUM_TARGETS             16
 /* Max size of area it should be safe to request */
 #define NFP_CPP_SAFE_AREA_SIZE		SZ_2M
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 06/15] nfp: add support for NFP5000
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

Add NFP5000 to supported chips, the chip is backward compatible
with NFP4000 and NFP6000, so core PCIe code needs to handle it
the same way as 4k and 6k.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c             | 4 ++++
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 61c22c2935d4..b0f1c313fee0 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -68,6 +68,10 @@ static const struct pci_device_id nfp_pci_device_ids[] = {
 	  PCI_VENDOR_ID_NETRONOME, PCI_ANY_ID,
 	  PCI_ANY_ID, 0,
 	},
+	{ PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP5000,
+	  PCI_VENDOR_ID_NETRONOME, PCI_ANY_ID,
+	  PCI_ANY_ID, 0,
+	},
 	{ PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP4000,
 	  PCI_VENDOR_ID_NETRONOME, PCI_ANY_ID,
 	  PCI_ANY_ID, 0,
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
index c8d0b1016a64..6ef5ac2d0827 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
@@ -652,6 +652,7 @@ static int enable_bars(struct nfp6000_pcie *nfp, u16 interface)
 		nfp->expl.data = bar->iomem + NFP_PCIE_SRAM + 0x1000;
 
 		if (nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP4000 ||
+		    nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP5000 ||
 		    nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP6000) {
 			nfp->iomem.csr = bar->iomem + NFP_PCIE_BAR(0);
 		} else {
@@ -663,6 +664,7 @@ static int enable_bars(struct nfp6000_pcie *nfp, u16 interface)
 	}
 
 	if (nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP4000 ||
+	    nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP5000 ||
 	    nfp->pdev->device == PCI_DEVICE_ID_NETRONOME_NFP6000)
 		expl_groups = 4;
 	else
@@ -1327,7 +1329,7 @@ struct nfp_cpp *nfp_cpp_from_nfp6000_pcie(struct pci_dev *pdev)
 
 	/*  Finished with card initialization. */
 	dev_info(&pdev->dev,
-		 "Netronome Flow Processor NFP4000/NFP6000 PCIe Card Probe\n");
+		 "Netronome Flow Processor NFP4000/NFP5000/NFP6000 PCIe Card Probe\n");
 	pcie_print_link_status(pdev);
 
 	nfp = kzalloc(sizeof(*nfp), GFP_KERNEL);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 05/15] nfp: abm: look up MAC addresses via management FW
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

In multi-host scenarios Management FW may allocate MAC addresses
at runtime, we have to use the indirect lookup to find them.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/abm/main.c | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index b84a6c2d387b..305ac07dc1e7 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -540,8 +540,9 @@ nfp_abm_vnic_set_mac(struct nfp_pf *pf, struct nfp_abm *abm, struct nfp_net *nn,
 {
 	struct nfp_eth_table_port *eth_port = &pf->eth_tbl->ports[id];
 	u8 mac_addr[ETH_ALEN];
-	const char *mac_str;
-	char name[32];
+	struct nfp_nsp *nsp;
+	char hwinfo[32];
+	int err;
 
 	if (id > pf->eth_tbl->count) {
 		nfp_warn(pf->cpp, "No entry for persistent MAC address\n");
@@ -549,22 +550,37 @@ nfp_abm_vnic_set_mac(struct nfp_pf *pf, struct nfp_abm *abm, struct nfp_net *nn,
 		return;
 	}
 
-	snprintf(name, sizeof(name), "eth%u.mac.pf%u",
+	snprintf(hwinfo, sizeof(hwinfo), "eth%u.mac.pf%u",
 		 eth_port->eth_index, abm->pf_id);
 
-	mac_str = nfp_hwinfo_lookup(pf->hwinfo, name);
-	if (!mac_str) {
-		nfp_warn(pf->cpp, "Can't lookup persistent MAC address (%s)\n",
-			 name);
+	nsp = nfp_nsp_open(pf->cpp);
+	if (IS_ERR(nsp)) {
+		nfp_warn(pf->cpp, "Failed to access the NSP for persistent MAC address: %ld\n",
+			 PTR_ERR(nsp));
+		eth_hw_addr_random(nn->dp.netdev);
+		return;
+	}
+
+	if (!nfp_nsp_has_hwinfo_lookup(nsp)) {
+		nfp_warn(pf->cpp, "NSP doesn't support PF MAC generation\n");
+		eth_hw_addr_random(nn->dp.netdev);
+		return;
+	}
+
+	err = nfp_nsp_hwinfo_lookup(nsp, hwinfo, sizeof(hwinfo));
+	nfp_nsp_close(nsp);
+	if (err) {
+		nfp_warn(pf->cpp, "Reading persistent MAC address failed: %d\n",
+			 err);
 		eth_hw_addr_random(nn->dp.netdev);
 		return;
 	}
 
-	if (sscanf(mac_str, "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx",
+	if (sscanf(hwinfo, "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx",
 		   &mac_addr[0], &mac_addr[1], &mac_addr[2],
 		   &mac_addr[3], &mac_addr[4], &mac_addr[5]) != 6) {
 		nfp_warn(pf->cpp, "Can't parse persistent MAC address (%s)\n",
-			 mac_str);
+			 hwinfo);
 		eth_hw_addr_random(nn->dp.netdev);
 		return;
 	}
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 04/15] nfp: add support for indirect HWinfo lookup
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

Management FW can adjust some of the information in the HWinfo table
at runtime.  In some cases reading the table directly will not yield
correct results.  Add a NSP command for looking up information.
Up until now we weren't making use of any of the values which may
get adjusted.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp.c  | 38 +++++++++++++++++++
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp.h  |  6 +++
 2 files changed, 44 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
index 9eb7b5a91bb1..bf593a6b26a1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
@@ -90,6 +90,8 @@
 #define NFP_FW_LOAD_RET_MAJOR	GENMASK(15, 8)
 #define NFP_FW_LOAD_RET_MINOR	GENMASK(23, 16)
 
+#define NFP_HWINFO_LOOKUP_SIZE	GENMASK(11, 0)
+
 enum nfp_nsp_cmd {
 	SPCODE_NOOP		= 0, /* No operation */
 	SPCODE_SOFT_RESET	= 1, /* Soft reset the NFP */
@@ -104,6 +106,7 @@ enum nfp_nsp_cmd {
 	SPCODE_NSP_SENSORS	= 12, /* Read NSP sensor(s) */
 	SPCODE_NSP_IDENTIFY	= 13, /* Read NSP version */
 	SPCODE_FW_STORED	= 16, /* If no FW loaded, load flash app FW */
+	SPCODE_HWINFO_LOOKUP	= 17, /* Lookup HWinfo with overwrites etc. */
 };
 
 static const struct {
@@ -703,3 +706,38 @@ int nfp_nsp_load_stored_fw(struct nfp_nsp *state)
 	nfp_nsp_load_fw_extended_msg(state, ret);
 	return 0;
 }
+
+static int
+__nfp_nsp_hwinfo_lookup(struct nfp_nsp *state, void *buf, unsigned int size)
+{
+	struct nfp_nsp_command_buf_arg hwinfo_lookup = {
+		{
+			.code		= SPCODE_HWINFO_LOOKUP,
+			.option		= size,
+		},
+		.in_buf		= buf,
+		.in_size	= size,
+		.out_buf	= buf,
+		.out_size	= size,
+	};
+
+	return nfp_nsp_command_buf(state, &hwinfo_lookup);
+}
+
+int nfp_nsp_hwinfo_lookup(struct nfp_nsp *state, void *buf, unsigned int size)
+{
+	int err;
+
+	size = min_t(u32, size, NFP_HWINFO_LOOKUP_SIZE);
+
+	err = __nfp_nsp_hwinfo_lookup(state, buf, size);
+	if (err)
+		return err;
+
+	if (strnlen(buf, size) == size) {
+		nfp_err(state->cpp, "NSP HWinfo value not NULL-terminated\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
index 65f2d4a6de02..bd6c9071c8e9 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
@@ -51,6 +51,7 @@ int nfp_nsp_load_fw(struct nfp_nsp *state, const struct firmware *fw);
 int nfp_nsp_write_flash(struct nfp_nsp *state, const struct firmware *fw);
 int nfp_nsp_mac_reinit(struct nfp_nsp *state);
 int nfp_nsp_load_stored_fw(struct nfp_nsp *state);
+int nfp_nsp_hwinfo_lookup(struct nfp_nsp *state, void *buf, unsigned int size);
 
 static inline bool nfp_nsp_has_mac_reinit(struct nfp_nsp *state)
 {
@@ -62,6 +63,11 @@ static inline bool nfp_nsp_has_stored_fw_load(struct nfp_nsp *state)
 	return nfp_nsp_get_abi_ver_minor(state) > 23;
 }
 
+static inline bool nfp_nsp_has_hwinfo_lookup(struct nfp_nsp *state)
+{
+	return nfp_nsp_get_abi_ver_minor(state) > 24;
+}
+
 enum nfp_eth_interface {
 	NFP_INTERFACE_NONE	= 0,
 	NFP_INTERFACE_SFP	= 1,
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 03/15] nfp: interpret extended FW load result codes
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

To enable easier FW distribution NFP can now automatically
select between FW stored on the flash and loaded from the
kernel.

If FW loading policy is set to auto it will compare the
versions of FW from the host and from the flash and load
the newer one.  If FW type doesn't match (e.g. one advanced
application vs another) the FW from the host takes precedence,
unless one of them is the basic NIC firmware, in which case
the non-basic-NIC FW is selected.

This automatic selection mechanism requires we inform user
what the verdict was.  Print a message to the logs explaining
the decision and the reason.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 .../ethernet/netronome/nfp/nfpcore/nfp_cpp.h  |  3 +
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp.c  | 85 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
index c338d539fa96..3b5182143ec7 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
@@ -56,6 +56,9 @@
 	dev_info(nfp_cpp_device(cpp)->parent, NFP_SUBSYS ": " fmt, ## args)
 #define nfp_dbg(cpp, fmt, args...) \
 	dev_dbg(nfp_cpp_device(cpp)->parent, NFP_SUBSYS ": " fmt, ## args)
+#define nfp_printk(level, cpp, fmt, args...) \
+	dev_printk(level, nfp_cpp_device(cpp)->parent,	\
+		   NFP_SUBSYS ": " fmt,	## args)
 
 #define PCI_64BIT_BAR_COUNT             3
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
index 0cdaa1fd6bcf..9eb7b5a91bb1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
@@ -87,6 +87,9 @@
 #define NSP_CODE_MAJOR		GENMASK(15, 12)
 #define NSP_CODE_MINOR		GENMASK(11, 0)
 
+#define NFP_FW_LOAD_RET_MAJOR	GENMASK(15, 8)
+#define NFP_FW_LOAD_RET_MINOR	GENMASK(23, 16)
+
 enum nfp_nsp_cmd {
 	SPCODE_NOOP		= 0, /* No operation */
 	SPCODE_SOFT_RESET	= 1, /* Soft reset the NFP */
@@ -135,6 +138,7 @@ struct nfp_nsp {
  * @option:	NFP SP Command Argument
  * @buff_cpp:	NFP SP Buffer CPP Address info
  * @buff_addr:	NFP SP Buffer Host address
+ * @error_cb:	Callback for interpreting option if error occurred
  */
 struct nfp_nsp_command_arg {
 	u16 code;
@@ -142,6 +146,7 @@ struct nfp_nsp_command_arg {
 	u32 option;
 	u32 buff_cpp;
 	u64 buff_addr;
+	void (*error_cb)(struct nfp_nsp *state, u32 ret_val);
 };
 
 /**
@@ -401,7 +406,10 @@ __nfp_nsp_command(struct nfp_nsp *state, const struct nfp_nsp_command_arg *arg)
 	if (err) {
 		nfp_warn(cpp, "Result (error) code set: %d (%d) command: %d\n",
 			 -err, (int)ret_val, arg->code);
-		nfp_nsp_print_extended_error(state, ret_val);
+		if (arg->error_cb)
+			arg->error_cb(state, ret_val);
+		else
+			nfp_nsp_print_extended_error(state, ret_val);
 		return -err;
 	}
 
@@ -530,18 +538,78 @@ int nfp_nsp_mac_reinit(struct nfp_nsp *state)
 	return nfp_nsp_command(state, SPCODE_MAC_INIT);
 }
 
+static void nfp_nsp_load_fw_extended_msg(struct nfp_nsp *state, u32 ret_val)
+{
+	static const char * const major_msg[] = {
+		/* 0 */ "Firmware from driver loaded",
+		/* 1 */ "Firmware from flash loaded",
+		/* 2 */ "Firmware loading failure",
+	};
+	static const char * const minor_msg[] = {
+		/*  0 */ "",
+		/*  1 */ "no named partition on flash",
+		/*  2 */ "error reading from flash",
+		/*  3 */ "can not deflate",
+		/*  4 */ "not a trusted file",
+		/*  5 */ "can not parse FW file",
+		/*  6 */ "MIP not found in FW file",
+		/*  7 */ "null firmware name in MIP",
+		/*  8 */ "FW version none",
+		/*  9 */ "FW build number none",
+		/* 10 */ "no FW selection policy HWInfo key found",
+		/* 11 */ "static FW selection policy",
+		/* 12 */ "FW version has precedence",
+		/* 13 */ "different FW application load requested",
+		/* 14 */ "development build",
+	};
+	unsigned int major, minor;
+	const char *level;
+
+	major = FIELD_GET(NFP_FW_LOAD_RET_MAJOR, ret_val);
+	minor = FIELD_GET(NFP_FW_LOAD_RET_MINOR, ret_val);
+
+	if (!nfp_nsp_has_stored_fw_load(state))
+		return;
+
+	/* Lower the message level in legacy case */
+	if (major == 0 && (minor == 0 || minor == 10))
+		level = KERN_DEBUG;
+	else if (major == 2)
+		level = KERN_ERR;
+	else
+		level = KERN_INFO;
+
+	if (major >= ARRAY_SIZE(major_msg))
+		nfp_printk(level, state->cpp, "FW loading status: %x\n",
+			   ret_val);
+	else if (minor >= ARRAY_SIZE(minor_msg))
+		nfp_printk(level, state->cpp, "%s, reason code: %d\n",
+			   major_msg[major], minor);
+	else
+		nfp_printk(level, state->cpp, "%s%c %s\n",
+			   major_msg[major], minor ? ',' : '.',
+			   minor_msg[minor]);
+}
+
 int nfp_nsp_load_fw(struct nfp_nsp *state, const struct firmware *fw)
 {
 	struct nfp_nsp_command_buf_arg load_fw = {
 		{
 			.code		= SPCODE_FW_LOAD,
 			.option		= fw->size,
+			.error_cb	= nfp_nsp_load_fw_extended_msg,
 		},
 		.in_buf		= fw->data,
 		.in_size	= fw->size,
 	};
+	int ret;
 
-	return nfp_nsp_command_buf(state, &load_fw);
+	ret = nfp_nsp_command_buf(state, &load_fw);
+	if (ret < 0)
+		return ret;
+
+	nfp_nsp_load_fw_extended_msg(state, ret);
+	return 0;
 }
 
 int nfp_nsp_write_flash(struct nfp_nsp *state, const struct firmware *fw)
@@ -622,5 +690,16 @@ int nfp_nsp_read_sensors(struct nfp_nsp *state, unsigned int sensor_mask,
 
 int nfp_nsp_load_stored_fw(struct nfp_nsp *state)
 {
-	return nfp_nsp_command(state, SPCODE_FW_STORED);
+	const struct nfp_nsp_command_arg arg = {
+		.code		= SPCODE_FW_STORED,
+		.error_cb	= nfp_nsp_load_fw_extended_msg,
+	};
+	int ret;
+
+	ret = __nfp_nsp_command(state, &arg);
+	if (ret < 0)
+		return ret;
+
+	nfp_nsp_load_fw_extended_msg(state, ret);
+	return 0;
 }
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 02/15] nfp: attempt FW load from flash
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

Flash may contain a default NFP application FW.  This application
can either be put there by the user (with ethtool -f) or shipped
with the card.  If file system FW is not found, attempt to load
this flash stored app FW.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c        | 6 ++++--
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c | 6 ++++++
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h | 6 ++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 4a540c5e27fe..61c22c2935d4 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -441,8 +441,11 @@ nfp_fw_load(struct pci_dev *pdev, struct nfp_pf *pf, struct nfp_nsp *nsp)
 	}
 
 	fw = nfp_net_fw_find(pdev, pf);
-	if (!fw)
+	if (!fw) {
+		if (nfp_nsp_has_stored_fw_load(nsp))
+			nfp_nsp_load_stored_fw(nsp);
 		return 0;
+	}
 
 	dev_info(&pdev->dev, "Soft-reset, loading FW image\n");
 	err = nfp_nsp_device_soft_reset(nsp);
@@ -453,7 +456,6 @@ nfp_fw_load(struct pci_dev *pdev, struct nfp_pf *pf, struct nfp_nsp *nsp)
 	}
 
 	err = nfp_nsp_load_fw(nsp, fw);
-
 	if (err < 0) {
 		dev_err(&pdev->dev, "FW loading failed: %d\n", err);
 		goto exit_release_fw;
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
index e1a14f4e5e71..0cdaa1fd6bcf 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
@@ -100,6 +100,7 @@ enum nfp_nsp_cmd {
 	SPCODE_NSP_WRITE_FLASH	= 11, /* Load and flash image from buffer */
 	SPCODE_NSP_SENSORS	= 12, /* Read NSP sensor(s) */
 	SPCODE_NSP_IDENTIFY	= 13, /* Read NSP version */
+	SPCODE_FW_STORED	= 16, /* If no FW loaded, load flash app FW */
 };
 
 static const struct {
@@ -618,3 +619,8 @@ int nfp_nsp_read_sensors(struct nfp_nsp *state, unsigned int sensor_mask,
 
 	return nfp_nsp_command_buf(state, &sensors);
 }
+
+int nfp_nsp_load_stored_fw(struct nfp_nsp *state)
+{
+	return nfp_nsp_command(state, SPCODE_FW_STORED);
+}
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
index f23d9e06f097..65f2d4a6de02 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
@@ -50,12 +50,18 @@ int nfp_nsp_device_soft_reset(struct nfp_nsp *state);
 int nfp_nsp_load_fw(struct nfp_nsp *state, const struct firmware *fw);
 int nfp_nsp_write_flash(struct nfp_nsp *state, const struct firmware *fw);
 int nfp_nsp_mac_reinit(struct nfp_nsp *state);
+int nfp_nsp_load_stored_fw(struct nfp_nsp *state);
 
 static inline bool nfp_nsp_has_mac_reinit(struct nfp_nsp *state)
 {
 	return nfp_nsp_get_abi_ver_minor(state) > 20;
 }
 
+static inline bool nfp_nsp_has_stored_fw_load(struct nfp_nsp *state)
+{
+	return nfp_nsp_get_abi_ver_minor(state) > 23;
+}
+
 enum nfp_eth_interface {
 	NFP_INTERFACE_NONE	= 0,
 	NFP_INTERFACE_SFP	= 1,
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 01/15] nfp: encapsulate NSP command arguments into structs
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20180828202047.1305-1-jakub.kicinski@netronome.com>

There is already a fair number of arguments to nfp_nsp_command()
family of functions.  Encapsulate them into structures to make
adding new ones easier.  No functional changes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp.c  | 205 ++++++++++++------
 1 file changed, 136 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
index 2abee0fe3a7c..e1a14f4e5e71 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
@@ -127,6 +127,38 @@ struct nfp_nsp {
 	void *entries;
 };
 
+/**
+ * struct nfp_nsp_command_arg - NFP command argument structure
+ * @code:	NFP SP Command Code
+ * @timeout_sec:Timeout value to wait for completion in seconds
+ * @option:	NFP SP Command Argument
+ * @buff_cpp:	NFP SP Buffer CPP Address info
+ * @buff_addr:	NFP SP Buffer Host address
+ */
+struct nfp_nsp_command_arg {
+	u16 code;
+	unsigned int timeout_sec;
+	u32 option;
+	u32 buff_cpp;
+	u64 buff_addr;
+};
+
+/**
+ * struct nfp_nsp_command_buf_arg - NFP command with buffer argument structure
+ * @arg:	NFP command argument structure
+ * @in_buf:	Buffer with data for input
+ * @in_size:	Size of @in_buf
+ * @out_buf:	Buffer for output data
+ * @out_size:	Size of @out_buf
+ */
+struct nfp_nsp_command_buf_arg {
+	struct nfp_nsp_command_arg arg;
+	const void *in_buf;
+	unsigned int in_size;
+	void *out_buf;
+	unsigned int out_size;
+};
+
 struct nfp_cpp *nfp_nsp_cpp(struct nfp_nsp *state)
 {
 	return state->cpp;
@@ -291,11 +323,7 @@ nfp_nsp_wait_reg(struct nfp_cpp *cpp, u64 *reg, u32 nsp_cpp, u64 addr,
 /**
  * __nfp_nsp_command() - Execute a command on the NFP Service Processor
  * @state:	NFP SP state
- * @code:	NFP SP Command Code
- * @option:	NFP SP Command Argument
- * @buff_cpp:	NFP SP Buffer CPP Address info
- * @buff_addr:	NFP SP Buffer Host address
- * @timeout_sec:Timeout value to wait for completion in seconds
+ * @arg:	NFP command argument structure
  *
  * Return: 0 for success with no result
  *
@@ -308,8 +336,7 @@ nfp_nsp_wait_reg(struct nfp_cpp *cpp, u64 *reg, u32 nsp_cpp, u64 addr,
  *	-ETIMEDOUT if the NSP took longer than @timeout_sec seconds to complete
  */
 static int
-__nfp_nsp_command(struct nfp_nsp *state, u16 code, u32 option, u32 buff_cpp,
-		  u64 buff_addr, u32 timeout_sec)
+__nfp_nsp_command(struct nfp_nsp *state, const struct nfp_nsp_command_arg *arg)
 {
 	u64 reg, ret_val, nsp_base, nsp_buffer, nsp_status, nsp_command;
 	struct nfp_cpp *cpp = state->cpp;
@@ -326,22 +353,22 @@ __nfp_nsp_command(struct nfp_nsp *state, u16 code, u32 option, u32 buff_cpp,
 	if (err)
 		return err;
 
-	if (!FIELD_FIT(NSP_BUFFER_CPP, buff_cpp >> 8) ||
-	    !FIELD_FIT(NSP_BUFFER_ADDRESS, buff_addr)) {
+	if (!FIELD_FIT(NSP_BUFFER_CPP, arg->buff_cpp >> 8) ||
+	    !FIELD_FIT(NSP_BUFFER_ADDRESS, arg->buff_addr)) {
 		nfp_err(cpp, "Host buffer out of reach %08x %016llx\n",
-			buff_cpp, buff_addr);
+			arg->buff_cpp, arg->buff_addr);
 		return -EINVAL;
 	}
 
 	err = nfp_cpp_writeq(cpp, nsp_cpp, nsp_buffer,
-			     FIELD_PREP(NSP_BUFFER_CPP, buff_cpp >> 8) |
-			     FIELD_PREP(NSP_BUFFER_ADDRESS, buff_addr));
+			     FIELD_PREP(NSP_BUFFER_CPP, arg->buff_cpp >> 8) |
+			     FIELD_PREP(NSP_BUFFER_ADDRESS, arg->buff_addr));
 	if (err < 0)
 		return err;
 
 	err = nfp_cpp_writeq(cpp, nsp_cpp, nsp_command,
-			     FIELD_PREP(NSP_COMMAND_OPTION, option) |
-			     FIELD_PREP(NSP_COMMAND_CODE, code) |
+			     FIELD_PREP(NSP_COMMAND_OPTION, arg->option) |
+			     FIELD_PREP(NSP_COMMAND_CODE, arg->code) |
 			     FIELD_PREP(NSP_COMMAND_START, 1));
 	if (err < 0)
 		return err;
@@ -351,16 +378,16 @@ __nfp_nsp_command(struct nfp_nsp *state, u16 code, u32 option, u32 buff_cpp,
 			       NSP_COMMAND_START, 0, NFP_NSP_TIMEOUT_DEFAULT);
 	if (err) {
 		nfp_err(cpp, "Error %d waiting for code 0x%04x to start\n",
-			err, code);
+			err, arg->code);
 		return err;
 	}
 
 	/* Wait for NSP_STATUS_BUSY to go to 0 */
 	err = nfp_nsp_wait_reg(cpp, &reg, nsp_cpp, nsp_status, NSP_STATUS_BUSY,
-			       0, timeout_sec);
+			       0, arg->timeout_sec ?: NFP_NSP_TIMEOUT_DEFAULT);
 	if (err) {
 		nfp_err(cpp, "Error %d waiting for code 0x%04x to complete\n",
-			err, code);
+			err, arg->code);
 		return err;
 	}
 
@@ -372,7 +399,7 @@ __nfp_nsp_command(struct nfp_nsp *state, u16 code, u32 option, u32 buff_cpp,
 	err = FIELD_GET(NSP_STATUS_RESULT, reg);
 	if (err) {
 		nfp_warn(cpp, "Result (error) code set: %d (%d) command: %d\n",
-			 -err, (int)ret_val, code);
+			 -err, (int)ret_val, arg->code);
 		nfp_nsp_print_extended_error(state, ret_val);
 		return -err;
 	}
@@ -380,18 +407,17 @@ __nfp_nsp_command(struct nfp_nsp *state, u16 code, u32 option, u32 buff_cpp,
 	return ret_val;
 }
 
-static int
-nfp_nsp_command(struct nfp_nsp *state, u16 code, u32 option, u32 buff_cpp,
-		u64 buff_addr)
+static int nfp_nsp_command(struct nfp_nsp *state, u16 code)
 {
-	return __nfp_nsp_command(state, code, option, buff_cpp, buff_addr,
-				 NFP_NSP_TIMEOUT_DEFAULT);
+	const struct nfp_nsp_command_arg arg = {
+		.code		= code,
+	};
+
+	return __nfp_nsp_command(state, &arg);
 }
 
 static int
-__nfp_nsp_command_buf(struct nfp_nsp *nsp, u16 code, u32 option,
-		      const void *in_buf, unsigned int in_size, void *out_buf,
-		      unsigned int out_size, u32 timeout_sec)
+nfp_nsp_command_buf(struct nfp_nsp *nsp, struct nfp_nsp_command_buf_arg *arg)
 {
 	struct nfp_cpp *cpp = nsp->cpp;
 	unsigned int max_size;
@@ -401,7 +427,7 @@ __nfp_nsp_command_buf(struct nfp_nsp *nsp, u16 code, u32 option,
 
 	if (nsp->ver.minor < 13) {
 		nfp_err(cpp, "NSP: Code 0x%04x with buffer not supported (ABI %hu.%hu)\n",
-			code, nsp->ver.major, nsp->ver.minor);
+			arg->arg.code, nsp->ver.major, nsp->ver.minor);
 		return -EOPNOTSUPP;
 	}
 
@@ -412,10 +438,11 @@ __nfp_nsp_command_buf(struct nfp_nsp *nsp, u16 code, u32 option,
 	if (err < 0)
 		return err;
 
-	max_size = max(in_size, out_size);
+	max_size = max(arg->in_size, arg->out_size);
 	if (FIELD_GET(NSP_DFLT_BUFFER_SIZE_MB, reg) * SZ_1M < max_size) {
 		nfp_err(cpp, "NSP: default buffer too small for command 0x%04x (%llu < %u)\n",
-			code, FIELD_GET(NSP_DFLT_BUFFER_SIZE_MB, reg) * SZ_1M,
+			arg->arg.code,
+			FIELD_GET(NSP_DFLT_BUFFER_SIZE_MB, reg) * SZ_1M,
 			max_size);
 		return -EINVAL;
 	}
@@ -430,27 +457,30 @@ __nfp_nsp_command_buf(struct nfp_nsp *nsp, u16 code, u32 option,
 	cpp_id = FIELD_GET(NSP_DFLT_BUFFER_CPP, reg) << 8;
 	cpp_buf = FIELD_GET(NSP_DFLT_BUFFER_ADDRESS, reg);
 
-	if (in_buf && in_size) {
-		err = nfp_cpp_write(cpp, cpp_id, cpp_buf, in_buf, in_size);
+	if (arg->in_buf && arg->in_size) {
+		err = nfp_cpp_write(cpp, cpp_id, cpp_buf,
+				    arg->in_buf, arg->in_size);
 		if (err < 0)
 			return err;
 	}
 	/* Zero out remaining part of the buffer */
-	if (out_buf && out_size && out_size > in_size) {
-		memset(out_buf, 0, out_size - in_size);
-		err = nfp_cpp_write(cpp, cpp_id, cpp_buf + in_size,
-				    out_buf, out_size - in_size);
+	if (arg->out_buf && arg->out_size && arg->out_size > arg->in_size) {
+		memset(arg->out_buf, 0, arg->out_size - arg->in_size);
+		err = nfp_cpp_write(cpp, cpp_id, cpp_buf + arg->in_size,
+				    arg->out_buf, arg->out_size - arg->in_size);
 		if (err < 0)
 			return err;
 	}
 
-	ret = __nfp_nsp_command(nsp, code, option, cpp_id, cpp_buf,
-				timeout_sec);
+	arg->arg.buff_cpp = cpp_id;
+	arg->arg.buff_addr = cpp_buf;
+	ret = __nfp_nsp_command(nsp, &arg->arg);
 	if (ret < 0)
 		return ret;
 
-	if (out_buf && out_size) {
-		err = nfp_cpp_read(cpp, cpp_id, cpp_buf, out_buf, out_size);
+	if (arg->out_buf && arg->out_size) {
+		err = nfp_cpp_read(cpp, cpp_id, cpp_buf,
+				   arg->out_buf, arg->out_size);
 		if (err < 0)
 			return err;
 	}
@@ -458,16 +488,6 @@ __nfp_nsp_command_buf(struct nfp_nsp *nsp, u16 code, u32 option,
 	return ret;
 }
 
-static int
-nfp_nsp_command_buf(struct nfp_nsp *nsp, u16 code, u32 option,
-		    const void *in_buf, unsigned int in_size, void *out_buf,
-		    unsigned int out_size)
-{
-	return __nfp_nsp_command_buf(nsp, code, option, in_buf, in_size,
-				     out_buf, out_size,
-				     NFP_NSP_TIMEOUT_DEFAULT);
-}
-
 int nfp_nsp_wait(struct nfp_nsp *state)
 {
 	const unsigned long wait_until = jiffies + NFP_NSP_TIMEOUT_BOOT * HZ;
@@ -479,7 +499,7 @@ int nfp_nsp_wait(struct nfp_nsp *state)
 	for (;;) {
 		const unsigned long start_time = jiffies;
 
-		err = nfp_nsp_command(state, SPCODE_NOOP, 0, 0, 0);
+		err = nfp_nsp_command(state, SPCODE_NOOP);
 		if (err != -EAGAIN)
 			break;
 
@@ -501,53 +521,100 @@ int nfp_nsp_wait(struct nfp_nsp *state)
 
 int nfp_nsp_device_soft_reset(struct nfp_nsp *state)
 {
-	return nfp_nsp_command(state, SPCODE_SOFT_RESET, 0, 0, 0);
+	return nfp_nsp_command(state, SPCODE_SOFT_RESET);
 }
 
 int nfp_nsp_mac_reinit(struct nfp_nsp *state)
 {
-	return nfp_nsp_command(state, SPCODE_MAC_INIT, 0, 0, 0);
+	return nfp_nsp_command(state, SPCODE_MAC_INIT);
 }
 
 int nfp_nsp_load_fw(struct nfp_nsp *state, const struct firmware *fw)
 {
-	return nfp_nsp_command_buf(state, SPCODE_FW_LOAD, fw->size, fw->data,
-				   fw->size, NULL, 0);
+	struct nfp_nsp_command_buf_arg load_fw = {
+		{
+			.code		= SPCODE_FW_LOAD,
+			.option		= fw->size,
+		},
+		.in_buf		= fw->data,
+		.in_size	= fw->size,
+	};
+
+	return nfp_nsp_command_buf(state, &load_fw);
 }
 
 int nfp_nsp_write_flash(struct nfp_nsp *state, const struct firmware *fw)
 {
-	/* The flash time is specified to take a maximum of 70s so we add an
-	 * additional factor to this spec time.
-	 */
-	u32 timeout_sec = 2.5 * 70;
-
-	return __nfp_nsp_command_buf(state, SPCODE_NSP_WRITE_FLASH, fw->size,
-				     fw->data, fw->size, NULL, 0, timeout_sec);
+	struct nfp_nsp_command_buf_arg write_flash = {
+		{
+			.code		= SPCODE_NSP_WRITE_FLASH,
+			.option		= fw->size,
+			/* The flash time is specified to take a maximum of 70s
+			 * so we add an additional factor to this spec time.
+			 */
+			.timeout_sec	= 2.5 * 70,
+		},
+		.in_buf		= fw->data,
+		.in_size	= fw->size,
+	};
+
+	return nfp_nsp_command_buf(state, &write_flash);
 }
 
 int nfp_nsp_read_eth_table(struct nfp_nsp *state, void *buf, unsigned int size)
 {
-	return nfp_nsp_command_buf(state, SPCODE_ETH_RESCAN, size, NULL, 0,
-				   buf, size);
+	struct nfp_nsp_command_buf_arg eth_rescan = {
+		{
+			.code		= SPCODE_ETH_RESCAN,
+			.option		= size,
+		},
+		.out_buf	= buf,
+		.out_size	= size,
+	};
+
+	return nfp_nsp_command_buf(state, &eth_rescan);
 }
 
 int nfp_nsp_write_eth_table(struct nfp_nsp *state,
 			    const void *buf, unsigned int size)
 {
-	return nfp_nsp_command_buf(state, SPCODE_ETH_CONTROL, size, buf, size,
-				   NULL, 0);
+	struct nfp_nsp_command_buf_arg eth_ctrl = {
+		{
+			.code		= SPCODE_ETH_CONTROL,
+			.option		= size,
+		},
+		.in_buf		= buf,
+		.in_size	= size,
+	};
+
+	return nfp_nsp_command_buf(state, &eth_ctrl);
 }
 
 int nfp_nsp_read_identify(struct nfp_nsp *state, void *buf, unsigned int size)
 {
-	return nfp_nsp_command_buf(state, SPCODE_NSP_IDENTIFY, size, NULL, 0,
-				   buf, size);
+	struct nfp_nsp_command_buf_arg identify = {
+		{
+			.code		= SPCODE_NSP_IDENTIFY,
+			.option		= size,
+		},
+		.out_buf	= buf,
+		.out_size	= size,
+	};
+
+	return nfp_nsp_command_buf(state, &identify);
 }
 
 int nfp_nsp_read_sensors(struct nfp_nsp *state, unsigned int sensor_mask,
 			 void *buf, unsigned int size)
 {
-	return nfp_nsp_command_buf(state, SPCODE_NSP_SENSORS, sensor_mask,
-				   NULL, 0, buf, size);
+	struct nfp_nsp_command_buf_arg sensors = {
+		{
+			.code		= SPCODE_NSP_SENSORS,
+			.option		= sensor_mask,
+		},
+		.out_buf	= buf,
+		.out_size	= size,
+	};
+
+	return nfp_nsp_command_buf(state, &sensors);
 }
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 00/15] nfp: add NFP5000 support
From: Jakub Kicinski @ 2018-08-28 20:20 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski

Hi!

This series broadly speaking adds support for NFP5000 and
related products.

First we add support for loading FW from flash.  We need to allow
for the management processor to provide extended log messages when
FW is loaded.  This is needed when FW selection policy is to compare
the FW on the disk and in the flash, and load the newer.  User should
be told what FW was selected.

We use this opportunity to add extended errors for normal FW loading
as well.

Next we add support for requesting HW information from the management
processor.  Up until now the driver read the HWinfo as it appears in
card memory, but there can be cases when management processor has
additional information or generates the entries dynamically so
occasionally we will have to consult it.  We use this to look up MAC
addresses for PCIe netdevs.

Next the actual patch with NFP5000 support and a small dose of
refactoring of PCIe init. 

The remaining patches add support for reading RTsymbol types we
didn't need before.  Ones explicitly placed in external memory unit's
cache and absolute ones.

This part begins with a patch moving the logic which figures out
the correct bit offsets to device probe, to avoid redoing the
calculation for each access.  Second patch adds error messages
for easier troubleshooting.  Next patch adds helpers which will
take care of address conversions to reach into EMU cache.
Subsequently users are migrated from the raw CPP API to the new RTsym
helpers.  Finally we add support for reading absolute symbols.

Jakub Kicinski (15):
  nfp: encapsulate NSP command arguments into structs
  nfp: attempt FW load from flash
  nfp: interpret extended FW load result codes
  nfp: add support for indirect HWinfo lookup
  nfp: abm: look up MAC addresses via management FW
  nfp: add support for NFP5000
  nfp: refactor the per-chip PCIe config
  nfp: save the MU locality field offset
  nfp: add basic errors messages to target logic
  nfp: add RTsym access helpers
  nfp: pass cpp_id to nfp_cpp_map_area()
  nfp: convert existing RTsym helpers to full target decoding
  nfp: convert all RTsym users to use new read/write helpers
  nfp: support access to absolute RTsyms
  nfp: make RTsym users handle absolute symbols correctly

 drivers/net/ethernet/netronome/nfp/abm/ctrl.c |  32 +-
 drivers/net/ethernet/netronome/nfp/abm/main.c |  34 +-
 drivers/net/ethernet/netronome/nfp/nfp_main.c |  44 +--
 .../netronome/nfp/nfp_net_debugdump.c         |  50 +--
 .../net/ethernet/netronome/nfp/nfp_net_main.c |   8 +-
 .../netronome/nfp/nfpcore/nfp6000_pcie.c      |  50 ++-
 .../ethernet/netronome/nfp/nfpcore/nfp_cpp.h  |  12 +-
 .../netronome/nfp/nfpcore/nfp_cppcore.c       |  36 ++
 .../netronome/nfp/nfpcore/nfp_cpplib.c        |  12 +-
 .../ethernet/netronome/nfp/nfpcore/nfp_nffw.c |  32 +-
 .../ethernet/netronome/nfp/nfpcore/nfp_nffw.h |  38 +-
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp.c  | 330 ++++++++++++++----
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp.h  |  12 +
 .../netronome/nfp/nfpcore/nfp_rtsym.c         | 216 +++++++++++-
 .../netronome/nfp/nfpcore/nfp_target.c        |  12 +-
 15 files changed, 682 insertions(+), 236 deletions(-)

-- 
2.17.1

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: force_napi_tx module param.
From: Willem de Bruijn @ 2018-08-28 19:57 UTC (permalink / raw)
  To: Jason Wang
  Cc: Jon Olson (Google Drive), Michael S. Tsirkin, caleb.raitto,
	David Miller, Network Development, Caleb Raitto
In-Reply-To: <706da951-ed30-85e8-c0aa-cb9ae8b3deb7@redhat.com>

On Mon, Jul 30, 2018 at 2:06 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
>
> On 2018年07月25日 08:17, Jon Olson wrote:
> > On Tue, Jul 24, 2018 at 3:46 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >> On Tue, Jul 24, 2018 at 06:31:54PM -0400, Willem de Bruijn wrote:
> >>> On Tue, Jul 24, 2018 at 6:23 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>>> On Tue, Jul 24, 2018 at 04:52:53PM -0400, Willem de Bruijn wrote:
> >>>>> >From the above linked patch, I understand that there are yet
> >>>>> other special cases in production, such as a hard cap on #tx queues to
> >>>>> 32 regardless of number of vcpus.
> >>>> I don't think upstream kernels have this limit - we can
> >>>> now use vmalloc for higher number of queues.
> >>> Yes. that patch* mentioned it as a google compute engine imposed
> >>> limit. It is exactly such cloud provider imposed rules that I'm
> >>> concerned about working around in upstream drivers.
> >>>
> >>> * for reference, I mean https://patchwork.ozlabs.org/patch/725249/
> >> Yea. Why does GCE do it btw?
> > There are a few reasons for the limit, some historical, some current.
> >
> > Historically we did this because of a kernel limit on the number of
> > TAP queues (in Montreal I thought this limit was 32). To my chagrin,
> > the limit upstream at the time we did it was actually eight. We had
> > increased the limit from eight to 32 internally, and it appears in
> > upstream it has subsequently increased upstream to 256. We no longer
> > use TAP for networking, so that constraint no longer applies for us,
> > but when looking at removing/raising the limit we discovered no
> > workloads that clearly benefited from lifting it, and it also placed
> > more pressure on our virtual networking stack particularly on the Tx
> > side. We left it as-is.
> >
> > In terms of current reasons there are really two. One is memory usage.
> > As you know, virtio-net uses rx/tx pairs, so there's an expectation
> > that the guest will have an Rx queue for every Tx queue. We run our
> > individual virtqueues fairly deep (4096 entries) to give guests a wide
> > time window for re-posting Rx buffers and avoiding starvation on
> > packet delivery. Filling an Rx vring with max-sized mergeable buffers
> > (4096 bytes) is 16MB of GFP_ATOMIC allocations. At 32 queues this can
> > be up to 512MB of memory posted for network buffers. Scaling this to
> > the largest VM GCE offers today (160 VCPUs -- n1-ultramem-160) keeping
> > all of the Rx rings full would (in the large average Rx packet size
> > case) consume up to 2.5 GB(!) of guest RAM. Now, those VMs have 3.8T
> > of RAM available, but I don't believe we've observed a situation where
> > they would have benefited from having 2.5 gigs of buffers posted for
> > incoming network traffic :)
>
> We can work to have async txq and rxq instead of paris if there's a
> strong requirement.
>
> >
> > The second reason is interrupt related -- as I mentioned above, we
> > have found no workloads that clearly benefit from so many queues, but
> > we have found workloads that degrade. In particular workloads that do
> > a lot of small packet processing but which aren't extremely latency
> > sensitive can achieve higher PPS by taking fewer interrupt across
> > fewer VCPUs due to better batching (this also incurs higher latency,
> > but at the limit the "busy" cores end up suppressing most interrupts
> > and spending most of their cycles farming out work). Memcache is a
> > good example here, particularly if the latency targets for request
> > completion are in the ~milliseconds range (rather than the
> > microseconds we typically strive for with TCP_RR-style workloads).
> >
> > All of that said, we haven't been forthcoming with data (and
> > unfortunately I don't have it handy in a useful form, otherwise I'd
> > simply post it here), so I understand the hesitation to simply run
> > with napi_tx across the board. As Willem said, this patch seemed like
> > the least disruptive way to allow us to continue down the road of
> > "universal" NAPI Tx and to hopefully get data across enough workloads
> > (with VMs small, large, and absurdly large :) to present a compelling
> > argument in one direction or another. As far as I know there aren't
> > currently any NAPI related ethtool commands (based on a quick perusal
> > of ethtool.h)
>
> As I suggest before, maybe we can (ab)use tx-frames-irq.

I forgot to respond to this originally, but I agree.

How about something like the snippet below. It would be simpler to
reason about if only allow switching while the device is down, but
napi does not strictly require that.

+static int virtnet_set_coalesce(struct net_device *dev,
+                               struct ethtool_coalesce *ec)
+{
+       const u32 tx_coalesce_napi_mask = (1 << 16);
+       const struct ethtool_coalesce ec_default = {
+               .cmd = ETHTOOL_SCOALESCE,
+               .rx_max_coalesced_frames = 1,
+               .tx_max_coalesced_frames = 1,
+       };
+       struct virtnet_info *vi = netdev_priv(dev);
+       int napi_weight = 0;
+       bool running;
+       int i;
+
+       if (ec->tx_max_coalesced_frames & tx_coalesce_napi_mask) {
+               ec->tx_max_coalesced_frames &= ~tx_coalesce_napi_mask;
+               napi_weight = NAPI_POLL_WEIGHT;
+       }
+
+       /* disallow changes to fields not explicitly tested above */
+       if (memcmp(ec, &ec_default, sizeof(ec_default)))
+               return -EINVAL;
+
+       if (napi_weight ^ vi->sq[0].napi.weight) {
+               running = netif_running(vi->dev);
+
+               for (i = 0; i < vi->max_queue_pairs; i++) {
+                       vi->sq[i].napi.weight = napi_weight;
+
+                       if (!running)
+                               continue;
+
+                       if (napi_weight)
+                               virtnet_napi_tx_enable(vi, vi->sq[i].vq,
+                                                      &vi->sq[i].napi);
+                       else
+                               napi_disable(&vi->sq[i].napi);
+               }
+       }
+
+       return 0;
+}
+
+static int virtnet_get_coalesce(struct net_device *dev,
+                               struct ethtool_coalesce *ec)
+{
+       const u32 tx_coalesce_napi_mask = (1 << 16);
+       const struct ethtool_coalesce ec_default = {
+               .cmd = ETHTOOL_GCOALESCE,
+               .rx_max_coalesced_frames = 1,
+               .tx_max_coalesced_frames = 1,
+       };
+       struct virtnet_info *vi = netdev_priv(dev);
+
+       memcpy(ec, &ec_default, sizeof(ec_default));
+
+       if (vi->sq[0].napi.weight)
+               ec->tx_max_coalesced_frames |= tx_coalesce_napi_mask;
+
+       return 0;
+}

^ permalink raw reply

* [PATCH net-next 5/5] rtnetlink: move type calculation out of loop
From: Christian Brauner @ 2018-08-28 23:18 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
	fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
	Christian Brauner
In-Reply-To: <20180828231859.29758-1-christian@brauner.io>

I don't see how the type - which is one of
RTM_{GETADDR,GETROUTE,GETNETCONF} - can change. So do the message type
calculation once before entering the for loop.

Signed-off-by: Christian Brauner <christian@brauner.io>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c6c6f93cd195..a644d392918b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3215,13 +3215,13 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int idx;
 	int s_idx = cb->family;
+	int type = cb->nlh->nlmsg_type - RTM_BASE;
 
 	if (s_idx == 0)
 		s_idx = 1;
 
 	for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
 		struct rtnl_link **tab;
-		int type = cb->nlh->nlmsg_type-RTM_BASE;
 		struct rtnl_link *link;
 		rtnl_dumpit_func dumpit;
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 4/5] ipv6: enable IFA_IF_NETNSID for RTM_GETADDR
From: Christian Brauner @ 2018-08-28 23:18 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
	fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
	Christian Brauner
In-Reply-To: <20180828231859.29758-1-christian@brauner.io>

- Backwards Compatibility:
  If userspace wants to determine whether ipv6 RTM_GETADDR requests support
  the new IFA_IF_NETNSID property they should verify that the reply after
  sending a request includes the IFA_IF_NETNSID property. If it does not
  userspace should assume that IFA_IF_NETNSID is not supported for ipv6
  RTM_GETADDR requests on this kernel.
- From what I gather from current userspace tools that make use of
  RTM_GETADDR requests some of them pass down struct ifinfomsg when they
  should actually pass down struct ifaddrmsg. To not break existing
  tools that pass down the wrong struct we will do the same as for
  RTM_GETLINK | NLM_F_DUMP requests and not error out when the
  nlmsg_parse() fails.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian@brauner.io>
---
 net/ipv6/addrconf.c | 70 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f66a1cae3366..f48f3479b341 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4493,6 +4493,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .len = sizeof(u32) },
 	[IFA_RT_PRIORITY]	= { .len = sizeof(u32) },
+	[IFA_IF_NETNSID]	= { .type = NLA_S32 },
 };
 
 static int
@@ -4796,7 +4797,8 @@ static inline int inet6_ifaddr_msgsize(void)
 }
 
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
-			     u32 portid, u32 seq, int event, unsigned int flags)
+			     u32 portid, u32 seq, int event, unsigned int flags,
+			     int netnsid)
 {
 	struct nlmsghdr  *nlh;
 	u32 preferred, valid;
@@ -4808,6 +4810,9 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
 	put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
 		      ifa->idev->dev->ifindex);
 
+	if (netnsid >= 0 && nla_put_s32(skb, IFA_IF_NETNSID, netnsid))
+		goto error;
+
 	if (!((ifa->flags&IFA_F_PERMANENT) &&
 	      (ifa->prefered_lft == INFINITY_LIFE_TIME))) {
 		preferred = ifa->prefered_lft;
@@ -4857,7 +4862,8 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
 }
 
 static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
-				u32 portid, u32 seq, int event, u16 flags)
+			       u32 portid, u32 seq, int event, u16 flags,
+			       int netnsid)
 {
 	struct nlmsghdr  *nlh;
 	u8 scope = RT_SCOPE_UNIVERSE;
@@ -4870,6 +4876,9 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (netnsid >= 0 && nla_put_s32(skb, IFA_IF_NETNSID, netnsid))
+		return -EMSGSIZE;
+
 	put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
 	if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
 	    put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
@@ -4883,7 +4892,8 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
 }
 
 static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
-				u32 portid, u32 seq, int event, unsigned int flags)
+			       u32 portid, u32 seq, int event,
+			       unsigned int flags, int netnsid)
 {
 	struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
 	int ifindex = dev ? dev->ifindex : 1;
@@ -4897,6 +4907,9 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (netnsid >= 0 && nla_put_s32(skb, IFA_IF_NETNSID, netnsid))
+		return -EMSGSIZE;
+
 	put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
 	if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
 	    put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp,
@@ -4918,7 +4931,7 @@ enum addr_type_t {
 /* called with rcu_read_lock() */
 static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
 			  struct netlink_callback *cb, enum addr_type_t type,
-			  int s_ip_idx, int *p_ip_idx)
+			  int s_ip_idx, int *p_ip_idx, int netnsid)
 {
 	struct ifmcaddr6 *ifmca;
 	struct ifacaddr6 *ifaca;
@@ -4938,7 +4951,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
 						NETLINK_CB(cb->skb).portid,
 						cb->nlh->nlmsg_seq,
 						RTM_NEWADDR,
-						NLM_F_MULTI);
+						NLM_F_MULTI, netnsid);
 			if (err < 0)
 				break;
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -4955,7 +4968,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
 						  NETLINK_CB(cb->skb).portid,
 						  cb->nlh->nlmsg_seq,
 						  RTM_GETMULTICAST,
-						  NLM_F_MULTI);
+						  NLM_F_MULTI, netnsid);
 			if (err < 0)
 				break;
 		}
@@ -4970,7 +4983,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
 						  NETLINK_CB(cb->skb).portid,
 						  cb->nlh->nlmsg_seq,
 						  RTM_GETANYCAST,
-						  NLM_F_MULTI);
+						  NLM_F_MULTI, netnsid);
 			if (err < 0)
 				break;
 		}
@@ -4987,6 +5000,9 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 			   enum addr_type_t type)
 {
 	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX+1];
+	struct net *tgt_net = net;
+	int netnsid = -1;
 	int h, s_h;
 	int idx, ip_idx;
 	int s_idx, s_ip_idx;
@@ -4998,11 +5014,22 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 	s_idx = idx = cb->args[1];
 	s_ip_idx = ip_idx = cb->args[2];
 
+	if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX,
+			ifa_ipv6_policy, NULL) >= 0) {
+		if (tb[IFA_IF_NETNSID]) {
+			netnsid = nla_get_s32(tb[IFA_IF_NETNSID]);
+
+			tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
+			if (IS_ERR(tgt_net))
+				return PTR_ERR(tgt_net);
+		}
+	}
+
 	rcu_read_lock();
-	cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq;
+	cb->seq = atomic_read(&tgt_net->ipv6.dev_addr_genid) ^ tgt_net->dev_base_seq;
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
-		head = &net->dev_index_head[h];
+		head = &tgt_net->dev_index_head[h];
 		hlist_for_each_entry_rcu(dev, head, index_hlist) {
 			if (idx < s_idx)
 				goto cont;
@@ -5014,7 +5041,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 				goto cont;
 
 			if (in6_dump_addrs(idev, skb, cb, type,
-					   s_ip_idx, &ip_idx) < 0)
+					   s_ip_idx, &ip_idx, netnsid) < 0)
 				goto done;
 cont:
 			idx++;
@@ -5025,6 +5052,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 	cb->args[0] = h;
 	cb->args[1] = idx;
 	cb->args[2] = ip_idx;
+	if (netnsid >= 0)
+		put_net(tgt_net);
 
 	return skb->len;
 }
@@ -5055,12 +5084,14 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(in_skb->sk);
+	struct net *tgt_net = net;
 	struct ifaddrmsg *ifm;
 	struct nlattr *tb[IFA_MAX+1];
 	struct in6_addr *addr = NULL, *peer;
 	struct net_device *dev = NULL;
 	struct inet6_ifaddr *ifa;
 	struct sk_buff *skb;
+	int netnsid = -1;
 	int err;
 
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
@@ -5068,15 +5099,24 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	if (tb[IFA_IF_NETNSID]) {
+		netnsid = nla_get_s32(tb[IFA_IF_NETNSID]);
+
+		tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk,
+						  netnsid);
+		if (IS_ERR(tgt_net))
+			return PTR_ERR(tgt_net);
+	}
+
 	addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
 	if (!addr)
 		return -EINVAL;
 
 	ifm = nlmsg_data(nlh);
 	if (ifm->ifa_index)
-		dev = dev_get_by_index(net, ifm->ifa_index);
+		dev = dev_get_by_index(tgt_net, ifm->ifa_index);
 
-	ifa = ipv6_get_ifaddr(net, addr, dev, 1);
+	ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1);
 	if (!ifa) {
 		err = -EADDRNOTAVAIL;
 		goto errout;
@@ -5089,14 +5129,14 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid,
-				nlh->nlmsg_seq, RTM_NEWADDR, 0);
+				nlh->nlmsg_seq, RTM_NEWADDR, 0, netnsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(skb);
 		goto errout_ifa;
 	}
-	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+	err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid);
 errout_ifa:
 	in6_ifa_put(ifa);
 errout:
@@ -5115,7 +5155,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 	if (!skb)
 		goto errout;
 
-	err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0);
+	err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0, -1);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
 		WARN_ON(err == -EMSGSIZE);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 3/5] ipv4: enable IFA_IF_NETNSID for RTM_GETADDR
From: Christian Brauner @ 2018-08-28 23:18 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
	fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
	Christian Brauner
In-Reply-To: <20180828231859.29758-1-christian@brauner.io>

- Backwards Compatibility:
  If userspace wants to determine whether ipv4 RTM_GETADDR requests support
  the new IFA_IF_NETNSID property they should verify that the reply after
  sending a request includes the IFA_IF_NETNSID property. If it does not
  userspace should assume that IFA_IF_NETNSID is not supported for ipv4
  RTM_GETADDR requests on this kernel.
- From what I gather from current userspace tools that make use of
  RTM_GETADDR requests some of them pass down struct ifinfomsg when they
  should actually pass down struct ifaddrmsg. To not break existing
  tools that pass down the wrong struct we will do the same as for
  RTM_GETLINK | NLM_F_DUMP requests and not error out when the
  nlmsg_parse() fails.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian@brauner.io>
---
 net/ipv4/devinet.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab1a77a..7d980c2279f5 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -100,6 +100,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .type = NLA_U32 },
 	[IFA_RT_PRIORITY]	= { .type = NLA_U32 },
+	[IFA_IF_NETNSID]	= { .type = NLA_S32 },
 };
 
 #define IN4_ADDR_HSIZE_SHIFT	8
@@ -1584,7 +1585,8 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
 }
 
 static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
-			    u32 portid, u32 seq, int event, unsigned int flags)
+			    u32 portid, u32 seq, int event, unsigned int flags,
+			    int netnsid)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
@@ -1601,6 +1603,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 	ifm->ifa_scope = ifa->ifa_scope;
 	ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
 
+	if (netnsid >= 0 && nla_put_s32(skb, IFA_IF_NETNSID, netnsid))
+		goto nla_put_failure;
+
 	if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
 		preferred = ifa->ifa_preferred_lft;
 		valid = ifa->ifa_valid_lft;
@@ -1648,6 +1653,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX+1];
+	struct net *tgt_net = net;
+	int netnsid = -1;
 	int h, s_h;
 	int idx, s_idx;
 	int ip_idx, s_ip_idx;
@@ -1660,12 +1668,23 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 	s_idx = idx = cb->args[1];
 	s_ip_idx = ip_idx = cb->args[2];
 
+	if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX,
+			ifa_ipv4_policy, NULL) >= 0) {
+		if (tb[IFA_IF_NETNSID]) {
+			netnsid = nla_get_s32(tb[IFA_IF_NETNSID]);
+
+			tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
+			if (IS_ERR(tgt_net))
+				return PTR_ERR(tgt_net);
+		}
+	}
+
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
-		head = &net->dev_index_head[h];
+		head = &tgt_net->dev_index_head[h];
 		rcu_read_lock();
-		cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
-			  net->dev_base_seq;
+		cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^
+			  tgt_net->dev_base_seq;
 		hlist_for_each_entry_rcu(dev, head, index_hlist) {
 			if (idx < s_idx)
 				goto cont;
@@ -1680,9 +1699,10 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 				if (ip_idx < s_ip_idx)
 					continue;
 				if (inet_fill_ifaddr(skb, ifa,
-					     NETLINK_CB(cb->skb).portid,
-					     cb->nlh->nlmsg_seq,
-					     RTM_NEWADDR, NLM_F_MULTI) < 0) {
+						     NETLINK_CB(cb->skb).portid,
+						     cb->nlh->nlmsg_seq,
+						     RTM_NEWADDR, NLM_F_MULTI,
+						     netnsid) < 0) {
 					rcu_read_unlock();
 					goto done;
 				}
@@ -1698,6 +1718,8 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 	cb->args[0] = h;
 	cb->args[1] = idx;
 	cb->args[2] = ip_idx;
+	if (netnsid >= 0)
+		put_net(tgt_net);
 
 	return skb->len;
 }
@@ -1715,7 +1737,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	if (!skb)
 		goto errout;
 
-	err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+	err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0, -1);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 2/5] if_addr: add IFA_IF_NETNSID
From: Christian Brauner @ 2018-08-28 23:18 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
	fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
	Christian Brauner
In-Reply-To: <20180828231859.29758-1-christian@brauner.io>

This adds a new IFA_IF_NETNSID property to be used by address families such
as PF_INET and PF_INET6.
The IFA_IF_NETNSID property can be used to send a network namespace
identifier as part of a request. If a IFA_IF_NETNSID property is identified
it will be used to retrieve the target network namespace in which the
request is to be made.

Signed-off-by: Christian Brauner <christian@brauner.io>
---
 include/uapi/linux/if_addr.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index ebaf5701c9db..0e0cd588cac0 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -34,6 +34,7 @@ enum {
 	IFA_MULTICAST,
 	IFA_FLAGS,
 	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
+	IFA_IF_NETNSID,
 	__IFA_MAX,
 };
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 1/5] rtnetlink: add rtnl_get_net_ns_capable()
From: Christian Brauner @ 2018-08-28 23:18 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
	fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
	Christian Brauner
In-Reply-To: <20180828231859.29758-1-christian@brauner.io>

get_target_net() will be used in follow-up patches in ipv{4,6} codepaths to
retrieve network namespaces based on network namespace identifiers. So
remove the static declaration and export in the rtnetlink header. Also,
rename it to rtnl_get_net_ns_capable() to make it obvious what this
function is doing.

Signed-off-by: Christian Brauner <christian@brauner.io>
---
 include/net/rtnetlink.h |  1 +
 net/core/rtnetlink.c    | 13 +++++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 0bbaa5488423..cf26e5aacac4 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -165,6 +165,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
 
 int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
 			struct netlink_ext_ack *exterr);
+struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid);
 
 #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind)
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e3f743c141b3..c6c6f93cd195 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1795,7 +1795,12 @@ static bool link_dump_filtered(struct net_device *dev,
 	return false;
 }
 
-static struct net *get_target_net(struct sock *sk, int netnsid)
+/**
+ * rtnl_get_net_ns_capable - Get a network namespace based on a netnsid.
+ *
+ * Returns the network namespace on success or a error pointer on failure.
+ */
+struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid)
 {
 	struct net *net;
 
@@ -1847,7 +1852,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 			ifla_policy, NULL) >= 0) {
 		if (tb[IFLA_IF_NETNSID]) {
 			netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-			tgt_net = get_target_net(skb->sk, netnsid);
+			tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
 			if (IS_ERR(tgt_net)) {
 				tgt_net = net;
 				netnsid = -1;
@@ -2715,7 +2720,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+		tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
 		if (IS_ERR(tgt_net))
 			return PTR_ERR(tgt_net);
 	}
@@ -3125,7 +3130,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+		tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
 		if (IS_ERR(tgt_net))
 			return PTR_ERR(tgt_net);
 	}
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 0/5] rtnetlink: add IFA_IF_NETNSID for RTM_GETADDR
From: Christian Brauner @ 2018-08-28 23:18 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
	fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
	Christian Brauner

From: Christian Brauner <christian.brauner@ubuntu.com>

Hey,

A while back we introduced and enabled IFLA_IF_NETNSID in
RTM_{DEL,GET,NEW}LINK requests (cf. [1], [2], [3], [4], [5]). This has led
to signficant performance increases since it allows userspace to avoid
taking the hit of a setns(netns_fd, CLONE_NEWNET), then getting the
interfaces from the netns associated with the netns_fd. Especially when a
lot of network namespaces are in use, using setns() becomes increasingly
problematic when performance matters.
Usually, RTML_GETLINK requests are followed by RTM_GETADDR requests (cf.
getifaddrs() style functions and friends). But currently, RTM_GETADDR
requests do not support a similar property like IFLA_IF_NETNSID for
RTM_*LINK requests.
This is problematic since userspace can retrieve interfaces from another
network namespace by sending a IFLA_IF_NETNSID property along but
RTM_GETLINK request but is still forced to use the legacy setns() style of
retrieving interfaces in RTM_GETADDR requests.

The goal of this series is to make it possible to perform RTM_GETADDR
requests on different network namespaces. To this end a new IFA_IF_NETNSID
property for RTM_*ADDR requests is introduced. It can be used to send a
network namespace identifier along in RTM_*ADDR requests.  The network
namespace identifier will be used to retrieve the target network namespace
in which the request is supposed to be fulfilled.  This aligns the behavior
of RTM_*ADDR requests with the behavior of RTM_*LINK requests.

Security:
- The caller must have assigned a valid network namespace identifier for
  the target network namespace.
- The caller must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Thanks!
Christian

[1]: commit 7973bfd8758d ("rtnetlink: remove check for IFLA_IF_NETNSID")
[2]: commit 5bb8ed075428 ("rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK")
[3]: commit b61ad68a9fe8 ("rtnetlink: enable IFLA_IF_NETNSID for RTM_DELLINK")
[4]: commit c310bfcb6e1b ("rtnetlink: enable IFLA_IF_NETNSID for RTM_SETLINK")
[5]: commit 7c4f63ba8243 ("rtnetlink: enable IFLA_IF_NETNSID in do_setlink()")

Christian Brauner (5):
  rtnetlink: add rtnl_get_net_ns_capable()
  if_addr: add IFA_IF_NETNSID
  ipv4: enable IFA_IF_NETNSID for RTM_GETADDR
  ipv6: enable IFA_IF_NETNSID for RTM_GETADDR
  rtnetlink: move type calculation out of loop

 include/net/rtnetlink.h      |  1 +
 include/uapi/linux/if_addr.h |  1 +
 net/core/rtnetlink.c         | 15 +++++---
 net/ipv4/devinet.c           | 38 +++++++++++++++-----
 net/ipv6/addrconf.c          | 70 ++++++++++++++++++++++++++++--------
 5 files changed, 97 insertions(+), 28 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [net-next 10/15] ice: Add support for Tx hang, Tx timeout and malicious driver detection
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Sudheer Mogilappagari, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Sudheer Mogilappagari <sudheer.mogilappagari@intel.com>

When a malicious operation is detected, the firmware triggers an
interrupt, which is then picked up by the service task (specifically by
ice_handle_mdd_event). A reset is scheduled if required.

Tx hang detection works in a similar way, except the logic here monitors
the VSI's Tx queues and tries to revive them if stalled. If the hang is
not resolved, the kernel eventually calls ndo_tx_timeout, which is
handled by ice_tx_timeout.

Signed-off-by: Sudheer Mogilappagari <sudheer.mogilappagari@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h          |   4 +
 .../net/ethernet/intel/ice/ice_hw_autogen.h   |  39 +++
 drivers/net/ethernet/intel/ice/ice_main.c     | 286 ++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_txrx.c     |   1 +
 drivers/net/ethernet/intel/ice/ice_txrx.h     |   1 +
 5 files changed, 331 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index e17030db0bee..6f44a850c4b2 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -134,6 +134,7 @@ enum ice_state {
 	__ICE_SUSPENDED,		/* set on module remove path */
 	__ICE_RESET_FAILED,		/* set by reset/rebuild */
 	__ICE_ADMINQ_EVENT_PENDING,
+	__ICE_MDD_EVENT_PENDING,
 	__ICE_FLTR_OVERFLOW_PROMISC,
 	__ICE_CFG_BUSY,
 	__ICE_SERVICE_SCHED,
@@ -272,6 +273,9 @@ struct ice_pf {
 	struct ice_hw_port_stats stats_prev;
 	struct ice_hw hw;
 	u8 stat_prev_loaded;	/* has previous stats been loaded */
+	u32 tx_timeout_count;
+	unsigned long tx_timeout_last_recovery;
+	u32 tx_timeout_recovery_level;
 	char int_name[ICE_INT_NAME_STR_LEN];
 };
 
diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
index 067ca26a1d94..88f11498804b 100644
--- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
+++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
@@ -123,6 +123,45 @@
 #define QRX_CTRL_QENA_STAT_M			BIT(2)
 #define QRX_ITR(_QRX)				(0x00292000 + ((_QRX) * 4))
 #define QRX_TAIL(_QRX)				(0x00290000 + ((_QRX) * 4))
+#define QRX_TAIL_MAX_INDEX			2047
+#define QRX_TAIL_TAIL_S				0
+#define QRX_TAIL_TAIL_M				ICE_M(0x1FFF, 0)
+#define GL_MDET_RX				0x00294C00
+#define GL_MDET_RX_QNUM_S			0
+#define GL_MDET_RX_QNUM_M			ICE_M(0x7FFF, 0)
+#define GL_MDET_RX_VF_NUM_S			15
+#define GL_MDET_RX_VF_NUM_M			ICE_M(0xFF, 15)
+#define GL_MDET_RX_PF_NUM_S			23
+#define GL_MDET_RX_PF_NUM_M			ICE_M(0x7, 23)
+#define GL_MDET_RX_MAL_TYPE_S			26
+#define GL_MDET_RX_MAL_TYPE_M			ICE_M(0x1F, 26)
+#define GL_MDET_RX_VALID_M			BIT(31)
+#define GL_MDET_TX_PQM				0x002D2E00
+#define GL_MDET_TX_PQM_PF_NUM_S			0
+#define GL_MDET_TX_PQM_PF_NUM_M			ICE_M(0x7, 0)
+#define GL_MDET_TX_PQM_VF_NUM_S			4
+#define GL_MDET_TX_PQM_VF_NUM_M			ICE_M(0xFF, 4)
+#define GL_MDET_TX_PQM_QNUM_S			12
+#define GL_MDET_TX_PQM_QNUM_M			ICE_M(0x3FFF, 12)
+#define GL_MDET_TX_PQM_MAL_TYPE_S		26
+#define GL_MDET_TX_PQM_MAL_TYPE_M		ICE_M(0x1F, 26)
+#define GL_MDET_TX_PQM_VALID_M			BIT(31)
+#define GL_MDET_TX_TCLAN			0x000FC068
+#define GL_MDET_TX_TCLAN_QNUM_S			0
+#define GL_MDET_TX_TCLAN_QNUM_M			ICE_M(0x7FFF, 0)
+#define GL_MDET_TX_TCLAN_VF_NUM_S		15
+#define GL_MDET_TX_TCLAN_VF_NUM_M		ICE_M(0xFF, 15)
+#define GL_MDET_TX_TCLAN_PF_NUM_S		23
+#define GL_MDET_TX_TCLAN_PF_NUM_M		ICE_M(0x7, 23)
+#define GL_MDET_TX_TCLAN_MAL_TYPE_S		26
+#define GL_MDET_TX_TCLAN_MAL_TYPE_M		ICE_M(0x1F, 26)
+#define GL_MDET_TX_TCLAN_VALID_M		BIT(31)
+#define PF_MDET_RX				0x00294280
+#define PF_MDET_RX_VALID_M			BIT(0)
+#define PF_MDET_TX_PQM				0x002D2C80
+#define PF_MDET_TX_PQM_VALID_M			BIT(0)
+#define PF_MDET_TX_TCLAN			0x000FC000
+#define PF_MDET_TX_TCLAN_VALID_M		BIT(0)
 #define GLNVM_FLA				0x000B6108
 #define GLNVM_FLA_LOCKED_M			BIT(6)
 #define GLNVM_GENS				0x000B6100
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 1ef63bf98cd8..fccecb6fa618 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -36,6 +36,81 @@ static void ice_vsi_release_all(struct ice_pf *pf);
 static void ice_update_vsi_stats(struct ice_vsi *vsi);
 static void ice_update_pf_stats(struct ice_pf *pf);
 
+/**
+ * ice_get_tx_pending - returns number of Tx descriptors not processed
+ * @ring: the ring of descriptors
+ */
+static u32 ice_get_tx_pending(struct ice_ring *ring)
+{
+	u32 head, tail;
+
+	head = ring->next_to_clean;
+	tail = readl(ring->tail);
+
+	if (head != tail)
+		return (head < tail) ?
+			tail - head : (tail + ring->count - head);
+	return 0;
+}
+
+/**
+ * ice_check_for_hang_subtask - check for and recover hung queues
+ * @pf: pointer to PF struct
+ */
+static void ice_check_for_hang_subtask(struct ice_pf *pf)
+{
+	struct ice_vsi *vsi = NULL;
+	unsigned int i;
+	u32 v, v_idx;
+	int packets;
+
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v] && pf->vsi[v]->type == ICE_VSI_PF) {
+			vsi = pf->vsi[v];
+			break;
+		}
+
+	if (!vsi || test_bit(__ICE_DOWN, vsi->state))
+		return;
+
+	if (!(vsi->netdev && netif_carrier_ok(vsi->netdev)))
+		return;
+
+	for (i = 0; i < vsi->num_txq; i++) {
+		struct ice_ring *tx_ring = vsi->tx_rings[i];
+
+		if (tx_ring && tx_ring->desc) {
+			int itr = ICE_ITR_NONE;
+
+			/* If packet counter has not changed the queue is
+			 * likely stalled, so force an interrupt for this
+			 * queue.
+			 *
+			 * prev_pkt would be negative if there was no
+			 * pending work.
+			 */
+			packets = tx_ring->stats.pkts & INT_MAX;
+			if (tx_ring->tx_stats.prev_pkt == packets) {
+				/* Trigger sw interrupt to revive the queue */
+				v_idx = tx_ring->q_vector->v_idx;
+				wr32(&vsi->back->hw,
+				     GLINT_DYN_CTL(vsi->base_vector + v_idx),
+				     (itr << GLINT_DYN_CTL_ITR_INDX_S) |
+				     GLINT_DYN_CTL_SWINT_TRIG_M |
+				     GLINT_DYN_CTL_INTENA_MSK_M);
+				continue;
+			}
+
+			/* Memory barrier between read of packet count and call
+			 * to ice_get_tx_pending()
+			 */
+			smp_rmb();
+			tx_ring->tx_stats.prev_pkt =
+			    ice_get_tx_pending(tx_ring) ? packets : -1;
+		}
+	}
+}
+
 /**
  * ice_get_free_slot - get the next non-NULL location index in array
  * @array: array to search
@@ -1003,6 +1078,114 @@ static void ice_service_timer(struct timer_list *t)
 	ice_service_task_schedule(pf);
 }
 
+/**
+ * ice_handle_mdd_event - handle malicious driver detect event
+ * @pf: pointer to the PF structure
+ *
+ * Called from service task. OICR interrupt handler indicates MDD event
+ */
+static void ice_handle_mdd_event(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
+	bool mdd_detected = false;
+	u32 reg;
+
+	if (!test_bit(__ICE_MDD_EVENT_PENDING, pf->state))
+		return;
+
+	/* find what triggered the MDD event */
+	reg = rd32(hw, GL_MDET_TX_PQM);
+	if (reg & GL_MDET_TX_PQM_VALID_M) {
+		u8 pf_num = (reg & GL_MDET_TX_PQM_PF_NUM_M) >>
+				GL_MDET_TX_PQM_PF_NUM_S;
+		u16 vf_num = (reg & GL_MDET_TX_PQM_VF_NUM_M) >>
+				GL_MDET_TX_PQM_VF_NUM_S;
+		u8 event = (reg & GL_MDET_TX_PQM_MAL_TYPE_M) >>
+				GL_MDET_TX_PQM_MAL_TYPE_S;
+		u16 queue = ((reg & GL_MDET_TX_PQM_QNUM_M) >>
+				GL_MDET_TX_PQM_QNUM_S);
+
+		if (netif_msg_tx_err(pf))
+			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
+				 event, queue, pf_num, vf_num);
+		wr32(hw, GL_MDET_TX_PQM, 0xffffffff);
+		mdd_detected = true;
+	}
+
+	reg = rd32(hw, GL_MDET_TX_TCLAN);
+	if (reg & GL_MDET_TX_TCLAN_VALID_M) {
+		u8 pf_num = (reg & GL_MDET_TX_TCLAN_PF_NUM_M) >>
+				GL_MDET_TX_TCLAN_PF_NUM_S;
+		u16 vf_num = (reg & GL_MDET_TX_TCLAN_VF_NUM_M) >>
+				GL_MDET_TX_TCLAN_VF_NUM_S;
+		u8 event = (reg & GL_MDET_TX_TCLAN_MAL_TYPE_M) >>
+				GL_MDET_TX_TCLAN_MAL_TYPE_S;
+		u16 queue = ((reg & GL_MDET_TX_TCLAN_QNUM_M) >>
+				GL_MDET_TX_TCLAN_QNUM_S);
+
+		if (netif_msg_rx_err(pf))
+			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
+				 event, queue, pf_num, vf_num);
+		wr32(hw, GL_MDET_TX_TCLAN, 0xffffffff);
+		mdd_detected = true;
+	}
+
+	reg = rd32(hw, GL_MDET_RX);
+	if (reg & GL_MDET_RX_VALID_M) {
+		u8 pf_num = (reg & GL_MDET_RX_PF_NUM_M) >>
+				GL_MDET_RX_PF_NUM_S;
+		u16 vf_num = (reg & GL_MDET_RX_VF_NUM_M) >>
+				GL_MDET_RX_VF_NUM_S;
+		u8 event = (reg & GL_MDET_RX_MAL_TYPE_M) >>
+				GL_MDET_RX_MAL_TYPE_S;
+		u16 queue = ((reg & GL_MDET_RX_QNUM_M) >>
+				GL_MDET_RX_QNUM_S);
+
+		if (netif_msg_rx_err(pf))
+			dev_info(&pf->pdev->dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n",
+				 event, queue, pf_num, vf_num);
+		wr32(hw, GL_MDET_RX, 0xffffffff);
+		mdd_detected = true;
+	}
+
+	if (mdd_detected) {
+		bool pf_mdd_detected = false;
+
+		reg = rd32(hw, PF_MDET_TX_PQM);
+		if (reg & PF_MDET_TX_PQM_VALID_M) {
+			wr32(hw, PF_MDET_TX_PQM, 0xFFFF);
+			dev_info(&pf->pdev->dev, "TX driver issue detected, PF reset issued\n");
+			pf_mdd_detected = true;
+		}
+
+		reg = rd32(hw, PF_MDET_TX_TCLAN);
+		if (reg & PF_MDET_TX_TCLAN_VALID_M) {
+			wr32(hw, PF_MDET_TX_TCLAN, 0xFFFF);
+			dev_info(&pf->pdev->dev, "TX driver issue detected, PF reset issued\n");
+			pf_mdd_detected = true;
+		}
+
+		reg = rd32(hw, PF_MDET_RX);
+		if (reg & PF_MDET_RX_VALID_M) {
+			wr32(hw, PF_MDET_RX, 0xFFFF);
+			dev_info(&pf->pdev->dev, "RX driver issue detected, PF reset issued\n");
+			pf_mdd_detected = true;
+		}
+		/* Queue belongs to the PF initiate a reset */
+		if (pf_mdd_detected) {
+			set_bit(__ICE_NEEDS_RESTART, pf->state);
+			ice_service_task_schedule(pf);
+		}
+	}
+
+	/* re-enable MDD interrupt cause */
+	clear_bit(__ICE_MDD_EVENT_PENDING, pf->state);
+	reg = rd32(hw, PFINT_OICR_ENA);
+	reg |= PFINT_OICR_MAL_DETECT_M;
+	wr32(hw, PFINT_OICR_ENA, reg);
+	ice_flush(hw);
+}
+
 /**
  * ice_service_task - manage and run subtasks
  * @work: pointer to work_struct contained by the PF struct
@@ -1025,7 +1208,9 @@ static void ice_service_task(struct work_struct *work)
 		return;
 	}
 
+	ice_check_for_hang_subtask(pf);
 	ice_sync_fltr_subtask(pf);
+	ice_handle_mdd_event(pf);
 	ice_watchdog_subtask(pf);
 	ice_clean_adminq_subtask(pf);
 
@@ -1037,6 +1222,7 @@ static void ice_service_task(struct work_struct *work)
 	 * schedule the service task now.
 	 */
 	if (time_after(jiffies, (start_time + pf->serv_tmr_period)) ||
+	    test_bit(__ICE_MDD_EVENT_PENDING, pf->state) ||
 	    test_bit(__ICE_ADMINQ_EVENT_PENDING, pf->state))
 		mod_timer(&pf->serv_tmr, jiffies);
 }
@@ -1747,8 +1933,14 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 	oicr = rd32(hw, PFINT_OICR);
 	ena_mask = rd32(hw, PFINT_OICR_ENA);
 
+	if (oicr & PFINT_OICR_MAL_DETECT_M) {
+		ena_mask &= ~PFINT_OICR_MAL_DETECT_M;
+		set_bit(__ICE_MDD_EVENT_PENDING, pf->state);
+	}
+
 	if (oicr & PFINT_OICR_GRST_M) {
 		u32 reset;
+
 		/* we have a reset warning */
 		ena_mask &= ~PFINT_OICR_GRST_M;
 		reset = (rd32(hw, GLGEN_RSTAT) & GLGEN_RSTAT_RESET_TYPE_M) >>
@@ -5503,6 +5695,99 @@ int ice_get_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
 	return 0;
 }
 
+/**
+ * ice_tx_timeout - Respond to a Tx Hang
+ * @netdev: network interface device structure
+ */
+static void ice_tx_timeout(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_ring *tx_ring = NULL;
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u32 head, val = 0, i;
+	int hung_queue = -1;
+
+	pf->tx_timeout_count++;
+
+	/* find the stopped queue the same way the stack does */
+	for (i = 0; i < netdev->num_tx_queues; i++) {
+		struct netdev_queue *q;
+		unsigned long trans_start;
+
+		q = netdev_get_tx_queue(netdev, i);
+		trans_start = q->trans_start;
+		if (netif_xmit_stopped(q) &&
+		    time_after(jiffies,
+			       (trans_start + netdev->watchdog_timeo))) {
+			hung_queue = i;
+			break;
+		}
+	}
+
+	if (i == netdev->num_tx_queues) {
+		netdev_info(netdev, "tx_timeout: no netdev hung queue found\n");
+	} else {
+		/* now that we have an index, find the tx_ring struct */
+		for (i = 0; i < vsi->num_txq; i++) {
+			if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc) {
+				if (hung_queue ==
+				    vsi->tx_rings[i]->q_index) {
+					tx_ring = vsi->tx_rings[i];
+					break;
+				}
+			}
+		}
+	}
+
+	/* Reset recovery level if enough time has elapsed after last timeout.
+	 * Also ensure no new reset action happens before next timeout period.
+	 */
+	if (time_after(jiffies, (pf->tx_timeout_last_recovery + HZ * 20)))
+		pf->tx_timeout_recovery_level = 1;
+	else if (time_before(jiffies, (pf->tx_timeout_last_recovery +
+				       netdev->watchdog_timeo)))
+		return;
+
+	if (tx_ring) {
+		head = tx_ring->next_to_clean;
+		/* Read interrupt register */
+		if (test_bit(ICE_FLAG_MSIX_ENA, pf->flags))
+			val = rd32(&pf->hw,
+				   GLINT_DYN_CTL(tx_ring->q_vector->v_idx +
+						tx_ring->vsi->base_vector - 1));
+
+		netdev_info(netdev, "tx_timeout: VSI_num: %d, Q %d, NTC: 0x%x, HWB: 0x%x, NTU: 0x%x, TAIL: 0x%x, INT: 0x%x\n",
+			    vsi->vsi_num, hung_queue, tx_ring->next_to_clean,
+			    head, tx_ring->next_to_use,
+			    readl(tx_ring->tail), val);
+	}
+
+	pf->tx_timeout_last_recovery = jiffies;
+	netdev_info(netdev, "tx_timeout recovery level %d, hung_queue %d\n",
+		    pf->tx_timeout_recovery_level, hung_queue);
+
+	switch (pf->tx_timeout_recovery_level) {
+	case 1:
+		set_bit(__ICE_PFR_REQ, pf->state);
+		break;
+	case 2:
+		set_bit(__ICE_CORER_REQ, pf->state);
+		break;
+	case 3:
+		set_bit(__ICE_GLOBR_REQ, pf->state);
+		break;
+	default:
+		netdev_err(netdev, "tx_timeout recovery unsuccessful, device is in unrecoverable state.\n");
+		set_bit(__ICE_DOWN, pf->state);
+		set_bit(__ICE_NEEDS_RESTART, vsi->state);
+		break;
+	}
+
+	ice_service_task_schedule(pf);
+	pf->tx_timeout_recovery_level++;
+}
+
 /**
  * ice_open - Called when a network interface becomes active
  * @netdev: network interface device structure
@@ -5624,4 +5909,5 @@ static const struct net_device_ops ice_netdev_ops = {
 	.ndo_set_features = ice_set_features,
 	.ndo_fdb_add = ice_fdb_add,
 	.ndo_fdb_del = ice_fdb_del,
+	.ndo_tx_timeout = ice_tx_timeout,
 };
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 6481e3d86374..5dae968d853e 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -251,6 +251,7 @@ int ice_setup_tx_ring(struct ice_ring *tx_ring)
 
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
+	tx_ring->tx_stats.prev_pkt = -1;
 	return 0;
 
 err:
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index 31bc998fe200..839fd9ff6043 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -71,6 +71,7 @@ struct ice_txq_stats {
 	u64 restart_q;
 	u64 tx_busy;
 	u64 tx_linearize;
+	int prev_pkt; /* negative if no pending Tx descriptors */
 };
 
 struct ice_rxq_stats {
-- 
2.17.1

^ permalink raw reply related

* [net-next 14/15] ice: Introduce SERVICE_DIS flag and service routine functions
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Akeem G Abodunrin, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>

This patch introduces SERVICE_DIS flag to use for stopping service task.
This flag will be checked before scheduling new tasks. Also add new
functions ice_service_task_stop to stop service task.

Signed-off-by: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h      |  1 +
 drivers/net/ethernet/intel/ice/ice_main.c | 34 ++++++++++++++++++-----
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 6f44a850c4b2..9cf233d085d8 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -138,6 +138,7 @@ enum ice_state {
 	__ICE_FLTR_OVERFLOW_PROMISC,
 	__ICE_CFG_BUSY,
 	__ICE_SERVICE_SCHED,
+	__ICE_SERVICE_DIS,
 	__ICE_STATE_NBITS		/* must be last */
 };
 
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 46d8e2275647..b1c4dfbdeeb3 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -1107,7 +1107,7 @@ static void ice_clean_adminq_subtask(struct ice_pf *pf)
  */
 static void ice_service_task_schedule(struct ice_pf *pf)
 {
-	if (!test_bit(__ICE_DOWN, pf->state) &&
+	if (!test_bit(__ICE_SERVICE_DIS, pf->state) &&
 	    !test_and_set_bit(__ICE_SERVICE_SCHED, pf->state) &&
 	    !test_bit(__ICE_NEEDS_RESTART, pf->state))
 		queue_work(ice_wq, &pf->serv_task);
@@ -1126,6 +1126,22 @@ static void ice_service_task_complete(struct ice_pf *pf)
 	clear_bit(__ICE_SERVICE_SCHED, pf->state);
 }
 
+/**
+ * ice_service_task_stop - stop service task and cancel works
+ * @pf: board private structure
+ */
+static void ice_service_task_stop(struct ice_pf *pf)
+{
+	set_bit(__ICE_SERVICE_DIS, pf->state);
+
+	if (pf->serv_tmr.function)
+		del_timer_sync(&pf->serv_tmr);
+	if (pf->serv_task.func)
+		cancel_work_sync(&pf->serv_task);
+
+	clear_bit(__ICE_SERVICE_SCHED, pf->state);
+}
+
 /**
  * ice_service_timer - timer callback to schedule service task
  * @t: pointer to timer_list
@@ -3389,10 +3405,7 @@ static void ice_determine_q_usage(struct ice_pf *pf)
  */
 static void ice_deinit_pf(struct ice_pf *pf)
 {
-	if (pf->serv_tmr.function)
-		del_timer_sync(&pf->serv_tmr);
-	if (pf->serv_task.func)
-		cancel_work_sync(&pf->serv_task);
+	ice_service_task_stop(pf);
 	mutex_destroy(&pf->sw_mutex);
 	mutex_destroy(&pf->avail_q_mutex);
 }
@@ -3599,6 +3612,8 @@ static int ice_probe(struct pci_dev *pdev,
 	pf->pdev = pdev;
 	pci_set_drvdata(pdev, pf);
 	set_bit(__ICE_DOWN, pf->state);
+	/* Disable service task until DOWN bit is cleared */
+	set_bit(__ICE_SERVICE_DIS, pf->state);
 
 	hw = &pf->hw;
 	hw->hw_addr = pcim_iomap_table(pdev)[ICE_BAR0];
@@ -3656,6 +3671,9 @@ static int ice_probe(struct pci_dev *pdev,
 		goto err_init_interrupt_unroll;
 	}
 
+	/* Driver is mostly up */
+	clear_bit(__ICE_DOWN, pf->state);
+
 	/* In case of MSIX we are going to setup the misc vector right here
 	 * to handle admin queue events etc. In case of legacy and MSI
 	 * the misc functionality and queue processing is combined in
@@ -3695,8 +3713,7 @@ static int ice_probe(struct pci_dev *pdev,
 		goto err_alloc_sw_unroll;
 	}
 
-	/* Driver is mostly up */
-	clear_bit(__ICE_DOWN, pf->state);
+	clear_bit(__ICE_SERVICE_DIS, pf->state);
 
 	/* since everything is good, start the service timer */
 	mod_timer(&pf->serv_tmr, round_jiffies(jiffies + pf->serv_tmr_period));
@@ -3710,6 +3727,7 @@ static int ice_probe(struct pci_dev *pdev,
 	return 0;
 
 err_alloc_sw_unroll:
+	set_bit(__ICE_SERVICE_DIS, pf->state);
 	set_bit(__ICE_DOWN, pf->state);
 	devm_kfree(&pf->pdev->dev, pf->first_sw);
 err_msix_misc_unroll:
@@ -3737,6 +3755,7 @@ static void ice_remove(struct pci_dev *pdev)
 		return;
 
 	set_bit(__ICE_DOWN, pf->state);
+	ice_service_task_stop(pf);
 
 	ice_vsi_release_all(pf);
 	ice_free_irq_msix_misc(pf);
@@ -5996,6 +6015,7 @@ static void ice_tx_timeout(struct net_device *netdev)
 		netdev_err(netdev, "tx_timeout recovery unsuccessful, device is in unrecoverable state.\n");
 		set_bit(__ICE_DOWN, pf->state);
 		set_bit(__ICE_NEEDS_RESTART, vsi->state);
+		set_bit(__ICE_SERVICE_DIS, pf->state);
 		break;
 	}
 
-- 
2.17.1

^ permalink raw reply related

* [net-next 15/15] ice: Fix and update driver version string
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Anirudh Venkataramanan, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

Remove the "ice" prefix for the driver version string and bump version
to 0.7.1-k.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index b1c4dfbdeeb3..1b49a605d094 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -7,7 +7,7 @@
 
 #include "ice.h"
 
-#define DRV_VERSION	"ice-0.7.0-k"
+#define DRV_VERSION	"0.7.1-k"
 #define DRV_SUMMARY	"Intel(R) Ethernet Connection E800 Series Linux Driver"
 const char ice_drv_ver[] = DRV_VERSION;
 static const char ice_driver_string[] = DRV_SUMMARY;
-- 
2.17.1

^ permalink raw reply related

* [net-next 13/15] ice: Enable VSI Rx/Tx pruning only when VLAN 0 is active
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Brett Creeley, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Brett Creeley <brett.creeley@intel.com>

VLAN pruning is not valid when VLAN 0 is not active. If VLAN
pruning is enabled and VLAN 0 is not active (8021q driver not loaded)
then normal, non-VLAN, traffic will not pass.

TX/RX VLAN pruning is enabled when the VLAN 0 is added to the
active_vlan bitmap and it is disabled when VLAN 0 is removed from the
active_vlan bitmap.

So, only enable VLAN pruning when VLAN 0 is active. Setting RX VLAN
pruning causes the switch to drop received VLAN packets when there
are no matching VLAN ids in the associated VSI's switch filters. Setting
TX pruning makes it so the switch will not send out any packets with
VLAN tags that don't match the associated VSI's switch filters.

With this patch, if the VF or PF tries to send a VLAN tagged packet with
a VLAN tag that it does not have a pruning rule for it will trigger an
MDD event. For example, if PF0 has VLAN10 and VLAN11 interfaces and
scapy is used to send a packet with VLAN8 then the MDD is triggered.

Also make ice_vsi_kill_vlan return a value which the caller can check
before updating VLAN related data structures (counts, pruning bits, etc.).

Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c | 94 ++++++++++++++++++++---
 1 file changed, 85 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index f04e124bca8c..46d8e2275647 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -349,6 +349,63 @@ static bool ice_vsi_fltr_changed(struct ice_vsi *vsi)
 	       test_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
 }
 
+/**
+ * ice_cfg_vlan_pruning - enable or disable VLAN pruning on the VSI
+ * @vsi: VSI to enable or disable VLAN pruning on
+ * @ena: set to true to enable VLAN pruning and false to disable it
+ *
+ * returns 0 if VSI is updated, negative otherwise
+ */
+static int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena)
+{
+	struct ice_vsi_ctx *ctxt;
+	struct device *dev;
+	int status;
+
+	if (!vsi)
+		return -EINVAL;
+
+	dev = &vsi->back->pdev->dev;
+	ctxt = devm_kzalloc(dev, sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		return -ENOMEM;
+
+	ctxt->info = vsi->info;
+
+	if (ena) {
+		ctxt->info.sec_flags |=
+			ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+			ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S;
+		ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+	} else {
+		ctxt->info.sec_flags &=
+			~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA <<
+			  ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S);
+		ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA;
+	}
+
+	ctxt->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID |
+						ICE_AQ_VSI_PROP_SW_VALID);
+	ctxt->vsi_num = vsi->vsi_num;
+	status = ice_aq_update_vsi(&vsi->back->hw, ctxt, NULL);
+	if (status) {
+		netdev_err(vsi->netdev, "%sabling VLAN pruning on VSI %d failed, err = %d, aq_err = %d\n",
+			   ena ? "Ena" : "Dis", vsi->vsi_num, status,
+			   vsi->back->hw.adminq.sq_last_status);
+		goto err_out;
+	}
+
+	vsi->info.sec_flags = ctxt->info.sec_flags;
+	vsi->info.sw_flags2 = ctxt->info.sw_flags2;
+
+	devm_kfree(dev, ctxt);
+	return 0;
+
+err_out:
+	devm_kfree(dev, ctxt);
+	return -EIO;
+}
+
 /**
  * ice_vsi_sync_fltr - Update the VSI filter list to the HW
  * @vsi: ptr to the VSI
@@ -3126,7 +3183,7 @@ static int ice_vlan_rx_add_vid(struct net_device *netdev,
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
-	int ret = 0;
+	int ret;
 
 	if (vid >= VLAN_N_VID) {
 		netdev_err(netdev, "VLAN id requested %d is out of range %d\n",
@@ -3137,6 +3194,13 @@ static int ice_vlan_rx_add_vid(struct net_device *netdev,
 	if (vsi->info.pvid)
 		return -EINVAL;
 
+	/* Enable VLAN pruning when VLAN 0 is added */
+	if (unlikely(!vid)) {
+		ret = ice_cfg_vlan_pruning(vsi, true);
+		if (ret)
+			return ret;
+	}
+
 	/* Add all VLAN ids including 0 to the switch filter. VLAN id 0 is
 	 * needed to continue allowing all untagged packets since VLAN prune
 	 * list is applied to all packets by the switch
@@ -3153,16 +3217,19 @@ static int ice_vlan_rx_add_vid(struct net_device *netdev,
  * ice_vsi_kill_vlan - Remove VSI membership for a given VLAN
  * @vsi: the VSI being configured
  * @vid: VLAN id to be removed
+ *
+ * Returns 0 on success and negative on failure
  */
-static void ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid)
+static int ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid)
 {
 	struct ice_fltr_list_entry *list;
 	struct ice_pf *pf = vsi->back;
 	LIST_HEAD(tmp_add_list);
+	int status = 0;
 
 	list = devm_kzalloc(&pf->pdev->dev, sizeof(*list), GFP_KERNEL);
 	if (!list)
-		return;
+		return -ENOMEM;
 
 	list->fltr_info.lkup_type = ICE_SW_LKUP_VLAN;
 	list->fltr_info.fwd_id.vsi_id = vsi->vsi_num;
@@ -3174,11 +3241,14 @@ static void ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid)
 	INIT_LIST_HEAD(&list->list_entry);
 	list_add(&list->list_entry, &tmp_add_list);
 
-	if (ice_remove_vlan(&pf->hw, &tmp_add_list))
+	if (ice_remove_vlan(&pf->hw, &tmp_add_list)) {
 		dev_err(&pf->pdev->dev, "Error removing VLAN %d on vsi %i\n",
 			vid, vsi->vsi_num);
+		status = -EIO;
+	}
 
 	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
+	return status;
 }
 
 /**
@@ -3194,19 +3264,25 @@ static int ice_vlan_rx_kill_vid(struct net_device *netdev,
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
+	int status;
 
 	if (vsi->info.pvid)
 		return -EINVAL;
 
-	/* return code is ignored as there is nothing a user
-	 * can do about failure to remove and a log message was
-	 * already printed from the other function
+	/* Make sure ice_vsi_kill_vlan is successful before updating VLAN
+	 * information
 	 */
-	ice_vsi_kill_vlan(vsi, vid);
+	status = ice_vsi_kill_vlan(vsi, vid);
+	if (status)
+		return status;
 
 	clear_bit(vid, vsi->active_vlans);
 
-	return 0;
+	/* Disable VLAN pruning when VLAN 0 is removed */
+	if (unlikely(!vid))
+		status = ice_cfg_vlan_pruning(vsi, false);
+
+	return status;
 }
 
 /**
-- 
2.17.1

^ permalink raw reply related

* [net-next 12/15] ice: Enable firmware logging during device initialization.
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Hieu Tran, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Hieu Tran <hieu.t.tran@intel.com>

To enable FW logging, the "cq_en" and "uart_en" enable bits of the
"fw_log" element in struct ice_hw need to set accordingly based on
some user-provided parameters during driver loading. To select which
FW log events to be emitted, the "cfg" elements of corresponding FW
modules in the "evnts" array member of "fw_log" need to be configured.

Signed-off-by: Hieu Tran <hieu.t.tran@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |  83 ++++++++
 drivers/net/ethernet/intel/ice/ice_common.c   | 182 +++++++++++++++++-
 drivers/net/ethernet/intel/ice/ice_common.h   |   1 +
 drivers/net/ethernet/intel/ice/ice_main.c     |   3 +
 drivers/net/ethernet/intel/ice/ice_type.h     |  19 ++
 5 files changed, 286 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 3dadb2b01b5c..f8dfd675486c 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1206,6 +1206,84 @@ struct ice_aqc_dis_txq {
 	struct ice_aqc_dis_txq_item qgrps[1];
 };
 
+/* Configure Firmware Logging Command (indirect 0xFF09)
+ * Logging Information Read Response (indirect 0xFF10)
+ * Note: The 0xFF10 command has no input parameters.
+ */
+struct ice_aqc_fw_logging {
+	u8 log_ctrl;
+#define ICE_AQC_FW_LOG_AQ_EN		BIT(0)
+#define ICE_AQC_FW_LOG_UART_EN		BIT(1)
+	u8 rsvd0;
+	u8 log_ctrl_valid; /* Not used by 0xFF10 Response */
+#define ICE_AQC_FW_LOG_AQ_VALID		BIT(0)
+#define ICE_AQC_FW_LOG_UART_VALID	BIT(1)
+	u8 rsvd1[5];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+enum ice_aqc_fw_logging_mod {
+	ICE_AQC_FW_LOG_ID_GENERAL = 0,
+	ICE_AQC_FW_LOG_ID_CTRL,
+	ICE_AQC_FW_LOG_ID_LINK,
+	ICE_AQC_FW_LOG_ID_LINK_TOPO,
+	ICE_AQC_FW_LOG_ID_DNL,
+	ICE_AQC_FW_LOG_ID_I2C,
+	ICE_AQC_FW_LOG_ID_SDP,
+	ICE_AQC_FW_LOG_ID_MDIO,
+	ICE_AQC_FW_LOG_ID_ADMINQ,
+	ICE_AQC_FW_LOG_ID_HDMA,
+	ICE_AQC_FW_LOG_ID_LLDP,
+	ICE_AQC_FW_LOG_ID_DCBX,
+	ICE_AQC_FW_LOG_ID_DCB,
+	ICE_AQC_FW_LOG_ID_NETPROXY,
+	ICE_AQC_FW_LOG_ID_NVM,
+	ICE_AQC_FW_LOG_ID_AUTH,
+	ICE_AQC_FW_LOG_ID_VPD,
+	ICE_AQC_FW_LOG_ID_IOSF,
+	ICE_AQC_FW_LOG_ID_PARSER,
+	ICE_AQC_FW_LOG_ID_SW,
+	ICE_AQC_FW_LOG_ID_SCHEDULER,
+	ICE_AQC_FW_LOG_ID_TXQ,
+	ICE_AQC_FW_LOG_ID_RSVD,
+	ICE_AQC_FW_LOG_ID_POST,
+	ICE_AQC_FW_LOG_ID_WATCHDOG,
+	ICE_AQC_FW_LOG_ID_TASK_DISPATCH,
+	ICE_AQC_FW_LOG_ID_MNG,
+	ICE_AQC_FW_LOG_ID_MAX,
+};
+
+/* This is the buffer for both of the logging commands.
+ * The entry array size depends on the datalen parameter in the descriptor.
+ * There will be a total of datalen / 2 entries.
+ */
+struct ice_aqc_fw_logging_data {
+	__le16 entry[1];
+#define ICE_AQC_FW_LOG_ID_S		0
+#define ICE_AQC_FW_LOG_ID_M		(0xFFF << ICE_AQC_FW_LOG_ID_S)
+
+#define ICE_AQC_FW_LOG_CONF_SUCCESS	0	/* Used by response */
+#define ICE_AQC_FW_LOG_CONF_BAD_INDX	BIT(12)	/* Used by response */
+
+#define ICE_AQC_FW_LOG_EN_S		12
+#define ICE_AQC_FW_LOG_EN_M		(0xF << ICE_AQC_FW_LOG_EN_S)
+#define ICE_AQC_FW_LOG_INFO_EN		BIT(12)	/* Used by command */
+#define ICE_AQC_FW_LOG_INIT_EN		BIT(13)	/* Used by command */
+#define ICE_AQC_FW_LOG_FLOW_EN		BIT(14)	/* Used by command */
+#define ICE_AQC_FW_LOG_ERR_EN		BIT(15)	/* Used by command */
+};
+
+/* Get/Clear FW Log (indirect 0xFF11) */
+struct ice_aqc_get_clear_fw_log {
+	u8 flags;
+#define ICE_AQC_FW_LOG_CLEAR		BIT(0)
+#define ICE_AQC_FW_LOG_MORE_DATA_AVAIL	BIT(1)
+	u8 rsvd1[7];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
 /**
  * struct ice_aq_desc - Admin Queue (AQ) descriptor
  * @flags: ICE_AQ_FLAG_* flags
@@ -1256,6 +1334,8 @@ struct ice_aq_desc {
 		struct ice_aqc_dis_txqs dis_txqs;
 		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
 		struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res;
+		struct ice_aqc_fw_logging fw_logging;
+		struct ice_aqc_get_clear_fw_log get_clear_fw_log;
 		struct ice_aqc_alloc_free_res_cmd sw_res_ctrl;
 		struct ice_aqc_set_event_mask set_event_mask;
 		struct ice_aqc_get_link_status get_link_status;
@@ -1353,6 +1433,9 @@ enum ice_adminq_opc {
 	/* TX queue handling commands/events */
 	ice_aqc_opc_add_txqs				= 0x0C30,
 	ice_aqc_opc_dis_txqs				= 0x0C31,
+
+	/* debug commands */
+	ice_aqc_opc_fw_logging				= 0xFF09,
 };
 
 #endif /* _ICE_ADMINQ_CMD_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 52c2bf4f108e..0847dbf9d42f 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -427,6 +427,176 @@ static void ice_cleanup_fltr_mgmt_struct(struct ice_hw *hw)
 	devm_kfree(ice_hw_to_dev(hw), sw);
 }
 
+#define ICE_FW_LOG_DESC_SIZE(n)	(sizeof(struct ice_aqc_fw_logging_data) + \
+	(((n) - 1) * sizeof(((struct ice_aqc_fw_logging_data *)0)->entry)))
+#define ICE_FW_LOG_DESC_SIZE_MAX	\
+	ICE_FW_LOG_DESC_SIZE(ICE_AQC_FW_LOG_ID_MAX)
+
+/**
+ * ice_cfg_fw_log - configure FW logging
+ * @hw: pointer to the hw struct
+ * @enable: enable certain FW logging events if true, disable all if false
+ *
+ * This function enables/disables the FW logging via Rx CQ events and a UART
+ * port based on predetermined configurations. FW logging via the Rx CQ can be
+ * enabled/disabled for individual PF's. However, FW logging via the UART can
+ * only be enabled/disabled for all PFs on the same device.
+ *
+ * To enable overall FW logging, the "cq_en" and "uart_en" enable bits in
+ * hw->fw_log need to be set accordingly, e.g. based on user-provided input,
+ * before initializing the device.
+ *
+ * When re/configuring FW logging, callers need to update the "cfg" elements of
+ * the hw->fw_log.evnts array with the desired logging event configurations for
+ * modules of interest. When disabling FW logging completely, the callers can
+ * just pass false in the "enable" parameter. On completion, the function will
+ * update the "cur" element of the hw->fw_log.evnts array with the resulting
+ * logging event configurations of the modules that are being re/configured. FW
+ * logging modules that are not part of a reconfiguration operation retain their
+ * previous states.
+ *
+ * Before resetting the device, it is recommended that the driver disables FW
+ * logging before shutting down the control queue. When disabling FW logging
+ * ("enable" = false), the latest configurations of FW logging events stored in
+ * hw->fw_log.evnts[] are not overridden to allow them to be reconfigured after
+ * a device reset.
+ *
+ * When enabling FW logging to emit log messages via the Rx CQ during the
+ * device's initialization phase, a mechanism alternative to interrupt handlers
+ * needs to be used to extract FW log messages from the Rx CQ periodically and
+ * to prevent the Rx CQ from being full and stalling other types of control
+ * messages from FW to SW. Interrupts are typically disabled during the device's
+ * initialization phase.
+ */
+static enum ice_status ice_cfg_fw_log(struct ice_hw *hw, bool enable)
+{
+	struct ice_aqc_fw_logging_data *data = NULL;
+	struct ice_aqc_fw_logging *cmd;
+	enum ice_status status = 0;
+	u16 i, chgs = 0, len = 0;
+	struct ice_aq_desc desc;
+	u8 actv_evnts = 0;
+	void *buf = NULL;
+
+	if (!hw->fw_log.cq_en && !hw->fw_log.uart_en)
+		return 0;
+
+	/* Disable FW logging only when the control queue is still responsive */
+	if (!enable &&
+	    (!hw->fw_log.actv_evnts || !ice_check_sq_alive(hw, &hw->adminq)))
+		return 0;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logging);
+	cmd = &desc.params.fw_logging;
+
+	/* Indicate which controls are valid */
+	if (hw->fw_log.cq_en)
+		cmd->log_ctrl_valid |= ICE_AQC_FW_LOG_AQ_VALID;
+
+	if (hw->fw_log.uart_en)
+		cmd->log_ctrl_valid |= ICE_AQC_FW_LOG_UART_VALID;
+
+	if (enable) {
+		/* Fill in an array of entries with FW logging modules and
+		 * logging events being reconfigured.
+		 */
+		for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) {
+			u16 val;
+
+			/* Keep track of enabled event types */
+			actv_evnts |= hw->fw_log.evnts[i].cfg;
+
+			if (hw->fw_log.evnts[i].cfg == hw->fw_log.evnts[i].cur)
+				continue;
+
+			if (!data) {
+				data = devm_kzalloc(ice_hw_to_dev(hw),
+						    ICE_FW_LOG_DESC_SIZE_MAX,
+						    GFP_KERNEL);
+				if (!data)
+					return ICE_ERR_NO_MEMORY;
+			}
+
+			val = i << ICE_AQC_FW_LOG_ID_S;
+			val |= hw->fw_log.evnts[i].cfg << ICE_AQC_FW_LOG_EN_S;
+			data->entry[chgs++] = cpu_to_le16(val);
+		}
+
+		/* Only enable FW logging if at least one module is specified.
+		 * If FW logging is currently enabled but all modules are not
+		 * enabled to emit log messages, disable FW logging altogether.
+		 */
+		if (actv_evnts) {
+			/* Leave if there is effectively no change */
+			if (!chgs)
+				goto out;
+
+			if (hw->fw_log.cq_en)
+				cmd->log_ctrl |= ICE_AQC_FW_LOG_AQ_EN;
+
+			if (hw->fw_log.uart_en)
+				cmd->log_ctrl |= ICE_AQC_FW_LOG_UART_EN;
+
+			buf = data;
+			len = ICE_FW_LOG_DESC_SIZE(chgs);
+			desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+		}
+	}
+
+	status = ice_aq_send_cmd(hw, &desc, buf, len, NULL);
+	if (!status) {
+		/* Update the current configuration to reflect events enabled.
+		 * hw->fw_log.cq_en and hw->fw_log.uart_en indicate if the FW
+		 * logging mode is enabled for the device. They do not reflect
+		 * actual modules being enabled to emit log messages. So, their
+		 * values remain unchanged even when all modules are disabled.
+		 */
+		u16 cnt = enable ? chgs : (u16)ICE_AQC_FW_LOG_ID_MAX;
+
+		hw->fw_log.actv_evnts = actv_evnts;
+		for (i = 0; i < cnt; i++) {
+			u16 v, m;
+
+			if (!enable) {
+				/* When disabling all FW logging events as part
+				 * of device's de-initialization, the original
+				 * configurations are retained, and can be used
+				 * to reconfigure FW logging later if the device
+				 * is re-initialized.
+				 */
+				hw->fw_log.evnts[i].cur = 0;
+				continue;
+			}
+
+			v = le16_to_cpu(data->entry[i]);
+			m = (v & ICE_AQC_FW_LOG_ID_M) >> ICE_AQC_FW_LOG_ID_S;
+			hw->fw_log.evnts[m].cur = hw->fw_log.evnts[m].cfg;
+		}
+	}
+
+out:
+	if (data)
+		devm_kfree(ice_hw_to_dev(hw), data);
+
+	return status;
+}
+
+/**
+ * ice_output_fw_log
+ * @hw: pointer to the hw struct
+ * @desc: pointer to the AQ message descriptor
+ * @buf: pointer to the buffer accompanying the AQ message
+ *
+ * Formats a FW Log message and outputs it via the standard driver logs.
+ */
+void ice_output_fw_log(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf)
+{
+	ice_debug(hw, ICE_DBG_AQ_MSG, "[ FW Log Msg Start ]\n");
+	ice_debug_array(hw, ICE_DBG_AQ_MSG, 16, 1, (u8 *)buf,
+			le16_to_cpu(desc->datalen));
+	ice_debug(hw, ICE_DBG_AQ_MSG, "[ FW Log Msg End ]\n");
+}
+
 /**
  * ice_init_hw - main hardware initialization routine
  * @hw: pointer to the hardware structure
@@ -461,6 +631,11 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 	if (status)
 		goto err_unroll_cqinit;
 
+	/* Enable FW logging. Not fatal if this fails. */
+	status = ice_cfg_fw_log(hw, true);
+	if (status)
+		ice_debug(hw, ICE_DBG_INIT, "Failed to enable FW logging.\n");
+
 	status = ice_clear_pf_cfg(hw);
 	if (status)
 		goto err_unroll_cqinit;
@@ -574,15 +749,18 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
  */
 void ice_deinit_hw(struct ice_hw *hw)
 {
+	ice_cleanup_fltr_mgmt_struct(hw);
+
 	ice_sched_cleanup_all(hw);
-	ice_shutdown_all_ctrlq(hw);
 
 	if (hw->port_info) {
 		devm_kfree(ice_hw_to_dev(hw), hw->port_info);
 		hw->port_info = NULL;
 	}
 
-	ice_cleanup_fltr_mgmt_struct(hw);
+	/* Attempt to disable FW logging before shutting down control queues */
+	ice_cfg_fw_log(hw, false);
+	ice_shutdown_all_ctrlq(hw);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index 409b9957f407..aac2d6cadaaf 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -95,4 +95,5 @@ enum ice_status
 ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_id, u8 tc, u8 num_qgrps,
 		struct ice_aqc_add_tx_qgrp *buf, u16 buf_size,
 		struct ice_sq_cd *cd);
+void ice_output_fw_log(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf);
 #endif /* _ICE_COMMON_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index cbeae1355593..f04e124bca8c 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -984,6 +984,9 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
 				dev_err(&pf->pdev->dev,
 					"Could not handle link event\n");
 			break;
+		case ice_aqc_opc_fw_logging:
+			ice_output_fw_log(hw, &event.desc, event.msg_buf);
+			break;
 		default:
 			dev_dbg(&pf->pdev->dev,
 				"%s Receive Queue unknown event 0x%04x ignored\n",
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 8b7efa87ce56..e681804be4d4 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -258,6 +258,24 @@ struct ice_switch_info {
 	struct ice_sw_recipe *recp_list;
 };
 
+/* FW logging configuration */
+struct ice_fw_log_evnt {
+	u8 cfg : 4;	/* New event enables to configure */
+	u8 cur : 4;	/* Current/active event enables */
+};
+
+struct ice_fw_log_cfg {
+	u8 cq_en : 1;    /* FW logging is enabled via the control queue */
+	u8 uart_en : 1;  /* FW logging is enabled via UART for all PFs */
+	u8 actv_evnts;   /* Cumulation of currently enabled log events */
+
+#define ICE_FW_LOG_EVNT_INFO	(ICE_AQC_FW_LOG_INFO_EN >> ICE_AQC_FW_LOG_EN_S)
+#define ICE_FW_LOG_EVNT_INIT	(ICE_AQC_FW_LOG_INIT_EN >> ICE_AQC_FW_LOG_EN_S)
+#define ICE_FW_LOG_EVNT_FLOW	(ICE_AQC_FW_LOG_FLOW_EN >> ICE_AQC_FW_LOG_EN_S)
+#define ICE_FW_LOG_EVNT_ERR	(ICE_AQC_FW_LOG_ERR_EN >> ICE_AQC_FW_LOG_EN_S)
+	struct ice_fw_log_evnt evnts[ICE_AQC_FW_LOG_ID_MAX];
+};
+
 /* Port hardware description */
 struct ice_hw {
 	u8 __iomem *hw_addr;
@@ -307,6 +325,7 @@ struct ice_hw {
 	u8 fw_patch;		/* firmware patch version */
 	u32 fw_build;		/* firmware build number */
 
+	struct ice_fw_log_cfg fw_log;
 	/* minimum allowed value for different speeds */
 #define ICE_ITR_GRAN_MIN_200	1
 #define ICE_ITR_GRAN_MIN_100	1
-- 
2.17.1

^ permalink raw reply related

* [net-next 07/15] ice: Refactor VSI allocation, deletion and rebuild flow
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Anirudh Venkataramanan, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

This patch refactors aspects of the VSI allocation, deletion and rebuild
flow. Some of the more noteworthy changes are described below.

1) On reset, all switch filters applied in the hardware are lost. In
   the rebuild flow, only MAC and broadcast filters are being restored.
   Instead, use a new function ice_replay_all_fltr to restore all the
   filters that were previously added. To do this, remove calls to
   ice_remove_vsi_fltr to prevent cleaning out the internal bookkeeping
   structures that ice_replay_all_fltr uses to replay filters.

2) Introduce a new state bit __ICE_PREPARED_FOR_RESET to distinguish the
   PF that requested the reset (and consequently prepared for it) from
   the rest of the PFs. These other PFs will prepare for reset only
   when they receive an interrupt from the firmware.

3) Use new functions ice_add_vsi and ice_free_vsi to create and destroy
   VSIs respectively. These functions accept a handle to uniquely
   identify a VSI. This same handle is required to rebuild the VSI post
   reset. To prevent confusion, the existing ice_vsi_add was renamed to
   ice_vsi_init.

4) Enhance ice_vsi_setup for the upcoming SR-IOV changes and expose a
   new wrapper function ice_pf_vsi_setup to create PF VSIs. Rework the
   error handling path in ice_setup_pf_sw.

5) Introduce a new function ice_vsi_release_all to release all PF VSIs.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h          |   2 +
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |   1 +
 drivers/net/ethernet/intel/ice/ice_common.c   |   2 +
 drivers/net/ethernet/intel/ice/ice_main.c     | 371 +++++++++++-------
 drivers/net/ethernet/intel/ice/ice_switch.c   | 353 +++++++++++++++--
 drivers/net/ethernet/intel/ice/ice_switch.h   |  14 +-
 drivers/net/ethernet/intel/ice/ice_type.h     |   8 +-
 7 files changed, 580 insertions(+), 171 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 868f4a1d0f72..e17030db0bee 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -62,6 +62,7 @@ extern const char ice_drv_ver[];
 #define ICE_RES_VALID_BIT	0x8000
 #define ICE_RES_MISC_VEC_ID	(ICE_RES_VALID_BIT - 1)
 #define ICE_INVAL_Q_INDEX	0xffff
+#define ICE_INVAL_VFID		256
 
 #define ICE_VSIQF_HKEY_ARRAY_SIZE	((VSIQF_HKEY_MAX_INDEX + 1) *	4)
 
@@ -122,6 +123,7 @@ struct ice_sw {
 enum ice_state {
 	__ICE_DOWN,
 	__ICE_NEEDS_RESTART,
+	__ICE_PREPARED_FOR_RESET,	/* set by driver when prepared */
 	__ICE_RESET_RECOVERY_PENDING,	/* set by driver when reset starts */
 	__ICE_PFR_REQ,			/* set by driver and peers */
 	__ICE_CORER_REQ,		/* set by driver and peers */
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 87b304db9cad..55e8275ce2ee 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1253,6 +1253,7 @@ struct ice_aq_desc {
 		struct ice_aqc_add_txqs add_txqs;
 		struct ice_aqc_dis_txqs dis_txqs;
 		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
+		struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res;
 		struct ice_aqc_alloc_free_res_cmd sw_res_ctrl;
 		struct ice_aqc_set_event_mask set_event_mask;
 		struct ice_aqc_get_link_status get_link_status;
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 4c6b1038dc5f..b2bb42def038 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -711,6 +711,8 @@ enum ice_status ice_reset(struct ice_hw *hw, enum ice_reset_req req)
 		ice_debug(hw, ICE_DBG_INIT, "GlobalR requested\n");
 		val = GLGEN_RTRIG_GLOBR_M;
 		break;
+	default:
+		return ICE_ERR_PARAM;
 	}
 
 	val |= rd32(hw, GLGEN_RTRIG);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 014a2f3ea76c..1ef63bf98cd8 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -32,6 +32,7 @@ static const struct net_device_ops ice_netdev_ops;
 static void ice_pf_dis_all_vsi(struct ice_pf *pf);
 static void ice_rebuild(struct ice_pf *pf);
 static int ice_vsi_release(struct ice_vsi *vsi);
+static void ice_vsi_release_all(struct ice_pf *pf);
 static void ice_update_vsi_stats(struct ice_vsi *vsi);
 static void ice_update_pf_stats(struct ice_pf *pf);
 
@@ -456,23 +457,13 @@ static void
 ice_prepare_for_reset(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
-	u32 v;
-
-	ice_for_each_vsi(pf, v)
-		if (pf->vsi[v])
-			ice_remove_vsi_fltr(hw, pf->vsi[v]->vsi_num);
-
-	dev_dbg(&pf->pdev->dev, "Tearing down internal switch for reset\n");
 
 	/* disable the VSIs and their queues that are not already DOWN */
-	/* pf_dis_all_vsi modifies netdev structures -rtnl_lock needed */
 	ice_pf_dis_all_vsi(pf);
 
-	ice_for_each_vsi(pf, v)
-		if (pf->vsi[v])
-			pf->vsi[v]->vsi_num = 0;
-
 	ice_shutdown_all_ctrlq(hw);
+
+	set_bit(__ICE_PREPARED_FOR_RESET, pf->state);
 }
 
 /**
@@ -490,26 +481,32 @@ static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
 	WARN_ON(in_interrupt());
 
 	/* PFR is a bit of a special case because it doesn't result in an OICR
-	 * interrupt. So for PFR, we prepare for reset, issue the reset and
-	 * rebuild sequentially.
+	 * interrupt. Set pending bit here which otherwise gets set in the
+	 * OICR handler.
 	 */
-	if (reset_type == ICE_RESET_PFR) {
+	if (reset_type == ICE_RESET_PFR)
 		set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
-		ice_prepare_for_reset(pf);
-	}
+
+	ice_prepare_for_reset(pf);
 
 	/* trigger the reset */
 	if (ice_reset(hw, reset_type)) {
 		dev_err(dev, "reset %d failed\n", reset_type);
 		set_bit(__ICE_RESET_FAILED, pf->state);
 		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+		clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
 		return;
 	}
 
+	/* PFR is a bit of a special case because it doesn't result in an OICR
+	 * interrupt. So for PFR, rebuild after the reset and clear the reset-
+	 * associated state bits.
+	 */
 	if (reset_type == ICE_RESET_PFR) {
 		pf->pfr_count++;
 		ice_rebuild(pf);
 		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+		clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
 	}
 }
 
@@ -519,20 +516,23 @@ static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
  */
 static void ice_reset_subtask(struct ice_pf *pf)
 {
-	enum ice_reset_req reset_type;
-
-	rtnl_lock();
+	enum ice_reset_req reset_type = ICE_RESET_INVAL;
 
 	/* When a CORER/GLOBR/EMPR is about to happen, the hardware triggers an
-	 * OICR interrupt. The OICR handler (ice_misc_intr) determines what
-	 * type of reset happened and sets __ICE_RESET_RECOVERY_PENDING bit in
-	 * pf->state. So if reset/recovery is pending (as indicated by this bit)
-	 * we do a rebuild and return.
+	 * OICR interrupt. The OICR handler (ice_misc_intr) determines what type
+	 * of reset is pending and sets bits in pf->state indicating the reset
+	 * type and __ICE_RESET_RECOVERY_PENDING.  So, if the latter bit is set
+	 * prepare for pending reset if not already (for PF software-initiated
+	 * global resets the software should already be prepared for it as
+	 * indicated by __ICE_PREPARED_FOR_RESET; for global resets initiated
+	 * by firmware or software on other PFs, that bit is not set so prepare
+	 * for the reset now), poll for reset done, rebuild and return.
 	 */
 	if (ice_is_reset_recovery_pending(pf->state)) {
 		clear_bit(__ICE_GLOBR_RECV, pf->state);
 		clear_bit(__ICE_CORER_RECV, pf->state);
-		ice_prepare_for_reset(pf);
+		if (!test_bit(__ICE_PREPARED_FOR_RESET, pf->state))
+			ice_prepare_for_reset(pf);
 
 		/* make sure we are ready to rebuild */
 		if (ice_check_reset(&pf->hw)) {
@@ -541,29 +541,32 @@ static void ice_reset_subtask(struct ice_pf *pf)
 			/* done with reset. start rebuild */
 			pf->hw.reset_ongoing = false;
 			ice_rebuild(pf);
+			/* clear bit to resume normal operations, but
+			 * ICE_NEEDS_RESTART bit is set incase rebuild failed
+			 */
+			clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+			clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
 		}
-		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
-		goto unlock;
+
+		return;
 	}
 
 	/* No pending resets to finish processing. Check for new resets */
+	if (test_and_clear_bit(__ICE_PFR_REQ, pf->state))
+		reset_type = ICE_RESET_PFR;
+	if (test_and_clear_bit(__ICE_CORER_REQ, pf->state))
+		reset_type = ICE_RESET_CORER;
 	if (test_and_clear_bit(__ICE_GLOBR_REQ, pf->state))
 		reset_type = ICE_RESET_GLOBR;
-	else if (test_and_clear_bit(__ICE_CORER_REQ, pf->state))
-		reset_type = ICE_RESET_CORER;
-	else if (test_and_clear_bit(__ICE_PFR_REQ, pf->state))
-		reset_type = ICE_RESET_PFR;
-	else
-		goto unlock;
+	/* If no valid reset type requested just return */
+	if (reset_type == ICE_RESET_INVAL)
+		return;
 
-	/* reset if not already down or resetting */
+	/* reset if not already down or busy */
 	if (!test_bit(__ICE_DOWN, pf->state) &&
 	    !test_bit(__ICE_CFG_BUSY, pf->state)) {
 		ice_do_reset(pf, reset_type);
 	}
-
-unlock:
-	rtnl_unlock();
 }
 
 /**
@@ -970,7 +973,8 @@ static void ice_clean_adminq_subtask(struct ice_pf *pf)
 static void ice_service_task_schedule(struct ice_pf *pf)
 {
 	if (!test_bit(__ICE_DOWN, pf->state) &&
-	    !test_and_set_bit(__ICE_SERVICE_SCHED, pf->state))
+	    !test_and_set_bit(__ICE_SERVICE_SCHED, pf->state) &&
+	    !test_bit(__ICE_NEEDS_RESTART, pf->state))
 		queue_work(ice_wq, &pf->serv_task);
 }
 
@@ -1013,9 +1017,10 @@ static void ice_service_task(struct work_struct *work)
 	/* process reset requests first */
 	ice_reset_subtask(pf);
 
-	/* bail if a reset/recovery cycle is pending */
+	/* bail if a reset/recovery cycle is pending or rebuild failed */
 	if (ice_is_reset_recovery_pending(pf->state) ||
-	    test_bit(__ICE_SUSPENDED, pf->state)) {
+	    test_bit(__ICE_SUSPENDED, pf->state) ||
+	    test_bit(__ICE_NEEDS_RESTART, pf->state)) {
 		ice_service_task_complete(pf);
 		return;
 	}
@@ -1160,7 +1165,7 @@ static void ice_vsi_delete(struct ice_vsi *vsi)
 
 	memcpy(&ctxt.info, &vsi->info, sizeof(struct ice_aqc_vsi_props));
 
-	status = ice_aq_free_vsi(&pf->hw, &ctxt, false, NULL);
+	status = ice_free_vsi(&pf->hw, vsi->idx, &ctxt, false, NULL);
 	if (status)
 		dev_err(&pf->pdev->dev, "Failed to delete VSI %i in FW\n",
 			vsi->vsi_num);
@@ -1423,13 +1428,13 @@ static void ice_set_rss_vsi_ctx(struct ice_vsi_ctx *ctxt, struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_add - Create a new VSI or fetch preallocated VSI
+ * ice_vsi_init - Create and initialize a VSI
  * @vsi: the VSI being configured
  *
  * This initializes a VSI context depending on the VSI type to be added and
  * passes it down to the add_vsi aq command to create a new VSI.
  */
-static int ice_vsi_add(struct ice_vsi *vsi)
+static int ice_vsi_init(struct ice_vsi *vsi)
 {
 	struct ice_vsi_ctx ctxt = { 0 };
 	struct ice_pf *pf = vsi->back;
@@ -1456,13 +1461,17 @@ static int ice_vsi_add(struct ice_vsi *vsi)
 	ctxt.info.sw_id = vsi->port_info->sw_id;
 	ice_vsi_setup_q_map(vsi, &ctxt);
 
-	ret = ice_aq_add_vsi(hw, &ctxt, NULL);
+	ret = ice_add_vsi(hw, vsi->idx, &ctxt, NULL);
 	if (ret) {
-		dev_err(&vsi->back->pdev->dev,
-			"Add VSI AQ call failed, err %d\n", ret);
+		dev_err(&pf->pdev->dev,
+			"Add VSI failed, err %d\n", ret);
 		return -EIO;
 	}
+
+	/* keep context for update VSI operations */
 	vsi->info = ctxt.info;
+
+	/* record VSI number returned */
 	vsi->vsi_num = ctxt.vsi_num;
 
 	return ret;
@@ -2652,14 +2661,12 @@ static int ice_vsi_cfg_rss(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_reinit_setup - return resource and reallocate resource for a VSI
- * @vsi: pointer to the ice_vsi
- *
- * This reallocates the VSIs queue resources
+ * ice_vsi_rebuild - Rebuild VSI after reset
+ * @vsi: vsi to be rebuild
  *
  * Returns 0 on success and negative value on failure
  */
-static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
+static int ice_vsi_rebuild(struct ice_vsi *vsi)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
 	int ret, i;
@@ -2675,7 +2682,7 @@ static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
 	ice_vsi_set_num_qs(vsi);
 
 	/* Initialize VSI struct elements and create VSI in FW */
-	ret = ice_vsi_add(vsi);
+	ret = ice_vsi_init(vsi);
 	if (ret < 0)
 		goto err_vsi;
 
@@ -2685,19 +2692,7 @@ static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
 
 	switch (vsi->type) {
 	case ICE_VSI_PF:
-		if (!vsi->netdev) {
-			ret = ice_cfg_netdev(vsi);
-			if (ret)
-				goto err_rings;
-
-			ret = register_netdev(vsi->netdev);
-			if (ret)
-				goto err_rings;
-
-			netif_carrier_off(vsi->netdev);
-			netif_tx_stop_all_queues(vsi->netdev);
-		}
-
+		/* fall through */
 		ret = ice_vsi_alloc_q_vectors(vsi);
 		if (ret)
 			goto err_rings;
@@ -2749,21 +2744,23 @@ static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
 /**
  * ice_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
- * @type: VSI type
  * @pi: pointer to the port_info instance
+ * @type: VSI type
+ * @vf_id: defines VF id to which this VSI connects. This field is meant to be
+ *         used only for ICE_VSI_VF VSI type. For other VSI types, should
+ *         fill-in ICE_INVAL_VFID as input.
  *
  * This allocates the sw VSI structure and its queue resources.
  *
- * Returns pointer to the successfully allocated and configure VSI sw struct on
- * success, otherwise returns NULL on failure.
+ * Returns pointer to the successfully allocated and configured VSI sw struct on
+ * success, NULL on failure.
  */
 static struct ice_vsi *
-ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
-	      struct ice_port_info *pi)
+ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
+	      enum ice_vsi_type type, u16 __always_unused vf_id)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
 	struct device *dev = &pf->pdev->dev;
-	struct ice_vsi_ctx ctxt = { 0 };
 	struct ice_vsi *vsi;
 	int ret, i;
 
@@ -2786,12 +2783,10 @@ ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
 	ice_vsi_set_rss_params(vsi);
 
 	/* create the VSI */
-	ret = ice_vsi_add(vsi);
+	ret = ice_vsi_init(vsi);
 	if (ret)
 		goto err_vsi;
 
-	ctxt.vsi_num = vsi->vsi_num;
-
 	switch (vsi->type) {
 	case ICE_VSI_PF:
 		ret = ice_cfg_netdev(vsi);
@@ -2860,10 +2855,7 @@ ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
 		vsi->netdev = NULL;
 	}
 err_cfg_netdev:
-	ret = ice_aq_free_vsi(&pf->hw, &ctxt, false, NULL);
-	if (ret)
-		dev_err(&vsi->back->pdev->dev,
-			"Free VSI AQ call failed, err %d\n", ret);
+	ice_vsi_delete(vsi);
 err_vsi:
 	ice_vsi_put_qs(vsi);
 err_get_qs:
@@ -2874,6 +2866,20 @@ ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
 	return NULL;
 }
 
+/**
+ * ice_pf_vsi_setup - Set up a PF VSI
+ * @pf: board private structure
+ * @pi: pointer to the port_info instance
+ *
+ * Returns pointer to the successfully allocated VSI sw struct on success,
+ * otherwise returns NULL on failure.
+ */
+static struct ice_vsi *
+ice_pf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_PF, ICE_INVAL_VFID);
+}
+
 /**
  * ice_vsi_add_vlan - Add vsi membership for given vlan
  * @vsi: the vsi being configured
@@ -3021,50 +3027,48 @@ static int ice_setup_pf_sw(struct ice_pf *pf)
 	struct ice_vsi *vsi;
 	int status = 0;
 
-	if (!ice_is_reset_recovery_pending(pf->state)) {
-		vsi = ice_vsi_setup(pf, ICE_VSI_PF, pf->hw.port_info);
-		if (!vsi) {
-			status = -ENOMEM;
-			goto error_exit;
-		}
-	} else {
-		vsi = pf->vsi[0];
-		status = ice_vsi_reinit_setup(vsi);
-		if (status < 0)
-			return -EIO;
+	if (ice_is_reset_recovery_pending(pf->state))
+		return -EBUSY;
+
+	vsi = ice_pf_vsi_setup(pf, pf->hw.port_info);
+	if (!vsi) {
+		status = -ENOMEM;
+		goto unroll_vsi_setup;
 	}
 
-	/* tmp_add_list contains a list of MAC addresses for which MAC
-	 * filters need to be programmed. Add the VSI's unicast MAC to
-	 * this list
+	/* To add a MAC filter, first add the MAC to a list and then
+	 * pass the list to ice_add_mac.
 	 */
+
+	 /* Add a unicast MAC filter so the VSI can get its packets */
 	status = ice_add_mac_to_list(vsi, &tmp_add_list,
 				     vsi->port_info->mac.perm_addr);
 	if (status)
-		goto error_exit;
+		goto unroll_vsi_setup;
 
 	/* VSI needs to receive broadcast traffic, so add the broadcast
-	 * MAC address to the list.
+	 * MAC address to the list as well.
 	 */
 	eth_broadcast_addr(broadcast);
 	status = ice_add_mac_to_list(vsi, &tmp_add_list, broadcast);
 	if (status)
-		goto error_exit;
+		goto free_mac_list;
 
 	/* program MAC filters for entries in tmp_add_list */
 	status = ice_add_mac(&pf->hw, &tmp_add_list);
 	if (status) {
 		dev_err(&pf->pdev->dev, "Could not add MAC filters\n");
 		status = -ENOMEM;
-		goto error_exit;
+		goto free_mac_list;
 	}
 
 	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
 	return status;
 
-error_exit:
+free_mac_list:
 	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
 
+unroll_vsi_setup:
 	if (vsi) {
 		ice_vsi_free_q_vectors(vsi);
 		if (vsi->netdev && vsi->netdev->reg_state == NETREG_REGISTERED)
@@ -3453,24 +3457,13 @@ static int ice_probe(struct pci_dev *pdev,
 static void ice_remove(struct pci_dev *pdev)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
-	int i = 0;
-	int err;
 
 	if (!pf)
 		return;
 
 	set_bit(__ICE_DOWN, pf->state);
 
-	for (i = 0; i < pf->num_alloc_vsi; i++) {
-		if (!pf->vsi[i])
-			continue;
-
-		err = ice_vsi_release(pf->vsi[i]);
-		if (err)
-			dev_dbg(&pf->pdev->dev, "Failed to release VSI index %d (err %d)\n",
-				i, err);
-	}
-
+	ice_vsi_release_all(pf);
 	ice_free_irq_msix_misc(pf);
 	ice_clear_interrupt_scheme(pf);
 	ice_deinit_pf(pf);
@@ -3517,7 +3510,7 @@ static int __init ice_module_init(void)
 	pr_info("%s - version %s\n", ice_driver_string, ice_drv_ver);
 	pr_info("%s\n", ice_copyright);
 
-	ice_wq = alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, KBUILD_MODNAME);
+	ice_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, KBUILD_MODNAME);
 	if (!ice_wq) {
 		pr_err("Failed to create workqueue\n");
 		return -ENOMEM;
@@ -5104,8 +5097,14 @@ static int ice_vsi_release(struct ice_vsi *vsi)
 	if (!vsi->back)
 		return -ENODEV;
 	pf = vsi->back;
-
-	if (vsi->netdev) {
+	/* do not unregister and free netdevs while driver is in the reset
+	 * recovery pending state. Since reset/rebuild happens through PF
+	 * service task workqueue, its not a good idea to unregister netdev
+	 * that is associated to the PF that is running the work queue items
+	 * currently. This is done to avoid check_flush_dependency() warning
+	 * on this wq
+	 */
+	if (vsi->netdev && !ice_is_reset_recovery_pending(pf->state)) {
 		unregister_netdev(vsi->netdev);
 		free_netdev(vsi->netdev);
 		vsi->netdev = NULL;
@@ -5131,11 +5130,39 @@ static int ice_vsi_release(struct ice_vsi *vsi)
 	pf->q_left_tx += vsi->alloc_txq;
 	pf->q_left_rx += vsi->alloc_rxq;
 
-	ice_vsi_clear(vsi);
+	/* retain SW VSI data structure since it is needed to unregister and
+	 * free VSI netdev when PF is not in reset recovery pending state,\
+	 * for ex: during rmmod.
+	 */
+	if (!ice_is_reset_recovery_pending(pf->state))
+		ice_vsi_clear(vsi);
 
 	return 0;
 }
 
+/**
+ * ice_vsi_release_all - Delete all VSIs
+ * @pf: PF from which all VSIs are being removed
+ */
+static void ice_vsi_release_all(struct ice_pf *pf)
+{
+	int err, i;
+
+	if (!pf->vsi)
+		return;
+
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		if (!pf->vsi[i])
+			continue;
+
+		err = ice_vsi_release(pf->vsi[i]);
+		if (err)
+			dev_dbg(&pf->pdev->dev,
+				"Failed to release pf->vsi[%d], err %d, vsi_num = %d\n",
+				i, err, pf->vsi[i]->vsi_num);
+	}
+}
+
 /**
  * ice_dis_vsi - pause a VSI
  * @vsi: the VSI being paused
@@ -5148,27 +5175,31 @@ static void ice_dis_vsi(struct ice_vsi *vsi)
 	set_bit(__ICE_NEEDS_RESTART, vsi->state);
 
 	if (vsi->netdev && netif_running(vsi->netdev) &&
-	    vsi->type == ICE_VSI_PF)
+	    vsi->type == ICE_VSI_PF) {
+		rtnl_lock();
 		vsi->netdev->netdev_ops->ndo_stop(vsi->netdev);
-
-	ice_vsi_close(vsi);
+		rtnl_unlock();
+	} else {
+		ice_vsi_close(vsi);
+	}
 }
 
 /**
  * ice_ena_vsi - resume a VSI
  * @vsi: the VSI being resume
  */
-static void ice_ena_vsi(struct ice_vsi *vsi)
+static int ice_ena_vsi(struct ice_vsi *vsi)
 {
-	if (!test_and_clear_bit(__ICE_NEEDS_RESTART, vsi->state))
-		return;
+	int err = 0;
+
+	if (test_and_clear_bit(__ICE_NEEDS_RESTART, vsi->state))
+		if (vsi->netdev && netif_running(vsi->netdev)) {
+			rtnl_lock();
+			err = vsi->netdev->netdev_ops->ndo_open(vsi->netdev);
+			rtnl_unlock();
+		}
 
-	if (vsi->netdev && netif_running(vsi->netdev))
-		vsi->netdev->netdev_ops->ndo_open(vsi->netdev);
-	else if (ice_vsi_open(vsi))
-		/* this clears the DOWN bit */
-		dev_dbg(&vsi->back->pdev->dev, "Failed open VSI 0x%04X on switch 0x%04X\n",
-			vsi->vsi_num, vsi->vsw->sw_id);
+	return err;
 }
 
 /**
@@ -5188,13 +5219,47 @@ static void ice_pf_dis_all_vsi(struct ice_pf *pf)
  * ice_pf_ena_all_vsi - Resume all VSIs on a PF
  * @pf: the PF
  */
-static void ice_pf_ena_all_vsi(struct ice_pf *pf)
+static int ice_pf_ena_all_vsi(struct ice_pf *pf)
 {
 	int v;
 
 	ice_for_each_vsi(pf, v)
 		if (pf->vsi[v])
-			ice_ena_vsi(pf->vsi[v]);
+			if (ice_ena_vsi(pf->vsi[v]))
+				return -EIO;
+
+	return 0;
+}
+
+/**
+ * ice_vsi_rebuild_all - rebuild all VSIs in pf
+ * @pf: the PF
+ */
+static int ice_vsi_rebuild_all(struct ice_pf *pf)
+{
+	int i;
+
+	/* loop through pf->vsi array and reinit the VSI if found */
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		int err;
+
+		if (!pf->vsi[i])
+			continue;
+
+		err = ice_vsi_rebuild(pf->vsi[i]);
+		if (err) {
+			dev_err(&pf->pdev->dev,
+				"VSI at index %d rebuild failed\n",
+				pf->vsi[i]->idx);
+			return err;
+		}
+
+		dev_info(&pf->pdev->dev,
+			 "VSI at index %d rebuilt. vsi_num = 0x%x\n",
+			 pf->vsi[i]->idx, pf->vsi[i]->vsi_num);
+	}
+
+	return 0;
 }
 
 /**
@@ -5216,13 +5281,13 @@ static void ice_rebuild(struct ice_pf *pf)
 	ret = ice_init_all_ctrlq(hw);
 	if (ret) {
 		dev_err(dev, "control queues init failed %d\n", ret);
-		goto fail_reset;
+		goto err_init_ctrlq;
 	}
 
 	ret = ice_clear_pf_cfg(hw);
 	if (ret) {
 		dev_err(dev, "clear PF configuration failed %d\n", ret);
-		goto fail_reset;
+		goto err_init_ctrlq;
 	}
 
 	ice_clear_pxe_mode(hw);
@@ -5230,14 +5295,24 @@ static void ice_rebuild(struct ice_pf *pf)
 	ret = ice_get_caps(hw);
 	if (ret) {
 		dev_err(dev, "ice_get_caps failed %d\n", ret);
-		goto fail_reset;
+		goto err_init_ctrlq;
 	}
 
-	/* basic nic switch setup */
-	err = ice_setup_pf_sw(pf);
+	err = ice_sched_init_port(hw->port_info);
+	if (err)
+		goto err_sched_init_port;
+
+	err = ice_vsi_rebuild_all(pf);
 	if (err) {
-		dev_err(dev, "ice_setup_pf_sw failed\n");
-		goto fail_reset;
+		dev_err(dev, "ice_vsi_rebuild_all failed\n");
+		goto err_vsi_rebuild;
+	}
+
+	ret = ice_replay_all_fltr(&pf->hw);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"error replaying switch filter rules\n");
+		goto err_vsi_rebuild;
 	}
 
 	/* start misc vector */
@@ -5245,20 +5320,35 @@ static void ice_rebuild(struct ice_pf *pf)
 		err = ice_req_irq_msix_misc(pf);
 		if (err) {
 			dev_err(dev, "misc vector setup failed: %d\n", err);
-			goto fail_reset;
+			goto err_vsi_rebuild;
 		}
 	}
 
 	/* restart the VSIs that were rebuilt and running before the reset */
-	ice_pf_ena_all_vsi(pf);
+	err = ice_pf_ena_all_vsi(pf);
+	if (err) {
+		dev_err(&pf->pdev->dev, "error enabling VSIs\n");
+		/* no need to disable VSIs in tear down path in ice_rebuild()
+		 * since its already taken care in ice_vsi_open()
+		 */
+		goto err_vsi_rebuild;
+	}
 
+	/* if we get here, reset flow is successful */
+	clear_bit(__ICE_RESET_FAILED, pf->state);
 	return;
 
-fail_reset:
+err_vsi_rebuild:
+	ice_vsi_release_all(pf);
+err_sched_init_port:
+	ice_sched_cleanup_all(hw);
+err_init_ctrlq:
 	ice_shutdown_all_ctrlq(hw);
 	set_bit(__ICE_RESET_FAILED, pf->state);
 clear_recovery:
-	set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+	/* set this bit in PF state to control service task scheduling */
+	set_bit(__ICE_NEEDS_RESTART, pf->state);
+	dev_err(dev, "Rebuild failed, unload and reload driver\n");
 }
 
 /**
@@ -5431,6 +5521,11 @@ static int ice_open(struct net_device *netdev)
 	struct ice_vsi *vsi = np->vsi;
 	int err;
 
+	if (test_bit(__ICE_NEEDS_RESTART, vsi->back->state)) {
+		netdev_err(netdev, "driver needs to be unloaded and reloaded\n");
+		return -EIO;
+	}
+
 	netif_carrier_off(netdev);
 
 	err = ice_vsi_open(vsi);
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 2693bebef977..ac66c96a123d 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -169,17 +169,17 @@ ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp *buf,
  *
  * Add a VSI context to the hardware (0x0210)
  */
-enum ice_status
+static enum ice_status
 ice_aq_add_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 	       struct ice_sq_cd *cd)
 {
 	struct ice_aqc_add_update_free_vsi_resp *res;
 	struct ice_aqc_add_get_update_free_vsi *cmd;
-	enum ice_status status;
 	struct ice_aq_desc desc;
+	enum ice_status status;
 
 	cmd = &desc.params.vsi_cmd;
-	res = (struct ice_aqc_add_update_free_vsi_resp *)&desc.params.raw;
+	res = &desc.params.add_update_free_vsi_res;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_vsi);
 
@@ -203,6 +203,42 @@ ice_aq_add_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 	return status;
 }
 
+/**
+ * ice_aq_free_vsi
+ * @hw: pointer to the hw struct
+ * @vsi_ctx: pointer to a VSI context struct
+ * @keep_vsi_alloc: keep VSI allocation as part of this PF's resources
+ * @cd: pointer to command details structure or NULL
+ *
+ * Free VSI context info from hardware (0x0213)
+ */
+static enum ice_status
+ice_aq_free_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
+		bool keep_vsi_alloc, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_update_free_vsi_resp *resp;
+	struct ice_aqc_add_get_update_free_vsi *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.vsi_cmd;
+	resp = &desc.params.add_update_free_vsi_res;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_free_vsi);
+
+	cmd->vsi_num = cpu_to_le16(vsi_ctx->vsi_num | ICE_AQ_VSI_IS_VALID);
+	if (keep_vsi_alloc)
+		cmd->cmd_flags = cpu_to_le16(ICE_AQ_VSI_KEEP_ALLOC);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (!status) {
+		vsi_ctx->vsis_allocd = le16_to_cpu(resp->vsi_used);
+		vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
+	}
+
+	return status;
+}
+
 /**
  * ice_aq_update_vsi
  * @hw: pointer to the hw struct
@@ -221,7 +257,7 @@ ice_aq_update_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 	enum ice_status status;
 
 	cmd = &desc.params.vsi_cmd;
-	resp = (struct ice_aqc_add_update_free_vsi_resp *)&desc.params.raw;
+	resp = &desc.params.add_update_free_vsi_res;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_update_vsi);
 
@@ -241,38 +277,202 @@ ice_aq_update_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 }
 
 /**
- * ice_aq_free_vsi
+ * ice_update_fltr_vsi_map - update given filter VSI map
+ * @list_head: list for which filters needs to be updated
+ * @list_lock: filter lock which needs to be updated
+ * @old_vsi_num: old VSI HW id
+ * @new_vsi_num: new VSI HW id
+ *
+ * update the VSI map for a given filter list
+ */
+static void
+ice_update_fltr_vsi_map(struct list_head *list_head,
+			struct mutex *list_lock, u16 old_vsi_num,
+			u16 new_vsi_num)
+{
+	struct ice_fltr_mgmt_list_entry *itr;
+
+	mutex_lock(list_lock);
+	if (list_empty(list_head))
+		goto exit_update_map;
+
+	list_for_each_entry(itr, list_head, list_entry) {
+		if (itr->vsi_list_info &&
+		    test_bit(old_vsi_num, itr->vsi_list_info->vsi_map)) {
+			clear_bit(old_vsi_num, itr->vsi_list_info->vsi_map);
+			set_bit(new_vsi_num, itr->vsi_list_info->vsi_map);
+		} else if (itr->fltr_info.fltr_act == ICE_FWD_TO_VSI &&
+			   itr->fltr_info.fwd_id.vsi_id == old_vsi_num) {
+			itr->fltr_info.fwd_id.vsi_id = new_vsi_num;
+			itr->fltr_info.src = new_vsi_num;
+		}
+	}
+exit_update_map:
+	mutex_unlock(list_lock);
+}
+
+/**
+ * ice_update_all_fltr_vsi_map - update all filters VSI map
+ * @hw: pointer to the hardware structure
+ * @old_vsi_num: old VSI HW id
+ * @new_vsi_num: new VSI HW id
+ *
+ * update all filters VSI map
+ */
+static void
+ice_update_all_fltr_vsi_map(struct ice_hw *hw, u16 old_vsi_num, u16 new_vsi_num)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	u8 i;
+
+	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+		struct list_head *head = &sw->recp_list[i].filt_rules;
+		struct mutex *lock; /* Lock to protect filter rule list */
+
+		lock = &sw->recp_list[i].filt_rule_lock;
+		ice_update_fltr_vsi_map(head, lock, old_vsi_num,
+					new_vsi_num);
+	}
+}
+
+/**
+ * ice_is_vsi_valid - check whether the VSI is valid or not
+ * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ *
+ * check whether the VSI is valid or not
+ */
+static bool ice_is_vsi_valid(struct ice_hw *hw, u16 vsi_handle)
+{
+	return vsi_handle < ICE_MAX_VSI && hw->vsi_ctx[vsi_handle];
+}
+
+/**
+ * ice_get_hw_vsi_num - return the hw VSI number
  * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ *
+ * return the hw VSI number
+ * Caution: call this function only if VSI is valid (ice_is_vsi_valid)
+ */
+static u16 ice_get_hw_vsi_num(struct ice_hw *hw, u16 vsi_handle)
+{
+	return hw->vsi_ctx[vsi_handle]->vsi_num;
+}
+
+/**
+ * ice_get_vsi_ctx - return the VSI context entry for a given VSI handle
+ * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ *
+ * return the VSI context entry for a given VSI handle
+ */
+static struct ice_vsi_ctx *ice_get_vsi_ctx(struct ice_hw *hw, u16 vsi_handle)
+{
+	return (vsi_handle >= ICE_MAX_VSI) ? NULL : hw->vsi_ctx[vsi_handle];
+}
+
+/**
+ * ice_save_vsi_ctx - save the VSI context for a given VSI handle
+ * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ * @vsi: VSI context pointer
+ *
+ * save the VSI context entry for a given VSI handle
+ */
+static void ice_save_vsi_ctx(struct ice_hw *hw, u16 vsi_handle,
+			     struct ice_vsi_ctx *vsi)
+{
+	hw->vsi_ctx[vsi_handle] = vsi;
+}
+
+/**
+ * ice_clear_vsi_ctx - clear the VSI context entry
+ * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ *
+ * clear the VSI context entry
+ */
+static void ice_clear_vsi_ctx(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_vsi_ctx *vsi;
+
+	vsi = ice_get_vsi_ctx(hw, vsi_handle);
+	if (vsi) {
+		devm_kfree(ice_hw_to_dev(hw), vsi);
+		hw->vsi_ctx[vsi_handle] = NULL;
+	}
+}
+
+/**
+ * ice_add_vsi - add VSI context to the hardware and VSI handle list
+ * @hw: pointer to the hw struct
+ * @vsi_handle: unique VSI handle provided by drivers
  * @vsi_ctx: pointer to a VSI context struct
- * @keep_vsi_alloc: keep VSI allocation as part of this PF's resources
  * @cd: pointer to command details structure or NULL
  *
- * Get VSI context info from hardware (0x0213)
+ * Add a VSI context to the hardware also add it into the VSI handle list.
+ * If this function gets called after reset for existing VSIs then update
+ * with the new HW VSI number in the corresponding VSI handle list entry.
  */
 enum ice_status
-ice_aq_free_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
-		bool keep_vsi_alloc, struct ice_sq_cd *cd)
+ice_add_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
+	    struct ice_sq_cd *cd)
 {
-	struct ice_aqc_add_update_free_vsi_resp *resp;
-	struct ice_aqc_add_get_update_free_vsi *cmd;
-	struct ice_aq_desc desc;
+	struct ice_vsi_ctx *tmp_vsi_ctx;
 	enum ice_status status;
 
-	cmd = &desc.params.vsi_cmd;
-	resp = (struct ice_aqc_add_update_free_vsi_resp *)&desc.params.raw;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_free_vsi);
+	if (vsi_handle >= ICE_MAX_VSI)
+		return ICE_ERR_PARAM;
+	status = ice_aq_add_vsi(hw, vsi_ctx, cd);
+	if (status)
+		return status;
+	tmp_vsi_ctx = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!tmp_vsi_ctx) {
+		/* Create a new vsi context */
+		tmp_vsi_ctx = devm_kzalloc(ice_hw_to_dev(hw),
+					   sizeof(*tmp_vsi_ctx), GFP_KERNEL);
+		if (!tmp_vsi_ctx) {
+			ice_aq_free_vsi(hw, vsi_ctx, false, cd);
+			return ICE_ERR_NO_MEMORY;
+		}
+		*tmp_vsi_ctx = *vsi_ctx;
+		ice_save_vsi_ctx(hw, vsi_handle, tmp_vsi_ctx);
+	} else {
+		/* update with new HW VSI num */
+		if (tmp_vsi_ctx->vsi_num != vsi_ctx->vsi_num) {
+			/* update all filter lists with new HW VSI num */
+			ice_update_all_fltr_vsi_map(hw, tmp_vsi_ctx->vsi_num,
+						    vsi_ctx->vsi_num);
+			tmp_vsi_ctx->vsi_num = vsi_ctx->vsi_num;
+		}
+	}
 
-	cmd->vsi_num = cpu_to_le16(vsi_ctx->vsi_num | ICE_AQ_VSI_IS_VALID);
-	if (keep_vsi_alloc)
-		cmd->cmd_flags = cpu_to_le16(ICE_AQ_VSI_KEEP_ALLOC);
+	return status;
+}
 
-	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
-	if (!status) {
-		vsi_ctx->vsis_allocd = le16_to_cpu(resp->vsi_used);
-		vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
-	}
+/**
+ * ice_free_vsi- free VSI context from hardware and VSI handle list
+ * @hw: pointer to the hw struct
+ * @vsi_handle: unique VSI handle
+ * @vsi_ctx: pointer to a VSI context struct
+ * @keep_vsi_alloc: keep VSI allocation as part of this PF's resources
+ * @cd: pointer to command details structure or NULL
+ *
+ * Free VSI context info from hardware as well as from VSI handle list
+ */
+enum ice_status
+ice_free_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
+	     bool keep_vsi_alloc, struct ice_sq_cd *cd)
+{
+	enum ice_status status;
 
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx->vsi_num = ice_get_hw_vsi_num(hw, vsi_handle);
+	status = ice_aq_free_vsi(hw, vsi_ctx, keep_vsi_alloc, cd);
+	if (!status)
+		ice_clear_vsi_ctx(hw, vsi_handle);
 	return status;
 }
 
@@ -1516,6 +1716,25 @@ ice_add_vlan(struct ice_hw *hw, struct list_head *v_list)
 	return 0;
 }
 
+/**
+ * ice_rem_sw_rule_info
+ * @hw: pointer to the hardware structure
+ * @rule_head: pointer to the switch list structure that we want to delete
+ */
+static void
+ice_rem_sw_rule_info(struct ice_hw *hw, struct list_head *rule_head)
+{
+	if (!list_empty(rule_head)) {
+		struct ice_fltr_mgmt_list_entry *entry;
+		struct ice_fltr_mgmt_list_entry *tmp;
+
+		list_for_each_entry_safe(entry, tmp, rule_head, list_entry) {
+			list_del(&entry->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), entry);
+		}
+	}
+}
+
 /**
  * ice_cfg_dflt_vsi - change state of VSI to set/clear default
  * @hw: pointer to the hardware structure
@@ -1822,3 +2041,89 @@ void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_id)
 	ice_remove_vsi_lkup_fltr(hw, vsi_id, ICE_SW_LKUP_ETHERTYPE_MAC);
 	ice_remove_vsi_lkup_fltr(hw, vsi_id, ICE_SW_LKUP_PROMISC_VLAN);
 }
+
+/**
+ * ice_replay_fltr - Replay all the filters stored by a specific list head
+ * @hw: pointer to the hardware structure
+ * @list_head: list for which filters needs to be replayed
+ * @recp_id: Recipe id for which rules need to be replayed
+ */
+static enum ice_status
+ice_replay_fltr(struct ice_hw *hw, u8 recp_id, struct list_head *list_head)
+{
+	struct ice_fltr_mgmt_list_entry *itr;
+	struct list_head l_head;
+	enum ice_status status = 0;
+
+	if (list_empty(list_head))
+		return status;
+
+	/* Move entries from the given list_head to a temporary l_head so that
+	 * they can be replayed. Otherwise when trying to re-add the same
+	 * filter, the function will return already exists
+	 */
+	list_replace_init(list_head, &l_head);
+
+	/* Mark the given list_head empty by reinitializing it so filters
+	 * could be added again by *handler
+	 */
+	list_for_each_entry(itr, &l_head, list_entry) {
+		struct ice_fltr_list_entry f_entry;
+
+		f_entry.fltr_info = itr->fltr_info;
+		if (itr->vsi_count < 2 && recp_id != ICE_SW_LKUP_VLAN) {
+			status = ice_add_rule_internal(hw, recp_id, &f_entry);
+			if (status)
+				goto end;
+			continue;
+		}
+
+		/* Add a filter per vsi separately */
+		while (1) {
+			u16 vsi;
+
+			vsi = find_first_bit(itr->vsi_list_info->vsi_map,
+					     ICE_MAX_VSI);
+			if (vsi == ICE_MAX_VSI)
+				break;
+
+			clear_bit(vsi, itr->vsi_list_info->vsi_map);
+			f_entry.fltr_info.fwd_id.vsi_id = vsi;
+			f_entry.fltr_info.fltr_act = ICE_FWD_TO_VSI;
+			if (recp_id == ICE_SW_LKUP_VLAN)
+				status = ice_add_vlan_internal(hw, &f_entry);
+			else
+				status = ice_add_rule_internal(hw, recp_id,
+							       &f_entry);
+			if (status)
+				goto end;
+		}
+	}
+end:
+	/* Clear the filter management list */
+	ice_rem_sw_rule_info(hw, &l_head);
+	return status;
+}
+
+/**
+ * ice_replay_all_fltr - replay all filters stored in bookkeeping lists
+ * @hw: pointer to the hardware structure
+ *
+ * NOTE: This function does not clean up partially added filters on error.
+ * It is up to caller of the function to issue a reset or fail early.
+ */
+enum ice_status ice_replay_all_fltr(struct ice_hw *hw)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	enum ice_status status = 0;
+	u8 i;
+
+	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+		struct list_head *head = &sw->recp_list[i].filt_rules;
+
+		status = ice_replay_fltr(hw, i, head);
+		if (status)
+			return status;
+	}
+	return status;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index 09a3b6d6541a..11a481644dac 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -158,15 +158,14 @@ struct ice_fltr_mgmt_list_entry {
 
 /* VSI related commands */
 enum ice_status
-ice_aq_add_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
-	       struct ice_sq_cd *cd);
-enum ice_status
 ice_aq_update_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 		  struct ice_sq_cd *cd);
 enum ice_status
-ice_aq_free_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
-		bool keep_vsi_alloc, struct ice_sq_cd *cd);
-
+ice_add_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
+	    struct ice_sq_cd *cd);
+enum ice_status
+ice_free_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
+	     bool keep_vsi_alloc, struct ice_sq_cd *cd);
 enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw);
 
 /* Switch/bridge related commands */
@@ -177,6 +176,9 @@ enum ice_status ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
 enum ice_status ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
 enum ice_status
 ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_id, bool set, u8 direction);
+
+enum ice_status ice_replay_all_fltr(struct ice_hw *hw);
+
 enum ice_status ice_init_def_sw_recp(struct ice_hw *hw);
 
 #endif /* _ICE_SWITCH_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 473d82fdb570..8b7efa87ce56 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -149,9 +149,10 @@ struct ice_mac_info {
 
 /* Various RESET request, These are not tied with HW reset types */
 enum ice_reset_req {
-	ICE_RESET_PFR	= 0,
-	ICE_RESET_CORER	= 1,
-	ICE_RESET_GLOBR	= 2,
+	ICE_RESET_INVAL	= 0,
+	ICE_RESET_PFR	= 1,
+	ICE_RESET_CORER	= 2,
+	ICE_RESET_GLOBR	= 3,
 };
 
 /* Bus parameters */
@@ -283,6 +284,7 @@ struct ice_hw {
 	u8 sw_entry_point_layer;
 	u16 max_children[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 
+	struct ice_vsi_ctx *vsi_ctx[ICE_MAX_VSI];
 	u8 evb_veb;		/* true for VEB, false for VEPA */
 	u8 reset_ongoing;	/* true if hw is in reset, false otherwise */
 	struct ice_bus_info bus;
-- 
2.17.1

^ permalink raw reply related

* [net-next 11/15] ice: Implement ice_bridge_getlink and ice_bridge_setlink
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Md Fahad Iqbal Polash, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com>

ice_bridge_getlink returns the current bridge mode using
ndo_dflt_bridge_getlink and the mode parameter available in
first_switch->bridge_mode.

ice_bridge_setlink is invoked when the bridge mode needs to
changed. The value to be changed to is available as a netlink
message which is parsed in this function. If the mode has to
be changed, switch_flags is set appropriately (set ALLOW_LB
for VEB mode and clear it for VEPA mode) and ice_aq_update_vsi
is called. Also change the unicast switch filter rules.

Signed-off-by: Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c   | 140 +++++++++++++++++++-
 drivers/net/ethernet/intel/ice/ice_switch.c |  41 ++++++
 drivers/net/ethernet/intel/ice/ice_switch.h |   1 +
 3 files changed, 181 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index fccecb6fa618..cbeae1355593 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -3599,7 +3599,11 @@ static int ice_probe(struct pci_dev *pdev,
 		goto err_msix_misc_unroll;
 	}
 
-	pf->first_sw->bridge_mode = BRIDGE_MODE_VEB;
+	if (hw->evb_veb)
+		pf->first_sw->bridge_mode = BRIDGE_MODE_VEB;
+	else
+		pf->first_sw->bridge_mode = BRIDGE_MODE_VEPA;
+
 	pf->first_sw->pf = pf;
 
 	/* record the sw_id available for later use */
@@ -5695,6 +5699,138 @@ int ice_get_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
 	return 0;
 }
 
+/**
+ * ice_bridge_getlink - Get the hardware bridge mode
+ * @skb: skb buff
+ * @pid: process id
+ * @seq: RTNL message seq
+ * @dev: the netdev being configured
+ * @filter_mask: filter mask passed in
+ * @nlflags: netlink flags passed in
+ *
+ * Return the bridge mode (VEB/VEPA)
+ */
+static int
+ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		   struct net_device *dev, u32 filter_mask, int nlflags)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u16 bmode;
+
+	bmode = pf->first_sw->bridge_mode;
+
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0, nlflags,
+				       filter_mask, NULL);
+}
+
+/**
+ * ice_vsi_update_bridge_mode - Update VSI for switching bridge mode (VEB/VEPA)
+ * @vsi: Pointer to VSI structure
+ * @bmode: Hardware bridge mode (VEB/VEPA)
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode)
+{
+	struct device *dev = &vsi->back->pdev->dev;
+	struct ice_aqc_vsi_props *vsi_props;
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx ctxt = { 0 };
+	enum ice_status status;
+
+	vsi_props = &vsi->info;
+	ctxt.info = vsi->info;
+
+	if (bmode == BRIDGE_MODE_VEB)
+		/* change from VEPA to VEB mode */
+		ctxt.info.sw_flags |= ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
+	else
+		/* change from VEB to VEPA mode */
+		ctxt.info.sw_flags &= ~ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
+	ctxt.vsi_num = vsi->vsi_num;
+	ctxt.info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+	status = ice_aq_update_vsi(hw, &ctxt, NULL);
+	if (status) {
+		dev_err(dev, "update VSI for bridge mode failed, bmode = %d err %d aq_err %d\n",
+			bmode, status, hw->adminq.sq_last_status);
+		return -EIO;
+	}
+	/* Update sw flags for book keeping */
+	vsi_props->sw_flags = ctxt.info.sw_flags;
+
+	return 0;
+}
+
+/**
+ * ice_bridge_setlink - Set the hardware bridge mode
+ * @dev: the netdev being configured
+ * @nlh: RTNL message
+ * @flags: bridge setlink flags
+ *
+ * Sets the bridge mode (VEB/VEPA) of the switch to which the netdev (VSI) is
+ * hooked up to. Iterates through the PF VSI list and sets the loopback mode (if
+ * not already set for all VSIs connected to this switch. And also update the
+ * unicast switch filter rules for the corresponding switch of the netdev.
+ */
+static int
+ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+		   u16 __always_unused flags)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_pf *pf = np->vsi->back;
+	struct nlattr *attr, *br_spec;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct ice_sw *pf_sw;
+	int rem, v, err = 0;
+
+	pf_sw = pf->first_sw;
+	/* find the attribute in the netlink message */
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+
+	nla_for_each_nested(attr, br_spec, rem) {
+		__u16 mode;
+
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+		mode = nla_get_u16(attr);
+		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
+			return -EINVAL;
+		/* Continue  if bridge mode is not being flipped */
+		if (mode == pf_sw->bridge_mode)
+			continue;
+		/* Iterates through the PF VSI list and update the loopback
+		 * mode of the VSI
+		 */
+		ice_for_each_vsi(pf, v) {
+			if (!pf->vsi[v])
+				continue;
+			err = ice_vsi_update_bridge_mode(pf->vsi[v], mode);
+			if (err)
+				return err;
+		}
+
+		hw->evb_veb = (mode == BRIDGE_MODE_VEB);
+		/* Update the unicast switch filter rules for the corresponding
+		 * switch of the netdev
+		 */
+		status = ice_update_sw_rule_bridge_mode(hw);
+		if (status) {
+			netdev_err(dev, "update SW_RULE for bridge mode failed,  = %d err %d aq_err %d\n",
+				   mode, status, hw->adminq.sq_last_status);
+			/* revert hw->evb_veb */
+			hw->evb_veb = (pf_sw->bridge_mode == BRIDGE_MODE_VEB);
+			return -EIO;
+		}
+
+		pf_sw->bridge_mode = mode;
+	}
+
+	return 0;
+}
+
 /**
  * ice_tx_timeout - Respond to a Tx Hang
  * @netdev: network interface device structure
@@ -5907,6 +6043,8 @@ static const struct net_device_ops ice_netdev_ops = {
 	.ndo_vlan_rx_add_vid = ice_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = ice_vlan_rx_kill_vid,
 	.ndo_set_features = ice_set_features,
+	.ndo_bridge_getlink = ice_bridge_getlink,
+	.ndo_bridge_setlink = ice_bridge_setlink,
 	.ndo_fdb_add = ice_fdb_add,
 	.ndo_fdb_del = ice_fdb_del,
 	.ndo_tx_timeout = ice_tx_timeout,
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index ac66c96a123d..65b4e1cca6be 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -1130,6 +1130,47 @@ ice_update_pkt_fwd_rule(struct ice_hw *hw, struct ice_fltr_info *f_info)
 	return status;
 }
 
+/**
+ * ice_update_sw_rule_bridge_mode
+ * @hw: pointer to the hw struct
+ *
+ * Updates unicast switch filter rules based on VEB/VEPA mode
+ */
+enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	enum ice_status status = 0;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+
+	mutex_lock(rule_lock);
+	list_for_each_entry(fm_entry, rule_head, list_entry) {
+		struct ice_fltr_info *fi = &fm_entry->fltr_info;
+		u8 *addr = fi->l_data.mac.mac_addr;
+
+		/* Update unicast Tx rules to reflect the selected
+		 * VEB/VEPA mode
+		 */
+		if ((fi->flag & ICE_FLTR_TX) && is_unicast_ether_addr(addr) &&
+		    (fi->fltr_act == ICE_FWD_TO_VSI ||
+		     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
+		     fi->fltr_act == ICE_FWD_TO_Q ||
+		     fi->fltr_act == ICE_FWD_TO_QGRP)) {
+			status = ice_update_pkt_fwd_rule(hw, fi);
+			if (status)
+				break;
+		}
+	}
+
+	mutex_unlock(rule_lock);
+
+	return status;
+}
+
 /**
  * ice_add_update_vsi_list
  * @hw: pointer to the hardware structure
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index 11a481644dac..646389ca1238 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -169,6 +169,7 @@ ice_free_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw);
 
 /* Switch/bridge related commands */
+enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw);
 enum ice_status ice_add_mac(struct ice_hw *hw, struct list_head *m_lst);
 enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_lst);
 void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_id);
-- 
2.17.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox