Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 0/6] qed/qede: VF tunnelling support
From: Manish Chopra @ 2017-04-24 17:00 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuval.Mintz

Hi David,

With this series VFs can run vxlan/geneve/gre tunnels over it.
Please consider applying this series to "net-next"

Thanks,
Manish

Manish Chopra (6):
  qed: refactor tunnelling - API/Structs
  qed/qede: Enable tunnel offloads based on hw configuration
  qede: Disable tunnel offloads for non offloaded UDP ports
  qede: Configure UDP ports in local context.
  qed/qede: Add UDP ports in bulletin board
  qed - VF tunnelling support [VXLAN/GENEVE/GRE]

 drivers/net/ethernet/qlogic/qed/qed.h             |  31 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |  17 +-
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h     |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  37 ++-
 drivers/net/ethernet/qlogic/qed/qed_main.c        |  47 ++-
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 330 +++++++++++-----------
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       | 240 ++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h       |   9 +
 drivers/net/ethernet/qlogic/qed/qed_vf.c          | 165 +++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h          |  58 +++-
 drivers/net/ethernet/qlogic/qede/qede.h           |   3 +-
 drivers/net/ethernet/qlogic/qede/qede_filter.c    |  84 +++++-
 drivers/net/ethernet/qlogic/qede/qede_fp.c        |  23 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c      |  56 ++--
 include/linux/qed/qed_eth_if.h                    |   1 +
 include/linux/qed/qed_if.h                        |   5 +
 17 files changed, 857 insertions(+), 255 deletions(-)

-- 
2.7.2

^ permalink raw reply

* [PATCH net-next 1/6] qed: refactor tunnelling - API/Structs
From: Manish Chopra @ 2017-04-24 17:00 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuval.Mintz
In-Reply-To: <20170424170049.16027-1-manish.chopra@cavium.com>

This patch changes the tunnel APIs to use per tunnel
info instead of using bitmasks for all tunnels and also
uses single struct to hold the data to prepare multiple
variant of tunnel configuration ramrods to be sent to the hardware.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed.h             |  30 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h     |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  15 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c        |  24 +-
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 327 +++++++++++-----------
 7 files changed, 207 insertions(+), 197 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index c539ba1..8b7b1dd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -149,9 +149,35 @@ enum qed_tunn_clss {
 	QED_TUNN_CLSS_MAC_VNI,
 	QED_TUNN_CLSS_INNER_MAC_VLAN,
 	QED_TUNN_CLSS_INNER_MAC_VNI,
+	QED_TUNN_CLSS_MAC_VLAN_DUAL_STAGE,
 	MAX_QED_TUNN_CLSS,
 };
 
+struct qed_tunn_update_type {
+	bool b_update_mode;
+	bool b_mode_enabled;
+	enum qed_tunn_clss tun_cls;
+};
+
+struct qed_tunn_update_udp_port {
+	bool b_update_port;
+	u16 port;
+};
+
+struct qed_tunnel_info {
+	struct qed_tunn_update_type vxlan;
+	struct qed_tunn_update_type l2_geneve;
+	struct qed_tunn_update_type ip_geneve;
+	struct qed_tunn_update_type l2_gre;
+	struct qed_tunn_update_type ip_gre;
+
+	struct qed_tunn_update_udp_port vxlan_port;
+	struct qed_tunn_update_udp_port geneve_port;
+
+	bool b_update_rx_cls;
+	bool b_update_tx_cls;
+};
+
 struct qed_tunn_start_params {
 	unsigned long	tunn_mode;
 	u16		vxlan_udp_port;
@@ -648,9 +674,7 @@ struct qed_dev {
 	/* SRIOV */
 	struct qed_hw_sriov_info *p_iov_info;
 #define IS_QED_SRIOV(cdev)              (!!(cdev)->p_iov_info)
-
-	unsigned long			tunn_mode;
-
+	struct qed_tunnel_info		tunnel;
 	bool				b_is_vf;
 	u32				drv_type;
 	struct qed_eth_stats		*reset_stats;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 6d24308..13817cc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1453,7 +1453,7 @@ static int qed_hw_init_port(struct qed_hwfn *p_hwfn,
 
 static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 			  struct qed_ptt *p_ptt,
-			  struct qed_tunn_start_params *p_tunn,
+			  struct qed_tunnel_info *p_tunn,
 			  int hw_mode,
 			  bool b_hw_start,
 			  enum qed_int_mode int_mode,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index 341636d..cefe3ee 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -113,7 +113,7 @@ struct qed_drv_load_params {
 
 struct qed_hw_init_params {
 	/* Tunneling parameters */
-	struct qed_tunn_start_params *p_tunn;
+	struct qed_tunnel_info *p_tunn;
 
 	bool b_hw_start;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index eb5e280..03216ba 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2285,21 +2285,21 @@ static int qed_stop_txq(struct qed_dev *cdev, u8 rss_id, void *handle)
 static int qed_tunn_configure(struct qed_dev *cdev,
 			      struct qed_tunn_params *tunn_params)
 {
-	struct qed_tunn_update_params tunn_info;
+	struct qed_tunnel_info tunn_info;
 	int i, rc;
 
 	if (IS_VF(cdev))
 		return 0;
 
 	memset(&tunn_info, 0, sizeof(tunn_info));
-	if (tunn_params->update_vxlan_port == 1) {
-		tunn_info.update_vxlan_udp_port = 1;
-		tunn_info.vxlan_udp_port = tunn_params->vxlan_port;
+	if (tunn_params->update_vxlan_port) {
+		tunn_info.vxlan_port.b_update_port = true;
+		tunn_info.vxlan_port.port = tunn_params->vxlan_port;
 	}
 
-	if (tunn_params->update_geneve_port == 1) {
-		tunn_info.update_geneve_udp_port = 1;
-		tunn_info.geneve_udp_port = tunn_params->geneve_port;
+	if (tunn_params->update_geneve_port) {
+		tunn_info.geneve_port.b_update_port = true;
+		tunn_info.geneve_port.port = tunn_params->geneve_port;
 	}
 
 	for_each_hwfn(cdev, i) {
@@ -2307,7 +2307,6 @@ static int qed_tunn_configure(struct qed_dev *cdev,
 
 		rc = qed_sp_pf_update_tunn_cfg(hwfn, &tunn_info,
 					       QED_SPQ_MODE_EBLOCK, NULL);
-
 		if (rc)
 			return rc;
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index da562cf..a622d75 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -909,8 +909,8 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 {
 	struct qed_drv_load_params drv_load_params;
 	struct qed_hw_init_params hw_init_params;
-	struct qed_tunn_start_params tunn_info;
 	struct qed_mcp_drv_version drv_version;
+	struct qed_tunnel_info tunn_info;
 	const u8 *data = NULL;
 	struct qed_hwfn *hwfn;
 	struct qed_ptt *p_ptt;
@@ -974,19 +974,19 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 		qed_dbg_pf_init(cdev);
 	}
 
-	memset(&tunn_info, 0, sizeof(tunn_info));
-	tunn_info.tunn_mode |=  1 << QED_MODE_VXLAN_TUNN |
-				1 << QED_MODE_L2GRE_TUNN |
-				1 << QED_MODE_IPGRE_TUNN |
-				1 << QED_MODE_L2GENEVE_TUNN |
-				1 << QED_MODE_IPGENEVE_TUNN;
-
-	tunn_info.tunn_clss_vxlan = QED_TUNN_CLSS_MAC_VLAN;
-	tunn_info.tunn_clss_l2gre = QED_TUNN_CLSS_MAC_VLAN;
-	tunn_info.tunn_clss_ipgre = QED_TUNN_CLSS_MAC_VLAN;
-
 	/* Start the slowpath */
 	memset(&hw_init_params, 0, sizeof(hw_init_params));
+	memset(&tunn_info, 0, sizeof(tunn_info));
+	tunn_info.vxlan.b_mode_enabled = true;
+	tunn_info.l2_gre.b_mode_enabled = true;
+	tunn_info.ip_gre.b_mode_enabled = true;
+	tunn_info.l2_geneve.b_mode_enabled = true;
+	tunn_info.ip_geneve.b_mode_enabled = true;
+	tunn_info.vxlan.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	tunn_info.l2_gre.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	tunn_info.ip_gre.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	tunn_info.l2_geneve.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	tunn_info.ip_geneve.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
 	hw_init_params.p_tunn = &tunn_info;
 	hw_init_params.b_hw_start = true;
 	hw_init_params.int_mode = cdev->int_params.out.int_mode;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 583c8d3..3357bbe 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -409,7 +409,7 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
  */
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
-		    struct qed_tunn_start_params *p_tunn,
+		    struct qed_tunnel_info *p_tunn,
 		    enum qed_mf_mode mode, bool allow_npar_tx_switch);
 
 /**
@@ -442,7 +442,7 @@ int qed_sp_pf_update(struct qed_hwfn *p_hwfn);
 int qed_sp_pf_stop(struct qed_hwfn *p_hwfn);
 
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
-			      struct qed_tunn_update_params *p_tunn,
+			      struct qed_tunnel_info *p_tunn,
 			      enum spq_mode comp_mode,
 			      struct qed_spq_comp_cb *p_comp_data);
 /**
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 6fb80f9..96c6fda 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -111,7 +111,7 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
-static enum tunnel_clss qed_tunn_get_clss_type(u8 type)
+static enum tunnel_clss qed_tunn_clss_to_fw_clss(u8 type)
 {
 	switch (type) {
 	case QED_TUNN_CLSS_MAC_VLAN:
@@ -122,206 +122,201 @@ static enum tunnel_clss qed_tunn_get_clss_type(u8 type)
 		return TUNNEL_CLSS_INNER_MAC_VLAN;
 	case QED_TUNN_CLSS_INNER_MAC_VNI:
 		return TUNNEL_CLSS_INNER_MAC_VNI;
+	case QED_TUNN_CLSS_MAC_VLAN_DUAL_STAGE:
+		return TUNNEL_CLSS_MAC_VLAN_DUAL_STAGE;
 	default:
 		return TUNNEL_CLSS_MAC_VLAN;
 	}
 }
 
 static void
-qed_tunn_set_pf_fix_tunn_mode(struct qed_hwfn *p_hwfn,
-			      struct qed_tunn_update_params *p_src,
-			      struct pf_update_tunnel_config *p_tunn_cfg)
+qed_set_pf_update_tunn_mode(struct qed_tunnel_info *p_tun,
+			    struct qed_tunnel_info *p_src, bool b_pf_start)
 {
-	unsigned long cached_tunn_mode = p_hwfn->cdev->tunn_mode;
-	unsigned long update_mask = p_src->tunn_mode_update_mask;
-	unsigned long tunn_mode = p_src->tunn_mode;
-	unsigned long new_tunn_mode = 0;
-
-	if (test_bit(QED_MODE_L2GRE_TUNN, &update_mask)) {
-		if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
-			__set_bit(QED_MODE_L2GRE_TUNN, &new_tunn_mode);
-	} else {
-		if (test_bit(QED_MODE_L2GRE_TUNN, &cached_tunn_mode))
-			__set_bit(QED_MODE_L2GRE_TUNN, &new_tunn_mode);
-	}
-
-	if (test_bit(QED_MODE_IPGRE_TUNN, &update_mask)) {
-		if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
-			__set_bit(QED_MODE_IPGRE_TUNN, &new_tunn_mode);
-	} else {
-		if (test_bit(QED_MODE_IPGRE_TUNN, &cached_tunn_mode))
-			__set_bit(QED_MODE_IPGRE_TUNN, &new_tunn_mode);
-	}
+	if (p_src->vxlan.b_update_mode || b_pf_start)
+		p_tun->vxlan.b_mode_enabled = p_src->vxlan.b_mode_enabled;
 
-	if (test_bit(QED_MODE_VXLAN_TUNN, &update_mask)) {
-		if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
-			__set_bit(QED_MODE_VXLAN_TUNN, &new_tunn_mode);
-	} else {
-		if (test_bit(QED_MODE_VXLAN_TUNN, &cached_tunn_mode))
-			__set_bit(QED_MODE_VXLAN_TUNN, &new_tunn_mode);
-	}
-
-	if (p_src->update_geneve_udp_port) {
-		p_tunn_cfg->set_geneve_udp_port_flg = 1;
-		p_tunn_cfg->geneve_udp_port =
-				cpu_to_le16(p_src->geneve_udp_port);
-	}
+	if (p_src->l2_gre.b_update_mode || b_pf_start)
+		p_tun->l2_gre.b_mode_enabled = p_src->l2_gre.b_mode_enabled;
 
-	if (test_bit(QED_MODE_L2GENEVE_TUNN, &update_mask)) {
-		if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
-			__set_bit(QED_MODE_L2GENEVE_TUNN, &new_tunn_mode);
-	} else {
-		if (test_bit(QED_MODE_L2GENEVE_TUNN, &cached_tunn_mode))
-			__set_bit(QED_MODE_L2GENEVE_TUNN, &new_tunn_mode);
-	}
+	if (p_src->ip_gre.b_update_mode || b_pf_start)
+		p_tun->ip_gre.b_mode_enabled = p_src->ip_gre.b_mode_enabled;
 
-	if (test_bit(QED_MODE_IPGENEVE_TUNN, &update_mask)) {
-		if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
-			__set_bit(QED_MODE_IPGENEVE_TUNN, &new_tunn_mode);
-	} else {
-		if (test_bit(QED_MODE_IPGENEVE_TUNN, &cached_tunn_mode))
-			__set_bit(QED_MODE_IPGENEVE_TUNN, &new_tunn_mode);
-	}
+	if (p_src->l2_geneve.b_update_mode || b_pf_start)
+		p_tun->l2_geneve.b_mode_enabled =
+		    p_src->l2_geneve.b_mode_enabled;
 
-	p_src->tunn_mode = new_tunn_mode;
+	if (p_src->ip_geneve.b_update_mode || b_pf_start)
+		p_tun->ip_geneve.b_mode_enabled =
+		    p_src->ip_geneve.b_mode_enabled;
 }
 
-static void
-qed_tunn_set_pf_update_params(struct qed_hwfn *p_hwfn,
-			      struct qed_tunn_update_params *p_src,
-			      struct pf_update_tunnel_config *p_tunn_cfg)
+static void qed_set_tunn_cls_info(struct qed_tunnel_info *p_tun,
+				  struct qed_tunnel_info *p_src)
 {
-	unsigned long tunn_mode = p_src->tunn_mode;
 	enum tunnel_clss type;
 
-	qed_tunn_set_pf_fix_tunn_mode(p_hwfn, p_src, p_tunn_cfg);
-	p_tunn_cfg->update_rx_pf_clss = p_src->update_rx_pf_clss;
-	p_tunn_cfg->update_tx_pf_clss = p_src->update_tx_pf_clss;
-
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_vxlan);
-	p_tunn_cfg->tunnel_clss_vxlan  = type;
-
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2gre);
-	p_tunn_cfg->tunnel_clss_l2gre = type;
+	p_tun->b_update_rx_cls = p_src->b_update_rx_cls;
+	p_tun->b_update_tx_cls = p_src->b_update_tx_cls;
+
+	type = qed_tunn_clss_to_fw_clss(p_src->vxlan.tun_cls);
+	p_tun->vxlan.tun_cls = type;
+	type = qed_tunn_clss_to_fw_clss(p_src->l2_gre.tun_cls);
+	p_tun->l2_gre.tun_cls = type;
+	type = qed_tunn_clss_to_fw_clss(p_src->ip_gre.tun_cls);
+	p_tun->ip_gre.tun_cls = type;
+	type = qed_tunn_clss_to_fw_clss(p_src->l2_geneve.tun_cls);
+	p_tun->l2_geneve.tun_cls = type;
+	type = qed_tunn_clss_to_fw_clss(p_src->ip_geneve.tun_cls);
+	p_tun->ip_geneve.tun_cls = type;
+}
 
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgre);
-	p_tunn_cfg->tunnel_clss_ipgre = type;
+static void qed_set_tunn_ports(struct qed_tunnel_info *p_tun,
+			       struct qed_tunnel_info *p_src)
+{
+	p_tun->geneve_port.b_update_port = p_src->geneve_port.b_update_port;
+	p_tun->vxlan_port.b_update_port = p_src->vxlan_port.b_update_port;
 
-	if (p_src->update_vxlan_udp_port) {
-		p_tunn_cfg->set_vxlan_udp_port_flg = 1;
-		p_tunn_cfg->vxlan_udp_port = cpu_to_le16(p_src->vxlan_udp_port);
-	}
+	if (p_src->geneve_port.b_update_port)
+		p_tun->geneve_port.port = p_src->geneve_port.port;
 
-	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_l2gre = 1;
+	if (p_src->vxlan_port.b_update_port)
+		p_tun->vxlan_port.port = p_src->vxlan_port.port;
+}
 
-	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_ipgre = 1;
+static void
+__qed_set_ramrod_tunnel_param(u8 *p_tunn_cls, u8 *p_enable_tx_clas,
+			      struct qed_tunn_update_type *tun_type)
+{
+	*p_tunn_cls = tun_type->tun_cls;
 
-	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_vxlan = 1;
+	if (tun_type->b_mode_enabled)
+		*p_enable_tx_clas = 1;
+}
 
-	if (p_src->update_geneve_udp_port) {
-		p_tunn_cfg->set_geneve_udp_port_flg = 1;
-		p_tunn_cfg->geneve_udp_port =
-				cpu_to_le16(p_src->geneve_udp_port);
+static void
+qed_set_ramrod_tunnel_param(u8 *p_tunn_cls, u8 *p_enable_tx_clas,
+			    struct qed_tunn_update_type *tun_type,
+			    u8 *p_update_port, __le16 *p_port,
+			    struct qed_tunn_update_udp_port *p_udp_port)
+{
+	__qed_set_ramrod_tunnel_param(p_tunn_cls, p_enable_tx_clas, tun_type);
+	if (p_udp_port->b_update_port) {
+		*p_update_port = 1;
+		*p_port = cpu_to_le16(p_udp_port->port);
 	}
+}
 
-	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_l2geneve = 1;
-
-	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_ipgeneve = 1;
-
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2geneve);
-	p_tunn_cfg->tunnel_clss_l2geneve = type;
-
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgeneve);
-	p_tunn_cfg->tunnel_clss_ipgeneve = type;
+static void
+qed_tunn_set_pf_update_params(struct qed_hwfn *p_hwfn,
+			      struct qed_tunnel_info *p_src,
+			      struct pf_update_tunnel_config *p_tunn_cfg)
+{
+	struct qed_tunnel_info *p_tun = &p_hwfn->cdev->tunnel;
+
+	qed_set_pf_update_tunn_mode(p_tun, p_src, false);
+	qed_set_tunn_cls_info(p_tun, p_src);
+	qed_set_tunn_ports(p_tun, p_src);
+
+	qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_vxlan,
+				    &p_tunn_cfg->tx_enable_vxlan,
+				    &p_tun->vxlan,
+				    &p_tunn_cfg->set_vxlan_udp_port_flg,
+				    &p_tunn_cfg->vxlan_udp_port,
+				    &p_tun->vxlan_port);
+
+	qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_l2geneve,
+				    &p_tunn_cfg->tx_enable_l2geneve,
+				    &p_tun->l2_geneve,
+				    &p_tunn_cfg->set_geneve_udp_port_flg,
+				    &p_tunn_cfg->geneve_udp_port,
+				    &p_tun->geneve_port);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_ipgeneve,
+				      &p_tunn_cfg->tx_enable_ipgeneve,
+				      &p_tun->ip_geneve);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_l2gre,
+				      &p_tunn_cfg->tx_enable_l2gre,
+				      &p_tun->l2_gre);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_ipgre,
+				      &p_tunn_cfg->tx_enable_ipgre,
+				      &p_tun->ip_gre);
+
+	p_tunn_cfg->update_rx_pf_clss = p_tun->b_update_rx_cls;
+	p_tunn_cfg->update_tx_pf_clss = p_tun->b_update_tx_cls;
 }
 
 static void qed_set_hw_tunn_mode(struct qed_hwfn *p_hwfn,
 				 struct qed_ptt *p_ptt,
-				 unsigned long tunn_mode)
+				 struct qed_tunnel_info *p_tun)
 {
-	u8 l2gre_enable = 0, ipgre_enable = 0, vxlan_enable = 0;
-	u8 l2geneve_enable = 0, ipgeneve_enable = 0;
-
-	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
-		l2gre_enable = 1;
-
-	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
-		ipgre_enable = 1;
-
-	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
-		vxlan_enable = 1;
+	qed_set_gre_enable(p_hwfn, p_ptt, p_tun->l2_gre.b_mode_enabled,
+			   p_tun->ip_gre.b_mode_enabled);
+	qed_set_vxlan_enable(p_hwfn, p_ptt, p_tun->vxlan.b_mode_enabled);
 
-	qed_set_gre_enable(p_hwfn, p_ptt, l2gre_enable, ipgre_enable);
-	qed_set_vxlan_enable(p_hwfn, p_ptt, vxlan_enable);
+	qed_set_geneve_enable(p_hwfn, p_ptt, p_tun->l2_geneve.b_mode_enabled,
+			      p_tun->ip_geneve.b_mode_enabled);
+}
 
-	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
-		l2geneve_enable = 1;
+static void qed_set_hw_tunn_mode_port(struct qed_hwfn *p_hwfn,
+				      struct qed_tunnel_info *p_tunn)
+{
+	if (p_tunn->vxlan_port.b_update_port)
+		qed_set_vxlan_dest_port(p_hwfn, p_hwfn->p_main_ptt,
+					p_tunn->vxlan_port.port);
 
-	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
-		ipgeneve_enable = 1;
+	if (p_tunn->geneve_port.b_update_port)
+		qed_set_geneve_dest_port(p_hwfn, p_hwfn->p_main_ptt,
+					 p_tunn->geneve_port.port);
 
-	qed_set_geneve_enable(p_hwfn, p_ptt, l2geneve_enable,
-			      ipgeneve_enable);
+	qed_set_hw_tunn_mode(p_hwfn, p_hwfn->p_main_ptt, p_tunn);
 }
 
 static void
 qed_tunn_set_pf_start_params(struct qed_hwfn *p_hwfn,
-			     struct qed_tunn_start_params *p_src,
+			     struct qed_tunnel_info *p_src,
 			     struct pf_start_tunnel_config *p_tunn_cfg)
 {
-	unsigned long tunn_mode;
-	enum tunnel_clss type;
+	struct qed_tunnel_info *p_tun = &p_hwfn->cdev->tunnel;
 
 	if (!p_src)
 		return;
 
-	tunn_mode = p_src->tunn_mode;
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_vxlan);
-	p_tunn_cfg->tunnel_clss_vxlan = type;
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2gre);
-	p_tunn_cfg->tunnel_clss_l2gre = type;
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgre);
-	p_tunn_cfg->tunnel_clss_ipgre = type;
-
-	if (p_src->update_vxlan_udp_port) {
-		p_tunn_cfg->set_vxlan_udp_port_flg = 1;
-		p_tunn_cfg->vxlan_udp_port = cpu_to_le16(p_src->vxlan_udp_port);
-	}
-
-	if (test_bit(QED_MODE_L2GRE_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_l2gre = 1;
-
-	if (test_bit(QED_MODE_IPGRE_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_ipgre = 1;
-
-	if (test_bit(QED_MODE_VXLAN_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_vxlan = 1;
-
-	if (p_src->update_geneve_udp_port) {
-		p_tunn_cfg->set_geneve_udp_port_flg = 1;
-		p_tunn_cfg->geneve_udp_port =
-				cpu_to_le16(p_src->geneve_udp_port);
-	}
-
-	if (test_bit(QED_MODE_L2GENEVE_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_l2geneve = 1;
-
-	if (test_bit(QED_MODE_IPGENEVE_TUNN, &tunn_mode))
-		p_tunn_cfg->tx_enable_ipgeneve = 1;
-
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_l2geneve);
-	p_tunn_cfg->tunnel_clss_l2geneve = type;
-	type = qed_tunn_get_clss_type(p_src->tunn_clss_ipgeneve);
-	p_tunn_cfg->tunnel_clss_ipgeneve = type;
+	qed_set_pf_update_tunn_mode(p_tun, p_src, true);
+	qed_set_tunn_cls_info(p_tun, p_src);
+	qed_set_tunn_ports(p_tun, p_src);
+
+	qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_vxlan,
+				    &p_tunn_cfg->tx_enable_vxlan,
+				    &p_tun->vxlan,
+				    &p_tunn_cfg->set_vxlan_udp_port_flg,
+				    &p_tunn_cfg->vxlan_udp_port,
+				    &p_tun->vxlan_port);
+
+	qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_l2geneve,
+				    &p_tunn_cfg->tx_enable_l2geneve,
+				    &p_tun->l2_geneve,
+				    &p_tunn_cfg->set_geneve_udp_port_flg,
+				    &p_tunn_cfg->geneve_udp_port,
+				    &p_tun->geneve_port);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_ipgeneve,
+				      &p_tunn_cfg->tx_enable_ipgeneve,
+				      &p_tun->ip_geneve);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_l2gre,
+				      &p_tunn_cfg->tx_enable_l2gre,
+				      &p_tun->l2_gre);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_ipgre,
+				      &p_tunn_cfg->tx_enable_ipgre,
+				      &p_tun->ip_gre);
 }
 
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
-		    struct qed_tunn_start_params *p_tunn,
+		    struct qed_tunnel_info *p_tunn,
 		    enum qed_mf_mode mode, bool allow_npar_tx_switch)
 {
 	struct pf_start_ramrod_data *p_ramrod = NULL;
@@ -416,11 +411,8 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 	rc = qed_spq_post(p_hwfn, p_ent, NULL);
 
-	if (p_tunn) {
-		qed_set_hw_tunn_mode(p_hwfn, p_hwfn->p_main_ptt,
-				     p_tunn->tunn_mode);
-		p_hwfn->cdev->tunn_mode = p_tunn->tunn_mode;
-	}
+	if (p_tunn)
+		qed_set_hw_tunn_mode_port(p_hwfn, &p_hwfn->cdev->tunnel);
 
 	return rc;
 }
@@ -451,7 +443,7 @@ int qed_sp_pf_update(struct qed_hwfn *p_hwfn)
 
 /* Set pf update ramrod command params */
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
-			      struct qed_tunn_update_params *p_tunn,
+			      struct qed_tunnel_info *p_tunn,
 			      enum spq_mode comp_mode,
 			      struct qed_spq_comp_cb *p_comp_data)
 {
@@ -459,6 +451,9 @@ int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 	struct qed_sp_init_data init_data;
 	int rc = -EINVAL;
 
+	if (!p_tunn)
+		return -EINVAL;
+
 	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
 	init_data.cid = qed_spq_get_cid(p_hwfn);
@@ -479,15 +474,7 @@ int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 	if (rc)
 		return rc;
 
-	if (p_tunn->update_vxlan_udp_port)
-		qed_set_vxlan_dest_port(p_hwfn, p_hwfn->p_main_ptt,
-					p_tunn->vxlan_udp_port);
-	if (p_tunn->update_geneve_udp_port)
-		qed_set_geneve_dest_port(p_hwfn, p_hwfn->p_main_ptt,
-					 p_tunn->geneve_udp_port);
-
-	qed_set_hw_tunn_mode(p_hwfn, p_hwfn->p_main_ptt, p_tunn->tunn_mode);
-	p_hwfn->cdev->tunn_mode = p_tunn->tunn_mode;
+	qed_set_hw_tunn_mode_port(p_hwfn, &p_hwfn->cdev->tunnel);
 
 	return rc;
 }
-- 
2.7.2

^ permalink raw reply related

* [PATCH net-next 2/6] qed/qede: Enable tunnel offloads based on hw configuration
From: Manish Chopra @ 2017-04-24 17:00 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuval.Mintz
In-Reply-To: <20170424170049.16027-1-manish.chopra@cavium.com>

This patch enables tunnel feature offloads based on hw configuration
at initialization time instead of enabling them always.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c     | 15 +++++++++++
 drivers/net/ethernet/qlogic/qede/qede_filter.c |  6 +++++
 drivers/net/ethernet/qlogic/qede/qede_main.c   | 36 ++++++++++++++++++--------
 include/linux/qed/qed_if.h                     |  5 ++++
 4 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index a622d75..e650281 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -230,10 +230,25 @@ static int qed_init_pci(struct qed_dev *cdev, struct pci_dev *pdev)
 int qed_fill_dev_info(struct qed_dev *cdev,
 		      struct qed_dev_info *dev_info)
 {
+	struct qed_tunnel_info *tun = &cdev->tunnel;
 	struct qed_ptt  *ptt;
 
 	memset(dev_info, 0, sizeof(struct qed_dev_info));
 
+	if (tun->vxlan.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->vxlan.b_mode_enabled)
+		dev_info->vxlan_enable = true;
+
+	if (tun->l2_gre.b_mode_enabled && tun->ip_gre.b_mode_enabled &&
+	    tun->l2_gre.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->ip_gre.tun_cls == QED_TUNN_CLSS_MAC_VLAN)
+		dev_info->gre_enable = true;
+
+	if (tun->l2_geneve.b_mode_enabled && tun->ip_geneve.b_mode_enabled &&
+	    tun->l2_geneve.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->ip_geneve.tun_cls == QED_TUNN_CLSS_MAC_VLAN)
+		dev_info->geneve_enable = true;
+
 	dev_info->num_hwfns = cdev->num_hwfns;
 	dev_info->pci_mem_start = cdev->pci_params.mem_start;
 	dev_info->pci_mem_end = cdev->pci_params.mem_end;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 34473fb..23e0c16 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -887,6 +887,9 @@ void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti)
 
 	switch (ti->type) {
 	case UDP_TUNNEL_TYPE_VXLAN:
+		if (!edev->dev_info.common.vxlan_enable)
+			return;
+
 		if (edev->vxlan_dst_port)
 			return;
 
@@ -898,6 +901,9 @@ void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti)
 		set_bit(QEDE_SP_VXLAN_PORT_CONFIG, &edev->sp_flags);
 		break;
 	case UDP_TUNNEL_TYPE_GENEVE:
+		if (!edev->dev_info.common.geneve_enable)
+			return;
+
 		if (edev->geneve_dst_port)
 			return;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 02b305c..42f043b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -609,6 +609,7 @@ static void qede_init_ndev(struct qede_dev *edev)
 {
 	struct net_device *ndev = edev->ndev;
 	struct pci_dev *pdev = edev->pdev;
+	bool udp_tunnel_enable = false;
 	netdev_features_t hw_features;
 
 	pci_set_drvdata(pdev, ndev);
@@ -631,20 +632,33 @@ static void qede_init_ndev(struct qede_dev *edev)
 		      NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 		      NETIF_F_TSO | NETIF_F_TSO6;
 
-	/* Encap features*/
-	hw_features |= NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL |
-		       NETIF_F_TSO_ECN | NETIF_F_GSO_UDP_TUNNEL_CSUM |
-		       NETIF_F_GSO_GRE_CSUM;
-
 	if (!IS_VF(edev) && edev->dev_info.common.num_hwfns == 1)
 		hw_features |= NETIF_F_NTUPLE;
 
-	ndev->hw_enc_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
-				NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO_ECN |
-				NETIF_F_TSO6 | NETIF_F_GSO_GRE |
-				NETIF_F_GSO_UDP_TUNNEL | NETIF_F_RXCSUM |
-				NETIF_F_GSO_UDP_TUNNEL_CSUM |
-				NETIF_F_GSO_GRE_CSUM;
+	if (edev->dev_info.common.vxlan_enable ||
+	    edev->dev_info.common.geneve_enable)
+		udp_tunnel_enable = true;
+
+	if (udp_tunnel_enable || edev->dev_info.common.gre_enable) {
+		hw_features |= NETIF_F_TSO_ECN;
+		ndev->hw_enc_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+					NETIF_F_SG | NETIF_F_TSO |
+					NETIF_F_TSO_ECN | NETIF_F_TSO6 |
+					NETIF_F_RXCSUM;
+	}
+
+	if (udp_tunnel_enable) {
+		hw_features |= (NETIF_F_GSO_UDP_TUNNEL |
+				NETIF_F_GSO_UDP_TUNNEL_CSUM);
+		ndev->hw_enc_features |= (NETIF_F_GSO_UDP_TUNNEL |
+					  NETIF_F_GSO_UDP_TUNNEL_CSUM);
+	}
+
+	if (edev->dev_info.common.gre_enable) {
+		hw_features |= (NETIF_F_GSO_GRE | NETIF_F_GSO_GRE_CSUM);
+		ndev->hw_enc_features |= (NETIF_F_GSO_GRE |
+					  NETIF_F_GSO_GRE_CSUM);
+	}
 
 	ndev->vlan_features = hw_features | NETIF_F_RXHASH | NETIF_F_RXCSUM |
 			      NETIF_F_HIGHDMA;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 9f966be895..5544d7b 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -338,6 +338,11 @@ struct qed_dev_info {
 	bool wol_support;
 
 	enum qed_dev_type dev_type;
+
+	/* Output parameters for qede */
+	bool		vxlan_enable;
+	bool		gre_enable;
+	bool		geneve_enable;
 };
 
 enum qed_sb_type {
-- 
2.7.2

^ permalink raw reply related

* [PATCH net-next 3/6] qede: Disable tunnel offloads for non offloaded UDP ports
From: Manish Chopra @ 2017-04-24 17:00 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuval.Mintz
In-Reply-To: <20170424170049.16027-1-manish.chopra@cavium.com>

This patch disables tunnel offloads via ndo_features_check()
if given UDP port is not offloaded to hardware. This in turn
allows to run multiple tunnel interfaces using different UDP ports.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede_fp.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 961b1d3..7b6f41d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1697,13 +1697,24 @@ netdev_features_t qede_features_check(struct sk_buff *skb,
 		}
 
 		/* Disable offloads for geneve tunnels, as HW can't parse
-		 * the geneve header which has option length greater than 32B.
+		 * the geneve header which has option length greater than 32b
+		 * and disable offloads for the ports which are not offloaded.
 		 */
-		if ((l4_proto == IPPROTO_UDP) &&
-		    ((skb_inner_mac_header(skb) -
-		      skb_transport_header(skb)) > QEDE_MAX_TUN_HDR_LEN))
-			return features & ~(NETIF_F_CSUM_MASK |
-					    NETIF_F_GSO_MASK);
+		if (l4_proto == IPPROTO_UDP) {
+			struct qede_dev *edev = netdev_priv(dev);
+			u16 hdrlen, vxln_port, gnv_port;
+
+			hdrlen = QEDE_MAX_TUN_HDR_LEN;
+			vxln_port = edev->vxlan_dst_port;
+			gnv_port = edev->geneve_dst_port;
+
+			if ((skb_inner_mac_header(skb) -
+			     skb_transport_header(skb)) > hdrlen ||
+			     (ntohs(udp_hdr(skb)->dest) != vxln_port &&
+			      ntohs(udp_hdr(skb)->dest) != gnv_port))
+				return features & ~(NETIF_F_CSUM_MASK |
+						    NETIF_F_GSO_MASK);
+		}
 	}
 
 	return features;
-- 
2.7.2

^ permalink raw reply related

* [PATCH net-next 6/6] qed - VF tunnelling support [VXLAN/GENEVE/GRE]
From: Manish Chopra @ 2017-04-24 17:00 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuval.Mintz
In-Reply-To: <20170424170049.16027-1-manish.chopra@cavium.com>

This patch adds hardware channel APIs support between
VF and PF for tunnelling configuration for the VFs.
According to that configuration VFs can run VXLAN/GENEVE/GRE
tunnels over it with tunnel features offloaded.

Using these APIs VF can also request for UDP ports configuration
to the PF, although PF and it's child VFs share the same port.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed.h             |   1 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |  15 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |   3 -
 drivers/net/ethernet/qlogic/qed/qed_main.c        |   8 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |   3 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       | 217 ++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.c          | 149 +++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h          |  54 ++++++
 8 files changed, 446 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 8b7b1dd..8a8f139 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -718,6 +718,7 @@ struct qed_dev {
 	u32 rdma_max_sge;
 	u32 rdma_max_inline;
 	u32 rdma_max_srq_sge;
+	u16 tunn_feature_mask;
 };
 
 #define NUM_OF_VFS(dev)         (QED_IS_BB(dev) ? MAX_NUM_VFS_BB \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 13817cc..f168b71 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1594,6 +1594,19 @@ qed_fill_load_req_params(struct qed_load_req_params *p_load_req,
 	p_load_req->override_force_load = p_drv_load->override_force_load;
 }
 
+static int qed_vf_start(struct qed_hwfn *p_hwfn,
+			struct qed_hw_init_params *p_params)
+{
+	if (p_params->p_tunn) {
+		qed_vf_set_vf_start_tunn_update_param(p_params->p_tunn);
+		qed_vf_pf_tunnel_param_update(p_hwfn, p_params->p_tunn);
+	}
+
+	p_hwfn->b_int_enabled = 1;
+
+	return 0;
+}
+
 int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 {
 	struct qed_load_req_params load_req_params;
@@ -1623,7 +1636,7 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 		}
 
 		if (IS_VF(cdev)) {
-			p_hwfn->b_int_enabled = 1;
+			qed_vf_start(p_hwfn, p_params);
 			continue;
 		}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index b8bd119..746fed4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2288,9 +2288,6 @@ static int qed_tunn_configure(struct qed_dev *cdev,
 	struct qed_tunnel_info tunn_info;
 	int i, rc;
 
-	if (IS_VF(cdev))
-		return 0;
-
 	memset(&tunn_info, 0, sizeof(tunn_info));
 	if (tunn_params->update_vxlan_port) {
 		tunn_info.vxlan_port.b_update_port = true;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index e650281..a919260 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1022,6 +1022,14 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	DP_INFO(cdev,
 		"HW initialization and function start completed successfully\n");
 
+	if (IS_PF(cdev)) {
+		cdev->tunn_feature_mask = (BIT(QED_MODE_VXLAN_TUNN) |
+					   BIT(QED_MODE_L2GENEVE_TUNN) |
+					   BIT(QED_MODE_IPGENEVE_TUNN) |
+					   BIT(QED_MODE_L2GRE_TUNN) |
+					   BIT(QED_MODE_IPGRE_TUNN));
+	}
+
 	/* Allocate LL2 interface if needed */
 	if (QED_LEADING_HWFN(cdev)->using_ll2) {
 		rc = qed_ll2_alloc_if(cdev);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 96c6fda..bc3694e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -451,6 +451,9 @@ int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 	struct qed_sp_init_data init_data;
 	int rc = -EINVAL;
 
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_pf_tunnel_param_update(p_hwfn, p_tunn);
+
 	if (!p_tunn)
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 85672f6..d5df29f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -2019,6 +2019,220 @@ static void qed_iov_vf_mbx_start_rxq(struct qed_hwfn *p_hwfn,
 	qed_iov_vf_mbx_start_rxq_resp(p_hwfn, p_ptt, vf, status, b_legacy_vf);
 }
 
+static void
+qed_iov_pf_update_tun_response(struct pfvf_update_tunn_param_tlv *p_resp,
+			       struct qed_tunnel_info *p_tun,
+			       u16 tunn_feature_mask)
+{
+	p_resp->tunn_feature_mask = tunn_feature_mask;
+	p_resp->vxlan_mode = p_tun->vxlan.b_mode_enabled;
+	p_resp->l2geneve_mode = p_tun->l2_geneve.b_mode_enabled;
+	p_resp->ipgeneve_mode = p_tun->ip_geneve.b_mode_enabled;
+	p_resp->l2gre_mode = p_tun->l2_gre.b_mode_enabled;
+	p_resp->ipgre_mode = p_tun->l2_gre.b_mode_enabled;
+	p_resp->vxlan_clss = p_tun->vxlan.tun_cls;
+	p_resp->l2gre_clss = p_tun->l2_gre.tun_cls;
+	p_resp->ipgre_clss = p_tun->ip_gre.tun_cls;
+	p_resp->l2geneve_clss = p_tun->l2_geneve.tun_cls;
+	p_resp->ipgeneve_clss = p_tun->ip_geneve.tun_cls;
+	p_resp->geneve_udp_port = p_tun->geneve_port.port;
+	p_resp->vxlan_udp_port = p_tun->vxlan_port.port;
+}
+
+static void
+__qed_iov_pf_update_tun_param(struct vfpf_update_tunn_param_tlv *p_req,
+			      struct qed_tunn_update_type *p_tun,
+			      enum qed_tunn_mode mask, u8 tun_cls)
+{
+	if (p_req->tun_mode_update_mask & BIT(mask)) {
+		p_tun->b_update_mode = true;
+
+		if (p_req->tunn_mode & BIT(mask))
+			p_tun->b_mode_enabled = true;
+	}
+
+	p_tun->tun_cls = tun_cls;
+}
+
+static void
+qed_iov_pf_update_tun_param(struct vfpf_update_tunn_param_tlv *p_req,
+			    struct qed_tunn_update_type *p_tun,
+			    struct qed_tunn_update_udp_port *p_port,
+			    enum qed_tunn_mode mask,
+			    u8 tun_cls, u8 update_port, u16 port)
+{
+	if (update_port) {
+		p_port->b_update_port = true;
+		p_port->port = port;
+	}
+
+	__qed_iov_pf_update_tun_param(p_req, p_tun, mask, tun_cls);
+}
+
+static bool
+qed_iov_pf_validate_tunn_param(struct vfpf_update_tunn_param_tlv *p_req)
+{
+	bool b_update_requested = false;
+
+	if (p_req->tun_mode_update_mask || p_req->update_tun_cls ||
+	    p_req->update_geneve_port || p_req->update_vxlan_port)
+		b_update_requested = true;
+
+	return b_update_requested;
+}
+
+static void qed_pf_validate_tunn_mode(struct qed_tunn_update_type *tun, int *rc)
+{
+	if (tun->b_update_mode && !tun->b_mode_enabled) {
+		tun->b_update_mode = false;
+		*rc = -EINVAL;
+	}
+}
+
+static int
+qed_pf_validate_modify_tunn_config(struct qed_hwfn *p_hwfn,
+				   u16 *tun_features, bool *update,
+				   struct qed_tunnel_info *tun_src)
+{
+	struct qed_eth_cb_ops *ops = p_hwfn->cdev->protocol_ops.eth;
+	struct qed_tunnel_info *tun = &p_hwfn->cdev->tunnel;
+	u16 bultn_vxlan_port, bultn_geneve_port;
+	void *cookie = p_hwfn->cdev->ops_cookie;
+	int i, rc = 0;
+
+	*tun_features = p_hwfn->cdev->tunn_feature_mask;
+	bultn_vxlan_port = tun->vxlan_port.port;
+	bultn_geneve_port = tun->geneve_port.port;
+	qed_pf_validate_tunn_mode(&tun_src->vxlan, &rc);
+	qed_pf_validate_tunn_mode(&tun_src->l2_geneve, &rc);
+	qed_pf_validate_tunn_mode(&tun_src->ip_geneve, &rc);
+	qed_pf_validate_tunn_mode(&tun_src->l2_gre, &rc);
+	qed_pf_validate_tunn_mode(&tun_src->ip_gre, &rc);
+
+	if ((tun_src->b_update_rx_cls || tun_src->b_update_tx_cls) &&
+	    (tun_src->vxlan.tun_cls != QED_TUNN_CLSS_MAC_VLAN ||
+	     tun_src->l2_geneve.tun_cls != QED_TUNN_CLSS_MAC_VLAN ||
+	     tun_src->ip_geneve.tun_cls != QED_TUNN_CLSS_MAC_VLAN ||
+	     tun_src->l2_gre.tun_cls != QED_TUNN_CLSS_MAC_VLAN ||
+	     tun_src->ip_gre.tun_cls != QED_TUNN_CLSS_MAC_VLAN)) {
+		tun_src->b_update_rx_cls = false;
+		tun_src->b_update_tx_cls = false;
+		rc = -EINVAL;
+	}
+
+	if (tun_src->vxlan_port.b_update_port) {
+		if (tun_src->vxlan_port.port == tun->vxlan_port.port) {
+			tun_src->vxlan_port.b_update_port = false;
+		} else {
+			*update = true;
+			bultn_vxlan_port = tun_src->vxlan_port.port;
+		}
+	}
+
+	if (tun_src->geneve_port.b_update_port) {
+		if (tun_src->geneve_port.port == tun->geneve_port.port) {
+			tun_src->geneve_port.b_update_port = false;
+		} else {
+			*update = true;
+			bultn_geneve_port = tun_src->geneve_port.port;
+		}
+	}
+
+	qed_for_each_vf(p_hwfn, i) {
+		qed_iov_bulletin_set_udp_ports(p_hwfn, i, bultn_vxlan_port,
+					       bultn_geneve_port);
+	}
+
+	qed_schedule_iov(p_hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+	ops->ports_update(cookie, bultn_vxlan_port, bultn_geneve_port);
+
+	return rc;
+}
+
+static void qed_iov_vf_mbx_update_tunn_param(struct qed_hwfn *p_hwfn,
+					     struct qed_ptt *p_ptt,
+					     struct qed_vf_info *p_vf)
+{
+	struct qed_tunnel_info *p_tun = &p_hwfn->cdev->tunnel;
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct pfvf_update_tunn_param_tlv *p_resp;
+	struct vfpf_update_tunn_param_tlv *p_req;
+	u8 status = PFVF_STATUS_SUCCESS;
+	bool b_update_required = false;
+	struct qed_tunnel_info tunn;
+	u16 tunn_feature_mask = 0;
+	int i, rc = 0;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+
+	memset(&tunn, 0, sizeof(tunn));
+	p_req = &mbx->req_virt->tunn_param_update;
+
+	if (!qed_iov_pf_validate_tunn_param(p_req)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "No tunnel update requested by VF\n");
+		status = PFVF_STATUS_FAILURE;
+		goto send_resp;
+	}
+
+	tunn.b_update_rx_cls = p_req->update_tun_cls;
+	tunn.b_update_tx_cls = p_req->update_tun_cls;
+
+	qed_iov_pf_update_tun_param(p_req, &tunn.vxlan, &tunn.vxlan_port,
+				    QED_MODE_VXLAN_TUNN, p_req->vxlan_clss,
+				    p_req->update_vxlan_port,
+				    p_req->vxlan_port);
+	qed_iov_pf_update_tun_param(p_req, &tunn.l2_geneve, &tunn.geneve_port,
+				    QED_MODE_L2GENEVE_TUNN,
+				    p_req->l2geneve_clss,
+				    p_req->update_geneve_port,
+				    p_req->geneve_port);
+	__qed_iov_pf_update_tun_param(p_req, &tunn.ip_geneve,
+				      QED_MODE_IPGENEVE_TUNN,
+				      p_req->ipgeneve_clss);
+	__qed_iov_pf_update_tun_param(p_req, &tunn.l2_gre,
+				      QED_MODE_L2GRE_TUNN, p_req->l2gre_clss);
+	__qed_iov_pf_update_tun_param(p_req, &tunn.ip_gre,
+				      QED_MODE_IPGRE_TUNN, p_req->ipgre_clss);
+
+	/* If PF modifies VF's req then it should
+	 * still return an error in case of partial configuration
+	 * or modified configuration as opposed to requested one.
+	 */
+	rc = qed_pf_validate_modify_tunn_config(p_hwfn, &tunn_feature_mask,
+						&b_update_required, &tunn);
+
+	if (rc)
+		status = PFVF_STATUS_FAILURE;
+
+	/* If QED client is willing to update anything ? */
+	if (b_update_required) {
+		u16 geneve_port;
+
+		rc = qed_sp_pf_update_tunn_cfg(p_hwfn, &tunn,
+					       QED_SPQ_MODE_EBLOCK, NULL);
+		if (rc)
+			status = PFVF_STATUS_FAILURE;
+
+		geneve_port = p_tun->geneve_port.port;
+		qed_for_each_vf(p_hwfn, i) {
+			qed_iov_bulletin_set_udp_ports(p_hwfn, i,
+						       p_tun->vxlan_port.port,
+						       geneve_port);
+		}
+	}
+
+send_resp:
+	p_resp = qed_add_tlv(p_hwfn, &mbx->offset,
+			     CHANNEL_TLV_UPDATE_TUNN_PARAM, sizeof(*p_resp));
+
+	qed_iov_pf_update_tun_response(p_resp, p_tun, tunn_feature_mask);
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	qed_iov_send_response(p_hwfn, p_ptt, p_vf, sizeof(*p_resp), status);
+}
+
 static void qed_iov_vf_mbx_start_txq_resp(struct qed_hwfn *p_hwfn,
 					  struct qed_ptt *p_ptt,
 					  struct qed_vf_info *p_vf, u8 status)
@@ -3275,6 +3489,9 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 		case CHANNEL_TLV_RELEASE:
 			qed_iov_vf_mbx_release(p_hwfn, p_ptt, p_vf);
 			break;
+		case CHANNEL_TLV_UPDATE_TUNN_PARAM:
+			qed_iov_vf_mbx_update_tunn_param(p_hwfn, p_ptt, p_vf);
+			break;
 		}
 	} else if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
 		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 01cbbe6..c4c4a40 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -418,6 +418,155 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
 #define MSTORM_QZONE_START(dev)   (TSTORM_QZONE_START +	\
 				   (TSTORM_QZONE_SIZE * NUM_OF_L2_QUEUES(dev)))
 
+static void
+__qed_vf_prep_tunn_req_tlv(struct vfpf_update_tunn_param_tlv *p_req,
+			   struct qed_tunn_update_type *p_src,
+			   enum qed_tunn_clss mask, u8 *p_cls)
+{
+	if (p_src->b_update_mode) {
+		p_req->tun_mode_update_mask |= BIT(mask);
+
+		if (p_src->b_mode_enabled)
+			p_req->tunn_mode |= BIT(mask);
+	}
+
+	*p_cls = p_src->tun_cls;
+}
+
+static void
+qed_vf_prep_tunn_req_tlv(struct vfpf_update_tunn_param_tlv *p_req,
+			 struct qed_tunn_update_type *p_src,
+			 enum qed_tunn_clss mask,
+			 u8 *p_cls, struct qed_tunn_update_udp_port *p_port,
+			 u8 *p_update_port, u16 *p_udp_port)
+{
+	if (p_port->b_update_port) {
+		*p_update_port = 1;
+		*p_udp_port = p_port->port;
+	}
+
+	__qed_vf_prep_tunn_req_tlv(p_req, p_src, mask, p_cls);
+}
+
+void qed_vf_set_vf_start_tunn_update_param(struct qed_tunnel_info *p_tun)
+{
+	if (p_tun->vxlan.b_mode_enabled)
+		p_tun->vxlan.b_update_mode = true;
+	if (p_tun->l2_geneve.b_mode_enabled)
+		p_tun->l2_geneve.b_update_mode = true;
+	if (p_tun->ip_geneve.b_mode_enabled)
+		p_tun->ip_geneve.b_update_mode = true;
+	if (p_tun->l2_gre.b_mode_enabled)
+		p_tun->l2_gre.b_update_mode = true;
+	if (p_tun->ip_gre.b_mode_enabled)
+		p_tun->ip_gre.b_update_mode = true;
+
+	p_tun->b_update_rx_cls = true;
+	p_tun->b_update_tx_cls = true;
+}
+
+static void
+__qed_vf_update_tunn_param(struct qed_tunn_update_type *p_tun,
+			   u16 feature_mask, u8 tunn_mode,
+			   u8 tunn_cls, enum qed_tunn_mode val)
+{
+	if (feature_mask & BIT(val)) {
+		p_tun->b_mode_enabled = tunn_mode;
+		p_tun->tun_cls = tunn_cls;
+	} else {
+		p_tun->b_mode_enabled = false;
+	}
+}
+
+static void qed_vf_update_tunn_param(struct qed_hwfn *p_hwfn,
+				     struct qed_tunnel_info *p_tun,
+				     struct pfvf_update_tunn_param_tlv *p_resp)
+{
+	/* Update mode and classes provided by PF */
+	u16 feat_mask = p_resp->tunn_feature_mask;
+
+	__qed_vf_update_tunn_param(&p_tun->vxlan, feat_mask,
+				   p_resp->vxlan_mode, p_resp->vxlan_clss,
+				   QED_MODE_VXLAN_TUNN);
+	__qed_vf_update_tunn_param(&p_tun->l2_geneve, feat_mask,
+				   p_resp->l2geneve_mode,
+				   p_resp->l2geneve_clss,
+				   QED_MODE_L2GENEVE_TUNN);
+	__qed_vf_update_tunn_param(&p_tun->ip_geneve, feat_mask,
+				   p_resp->ipgeneve_mode,
+				   p_resp->ipgeneve_clss,
+				   QED_MODE_IPGENEVE_TUNN);
+	__qed_vf_update_tunn_param(&p_tun->l2_gre, feat_mask,
+				   p_resp->l2gre_mode, p_resp->l2gre_clss,
+				   QED_MODE_L2GRE_TUNN);
+	__qed_vf_update_tunn_param(&p_tun->ip_gre, feat_mask,
+				   p_resp->ipgre_mode, p_resp->ipgre_clss,
+				   QED_MODE_IPGRE_TUNN);
+	p_tun->geneve_port.port = p_resp->geneve_udp_port;
+	p_tun->vxlan_port.port = p_resp->vxlan_udp_port;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "tunn mode: vxlan=0x%x, l2geneve=0x%x, ipgeneve=0x%x, l2gre=0x%x, ipgre=0x%x",
+		   p_tun->vxlan.b_mode_enabled, p_tun->l2_geneve.b_mode_enabled,
+		   p_tun->ip_geneve.b_mode_enabled,
+		   p_tun->l2_gre.b_mode_enabled, p_tun->ip_gre.b_mode_enabled);
+}
+
+int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
+				  struct qed_tunnel_info *p_src)
+{
+	struct qed_tunnel_info *p_tun = &p_hwfn->cdev->tunnel;
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_update_tunn_param_tlv *p_resp;
+	struct vfpf_update_tunn_param_tlv *p_req;
+	int rc;
+
+	p_req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_UPDATE_TUNN_PARAM,
+			       sizeof(*p_req));
+
+	if (p_src->b_update_rx_cls && p_src->b_update_tx_cls)
+		p_req->update_tun_cls = 1;
+
+	qed_vf_prep_tunn_req_tlv(p_req, &p_src->vxlan, QED_MODE_VXLAN_TUNN,
+				 &p_req->vxlan_clss, &p_src->vxlan_port,
+				 &p_req->update_vxlan_port,
+				 &p_req->vxlan_port);
+	qed_vf_prep_tunn_req_tlv(p_req, &p_src->l2_geneve,
+				 QED_MODE_L2GENEVE_TUNN,
+				 &p_req->l2geneve_clss, &p_src->geneve_port,
+				 &p_req->update_geneve_port,
+				 &p_req->geneve_port);
+	__qed_vf_prep_tunn_req_tlv(p_req, &p_src->ip_geneve,
+				   QED_MODE_IPGENEVE_TUNN,
+				   &p_req->ipgeneve_clss);
+	__qed_vf_prep_tunn_req_tlv(p_req, &p_src->l2_gre,
+				   QED_MODE_L2GRE_TUNN, &p_req->l2gre_clss);
+	__qed_vf_prep_tunn_req_tlv(p_req, &p_src->ip_gre,
+				   QED_MODE_IPGRE_TUNN, &p_req->ipgre_clss);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	p_resp = &p_iov->pf2vf_reply->tunn_param_resp;
+	rc = qed_send_msg2pf(p_hwfn, &p_resp->hdr.status, sizeof(*p_resp));
+
+	if (rc)
+		goto exit;
+
+	if (p_resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Failed to update tunnel parameters\n");
+		rc = -EINVAL;
+	}
+
+	qed_vf_update_tunn_param(p_hwfn, p_tun, p_resp);
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+	return rc;
+}
+
 int
 qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
 		    struct qed_queue_cid *p_cid,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 6cca145..34ac70b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -429,6 +429,43 @@ struct vfpf_ucast_filter_tlv {
 	u16 padding[3];
 };
 
+/* tunnel update param tlv */
+struct vfpf_update_tunn_param_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	u8 tun_mode_update_mask;
+	u8 tunn_mode;
+	u8 update_tun_cls;
+	u8 vxlan_clss;
+	u8 l2gre_clss;
+	u8 ipgre_clss;
+	u8 l2geneve_clss;
+	u8 ipgeneve_clss;
+	u8 update_geneve_port;
+	u8 update_vxlan_port;
+	u16 geneve_port;
+	u16 vxlan_port;
+	u8 padding[2];
+};
+
+struct pfvf_update_tunn_param_tlv {
+	struct pfvf_tlv hdr;
+
+	u16 tunn_feature_mask;
+	u8 vxlan_mode;
+	u8 l2geneve_mode;
+	u8 ipgeneve_mode;
+	u8 l2gre_mode;
+	u8 ipgre_mode;
+	u8 vxlan_clss;
+	u8 l2gre_clss;
+	u8 ipgre_clss;
+	u8 l2geneve_clss;
+	u8 ipgeneve_clss;
+	u16 vxlan_udp_port;
+	u16 geneve_udp_port;
+};
+
 struct tlv_buffer_size {
 	u8 tlv_buffer[TLV_BUFFER_SIZE];
 };
@@ -444,6 +481,7 @@ union vfpf_tlvs {
 	struct vfpf_vport_start_tlv start_vport;
 	struct vfpf_vport_update_tlv vport_update;
 	struct vfpf_ucast_filter_tlv ucast_filter;
+	struct vfpf_update_tunn_param_tlv tunn_param_update;
 	struct channel_list_end_tlv list_end;
 	struct tlv_buffer_size tlv_buf_size;
 };
@@ -453,6 +491,7 @@ union pfvf_tlvs {
 	struct pfvf_acquire_resp_tlv acquire_resp;
 	struct tlv_buffer_size tlv_buf_size;
 	struct pfvf_start_queue_resp_tlv queue_start;
+	struct pfvf_update_tunn_param_tlv tunn_param_resp;
 };
 
 enum qed_bulletin_bit {
@@ -557,6 +596,7 @@ enum {
 	CHANNEL_TLV_VPORT_UPDATE_RSS,
 	CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN,
 	CHANNEL_TLV_VPORT_UPDATE_SGE_TPA,
+	CHANNEL_TLV_UPDATE_TUNN_PARAM,
 	CHANNEL_TLV_MAX,
 
 	/* Required for iterating over vport-update tlvs.
@@ -874,6 +914,9 @@ void __qed_vf_get_link_caps(struct qed_hwfn *p_hwfn,
 			    struct qed_bulletin_content *p_bulletin);
 
 void qed_iov_vf_task(struct work_struct *work);
+void qed_vf_set_vf_start_tunn_update_param(struct qed_tunnel_info *p_tun);
+int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
+				  struct qed_tunnel_info *p_tunn);
 #else
 static inline void qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
 					  struct qed_mcp_link_params *params)
@@ -1035,6 +1078,17 @@ __qed_vf_get_link_caps(struct qed_hwfn *p_hwfn,
 static inline void qed_iov_vf_task(struct work_struct *work)
 {
 }
+
+static inline void
+qed_vf_set_vf_start_tunn_update_param(struct qed_tunnel_info *p_tun)
+{
+}
+
+static inline int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
+						struct qed_tunnel_info *p_tunn)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif
-- 
2.7.2

^ permalink raw reply related

* [PATCH net-next 5/6] qed/qede: Add UDP ports in bulletin board
From: Manish Chopra @ 2017-04-24 17:00 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuval.Mintz
In-Reply-To: <20170424170049.16027-1-manish.chopra@cavium.com>

This patch adds support for UDP ports in bulletin board
to notify UDP ports change to the VFs

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c       | 19 +++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.c    | 23 +++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h    |  9 +++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.c       | 16 ++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h       |  4 +++-
 drivers/net/ethernet/qlogic/qede/qede.h        |  1 +
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 11 +++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c   |  1 +
 include/linux/qed/qed_eth_if.h                 |  1 +
 9 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 03216ba..b8bd119 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2304,11 +2304,30 @@ static int qed_tunn_configure(struct qed_dev *cdev,
 
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_tunnel_info *tun;
+
+		tun = &hwfn->cdev->tunnel;
 
 		rc = qed_sp_pf_update_tunn_cfg(hwfn, &tunn_info,
 					       QED_SPQ_MODE_EBLOCK, NULL);
 		if (rc)
 			return rc;
+
+		if (IS_PF_SRIOV(hwfn)) {
+			u16 vxlan_port, geneve_port;
+			int j;
+
+			vxlan_port = tun->vxlan_port.port;
+			geneve_port = tun->geneve_port.port;
+
+			qed_for_each_vf(hwfn, j) {
+				qed_iov_bulletin_set_udp_ports(hwfn, j,
+							       vxlan_port,
+							       geneve_port);
+			}
+
+			qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+		}
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 92a3ee1..85672f6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -3511,6 +3511,29 @@ static void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
 	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
 }
 
+void qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn,
+				    int vfid, u16 vxlan_port, u16 geneve_port)
+{
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set udp ports, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	if (vf_info->b_malicious) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Can not set udp ports to malicious VF [%d]\n",
+			   vfid);
+		return;
+	}
+
+	vf_info->bulletin.p_virt->vxlan_udp_port = vxlan_port;
+	vf_info->bulletin.p_virt->geneve_udp_port = geneve_port;
+}
+
 static bool qed_iov_vf_has_vport_instance(struct qed_hwfn *p_hwfn, int vfid)
 {
 	struct qed_vf_info *p_vf_info;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 8e96b1d..81a497c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -270,6 +270,9 @@ enum qed_iov_wq_flag {
  */
 u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id);
 
+void qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn,
+				    int vfid, u16 vxlan_port, u16 geneve_port);
+
 /**
  * @brief Read sriov related information and allocated resources
  *  reads from configuraiton space, shmem, etc.
@@ -378,6 +381,12 @@ static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
 	return MAX_NUM_VFS;
 }
 
+static inline void
+qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn, int vfid,
+			       u16 vxlan_port, u16 geneve_port)
+{
+}
+
 static inline int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 {
 	return 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 7987865..01cbbe6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -1251,6 +1251,18 @@ static bool qed_vf_bulletin_get_forced_mac(struct qed_hwfn *hwfn,
 	return true;
 }
 
+static void
+qed_vf_bulletin_get_udp_ports(struct qed_hwfn *p_hwfn,
+			      u16 *p_vxlan_port, u16 *p_geneve_port)
+{
+	struct qed_bulletin_content *p_bulletin;
+
+	p_bulletin = &p_hwfn->vf_iov_info->bulletin_shadow;
+
+	*p_vxlan_port = p_bulletin->vxlan_udp_port;
+	*p_geneve_port = p_bulletin->geneve_udp_port;
+}
+
 void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 			   u16 *fw_major, u16 *fw_minor,
 			   u16 *fw_rev, u16 *fw_eng)
@@ -1270,12 +1282,16 @@ static void qed_handle_bulletin_change(struct qed_hwfn *hwfn)
 	struct qed_eth_cb_ops *ops = hwfn->cdev->protocol_ops.eth;
 	u8 mac[ETH_ALEN], is_mac_exist, is_mac_forced;
 	void *cookie = hwfn->cdev->ops_cookie;
+	u16 vxlan_port, geneve_port;
 
+	qed_vf_bulletin_get_udp_ports(hwfn, &vxlan_port, &geneve_port);
 	is_mac_exist = qed_vf_bulletin_get_forced_mac(hwfn, mac,
 						      &is_mac_forced);
 	if (is_mac_exist && cookie)
 		ops->force_mac(cookie, mac, !!is_mac_forced);
 
+	ops->ports_update(cookie, vxlan_port, geneve_port);
+
 	/* Always update link configuration according to bulletin */
 	qed_link_update(hwfn);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 105c0ed..6cca145 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -513,7 +513,9 @@ struct qed_bulletin_content {
 	u8 partner_rx_flow_ctrl_en;
 	u8 partner_adv_pause;
 	u8 sfp_tx_fault;
-	u8 padding4[6];
+	u16 vxlan_udp_port;
+	u16 geneve_udp_port;
+	u8 padding4[2];
 
 	u32 speed;
 	u32 partner_adv_speed;
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 26699a7..766a79d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -480,6 +480,7 @@ irqreturn_t qede_msix_fp_int(int irq, void *fp_cookie);
 
 /* Filtering function definitions */
 void qede_force_mac(void *dev, u8 *mac, bool forced);
+void qede_udp_ports_update(void *dev, u16 vxlan_port, u16 geneve_port);
 int qede_set_mac_addr(struct net_device *ndev, void *p);
 
 int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 4fa2c88..eb56520 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -480,6 +480,17 @@ int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 }
 #endif
 
+void qede_udp_ports_update(void *dev, u16 vxlan_port, u16 geneve_port)
+{
+	struct qede_dev *edev = dev;
+
+	if (edev->vxlan_dst_port != vxlan_port)
+		edev->vxlan_dst_port = 0;
+
+	if (edev->geneve_dst_port != geneve_port)
+		edev->geneve_dst_port = 0;
+}
+
 void qede_force_mac(void *dev, u8 *mac, bool forced)
 {
 	struct qede_dev *edev = dev;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index f57c823..292e2dc 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -231,6 +231,7 @@ static struct qed_eth_cb_ops qede_ll_ops = {
 		.link_update = qede_link_update,
 	},
 	.force_mac = qede_force_mac,
+	.ports_update = qede_udp_ports_update,
 };
 
 static int qede_netdev_event(struct notifier_block *this, unsigned long event,
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 1eba803c..15fa7c6 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -158,6 +158,7 @@ struct qed_tunn_params {
 struct qed_eth_cb_ops {
 	struct qed_common_cb_ops common;
 	void (*force_mac) (void *dev, u8 *mac, bool forced);
+	void (*ports_update)(void *dev, u16 vxlan_port, u16 geneve_port);
 };
 
 #define QED_MAX_PHC_DRIFT_PPB   291666666
-- 
2.7.2

^ permalink raw reply related

* [PATCH net-next 4/6] qede: Configure UDP ports in local context.
From: Manish Chopra @ 2017-04-24 17:00 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuval.Mintz
In-Reply-To: <20170424170049.16027-1-manish.chopra@cavium.com>

This patch configures UDP ports locally instead of
configuring them in deferred context which would be
helpful in synchronizing UDP ports configuration for VFs
which will be enabled in further patches.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h        |  2 -
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 67 ++++++++++++++++++++------
 drivers/net/ethernet/qlogic/qede/qede_main.c   | 19 --------
 3 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 7e18ae6..26699a7 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -442,8 +442,6 @@ struct qede_fastpath {
 #define QEDE_TUNN_CSUM_UNNECESSARY	BIT(2)
 
 #define QEDE_SP_RX_MODE			1
-#define QEDE_SP_VXLAN_PORT_CONFIG	2
-#define QEDE_SP_GENEVE_PORT_CONFIG	3
 
 #ifdef CONFIG_RFS_ACCEL
 int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 23e0c16..4fa2c88 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -883,7 +883,11 @@ int qede_set_features(struct net_device *dev, netdev_features_t features)
 void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti)
 {
 	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_tunn_params tunn_params;
 	u16 t_port = ntohs(ti->port);
+	int rc;
+
+	memset(&tunn_params, 0, sizeof(tunn_params));
 
 	switch (ti->type) {
 	case UDP_TUNNEL_TYPE_VXLAN:
@@ -893,12 +897,22 @@ void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti)
 		if (edev->vxlan_dst_port)
 			return;
 
-		edev->vxlan_dst_port = t_port;
+		tunn_params.update_vxlan_port = 1;
+		tunn_params.vxlan_port = t_port;
 
-		DP_VERBOSE(edev, QED_MSG_DEBUG, "Added vxlan port=%d\n",
-			   t_port);
+		__qede_lock(edev);
+		rc = edev->ops->tunn_config(edev->cdev, &tunn_params);
+		__qede_unlock(edev);
+
+		if (!rc) {
+			edev->vxlan_dst_port = t_port;
+			DP_VERBOSE(edev, QED_MSG_DEBUG, "Added vxlan port=%d\n",
+				   t_port);
+		} else {
+			DP_NOTICE(edev, "Failed to add vxlan UDP port=%d\n",
+				  t_port);
+		}
 
-		set_bit(QEDE_SP_VXLAN_PORT_CONFIG, &edev->sp_flags);
 		break;
 	case UDP_TUNNEL_TYPE_GENEVE:
 		if (!edev->dev_info.common.geneve_enable)
@@ -907,51 +921,74 @@ void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti)
 		if (edev->geneve_dst_port)
 			return;
 
-		edev->geneve_dst_port = t_port;
+		tunn_params.update_geneve_port = 1;
+		tunn_params.geneve_port = t_port;
+
+		__qede_lock(edev);
+		rc = edev->ops->tunn_config(edev->cdev, &tunn_params);
+		__qede_unlock(edev);
+
+		if (!rc) {
+			edev->geneve_dst_port = t_port;
+			DP_VERBOSE(edev, QED_MSG_DEBUG,
+				   "Added geneve port=%d\n", t_port);
+		} else {
+			DP_NOTICE(edev, "Failed to add geneve UDP port=%d\n",
+				  t_port);
+		}
 
-		DP_VERBOSE(edev, QED_MSG_DEBUG, "Added geneve port=%d\n",
-			   t_port);
-		set_bit(QEDE_SP_GENEVE_PORT_CONFIG, &edev->sp_flags);
 		break;
 	default:
 		return;
 	}
-
-	schedule_delayed_work(&edev->sp_task, 0);
 }
 
-void qede_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *ti)
+void qede_udp_tunnel_del(struct net_device *dev,
+			 struct udp_tunnel_info *ti)
 {
 	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_tunn_params tunn_params;
 	u16 t_port = ntohs(ti->port);
 
+	memset(&tunn_params, 0, sizeof(tunn_params));
+
 	switch (ti->type) {
 	case UDP_TUNNEL_TYPE_VXLAN:
 		if (t_port != edev->vxlan_dst_port)
 			return;
 
+		tunn_params.update_vxlan_port = 1;
+		tunn_params.vxlan_port = 0;
+
+		__qede_lock(edev);
+		edev->ops->tunn_config(edev->cdev, &tunn_params);
+		__qede_unlock(edev);
+
 		edev->vxlan_dst_port = 0;
 
 		DP_VERBOSE(edev, QED_MSG_DEBUG, "Deleted vxlan port=%d\n",
 			   t_port);
 
-		set_bit(QEDE_SP_VXLAN_PORT_CONFIG, &edev->sp_flags);
 		break;
 	case UDP_TUNNEL_TYPE_GENEVE:
 		if (t_port != edev->geneve_dst_port)
 			return;
 
+		tunn_params.update_geneve_port = 1;
+		tunn_params.geneve_port = 0;
+
+		__qede_lock(edev);
+		edev->ops->tunn_config(edev->cdev, &tunn_params);
+		__qede_unlock(edev);
+
 		edev->geneve_dst_port = 0;
 
 		DP_VERBOSE(edev, QED_MSG_DEBUG, "Deleted geneve port=%d\n",
 			   t_port);
-		set_bit(QEDE_SP_GENEVE_PORT_CONFIG, &edev->sp_flags);
 		break;
 	default:
 		return;
 	}
-
-	schedule_delayed_work(&edev->sp_task, 0);
 }
 
 static void qede_xdp_reload_func(struct qede_dev *edev,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 42f043b..f57c823 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -796,7 +796,6 @@ static void qede_sp_task(struct work_struct *work)
 {
 	struct qede_dev *edev = container_of(work, struct qede_dev,
 					     sp_task.work);
-	struct qed_dev *cdev = edev->cdev;
 
 	__qede_lock(edev);
 
@@ -804,24 +803,6 @@ static void qede_sp_task(struct work_struct *work)
 		if (edev->state == QEDE_STATE_OPEN)
 			qede_config_rx_mode(edev->ndev);
 
-	if (test_and_clear_bit(QEDE_SP_VXLAN_PORT_CONFIG, &edev->sp_flags)) {
-		struct qed_tunn_params tunn_params;
-
-		memset(&tunn_params, 0, sizeof(tunn_params));
-		tunn_params.update_vxlan_port = 1;
-		tunn_params.vxlan_port = edev->vxlan_dst_port;
-		qed_ops->tunn_config(cdev, &tunn_params);
-	}
-
-	if (test_and_clear_bit(QEDE_SP_GENEVE_PORT_CONFIG, &edev->sp_flags)) {
-		struct qed_tunn_params tunn_params;
-
-		memset(&tunn_params, 0, sizeof(tunn_params));
-		tunn_params.update_geneve_port = 1;
-		tunn_params.geneve_port = edev->geneve_dst_port;
-		qed_ops->tunn_config(cdev, &tunn_params);
-	}
-
 #ifdef CONFIG_RFS_ACCEL
 	if (test_and_clear_bit(QEDE_SP_ARFS_CONFIG, &edev->sp_flags)) {
 		if (edev->state == QEDE_STATE_OPEN)
-- 
2.7.2

^ permalink raw reply related

* Re: [PATCH net] ipv6: move stub initialization after ipv6 setup completion
From: Cong Wang @ 2017-04-24 17:02 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Linux Kernel Network Developers, David S. Miller,
	Hannes Frederic Sowa
In-Reply-To: <fd7c85f650661466a782a71485f227f0b84454a7.1493035843.git.pabeni@redhat.com>

On Mon, Apr 24, 2017 at 5:18 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> The ipv6 stub pointer is currently initialized before the ipv6
> routing subsystem: a 3rd party can access and use such stub
> before the routing data is ready.
> Moreover, such pointer is not cleared in case of initialization
> error, possibly leading to dangling pointers usage.
>
> This change addresses the above moving the stub initialization
> at the end of ipv6 init code.
>
> Fixes: 5f81bd2e5d80 ("ipv6: export a stub for IPv6 symbols used by vxlan")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

^ permalink raw reply

* Re: [PATCH RFC (resend) net-next 0/6] virtio-net: Add support for virtio-net header extensions
From: Michael S. Tsirkin @ 2017-04-24 17:04 UTC (permalink / raw)
  To: Vlad Yasevich
  Cc: Jason Wang, Vladislav Yasevich, netdev, virtio-dev,
	maxime.coquelin, virtualization
In-Reply-To: <f52fa31e-5ca2-34dc-d48b-417a34061359@redhat.com>

On Thu, Apr 20, 2017 at 11:34:57AM -0400, Vlad Yasevich wrote:
> > - For 1.1, do we really want something like vnet header? AFAIK, it was not used by modern
> > NICs, is this better to pack all meta-data into descriptor itself? This may need a some
> > changes in tun/macvtap, but looks more PCIE friendly.
> 
> That would really be ideal and I've looked at this.

We already have at least 16 unused bits in the used ring
(head is 16 bit we are using 32 for it).

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next v2 2/5] virtio-net: transmit napi
From: Willem de Bruijn @ 2017-04-24 17:05 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, Network Development, virtualization, David Miller,
	Willem de Bruijn
In-Reply-To: <20170424194015-mutt-send-email-mst@kernel.org>

On Mon, Apr 24, 2017 at 12:40 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Fri, Apr 21, 2017 at 10:50:12AM -0400, Willem de Bruijn wrote:
>> >>> Maybe I was wrong, but according to Michael's comment it looks like he
>> >>> want
>> >>> check affinity_hint_set just for speculative tx polling on rx napi
>> >>> instead
>> >>> of disabling it at all.
>> >>>
>> >>> And I'm not convinced this is really needed, driver only provide affinity
>> >>> hint instead of affinity, so it's not guaranteed that tx and rx interrupt
>> >>> are in the same vcpus.
>> >>
>> >> You're right. I made the restriction broader than the request, to really
>> >> err
>> >> on the side of caution for the initial merge of napi tx. And enabling
>> >> the optimization is always a win over keeping it off, even without irq
>> >> affinity.
>> >>
>> >> The cycle cost is significant without affinity regardless of whether the
>> >> optimization is used.
>> >
>> >
>> > Yes, I noticed this in the past too.
>> >
>> >> Though this is not limited to napi-tx, it is more
>> >> pronounced in that mode than without napi.
>> >>
>> >> 1x TCP_RR for affinity configuration {process, rx_irq, tx_irq}:
>> >>
>> >> upstream:
>> >>
>> >> 1,1,1: 28985 Mbps, 278 Gcyc
>> >> 1,0,2: 30067 Mbps, 402 Gcyc
>> >>
>> >> napi tx:
>> >>
>> >> 1,1,1: 34492 Mbps, 269 Gcyc
>> >> 1,0,2: 36527 Mbps, 537 Gcyc (!)
>> >> 1,0,1: 36269 Mbps, 394 Gcyc
>> >> 1,0,0: 34674 Mbps, 402 Gcyc
>> >>
>> >> This is a particularly strong example. It is also representative
>> >> of most RR tests. It is less pronounced in other streaming tests.
>> >> 10x TCP_RR, for instance:
>> >>
>> >> upstream:
>> >>
>> >> 1,1,1: 42267 Mbps, 301 Gcyc
>> >> 1,0,2: 40663 Mbps, 445 Gcyc
>> >>
>> >> napi tx:
>> >>
>> >> 1,1,1: 42420 Mbps, 303 Gcyc
>> >> 1,0,2:  42267 Mbps, 431 Gcyc
>> >>
>> >> These numbers were obtained with the virtqueue_enable_cb_delayed
>> >> optimization after xmit_skb, btw. It turns out that moving that before
>> >> increases 1x TCP_RR further to ~39 Gbps, at the cost of reducing
>> >> 100x TCP_RR a bit.
>> >
>> >
>> > I see, so I think we can leave the affinity hint optimization/check for
>> > future investigation:
>> >
>> > - to avoid endless optimization (e.g we may want to share a single
>> > vector/napi for tx/rx queue pairs in the future) for this series.
>> > - tx napi is disabled by default which means we can do optimization on top.
>>
>> Okay. I'll drop the vi->affinity_hint_set from the patch set for now.
>
> I kind of like it, let's be conservative. But I'd prefer a comment
> near it explaining why it's there.

I don't feel strongly. Was minutes away from sending a v3 with this
code reverted, but I'll reinstate it and add a comment. Other planned
changes based on Jason's feedback to v2:

  v2 -> v3:
    - convert __netif_tx_trylock to __netif_tx_lock on tx napi poll
          ensure that the handler always cleans, to avoid deadlock
    - unconditionally clean in start_xmit
          avoid adding an unnecessary "if (use_napi)" branch
    - remove virtqueue_disable_cb in patch 5/5
          a noop in the common event_idx based loop

^ permalink raw reply

* Re: [PATCH net-next v2 2/5] virtio-net: transmit napi
From: Michael S. Tsirkin @ 2017-04-24 17:14 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Willem de Bruijn, Network Development, David Miller,
	virtualization
In-Reply-To: <CAF=yD-Lv8BptfhV+Many0iaG1Dz+LmkT2VVv5Kgo=TU31PJPHQ@mail.gmail.com>

On Mon, Apr 24, 2017 at 01:05:45PM -0400, Willem de Bruijn wrote:
> On Mon, Apr 24, 2017 at 12:40 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Fri, Apr 21, 2017 at 10:50:12AM -0400, Willem de Bruijn wrote:
> >> >>> Maybe I was wrong, but according to Michael's comment it looks like he
> >> >>> want
> >> >>> check affinity_hint_set just for speculative tx polling on rx napi
> >> >>> instead
> >> >>> of disabling it at all.
> >> >>>
> >> >>> And I'm not convinced this is really needed, driver only provide affinity
> >> >>> hint instead of affinity, so it's not guaranteed that tx and rx interrupt
> >> >>> are in the same vcpus.
> >> >>
> >> >> You're right. I made the restriction broader than the request, to really
> >> >> err
> >> >> on the side of caution for the initial merge of napi tx. And enabling
> >> >> the optimization is always a win over keeping it off, even without irq
> >> >> affinity.
> >> >>
> >> >> The cycle cost is significant without affinity regardless of whether the
> >> >> optimization is used.
> >> >
> >> >
> >> > Yes, I noticed this in the past too.
> >> >
> >> >> Though this is not limited to napi-tx, it is more
> >> >> pronounced in that mode than without napi.
> >> >>
> >> >> 1x TCP_RR for affinity configuration {process, rx_irq, tx_irq}:
> >> >>
> >> >> upstream:
> >> >>
> >> >> 1,1,1: 28985 Mbps, 278 Gcyc
> >> >> 1,0,2: 30067 Mbps, 402 Gcyc
> >> >>
> >> >> napi tx:
> >> >>
> >> >> 1,1,1: 34492 Mbps, 269 Gcyc
> >> >> 1,0,2: 36527 Mbps, 537 Gcyc (!)
> >> >> 1,0,1: 36269 Mbps, 394 Gcyc
> >> >> 1,0,0: 34674 Mbps, 402 Gcyc
> >> >>
> >> >> This is a particularly strong example. It is also representative
> >> >> of most RR tests. It is less pronounced in other streaming tests.
> >> >> 10x TCP_RR, for instance:
> >> >>
> >> >> upstream:
> >> >>
> >> >> 1,1,1: 42267 Mbps, 301 Gcyc
> >> >> 1,0,2: 40663 Mbps, 445 Gcyc
> >> >>
> >> >> napi tx:
> >> >>
> >> >> 1,1,1: 42420 Mbps, 303 Gcyc
> >> >> 1,0,2:  42267 Mbps, 431 Gcyc
> >> >>
> >> >> These numbers were obtained with the virtqueue_enable_cb_delayed
> >> >> optimization after xmit_skb, btw. It turns out that moving that before
> >> >> increases 1x TCP_RR further to ~39 Gbps, at the cost of reducing
> >> >> 100x TCP_RR a bit.
> >> >
> >> >
> >> > I see, so I think we can leave the affinity hint optimization/check for
> >> > future investigation:
> >> >
> >> > - to avoid endless optimization (e.g we may want to share a single
> >> > vector/napi for tx/rx queue pairs in the future) for this series.
> >> > - tx napi is disabled by default which means we can do optimization on top.
> >>
> >> Okay. I'll drop the vi->affinity_hint_set from the patch set for now.
> >
> > I kind of like it, let's be conservative. But I'd prefer a comment
> > near it explaining why it's there.
> 
> I don't feel strongly. Was minutes away from sending a v3 with this
> code reverted, but I'll reinstate it and add a comment. Other planned
> changes based on Jason's feedback to v2:
> 
>   v2 -> v3:
>     - convert __netif_tx_trylock to __netif_tx_lock on tx napi poll
>           ensure that the handler always cleans, to avoid deadlock
>     - unconditionally clean in start_xmit
>           avoid adding an unnecessary "if (use_napi)" branch
>     - remove virtqueue_disable_cb in patch 5/5
>           a noop in the common event_idx based loop

Makes sense, thanks!

-- 
MST

^ permalink raw reply

* Re: cpsw regression in mainline with "cpsw/netcp: cpts depends on posix_timers"
From: Arnd Bergmann @ 2017-04-24 17:35 UTC (permalink / raw)
  To: Tony Lindgren; +Cc: Grygorii Strashko, Networking, linux-omap
In-Reply-To: <20170424165104.GC3780@atomide.com>

On Mon, Apr 24, 2017 at 6:51 PM, Tony Lindgren <tony@atomide.com> wrote:
> Hi,
>
> Looks like commit 07fef3623407 ("cpsw/netcp: cpts depends on posix_timers")
> in mainline started triggering the following oops at least on j5eco-evm.
>
> Adding CONFIG_PTP_1588_CLOCK to .config solves it, but the oops hints
> something is wrong with the dependencies.. CONFIG_TI_CPTS defaults to N
> and not selecting it causes the oops.
>
> Any ideas what's needed to properly fix this?

Does your configuration have POSIX_TIMERS enabled? If not, then CPTS
is now also disabled. There are two issues here that we need to solve:

a) find out why POSIX_TIMERS got disabled, and make sure it's always
    turned on for normal users

b) find out what's wrong with the driver when CPTS is disabled. This would
    be an existing problem that you just never saw because CPTS was
    always enabled so far.

        Arnd

^ permalink raw reply

* qed*: debug infrastructures
From: Elior, Ariel @ 2017-04-24 17:38 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, Mintz, Yuval, Tayar, Tomer, Dupuis, Chad

Hi Dave,

According to the recent messages on the list indicating debugfs is not the way
to go, I am looking for some guidance on what is. dpipe approach was
mentioned as favorable, but I wanted to make sure molding our debug features to
this infrastructure will result in something acceptable. A few points:

1.
One of our HW debug features is a signal recording feature. HW is configured to
output specific signals, which are then continuously dumped into a cyclic
buffer on host. There are ~8000 signals, which can be logically divided to ~100
groups. I believe this can be modeled in dpipe (or similar tool) as a set of
~100 tables with ~80 entries each, and user would be able to see them all and
choose what they like. The output data itself is binary, and meaningless to
"the user". The amount of data is basically as large a contiguous buffer as
driver can allocate, i.e. usually 4MB. When user selects the signals, and sets
meta data regarding to mode of operations, some device configuration will have
to take place. Does that sound reasonable?
This debug feature already exists out of tree for bnx2x and qed* drivers and is
*very* effective in field deployments. I would very much like to see this as an
in-tree feature via some infrastructure or another.

2.
Sometimes we want to debug the probe or removal flow of the driver. ethtool has
the disadvantage of only being available once network device is available. If a
bug stops the load flow before the ethtool debug paths are available, there is
no way to collect a dump. Similarly, removal flows which hit a bug but do remove
the network device, can't be debugged from ethtool. Does dpipe suffer from the
same problem? qed* (like mlx*) has a common-functionality module. This allows
creating debugfs nodes even before the network drivers are probed, providing a
solution for this (debug nodes are also available after network driver removal).
If dpipe does hold an answer here (e.g. provide preconfiguration which would
kick in when network device registers) then we might want to port all of our
register dump logic over there for this advantage. Does that sound reasonable?

3.
Yuval mentioned this, but I wanted to reiterate that the same is necessary for
our storage drivers (qedi/qedf). debugfs does have the advantage of being non
sub-system specific. Is there perhaps another non subsystem specific debug
infrastructure which *is* acceptable to the networking subsystem? My guess is
that the storage drivers will turn to debugfs (in their own subsystem).

Thanks,
Ariel

^ permalink raw reply

* Re: cpsw regression in mainline with "cpsw/netcp: cpts depends on posix_timers"
From: Tony Lindgren @ 2017-04-24 17:44 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: Grygorii Strashko, Networking, linux-omap
In-Reply-To: <CAK8P3a3kvMubq5Mtu3XKnityAgRv5R35dMtTX=BO8Qv_iX9=VQ@mail.gmail.com>

* Arnd Bergmann <arnd@arndb.de> [170424 10:38]:
> On Mon, Apr 24, 2017 at 6:51 PM, Tony Lindgren <tony@atomide.com> wrote:
> > Hi,
> >
> > Looks like commit 07fef3623407 ("cpsw/netcp: cpts depends on posix_timers")
> > in mainline started triggering the following oops at least on j5eco-evm.
> >
> > Adding CONFIG_PTP_1588_CLOCK to .config solves it, but the oops hints
> > something is wrong with the dependencies.. CONFIG_TI_CPTS defaults to N
> > and not selecting it causes the oops.
> >
> > Any ideas what's needed to properly fix this?
> 
> Does your configuration have POSIX_TIMERS enabled? If not, then CPTS
> is now also disabled. There are two issues here that we need to solve:
> 
> a) find out why POSIX_TIMERS got disabled, and make sure it's always
>     turned on for normal users

$ make omap2plus_defconfig
...
CONFIG_POSIX_TIMERS=y
# CONFIG_PTP_1588_CLOCK is not set

So CONFIG_TI_CPTS is unselectable.

> b) find out what's wrong with the driver when CPTS is disabled. This would
>     be an existing problem that you just never saw because CPTS was
>     always enabled so far.

See a) above, yes seems there are two problems here.

Regards,

Tony

^ permalink raw reply

* Re: [PATCH] macsec: avoid heap overflow in skb_to_sgvec
From: David Miller @ 2017-04-24 17:47 UTC (permalink / raw)
  To: Jason; +Cc: netdev, linux-kernel, stable, security
In-Reply-To: <20170421211448.16995-1-Jason@zx2c4.com>

From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 21 Apr 2017 23:14:48 +0200

> While this may appear as a humdrum one line change, it's actually quite
> important. An sk_buff stores data in three places:
> 
> 1. A linear chunk of allocated memory in skb->data. This is the easiest
>    one to work with, but it precludes using scatterdata since the memory
>    must be linear.
> 2. The array skb_shinfo(skb)->frags, which is of maximum length
>    MAX_SKB_FRAGS. This is nice for scattergather, since these fragments
>    can point to different pages.
> 3. skb_shinfo(skb)->frag_list, which is a pointer to another sk_buff,
>    which in turn can have data in either (1) or (2).
> 
> The first two are rather easy to deal with, since they're of a fixed
> maximum length, while the third one is not, since there can be
> potentially limitless chains of fragments. Fortunately dealing with
> frag_list is opt-in for drivers, so drivers don't actually have to deal
> with this mess. For whatever reason, macsec decided it wanted pain, and
> so it explicitly specified NETIF_F_FRAGLIST.
> 
> Because dealing with (1), (2), and (3) is insane, most users of sk_buff
> doing any sort of crypto or paging operation calls a convenient function
> called skb_to_sgvec (which happens to be recursive if (3) is in use!).
> This takes a sk_buff as input, and writes into its output pointer an
> array of scattergather list items. Sometimes people like to declare a
> fixed size scattergather list on the stack; othertimes people like to
> allocate a fixed size scattergather list on the heap. However, if you're
> doing it in a fixed-size fashion, you really shouldn't be using
> NETIF_F_FRAGLIST too (unless you're also ensuring the sk_buff and its
> frag_list children arent't shared and then you check the number of
> fragments in total required.)
> 
> Macsec specifically does this:
> 
>         size += sizeof(struct scatterlist) * (MAX_SKB_FRAGS + 1);
>         tmp = kmalloc(size, GFP_ATOMIC);
>         *sg = (struct scatterlist *)(tmp + sg_offset);
> 	...
>         sg_init_table(sg, MAX_SKB_FRAGS + 1);
>         skb_to_sgvec(skb, sg, 0, skb->len);
> 
> Specifying MAX_SKB_FRAGS + 1 is the right answer usually, but not if you're
> using NETIF_F_FRAGLIST, in which case the call to skb_to_sgvec will
> overflow the heap, and disaster ensues.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

This is definitely the simplest fix for now, applied, thanks.

^ permalink raw reply

* Re: [PATCH net] udp: disable inner UDP checksum offloads in IPsec case
From: David Miller @ 2017-04-24 17:49 UTC (permalink / raw)
  To: aatteka; +Cc: netdev
In-Reply-To: <1492813385-31736-1-git-send-email-aatteka@ovn.org>

From: Ansis Atteka <aatteka@ovn.org>
Date: Fri, 21 Apr 2017 15:23:05 -0700

> Otherwise, UDP checksum offloads could corrupt ESP packets by attempting
> to calculate UDP checksum when this inner UDP packet is already protected
> by IPsec.
> 
> One way to reproduce this bug is to have a VM with virtio_net driver (UFO
> set to ON in the guest VM); and then encapsulate all guest's Ethernet
> frames in Geneve; and then further encrypt Geneve with IPsec.  In this
> case following symptoms are observed:
> 1. If using ixgbe NIC, then it will complain with following error message:
>    ixgbe 0000:01:00.1: partial checksum but l4 proto=32!
> 2. Receiving IPsec stack will drop all the corrupted ESP packets and
>    increase XfrmInStateProtoError counter in /proc/net/xfrm_stat.
> 3. iperf UDP test from the VM with packet sizes above MTU will not work at
>    all.
> 4. iperf TCP test from the VM will get ridiculously low performance because.
> 
> Signed-off-by: Ansis Atteka <aatteka@ovn.org>
> Co-authored-by: Steffen Klassert <steffen.klassert@secunet.com>

Applied, thanks.

^ permalink raw reply

* [PATCH net-next v3 0/5] virtio-net tx napi
From: Willem de Bruijn @ 2017-04-24 17:49 UTC (permalink / raw)
  To: netdev; +Cc: mst, jasowang, virtualization, davem, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Add napi for virtio-net transmit completion processing.

Changes:
  v2 -> v3:
    - convert __netif_tx_trylock to __netif_tx_lock on tx napi poll
          ensure that the handler always cleans, to avoid deadlock
    - unconditionally clean in start_xmit
          avoid adding an unnecessary "if (use_napi)" branch
    - remove virtqueue_disable_cb in patch 5/5
          a noop in the common event_idx based loop
    - document affinity_hint_set constraint

  v1 -> v2:
    - disable by default
    - disable unless affinity_hint_set
          because cache misses add up to a third higher cycle cost,
	  e.g., in TCP_RR tests. This is not limited to the patch
	  that enables tx completion cleaning in rx napi.
    - use trylock to avoid contention between tx and rx napi
    - keep interrupts masked during xmit_more (new patch 5/5)
          this improves cycles especially for multi UDP_STREAM, which
	  does not benefit from cleaning tx completions on rx napi.
    - move free_old_xmit_skbs (new patch 3/5)
          to avoid forward declaration

    not changed:
    - deduplicate virnet_poll_tx and virtnet_poll_txclean
          they look similar, but have differ too much to make it
	  worthwhile.
    - delay netif_wake_subqueue for more than 2 + MAX_SKB_FRAGS
          evaluated, but made no difference
    - patch 1/5

  RFC -> v1:
    - dropped vhost interrupt moderation patch:
          not needed and likely expensive at light load
    - remove tx napi weight
        - always clean all tx completions
        - use boolean to toggle tx-napi, instead
    - only clean tx in rx if tx-napi is enabled
        - then clean tx before rx
    - fix: add missing braces in virtnet_freeze_down
    - testing: add 4KB TCP_RR + UDP test results

Based on previous patchsets by Jason Wang:

  [RFC V7 PATCH 0/7] enable tx interrupts for virtio-net
  http://lkml.iu.edu/hypermail/linux/kernel/1505.3/00245.html

Before commit b0c39dbdc204 ("virtio_net: don't free buffers in xmit
ring") the virtio-net driver would free transmitted packets on
transmission of new packets in ndo_start_xmit and, to catch the edge
case when no new packet is sent, also in a timer at 10HZ.

A timer can cause long stalls. VIRTIO_F_NOTIFY_ON_EMPTY avoids stalls
due to low free descriptor count. It does not address a stalls due to
low socket SO_SNDBUF. Increasing timer frequency decreases that stall
time, but increases interrupt rate and, thus, cycle count.

Currently, with no timer, packets are freed only at ndo_start_xmit.
Latency of consume_skb is now unbounded. To avoid a deadlock if a sock
reaches SO_SNDBUF, packets are orphaned on tx. This breaks TCP small
queues.

Reenable TCP small queues by removing the orphan. Instead of using a
timer, convert the driver to regular tx napi. This does not have the
unresolved stall issue and does not have any frequency to tune.

By keeping interrupts enabled by default, napi increases tx
interrupt rate. VIRTIO_F_EVENT_IDX avoids sending an interrupt if
one is already unacknowledged, so makes this more feasible today.
Combine that with an optimization that brings interrupt rate
back in line with the existing version for most workloads:

Tx completion cleaning on rx interrupts elides most explicit tx
interrupts by relying on the fact that many rx interrupts fire.

Tested by running {1, 10, 100} {TCP, UDP} STREAM, RR, 4K_RR benchmarks
from a guest to a server on the host, on an x86_64 Haswell. The guest
runs 4 vCPUs pinned to 4 cores. vhost and the test server are
pinned to a core each.

All results are the median of 5 runs, with variance well < 10%.
Used neper (github.com/google/neper) as test process.

Napi increases single stream throughput, but increases cycle cost.
The optimizations bring this down. The previous patchset saw a
regression with UDP_STREAM, which does not benefit from cleaning tx
interrupts in rx napi. This regression is now gone for 10x, 100x.
Remaining difference is higher 1x TCP_STREAM, lower 1x UDP_STREAM.

The latest results are with process, rx napi and tx napi affine to
the same core. All numbers are lower than the previous patchset.

             upstream     napi
TCP_STREAM:
1x:
  Mbps          27816    39805
  Gcycles         274      285

10x:
  Mbps          42947    42531
  Gcycles         300      296

100x:
  Mbps          31830    28042
  Gcycles         279      269

TCP_RR Latency (us):
1x:
  p50              21       21
  p99              27       27
  Gcycles         180      167

10x:
  p50              40       39
  p99              52       52
  Gcycles         214      211

100x:
  p50             281      241
  p99             411      337
  Gcycles         218      226

TCP_RR 4K:
1x:
  p50              28       29
  p99              34       36
  Gcycles         177      167

10x:
  p50              70       71
  p99              85      134
  Gcycles         213      214

100x:
  p50             442      611
  p99             802      785
  Gcycles         237      216

UDP_STREAM:
1x:
  Mbps          29468    26800
  Gcycles         284      293

10x:
  Mbps          29891    29978
  Gcycles         285      312

100x:
  Mbps          30269    30304
  Gcycles         318      316

UDP_RR:
1x:
  p50              19       19
  p99              23       23
  Gcycles         180      173

10x:
  p50              35       40
  p99              54       64
  Gcycles         245      237

100x:
  p50             234      286
  p99             484      473
  Gcycles         224      214

Note that GSO is enabled, so 4K RR still translates to one packet
per request.

Lower throughput at 100x vs 10x can be (at least in part)
explained by looking at bytes per packet sent (nstat). It likely
also explains the lower throughput of 1x for some variants.

upstream:

 N=1   bytes/pkt=16581
 N=10  bytes/pkt=61513
 N=100 bytes/pkt=51558

at_rx:

 N=1   bytes/pkt=65204
 N=10  bytes/pkt=65148
 N=100 bytes/pkt=56840

Willem de Bruijn (5):
  virtio-net: napi helper functions
  virtio-net: transmit napi
  virtio-net: move free_old_xmit_skbs
  virtio-net: clean tx descriptors from rx napi
  virtio-net: keep tx interrupts disabled unless kick

 drivers/net/virtio_net.c | 193 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 132 insertions(+), 61 deletions(-)

-- 
2.12.2.816.g2cccc81164-goog

^ permalink raw reply

* [PATCH net-next v3 1/5] virtio-net: napi helper functions
From: Willem de Bruijn @ 2017-04-24 17:49 UTC (permalink / raw)
  To: netdev; +Cc: mst, jasowang, virtualization, davem, Willem de Bruijn
In-Reply-To: <20170424174930.82623-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Prepare virtio-net for tx napi by converting existing napi code to
use helper functions. This also deduplicates some logic.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 65 ++++++++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 666ada6130ab..b9c1df29892c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -239,6 +239,26 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 	return p;
 }
 
+static void virtqueue_napi_schedule(struct napi_struct *napi,
+				    struct virtqueue *vq)
+{
+	if (napi_schedule_prep(napi)) {
+		virtqueue_disable_cb(vq);
+		__napi_schedule(napi);
+	}
+}
+
+static void virtqueue_napi_complete(struct napi_struct *napi,
+				    struct virtqueue *vq, int processed)
+{
+	int opaque;
+
+	opaque = virtqueue_enable_cb_prepare(vq);
+	if (napi_complete_done(napi, processed) &&
+	    unlikely(virtqueue_poll(vq, opaque)))
+		virtqueue_napi_schedule(napi, vq);
+}
+
 static void skb_xmit_done(struct virtqueue *vq)
 {
 	struct virtnet_info *vi = vq->vdev->priv;
@@ -936,27 +956,20 @@ static void skb_recv_done(struct virtqueue *rvq)
 	struct virtnet_info *vi = rvq->vdev->priv;
 	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
 
-	/* Schedule NAPI, Suppress further interrupts if successful. */
-	if (napi_schedule_prep(&rq->napi)) {
-		virtqueue_disable_cb(rvq);
-		__napi_schedule(&rq->napi);
-	}
+	virtqueue_napi_schedule(&rq->napi, rvq);
 }
 
-static void virtnet_napi_enable(struct receive_queue *rq)
+static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
 {
-	napi_enable(&rq->napi);
+	napi_enable(napi);
 
 	/* If all buffers were filled by other side before we napi_enabled, we
-	 * won't get another interrupt, so process any outstanding packets
-	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
-	 * We synchronize against interrupts via NAPI_STATE_SCHED */
-	if (napi_schedule_prep(&rq->napi)) {
-		virtqueue_disable_cb(rq->vq);
-		local_bh_disable();
-		__napi_schedule(&rq->napi);
-		local_bh_enable();
-	}
+	 * won't get another interrupt, so process any outstanding packets now.
+	 * Call local_bh_enable after to trigger softIRQ processing.
+	 */
+	local_bh_disable();
+	virtqueue_napi_schedule(napi, vq);
+	local_bh_enable();
 }
 
 static void refill_work(struct work_struct *work)
@@ -971,7 +984,7 @@ static void refill_work(struct work_struct *work)
 
 		napi_disable(&rq->napi);
 		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
-		virtnet_napi_enable(rq);
+		virtnet_napi_enable(rq->vq, &rq->napi);
 
 		/* In theory, this can happen: if we don't get any buffers in
 		 * we will *never* try to fill again.
@@ -1011,21 +1024,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
 {
 	struct receive_queue *rq =
 		container_of(napi, struct receive_queue, napi);
-	unsigned int r, received;
+	unsigned int received;
 
 	received = virtnet_receive(rq, budget);
 
 	/* Out of packets? */
-	if (received < budget) {
-		r = virtqueue_enable_cb_prepare(rq->vq);
-		if (napi_complete_done(napi, received)) {
-			if (unlikely(virtqueue_poll(rq->vq, r)) &&
-			    napi_schedule_prep(napi)) {
-				virtqueue_disable_cb(rq->vq);
-				__napi_schedule(napi);
-			}
-		}
-	}
+	if (received < budget)
+		virtqueue_napi_complete(napi, rq->vq, received);
 
 	return received;
 }
@@ -1040,7 +1045,7 @@ static int virtnet_open(struct net_device *dev)
 			/* Make sure we have some buffers: if oom use wq. */
 			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
 				schedule_delayed_work(&vi->refill, 0);
-		virtnet_napi_enable(&vi->rq[i]);
+		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
 	}
 
 	return 0;
@@ -1747,7 +1752,7 @@ static int virtnet_restore_up(struct virtio_device *vdev)
 				schedule_delayed_work(&vi->refill, 0);
 
 		for (i = 0; i < vi->max_queue_pairs; i++)
-			virtnet_napi_enable(&vi->rq[i]);
+			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
 	}
 
 	netif_device_attach(vi->dev);
-- 
2.12.2.816.g2cccc81164-goog

^ permalink raw reply related

* [PATCH net-next v3 2/5] virtio-net: transmit napi
From: Willem de Bruijn @ 2017-04-24 17:49 UTC (permalink / raw)
  To: netdev; +Cc: mst, jasowang, virtualization, davem, Willem de Bruijn
In-Reply-To: <20170424174930.82623-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Convert virtio-net to a standard napi tx completion path. This enables
better TCP pacing using TCP small queues and increases single stream
throughput.

The virtio-net driver currently cleans tx descriptors on transmission
of new packets in ndo_start_xmit. Latency depends on new traffic, so
is unbounded. To avoid deadlock when a socket reaches its snd limit,
packets are orphaned on tranmission. This breaks socket backpressure,
including TSQ.

Napi increases the number of interrupts generated compared to the
current model, which keeps interrupts disabled as long as the ring
has enough free descriptors. Keep tx napi optional and disabled for
now. Follow-on patches will reduce the interrupt cost.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 76 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 9 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b9c1df29892c..356d18481ee4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -33,9 +33,10 @@
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
 
-static bool csum = true, gso = true;
+static bool csum = true, gso = true, napi_tx;
 module_param(csum, bool, 0444);
 module_param(gso, bool, 0444);
+module_param(napi_tx, bool, 0644);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
@@ -86,6 +87,8 @@ struct send_queue {
 
 	/* Name of the send queue: output.$index */
 	char name[40];
+
+	struct napi_struct napi;
 };
 
 /* Internal representation of a receive virtqueue */
@@ -262,12 +265,16 @@ static void virtqueue_napi_complete(struct napi_struct *napi,
 static void skb_xmit_done(struct virtqueue *vq)
 {
 	struct virtnet_info *vi = vq->vdev->priv;
+	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
 
 	/* Suppress further interrupts. */
 	virtqueue_disable_cb(vq);
 
-	/* We were probably waiting for more output buffers. */
-	netif_wake_subqueue(vi->dev, vq2txq(vq));
+	if (napi->weight)
+		virtqueue_napi_schedule(napi, vq);
+	else
+		/* We were probably waiting for more output buffers. */
+		netif_wake_subqueue(vi->dev, vq2txq(vq));
 }
 
 static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
@@ -972,6 +979,24 @@ static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
 	local_bh_enable();
 }
 
+static void virtnet_napi_tx_enable(struct virtnet_info *vi,
+				   struct virtqueue *vq,
+				   struct napi_struct *napi)
+{
+	if (!napi->weight)
+		return;
+
+	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
+	 * enable the feature if this is likely affine with the transmit path.
+	 */
+	if (!vi->affinity_hint_set) {
+		napi->weight = 0;
+		return;
+	}
+
+	return virtnet_napi_enable(vq, napi);
+}
+
 static void refill_work(struct work_struct *work)
 {
 	struct virtnet_info *vi =
@@ -1046,6 +1071,7 @@ static int virtnet_open(struct net_device *dev)
 			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
 				schedule_delayed_work(&vi->refill, 0);
 		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
+		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
 	}
 
 	return 0;
@@ -1081,6 +1107,24 @@ static void free_old_xmit_skbs(struct send_queue *sq)
 	u64_stats_update_end(&stats->tx_syncp);
 }
 
+static int virtnet_poll_tx(struct napi_struct *napi, int budget)
+{
+	struct send_queue *sq = container_of(napi, struct send_queue, napi);
+	struct virtnet_info *vi = sq->vq->vdev->priv;
+	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));
+
+	__netif_tx_lock(txq, raw_smp_processor_id());
+	free_old_xmit_skbs(sq);
+	__netif_tx_unlock(txq);
+
+	virtqueue_napi_complete(napi, sq->vq, 0);
+
+	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
+		netif_tx_wake_queue(txq);
+
+	return 0;
+}
+
 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
@@ -1130,6 +1174,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	int err;
 	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
 	bool kick = !skb->xmit_more;
+	bool use_napi = sq->napi.weight;
 
 	/* Free up any pending old buffers before queueing new ones. */
 	free_old_xmit_skbs(sq);
@@ -1152,8 +1197,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	/* Don't wait up for transmitted skbs to be freed. */
-	skb_orphan(skb);
-	nf_reset(skb);
+	if (!use_napi) {
+		skb_orphan(skb);
+		nf_reset(skb);
+	}
 
 	/* If running out of space, stop queue to avoid getting packets that we
 	 * are then unable to transmit.
@@ -1167,7 +1214,8 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
 		netif_stop_subqueue(dev, qnum);
-		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
+		if (!use_napi &&
+		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
 			free_old_xmit_skbs(sq);
 			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
@@ -1371,8 +1419,10 @@ static int virtnet_close(struct net_device *dev)
 	/* Make sure refill_work doesn't re-enable napi! */
 	cancel_delayed_work_sync(&vi->refill);
 
-	for (i = 0; i < vi->max_queue_pairs; i++)
+	for (i = 0; i < vi->max_queue_pairs; i++) {
 		napi_disable(&vi->rq[i].napi);
+		napi_disable(&vi->sq[i].napi);
+	}
 
 	return 0;
 }
@@ -1727,8 +1777,10 @@ static void virtnet_freeze_down(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 
 	if (netif_running(vi->dev)) {
-		for (i = 0; i < vi->max_queue_pairs; i++)
+		for (i = 0; i < vi->max_queue_pairs; i++) {
 			napi_disable(&vi->rq[i].napi);
+			napi_disable(&vi->sq[i].napi);
+		}
 	}
 }
 
@@ -1751,8 +1803,11 @@ static int virtnet_restore_up(struct virtio_device *vdev)
 			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
 				schedule_delayed_work(&vi->refill, 0);
 
-		for (i = 0; i < vi->max_queue_pairs; i++)
+		for (i = 0; i < vi->max_queue_pairs; i++) {
 			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
+			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
+					       &vi->sq[i].napi);
+		}
 	}
 
 	netif_device_attach(vi->dev);
@@ -1957,6 +2012,7 @@ static void virtnet_free_queues(struct virtnet_info *vi)
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		napi_hash_del(&vi->rq[i].napi);
 		netif_napi_del(&vi->rq[i].napi);
+		netif_napi_del(&vi->sq[i].napi);
 	}
 
 	/* We called napi_hash_del() before netif_napi_del(),
@@ -2142,6 +2198,8 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 		vi->rq[i].pages = NULL;
 		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
 			       napi_weight);
+		netif_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
+			       napi_tx ? napi_weight : 0);
 
 		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
 		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
-- 
2.12.2.816.g2cccc81164-goog

^ permalink raw reply related

* [PATCH net-next v3 3/5] virtio-net: move free_old_xmit_skbs
From: Willem de Bruijn @ 2017-04-24 17:49 UTC (permalink / raw)
  To: netdev; +Cc: mst, jasowang, virtualization, davem, Willem de Bruijn
In-Reply-To: <20170424174930.82623-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

An upcoming patch will call free_old_xmit_skbs indirectly from
virtnet_poll. Move the function above this to avoid having to
introduce a forward declaration.

This is a pure move: no code changes.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 drivers/net/virtio_net.c | 60 ++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 356d18481ee4..4ec79e5d7a86 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1045,6 +1045,36 @@ static int virtnet_receive(struct receive_queue *rq, int budget)
 	return received;
 }
 
+static void free_old_xmit_skbs(struct send_queue *sq)
+{
+	struct sk_buff *skb;
+	unsigned int len;
+	struct virtnet_info *vi = sq->vq->vdev->priv;
+	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
+	unsigned int packets = 0;
+	unsigned int bytes = 0;
+
+	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
+		pr_debug("Sent skb %p\n", skb);
+
+		bytes += skb->len;
+		packets++;
+
+		dev_kfree_skb_any(skb);
+	}
+
+	/* Avoid overhead when no packets have been processed
+	 * happens when called speculatively from start_xmit.
+	 */
+	if (!packets)
+		return;
+
+	u64_stats_update_begin(&stats->tx_syncp);
+	stats->tx_bytes += bytes;
+	stats->tx_packets += packets;
+	u64_stats_update_end(&stats->tx_syncp);
+}
+
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
 	struct receive_queue *rq =
@@ -1077,36 +1107,6 @@ static int virtnet_open(struct net_device *dev)
 	return 0;
 }
 
-static void free_old_xmit_skbs(struct send_queue *sq)
-{
-	struct sk_buff *skb;
-	unsigned int len;
-	struct virtnet_info *vi = sq->vq->vdev->priv;
-	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
-	unsigned int packets = 0;
-	unsigned int bytes = 0;
-
-	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		pr_debug("Sent skb %p\n", skb);
-
-		bytes += skb->len;
-		packets++;
-
-		dev_kfree_skb_any(skb);
-	}
-
-	/* Avoid overhead when no packets have been processed
-	 * happens when called speculatively from start_xmit.
-	 */
-	if (!packets)
-		return;
-
-	u64_stats_update_begin(&stats->tx_syncp);
-	stats->tx_bytes += bytes;
-	stats->tx_packets += packets;
-	u64_stats_update_end(&stats->tx_syncp);
-}
-
 static int virtnet_poll_tx(struct napi_struct *napi, int budget)
 {
 	struct send_queue *sq = container_of(napi, struct send_queue, napi);
-- 
2.12.2.816.g2cccc81164-goog

^ permalink raw reply related

* [PATCH net-next v3 4/5] virtio-net: clean tx descriptors from rx napi
From: Willem de Bruijn @ 2017-04-24 17:49 UTC (permalink / raw)
  To: netdev; +Cc: mst, jasowang, virtualization, davem, Willem de Bruijn
In-Reply-To: <20170424174930.82623-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Amortize the cost of virtual interrupts by doing both rx and tx work
on reception of a receive interrupt if tx napi is enabled. With
VIRTIO_F_EVENT_IDX, this suppresses most explicit tx completion
interrupts for bidirectional workloads.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 drivers/net/virtio_net.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4ec79e5d7a86..9dd978f34c1f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1075,12 +1075,33 @@ static void free_old_xmit_skbs(struct send_queue *sq)
 	u64_stats_update_end(&stats->tx_syncp);
 }
 
+static void virtnet_poll_cleantx(struct receive_queue *rq)
+{
+	struct virtnet_info *vi = rq->vq->vdev->priv;
+	unsigned int index = vq2rxq(rq->vq);
+	struct send_queue *sq = &vi->sq[index];
+	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
+
+	if (!sq->napi.weight)
+		return;
+
+	if (__netif_tx_trylock(txq)) {
+		free_old_xmit_skbs(sq);
+		__netif_tx_unlock(txq);
+	}
+
+	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
+		netif_tx_wake_queue(txq);
+}
+
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
 	struct receive_queue *rq =
 		container_of(napi, struct receive_queue, napi);
 	unsigned int received;
 
+	virtnet_poll_cleantx(rq);
+
 	received = virtnet_receive(rq, budget);
 
 	/* Out of packets? */
-- 
2.12.2.816.g2cccc81164-goog

^ permalink raw reply related

* [PATCH net-next v3 5/5] virtio-net: keep tx interrupts disabled unless kick
From: Willem de Bruijn @ 2017-04-24 17:49 UTC (permalink / raw)
  To: netdev; +Cc: mst, jasowang, virtualization, davem, Willem de Bruijn
In-Reply-To: <20170424174930.82623-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Tx napi mode increases the rate of transmit interrupts. Suppress some
by masking interrupts while more packets are expected. The interrupts
will be reenabled before the last packet is sent.

This optimization reduces the througput drop with tx napi for
unidirectional flows such as UDP_STREAM that do not benefit from
cleaning tx completions in the the receive napi handler.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 drivers/net/virtio_net.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9dd978f34c1f..003143835766 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1200,6 +1200,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Free up any pending old buffers before queueing new ones. */
 	free_old_xmit_skbs(sq);
 
+	if (use_napi && kick)
+		virtqueue_enable_cb_delayed(sq->vq);
+
 	/* timestamp packet in software */
 	skb_tx_timestamp(skb);
 
-- 
2.12.2.816.g2cccc81164-goog

^ permalink raw reply related

* PATCH drivers:net:cris/eth_v10: alternate string char arrary
From: Karim Eshapa @ 2017-04-24 17:49 UTC (permalink / raw)
  To: davem
  Cc: felipe.balbi, paul.gortmaker, mugunthanvnm, jarod, fw, netdev,
	linux-kernel, Karim Eshapa

static char pointer creates two variables in final assembly.
static string and pointer to it according to
Jeff Garzik janitors TODO.

Signed-off-by: Karim Eshapa <karim.eshapa@gmail.com>
---
 drivers/net/cris/eth_v10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c
index 91c876a..370765f 100644
--- a/drivers/net/cris/eth_v10.c
+++ b/drivers/net/cris/eth_v10.c
@@ -44,7 +44,7 @@
  * io regions, irqs and dma channels
  */
 
-static const char* cardname = "ETRAX 100LX built-in ethernet controller";
+static const char cardname[] = "ETRAX 100LX built-in ethernet controller";
 
 /* A default ethernet address. Highlevel SW will set the real one later */
 
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v3 07/29] x86: bpf_jit, use ENTRY+ENDPROC
From: Jiri Slaby @ 2017-04-24 17:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, David Miller, mingo, tglx, hpa, x86, jpoimboe,
	linux-kernel, netdev, daniel, edumazet
In-Reply-To: <20170424164717.GA80404@ast-mbp.thefacebook.com>

On 04/24/2017, 06:47 PM, Alexei Starovoitov wrote:
> On Mon, Apr 24, 2017 at 06:02:51PM +0200, Jiri Slaby wrote:
>> On 04/24/2017, 05:55 PM, Ingo Molnar wrote:
>>> * Jiri Slaby <jslaby@suse.cz> wrote:
>>>
>>>> On 04/24/2017, 05:08 PM, David Miller wrote:
>>>>> If you align the entry points, then the code sequence as a whole is
>>>>> are no longer densely packed.
>>>>
>>>> Sure.
>>>>
>>>>> Or do I misunderstand how your macros work?
>>>>
>>>> Perhaps. So the suggested macros for the code are:
>>>> #define BPF_FUNC_START_LOCAL(name) \
>>>> 		SYM_START(name, SYM_V_LOCAL, SYM_A_NONE)
>>>> #define BPF_FUNC_START(name) \
>>>> 		SYM_START(name, SYM_V_GLOBAL, SYM_A_NONE)
>>>>
>>>> and they differ from the standard ones:
>>>> #define SYM_FUNC_START_LOCAL(name)                      \
>>>>         SYM_START(name, SYM_V_LOCAL, SYM_A_ALIGN)
>>>> #define SYM_FUNC_START(name)                            \
>>>>         SYM_START(name, SYM_V_GLOBAL, SYM_A_ALIGN)
>>>>
>>>>
>>>> The difference is SYM_A_NONE vs. SYM_A_ALIGN, which means:
>>>> #define SYM_A_ALIGN                             ALIGN
>>>> #define SYM_A_NONE                              /* nothing */
>>>>
>>>> Does it look OK now?
>>>
>>> No, the patch changes alignment which is undesirable, it needs to preserve the 
>>> existing (non-)alignment of the symbols!
>>
>> OK, so I am not expressing myself explicitly enough, it seems.
>>
>> So, correct, the patch v3 adds alignments. I suggested in the discussion
>> the macros above. They do not add alignments. If everybody is OK with
>> that, v4 of the patch won't add alignments. OK?
> 
> can we go back to what problem this patch set is trying to solve?
> Sounds like you want to add _function_ start/end marks to aid debugging?
> Debugging with what? What tool will recognize this stuff?

objtool will generate DWARF debuginfo between every ENTRY and ENDPROC
(dubbed differently at the end of the series). DWARF is understood by
everything, including the kernel proper (we have DWARF unwinder in
SUSE's kernels available for decades).

> Take a look at what your patch does:
> +ENTRY(sk_load_word)
>         test    %esi,%esi
>         js      bpf_slow_path_word_neg
> +ENDPROC(sk_load_word)
> 
> Does above two assembler instructions look like a function?

Yes, you are right, the code is complete mess.

For example what's the point of making the sk_load_word_positive_offset
label a global, callable function? Note that this is exactly the reason
why this particular two hunks look weird to you even though the
annotations only mechanically paraphrase what is in the current code.

> or this:
> +ENTRY(sk_load_byte_positive_offset)
>         cmp     %esi,%r9d   /* if (offset >= hlen) goto bpf_slow_path_byte */
>         jle     bpf_slow_path_byte
>         movzbl  (SKBDATA,%rsi),%eax
>         ret
> +ENDPROC(sk_load_byte_positive_offset)
> 
> This assembler code doesn't represent functions. There is no prologue/epilogue
> and no stack frame. JITed code uses 'call' insn to jump into them, but they're
> not your typical C functions.

I am not looking for C functions everywhere in assembly, actually. (In
the ideal assembly, I would and in most cases I really am.) But you are
right the annotations of the current code aren't right. It results in my
annotations being wrong too -- based on invalid assumptions.

> Take a look at bpf_slow_path_common() macro that creates the frame before
> calling into C code with 'call skb_copy_bits;'
> 
> I still think that this code should be left alone.
> Even macro names you're proposing:
>  #define BPF_FUNC_START_LOCAL
> don't sound right. These are not functions.

Well, what is the reason to call them FUNC in the current code then?

thanks,
-- 
js
suse labs

^ permalink raw reply

* Re: [PATCH net-next v2 2/5] virtio-net: transmit napi
From: Willem de Bruijn @ 2017-04-24 17:51 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, Network Development, virtualization, David Miller,
	Willem de Bruijn
In-Reply-To: <20170424201346-mutt-send-email-mst@kernel.org>

On Mon, Apr 24, 2017 at 1:14 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Apr 24, 2017 at 01:05:45PM -0400, Willem de Bruijn wrote:
>> On Mon, Apr 24, 2017 at 12:40 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > On Fri, Apr 21, 2017 at 10:50:12AM -0400, Willem de Bruijn wrote:
>> >> >>> Maybe I was wrong, but according to Michael's comment it looks like he
>> >> >>> want
>> >> >>> check affinity_hint_set just for speculative tx polling on rx napi
>> >> >>> instead
>> >> >>> of disabling it at all.
>> >> >>>
>> >> >>> And I'm not convinced this is really needed, driver only provide affinity
>> >> >>> hint instead of affinity, so it's not guaranteed that tx and rx interrupt
>> >> >>> are in the same vcpus.
>> >> >>
>> >> >> You're right. I made the restriction broader than the request, to really
>> >> >> err
>> >> >> on the side of caution for the initial merge of napi tx. And enabling
>> >> >> the optimization is always a win over keeping it off, even without irq
>> >> >> affinity.
>> >> >>
>> >> >> The cycle cost is significant without affinity regardless of whether the
>> >> >> optimization is used.
>> >> >
>> >> >
>> >> > Yes, I noticed this in the past too.
>> >> >
>> >> >> Though this is not limited to napi-tx, it is more
>> >> >> pronounced in that mode than without napi.
>> >> >>
>> >> >> 1x TCP_RR for affinity configuration {process, rx_irq, tx_irq}:
>> >> >>
>> >> >> upstream:
>> >> >>
>> >> >> 1,1,1: 28985 Mbps, 278 Gcyc
>> >> >> 1,0,2: 30067 Mbps, 402 Gcyc
>> >> >>
>> >> >> napi tx:
>> >> >>
>> >> >> 1,1,1: 34492 Mbps, 269 Gcyc
>> >> >> 1,0,2: 36527 Mbps, 537 Gcyc (!)
>> >> >> 1,0,1: 36269 Mbps, 394 Gcyc
>> >> >> 1,0,0: 34674 Mbps, 402 Gcyc
>> >> >>
>> >> >> This is a particularly strong example. It is also representative
>> >> >> of most RR tests. It is less pronounced in other streaming tests.
>> >> >> 10x TCP_RR, for instance:
>> >> >>
>> >> >> upstream:
>> >> >>
>> >> >> 1,1,1: 42267 Mbps, 301 Gcyc
>> >> >> 1,0,2: 40663 Mbps, 445 Gcyc
>> >> >>
>> >> >> napi tx:
>> >> >>
>> >> >> 1,1,1: 42420 Mbps, 303 Gcyc
>> >> >> 1,0,2:  42267 Mbps, 431 Gcyc
>> >> >>
>> >> >> These numbers were obtained with the virtqueue_enable_cb_delayed
>> >> >> optimization after xmit_skb, btw. It turns out that moving that before
>> >> >> increases 1x TCP_RR further to ~39 Gbps, at the cost of reducing
>> >> >> 100x TCP_RR a bit.
>> >> >
>> >> >
>> >> > I see, so I think we can leave the affinity hint optimization/check for
>> >> > future investigation:
>> >> >
>> >> > - to avoid endless optimization (e.g we may want to share a single
>> >> > vector/napi for tx/rx queue pairs in the future) for this series.
>> >> > - tx napi is disabled by default which means we can do optimization on top.
>> >>
>> >> Okay. I'll drop the vi->affinity_hint_set from the patch set for now.
>> >
>> > I kind of like it, let's be conservative. But I'd prefer a comment
>> > near it explaining why it's there.
>>
>> I don't feel strongly. Was minutes away from sending a v3 with this
>> code reverted, but I'll reinstate it and add a comment. Other planned
>> changes based on Jason's feedback to v2:
>>
>>   v2 -> v3:
>>     - convert __netif_tx_trylock to __netif_tx_lock on tx napi poll
>>           ensure that the handler always cleans, to avoid deadlock
>>     - unconditionally clean in start_xmit
>>           avoid adding an unnecessary "if (use_napi)" branch
>>     - remove virtqueue_disable_cb in patch 5/5
>>           a noop in the common event_idx based loop
>
> Makes sense, thanks!

Great. Sent that, thanks.

The actual diff to v2 is quite small:

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b107ae011632..003143835766 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -986,6 +986,9 @@ static void virtnet_napi_tx_enable(struct virtnet_info *vi,
        if (!napi->weight)
                return;

+       /* Tx napi touches cachelines on the cpu handling tx interrupts. Only
+        * enable the feature if this is likely affine with the transmit path.
+        */
        if (!vi->affinity_hint_set) {
                napi->weight = 0;
                return;
@@ -1131,10 +1134,9 @@ static int virtnet_poll_tx(struct napi_struct
*napi, int budget)
        struct virtnet_info *vi = sq->vq->vdev->priv;
        struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));

-       if (__netif_tx_trylock(txq)) {
-               free_old_xmit_skbs(sq);
-               __netif_tx_unlock(txq);
-       }
+       __netif_tx_lock(txq, raw_smp_processor_id());
+       free_old_xmit_skbs(sq);
+       __netif_tx_unlock(txq);

        virtqueue_napi_complete(napi, sq->vq, 0);

@@ -1196,14 +1198,10 @@ static netdev_tx_t start_xmit(struct sk_buff
*skb, struct net_device *dev)
        bool use_napi = sq->napi.weight;

        /* Free up any pending old buffers before queueing new ones. */
-       if (use_napi) {
-               if (kick)
-                       virtqueue_enable_cb_delayed(sq->vq);
-               else
-                       virtqueue_disable_cb(sq->vq);
-       } else {
-               free_old_xmit_skbs(sq);
-       }
+       free_old_xmit_skbs(sq);
+
+       if (use_napi && kick)
+               virtqueue_enable_cb_delayed(sq->vq);

(gmail will munge the identation, sorry)

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox