Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC PATCH 3/7] Enable pause frame support
From: Satha Koteswara Rao @ 2016-12-21  8:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: rvatsavayi, rric, david.daney, netdev, satha.rao, derek.chickles,
	sgoutham, davem, linux-arm-kernel, philip.romanov
In-Reply-To: <1482310011-1862-1-git-send-email-satha.rao@caviumnetworks.com>

---
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 25 +++++++++++++++++++++++
 drivers/net/ethernet/cavium/thunder/thunder_bgx.h |  7 +++++++
 2 files changed, 32 insertions(+)

diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 050e21f..92d7e04 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -121,6 +121,31 @@ static int bgx_poll_reg(struct bgx *bgx, u8 lmac, u64 reg, u64 mask, bool zero)
 	return 1;
 }
 
+void enable_pause_frames(int node, int bgx_idx, int lmac)
+{
+	u64 reg_value = 0;
+	struct bgx *bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx];
+
+	reg_value =  bgx_reg_read(bgx, lmac, BGX_SMUX_TX_CTL);
+	/* Enable BGX()_SMU()_TX_CTL */
+	if (!(reg_value & L2P_BP_CONV))
+		bgx_reg_write(bgx, lmac, BGX_SMUX_TX_CTL,
+			      (reg_value | (L2P_BP_CONV)));
+
+	reg_value =  bgx_reg_read(bgx, lmac, BGX_SMUX_HG2_CTL);
+	/* Clear if BGX()_SMU()_HG2_CONTROL[HG2TX_EN] is set */
+	if (reg_value & SMUX_HG2_CTL_HG2TX_EN)
+		bgx_reg_write(bgx, lmac, BGX_SMUX_HG2_CTL,
+			      (reg_value & (~SMUX_HG2_CTL_HG2TX_EN)));
+
+	reg_value =  bgx_reg_read(bgx, lmac, BGX_SMUX_CBFC_CTL);
+	/* Clear if BGX()_SMU()_CBFC_CTL[TX_EN] is set */
+	if (reg_value & CBFC_CTL_TX_EN)
+		bgx_reg_write(bgx, lmac, BGX_SMUX_CBFC_CTL,
+			      (reg_value & (~CBFC_CTL_TX_EN)));
+}
+EXPORT_SYMBOL(enable_pause_frames);
+
 /* Return number of BGX present in HW */
 unsigned bgx_get_map(int node)
 {
diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h
index 01cc7c8..5b57bd1 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h
@@ -131,6 +131,11 @@
 #define BGX_SMUX_TX_CTL			0x20178
 #define  SMU_TX_CTL_DIC_EN			BIT_ULL(0)
 #define  SMU_TX_CTL_UNI_EN			BIT_ULL(1)
+#define  L2P_BP_CONV				BIT_ULL(7)
+#define  BGX_SMUX_CBFC_CTL		0x20218
+#define  CBFC_CTL_TX_EN				BIT_ULL(1)
+#define  BGX_SMUX_HG2_CTL		0x20210
+#define SMUX_HG2_CTL_HG2TX_EN			BIT_ULL(18)
 #define  SMU_TX_CTL_LNK_STATUS			(3ull << 4)
 #define BGX_SMUX_TX_THRESH		0x20180
 #define BGX_SMUX_CTL			0x20200
@@ -212,6 +217,8 @@ void bgx_lmac_internal_loopback(int node, int bgx_idx,
 
 u64 bgx_get_rx_stats(int node, int bgx_idx, int lmac, int idx);
 u64 bgx_get_tx_stats(int node, int bgx_idx, int lmac, int idx);
+void enable_pause_frames(int node, int bgx_idx, int lmac);
+
 #define BGX_RX_STATS_COUNT 11
 #define BGX_TX_STATS_COUNT 18
 
-- 
1.8.3.1

^ permalink raw reply related

* [RFC PATCH 2/7] VF driver changes to enable hooks to get kernel notifications
From: Satha Koteswara Rao @ 2016-12-21  8:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: rvatsavayi, rric, david.daney, netdev, satha.rao, derek.chickles,
	sgoutham, davem, linux-arm-kernel, philip.romanov
In-Reply-To: <1482310011-1862-1-git-send-email-satha.rao@caviumnetworks.com>

---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c | 579 ++++++++++++++++++++++-
 1 file changed, 565 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 8a37012..8f00bc7 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -52,6 +52,11 @@
 MODULE_VERSION(DRV_VERSION);
 MODULE_DEVICE_TABLE(pci, nicvf_id_table);
 
+static int veb_enabled;
+
+int uc_mc_list;
+module_param(uc_mc_list, int, 0644);
+
 static int debug = 0x00;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "Debug message level bitmap");
@@ -61,6 +66,132 @@
 MODULE_PARM_DESC(cpi_alg,
 		 "PFC algorithm (0=none, 1=VLAN, 2=VLAN16, 3=IP Diffserv)");
 
+/* Initialize the Shadow List */
+void nicvf_shadow_list_init(struct netdev_hw_addr_list *list)
+{
+	INIT_LIST_HEAD(&list->list);
+	list->count = 0;
+}
+
+/*Set the sync it of the addr structure */
+void nicvf_shadow_list_setsync(struct netdev_hw_addr_list *list, int sync)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+		ha->synced = sync;
+	}
+}
+
+/*Flush the entire list */
+void nicvf_shadow_list_flush(struct netdev_hw_addr_list *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+		list_del(&ha->list);
+		kfree(ha);
+	}
+	list->count = 0;
+}
+
+/*Return the number of items in the list */
+int nicvf_shadow_list_count(struct netdev_hw_addr_list *list)
+{
+	return list->count;
+}
+
+/*Check if the list is empty */
+int nicvf_shadow_list_empty(struct netdev_hw_addr_list *list)
+{
+	return (list->count == 0);
+}
+
+/* Add item to list */
+int nicvf_shadow_list_add(struct netdev_hw_addr_list *list, unsigned char *addr)
+{
+	struct netdev_hw_addr *ha;
+	int alloc_size;
+
+	alloc_size = sizeof(*ha);
+	ha = kmalloc(alloc_size, GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	ether_addr_copy(ha->addr, addr);
+	ha->synced = 0;
+	list_add_tail(&ha->list, &list->list);
+	list->count++;
+	return 0;
+}
+
+/* Delete item in the list given the address */
+void nicvf_shadow_list_del_ha(struct netdev_hw_addr_list *list,
+			      struct netdev_hw_addr *ha)
+{
+	list_del(&ha->list);
+	kfree(ha);
+	list->count--;
+}
+
+/* Delete item in list by address */
+int nicvf_shadow_list_del(struct netdev_hw_addr_list *list, unsigned char *addr)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &list->list, list)
+		if (ether_addr_equal(ha->addr, addr))
+			nicvf_shadow_list_del_ha(list, ha);
+
+	return -ENOENT;
+}
+
+/* Delete the addresses that are not in the netdev list and send delete
+ * notification
+ */
+int nicvf_shadow_list_delsync(struct netdev_hw_addr_list *list,
+			      struct nicvf *nic, int addr_type)
+{
+	int is_modified = 0;
+	union nic_mbx mbx = {};
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+		if (ha->synced == 1) {
+			if (!uc_mc_list) {
+				mbx.msg.msg = NIC_MBOX_MSG_UC_MC;
+				mbx.uc_mc_cfg.vf_id = nic->vf_id;
+				mbx.uc_mc_cfg.addr_type = addr_type;
+				mbx.uc_mc_cfg.is_flush = 0;
+				mbx.uc_mc_cfg.is_add = 0;
+				ether_addr_copy(mbx.uc_mc_cfg.mac_addr,
+						ha->addr);
+				if (nicvf_send_msg_to_pf(nic, &mbx)) {
+					netdev_err(nic->netdev,
+						   "PF not respond to MSG_UC_MC\n");
+				}
+			}
+			is_modified = 1;
+			nicvf_shadow_list_del_ha(list, ha);
+		}
+	}
+	return is_modified;
+}
+
+/*Check if an entry with the mac address exits in the list */
+int nicvf_shadow_list_find(struct netdev_hw_addr_list *list,
+			   unsigned char *addr)
+{
+	struct netdev_hw_addr *ha;
+
+	list_for_each_entry(ha, &list->list, list) {
+		if (ether_addr_equal(ha->addr, addr)) {
+			ha->synced = 0;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
 static inline u8 nicvf_netdev_qidx(struct nicvf *nic, u8 qidx)
 {
 	if (nic->sqs_mode)
@@ -113,22 +244,198 @@ static void nicvf_write_to_mbx(struct nicvf *nic, union nic_mbx *mbx)
 	nicvf_reg_write(nic, NIC_VF_PF_MAILBOX_0_1 + 8, msg[1]);
 }
 
+bool pf_ack_required(struct nicvf *nic, union nic_mbx *mbx)
+{
+	if (mbx->msg.msg == NIC_MBOX_MSG_PROMISC ||
+	    !nic->wait_for_ack)
+		return false;
+
+	return true;
+}
+
+void submit_uc_mc_mbox_msg(struct nicvf *nic, int vf, int flush, int addr_type,
+			   u8 *mac)
+{
+	union nic_mbx mbx = {};
+
+	mbx.msg.msg = NIC_MBOX_MSG_UC_MC;
+	mbx.uc_mc_cfg.vf_id = vf;
+	mbx.uc_mc_cfg.addr_type = addr_type;
+	mbx.uc_mc_cfg.is_flush = flush;
+	mbx.uc_mc_cfg.is_add = !flush;
+	if (mac)
+		ether_addr_copy(mbx.uc_mc_cfg.mac_addr, mac);
+
+	if (nicvf_send_msg_to_pf(nic, &mbx) == -EBUSY) {
+		netdev_err(nic->netdev,
+			   "PF didn't respond to MSG_UC_MC flush\n");
+	}
+}
+
+void send_uc_mc_msg(struct work_struct *work)
+{
+	struct nicvf *nic = container_of(work, struct nicvf, dwork.work);
+	struct net_device *netdev = nic->netdev;
+	union nic_mbx mbx = {};
+	int is_modified1 = 0;
+	int is_modified2 = 0;
+
+	if (nic->send_op_link_status) {
+		mbx.msg.msg = nic->link_up ? NIC_MBOX_MSG_OP_UP :
+						NIC_MBOX_MSG_OP_DOWN;
+		if (nicvf_send_msg_to_pf(nic, &mbx)) {
+			netdev_err(nic->netdev,
+				   "PF not respond to msg %d\n", mbx.msg.msg);
+		}
+		nic->send_op_link_status = false;
+		return;
+	}
+
+	/* If the netdev list is empty */
+	if (netdev_uc_empty(netdev)) {
+		/* If shadow list is not empty */
+		if (!nicvf_shadow_list_empty(&nic->uc_shadow)) {
+			/* send uc flush notifcation */
+			nicvf_shadow_list_flush(&nic->uc_shadow);
+			submit_uc_mc_mbox_msg(nic, nic->vf_id, 1, 0, NULL);
+		}
+	} else {
+		/* If shadow list is empty add all and notify */
+		if (nicvf_shadow_list_empty(&nic->uc_shadow)) {
+			struct netdev_hw_addr *ha;
+
+			netdev_for_each_uc_addr(ha, netdev) {
+				nicvf_shadow_list_add(&nic->uc_shadow,
+						      ha->addr);
+				submit_uc_mc_mbox_msg(nic, nic->vf_id, 0, 0,
+						      ha->addr);
+			}
+		} else {
+			struct netdev_hw_addr *ha;
+
+			nicvf_shadow_list_setsync(&nic->uc_shadow, 1);
+			/* ADD the entries which are present in netdev list
+			 * and not present in shadow list
+			 */
+			netdev_for_each_uc_addr(ha, netdev) {
+				if (nicvf_shadow_list_find(&nic->uc_shadow,
+							   ha->addr)) {
+					is_modified1 = 1;
+					nicvf_shadow_list_add(&nic->uc_shadow,
+							      ha->addr);
+					if (uc_mc_list)
+						continue;
+					submit_uc_mc_mbox_msg(nic, nic->vf_id,
+							      0, 0, ha->addr);
+				}
+			}
+			/* Delete items that are not present in netdev list and
+			 *  present in shadow list
+			 */
+			is_modified2 = nicvf_shadow_list_delsync(
+					&nic->uc_shadow, nic, 0);
+			if (uc_mc_list && (is_modified1 || is_modified2)) {
+				/* Now the shadow list is updated,
+				 * send the entire list
+				 */
+				netdev_for_each_uc_addr(ha, netdev)
+					submit_uc_mc_mbox_msg(nic, nic->vf_id,
+							      0, 0, ha->addr);
+			}
+		}
+	}
+
+	is_modified1 = 0;
+	is_modified2 = 0;
+	if (netdev_mc_empty(netdev)) { // If the netdev list is empty
+		/* If shadow list is not empty */
+		if (!nicvf_shadow_list_empty(&nic->mc_shadow)) {
+			// send uc flush notifcation
+			nicvf_shadow_list_flush(&nic->mc_shadow);
+			submit_uc_mc_mbox_msg(nic, nic->vf_id, 1, 1, NULL);
+		}
+	} else {
+		/* If shadow list is empty add all and notfy */
+		if (nicvf_shadow_list_empty(&nic->mc_shadow)) {
+			struct netdev_hw_addr *ha;
+
+			netdev_for_each_mc_addr(ha, netdev) {
+				nicvf_shadow_list_add(&nic->mc_shadow,
+						      ha->addr);
+				submit_uc_mc_mbox_msg(nic, nic->vf_id, 0, 1,
+						      ha->addr);
+			}
+		} else {
+			struct netdev_hw_addr *ha;
+
+			nicvf_shadow_list_setsync(&nic->mc_shadow, 1);
+			/* ADD the entries which are present in netdev list and
+			 * not present in shadow list
+			 */
+			netdev_for_each_mc_addr(ha, netdev) {
+				if (nicvf_shadow_list_find(&nic->mc_shadow,
+							   ha->addr)) {
+					is_modified1 = 1;
+					nicvf_shadow_list_add(&nic->mc_shadow,
+							      ha->addr);
+					if (!uc_mc_list)
+						submit_uc_mc_mbox_msg(
+							nic, nic->vf_id, 0, 1,
+							ha->addr);
+				}
+			}
+			/* Delete items that are not present in netdev list and
+			 * present in shadow list
+			 */
+			is_modified2 = nicvf_shadow_list_delsync(
+					&nic->mc_shadow, nic, 1);
+			if (uc_mc_list && (is_modified1 || is_modified2)) {
+				/* Now the shadow list is updated, send the
+				 * entire list
+				 */
+				netdev_for_each_mc_addr(ha, netdev)
+					submit_uc_mc_mbox_msg(nic, nic->vf_id,
+							      0, 1, ha->addr);
+			}
+		}
+	}
+}
+
 int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx)
 {
 	int timeout = NIC_MBOX_MSG_TIMEOUT;
 	int sleep = 10;
 
+	if (nic->pf_ack_waiting) {
+		timeout += 20;
+		while (nic->pf_ack_waiting) {
+			msleep(sleep);
+			if (!timeout)
+				break;
+			timeout -= sleep;
+		}
+		timeout = NIC_MBOX_MSG_TIMEOUT;
+	}
 	nic->pf_acked = false;
 	nic->pf_nacked = false;
+	nic->pf_ack_waiting = true;
 
 	nicvf_write_to_mbx(nic, mbx);
 
+	if (!pf_ack_required(nic, mbx)) {
+		nic->pf_ack_waiting = false;
+		nic->pf_acked = true;
+		nic->pf_nacked = true;
+		return 0;
+	}
 	/* Wait for previous message to be acked, timeout 2sec */
 	while (!nic->pf_acked) {
 		if (nic->pf_nacked) {
-			netdev_err(nic->netdev,
-				   "PF NACK to mbox msg 0x%02x from VF%d\n",
-				   (mbx->msg.msg & 0xFF), nic->vf_id);
+			if (mbx->msg.msg != NIC_MBOX_MSG_READY)
+				netdev_info(nic->netdev,
+					    "PF NACK to mbox msg 0x%02x from VF%d\n",
+					    (mbx->msg.msg & 0xFF), nic->vf_id);
+			nic->pf_ack_waiting = false;
 			return -EINVAL;
 		}
 		msleep(sleep);
@@ -139,9 +446,11 @@ int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx)
 			netdev_err(nic->netdev,
 				   "PF didn't ACK to mbox msg 0x%02x from VF%d\n",
 				   (mbx->msg.msg & 0xFF), nic->vf_id);
+			nic->pf_ack_waiting = false;
 			return -EBUSY;
 		}
 	}
+	nic->pf_ack_waiting = false;
 	return 0;
 }
 
@@ -151,9 +460,14 @@ int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx)
 static int nicvf_check_pf_ready(struct nicvf *nic)
 {
 	union nic_mbx mbx = {};
+	int ret = 0;
 
 	mbx.msg.msg = NIC_MBOX_MSG_READY;
-	if (nicvf_send_msg_to_pf(nic, &mbx)) {
+	ret = nicvf_send_msg_to_pf(nic, &mbx);
+	if (ret == -EINVAL) {
+		/* VF disabled through module parameter */
+		return 0;
+	} else if (ret) {
 		netdev_err(nic->netdev,
 			   "PF didn't respond to READY msg\n");
 		return 0;
@@ -193,12 +507,22 @@ static void  nicvf_handle_mbx_intr(struct nicvf *nic)
 		nic->vf_id = mbx.nic_cfg.vf_id & 0x7F;
 		nic->tns_mode = mbx.nic_cfg.tns_mode & 0x7F;
 		nic->node = mbx.nic_cfg.node_id;
+		nic->true_vf = mbx.nic_cfg.is_pf;
+		if (!veb_enabled)
+			veb_enabled = mbx.nic_cfg.veb_enabled;
+		if (veb_enabled)
+			snprintf(nic->phys_port_name, IFNAMSIZ, "%d %d %d %d",
+				 nic->node, mbx.nic_cfg.bgx_id,
+				 mbx.nic_cfg.lmac, mbx.nic_cfg.chan);
 		if (!nic->set_mac_pending)
 			ether_addr_copy(nic->netdev->dev_addr,
 					mbx.nic_cfg.mac_addr);
 		nic->sqs_mode = mbx.nic_cfg.sqs_mode;
 		nic->loopback_supported = mbx.nic_cfg.loopback_supported;
-		nic->link_up = false;
+		if (veb_enabled)
+			nic->link_up = mbx.nic_cfg.pf_up;
+		else
+			nic->link_up = false;
 		nic->duplex = 0;
 		nic->speed = 0;
 		break;
@@ -208,6 +532,12 @@ static void  nicvf_handle_mbx_intr(struct nicvf *nic)
 	case NIC_MBOX_MSG_NACK:
 		nic->pf_nacked = true;
 		break;
+	case NIC_MBOX_MSG_ADMIN_VLAN:
+		if (mbx.vlan_cfg.vlan_add && nic->admin_vlan_id == -1)
+			nic->admin_vlan_id = mbx.vlan_cfg.vlan_id;
+		else if (!mbx.vlan_cfg.vlan_add)
+			nic->admin_vlan_id = -1;
+		break;
 	case NIC_MBOX_MSG_RSS_SIZE:
 		nic->rss_info.rss_size = mbx.rss_size.ind_tbl_size;
 		nic->pf_acked = true;
@@ -216,16 +546,21 @@ static void  nicvf_handle_mbx_intr(struct nicvf *nic)
 		nicvf_read_bgx_stats(nic, &mbx.bgx_stats);
 		nic->pf_acked = true;
 		break;
-	case NIC_MBOX_MSG_BGX_LINK_CHANGE:
+	case NIC_MBOX_MSG_CFG_DONE:
 		nic->pf_acked = true;
 		nic->link_up = mbx.link_status.link_up;
 		nic->duplex = mbx.link_status.duplex;
 		nic->speed = mbx.link_status.speed;
+		break;
+	case NIC_MBOX_MSG_BGX_LINK_CHANGE:
+		nic->link_up = mbx.link_status.link_up;
+		nic->duplex = mbx.link_status.duplex;
+		nic->speed = mbx.link_status.speed;
 		if (nic->link_up) {
 			netdev_info(nic->netdev, "%s: Link is Up %d Mbps %s\n",
 				    nic->netdev->name, nic->speed,
 				    nic->duplex == DUPLEX_FULL ?
-				"Full duplex" : "Half duplex");
+				    "Full duplex" : "Half duplex");
 			netif_carrier_on(nic->netdev);
 			netif_tx_start_all_queues(nic->netdev);
 		} else {
@@ -563,6 +898,14 @@ static inline void nicvf_set_rxhash(struct net_device *netdev,
 	skb_set_hash(skb, hash, hash_type);
 }
 
+static inline bool is_vf_vlan(struct nicvf *nic, u16 vid)
+{
+	if (veb_enabled && ((nic->admin_vlan_id & 0xFFF) == vid))
+		return false;
+
+	return true;
+}
+
 static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 				  struct napi_struct *napi,
 				  struct cqe_rx_t *cqe_rx)
@@ -617,7 +960,8 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 	skb->protocol = eth_type_trans(skb, netdev);
 
 	/* Check for stripped VLAN */
-	if (cqe_rx->vlan_found && cqe_rx->vlan_stripped)
+	if (cqe_rx->vlan_found && cqe_rx->vlan_stripped &&
+	    is_vf_vlan(nic, (ntohs(cqe_rx->vlan_tci) & 0xFFF)))
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 				       ntohs((__force __be16)cqe_rx->vlan_tci));
 
@@ -1151,6 +1495,8 @@ int nicvf_stop(struct net_device *netdev)
 	/* disable mailbox interrupt */
 	nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0);
 
+	//MBOX interrupts disabled, don't expect any ACK's from PF
+	nic->wait_for_ack = false;
 	nicvf_unregister_interrupts(nic);
 
 	nicvf_free_cq_poll(nic);
@@ -1182,6 +1528,8 @@ int nicvf_open(struct net_device *netdev)
 
 	netif_carrier_off(netdev);
 
+	//MBOX interrupts enabled, so wait for ACK from PF
+	nic->wait_for_ack = true;
 	err = nicvf_register_misc_interrupt(nic);
 	if (err)
 		return err;
@@ -1202,7 +1550,8 @@ int nicvf_open(struct net_device *netdev)
 	}
 
 	/* Check if we got MAC address from PF or else generate a radom MAC */
-	if (!nic->sqs_mode && is_zero_ether_addr(netdev->dev_addr)) {
+	if ((veb_enabled || !nic->sqs_mode) &&
+	    is_zero_ether_addr(netdev->dev_addr)) {
 		eth_hw_addr_random(netdev);
 		nicvf_hw_set_mac_addr(nic, netdev);
 	}
@@ -1268,7 +1617,17 @@ int nicvf_open(struct net_device *netdev)
 
 	/* Send VF config done msg to PF */
 	mbx.msg.msg = NIC_MBOX_MSG_CFG_DONE;
-	nicvf_write_to_mbx(nic, &mbx);
+	if (veb_enabled)
+		nicvf_send_msg_to_pf(nic, &mbx);
+	else
+		nicvf_write_to_mbx(nic, &mbx);
+
+	if (veb_enabled && nic->link_up) {
+		nic->send_op_link_status = true;
+		queue_delayed_work(nic->uc_mc_msg, &nic->dwork, 0);
+		netif_carrier_on(netdev);
+		netif_tx_start_all_queues(netdev);
+	}
 
 	return 0;
 cleanup:
@@ -1299,6 +1658,8 @@ static int nicvf_change_mtu(struct net_device *netdev, int new_mtu)
 		return -EINVAL;
 
 	netdev->mtu = new_mtu;
+	if (!nic->link_up)
+		return 0;
 
 	if (!netif_running(netdev))
 		return 0;
@@ -1508,6 +1869,142 @@ static int nicvf_set_features(struct net_device *netdev,
 	return 0;
 }
 
+static int nicvf_vlan_rx_add_vid(struct net_device *netdev,
+				 __always_unused __be16 proto, u16 vid)
+{
+	struct nicvf *nic = netdev_priv(netdev);
+	union nic_mbx mbx = {};
+	int ret = 0;
+
+	if (!veb_enabled)
+		return 0;
+
+	if (nic->admin_vlan_id != -1) {
+		netdev_err(nic->netdev,
+			   "VF %d could not add VLAN %d\n", nic->vf_id, vid);
+		return -1;
+	}
+	mbx.msg.msg = NIC_MBOX_MSG_VLAN;
+	mbx.vlan_cfg.vf_id = nic->vf_id;
+	mbx.vlan_cfg.vlan_id = vid;
+	mbx.vlan_cfg.vlan_add = 1;
+	ret = nicvf_send_msg_to_pf(nic, &mbx);
+	if (ret == -EINVAL) {
+		netdev_err(nic->netdev, "VF %d could not add VLAN %d\n",
+			   nic->vf_id, vid);
+	} else if (ret == -EBUSY) {
+		netdev_err(nic->netdev,
+			   "PF didn't respond to VLAN msg VLAN ID: %d VF: %d\n",
+			   vid, nic->vf_id);
+	}
+	return ret;
+}
+
+static int nicvf_vlan_rx_kill_vid(struct net_device *netdev,
+				  __always_unused __be16 proto, u16 vid)
+{
+	struct nicvf *nic = netdev_priv(netdev);
+	union nic_mbx mbx = {};
+
+	if (!veb_enabled)
+		return 0;
+
+	mbx.msg.msg = NIC_MBOX_MSG_VLAN;
+	mbx.vlan_cfg.vf_id = nic->vf_id;
+	mbx.vlan_cfg.vlan_id = vid;
+	mbx.vlan_cfg.vlan_add = 0;
+	if (nicvf_send_msg_to_pf(nic, &mbx)) {
+		netdev_err(nic->netdev,
+			   "PF didn't respond to VLAN msg VLAN ID: %d VF: %d\n",
+			   vid, nic->vf_id);
+		return -1;
+	}
+	return 0;
+}
+
+void nicvf_set_rx_mode(struct net_device *netdev)
+{
+	struct nicvf *nic = netdev_priv(netdev);
+
+	if (!veb_enabled)
+		return;
+
+	queue_delayed_work(nic->uc_mc_msg, &nic->dwork, 0);
+}
+
+void nicvf_change_rx_flags(struct net_device *netdev, int flags)
+{
+	struct nicvf *nic = netdev_priv(netdev);
+	union nic_mbx mbx = {};
+
+	if (!veb_enabled)
+		return;
+
+	mbx.msg.msg = NIC_MBOX_MSG_PROMISC;
+	mbx.promisc_cfg.vf_id = nic->vf_id;
+	mbx.promisc_cfg.on = netdev->flags & IFF_PROMISC;
+	if (nicvf_send_msg_to_pf(nic, &mbx)) {
+		netdev_err(nic->netdev,
+			   "PF didn't respond to PROMISC Mode\n");
+		return;
+	}
+}
+
+int nicvf_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
+		      __be16 vlan_proto)
+{
+	struct nicvf *nic = netdev_priv(netdev);
+	int is_add = (vlan | qos);
+	union nic_mbx mbx = {};
+	int ret = 0;
+
+	if (!veb_enabled)
+		return 0;
+
+	mbx.msg.msg = NIC_MBOX_MSG_ADMIN_VLAN;
+	mbx.vlan_cfg.vf_id   = vf;
+	mbx.vlan_cfg.vlan_add = is_add;
+	mbx.vlan_cfg.vlan_id = vlan;
+
+	ret = nicvf_send_msg_to_pf(nic, &mbx);
+	if (ret == -EINVAL) {
+		netdev_err(nic->netdev, "ADMIN VLAN %s failed For Vf %d\n",
+			   is_add ? "Add" : "Delete", vf);
+	} else if (ret == -EBUSY) {
+		netdev_err(nic->netdev,
+			   "PF didn't respond to ADMIN VLAN UPDATE msg\n");
+	}
+	return ret;
+}
+
+static int nicvf_get_phys_port_name(struct net_device *netdev, char *name,
+				    size_t len)
+{
+	struct nicvf *nic = netdev_priv(netdev);
+	int plen;
+
+	plen = snprintf(name, len, "%s", nic->phys_port_name);
+
+	if (plen >= len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int nicvf_get_phys_port_id(struct net_device *netdev,
+				  struct netdev_phys_item_id *ppid)
+{
+	struct nicvf *nic = netdev_priv(netdev);
+
+	if (veb_enabled && !nic->true_vf)
+		return -EOPNOTSUPP;
+
+	ppid->id_len = min_t(int, sizeof(netdev->dev_addr), sizeof(ppid->id));
+	memcpy(ppid->id, netdev->dev_addr, ppid->id_len);
+
+	return 0;
+}
+
 static const struct net_device_ops nicvf_netdev_ops = {
 	.ndo_open		= nicvf_open,
 	.ndo_stop		= nicvf_stop,
@@ -1518,6 +2015,13 @@ static int nicvf_set_features(struct net_device *netdev,
 	.ndo_tx_timeout         = nicvf_tx_timeout,
 	.ndo_fix_features       = nicvf_fix_features,
 	.ndo_set_features       = nicvf_set_features,
+	.ndo_vlan_rx_add_vid    = nicvf_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid   = nicvf_vlan_rx_kill_vid,
+	.ndo_set_rx_mode        = nicvf_set_rx_mode,
+	.ndo_change_rx_flags    = nicvf_change_rx_flags,
+	.ndo_set_vf_vlan        = nicvf_set_vf_vlan,
+	.ndo_get_phys_port_name = nicvf_get_phys_port_name,
+	.ndo_get_phys_port_id   = nicvf_get_phys_port_id,
 };
 
 static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -1576,6 +2080,7 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	nic->pdev = pdev;
 	nic->pnicvf = nic;
 	nic->max_queues = qcount;
+	nic->pf_ack_waiting = false;
 
 	/* MAP VF's configuration registers */
 	nic->reg_base = pcim_iomap(pdev, PCI_CFG_REG_BAR_NUM, 0);
@@ -1595,6 +2100,8 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto err_free_netdev;
 
+	//MBOX interrupts enabled, so wait for ACK from PF
+	nic->wait_for_ack = true;
 	/* Check if PF is alive and get MAC address for this VF */
 	err = nicvf_register_misc_interrupt(nic);
 	if (err)
@@ -1619,12 +2126,13 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	netdev->hw_features = (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_SG |
 			       NETIF_F_TSO | NETIF_F_GRO |
-			       NETIF_F_HW_VLAN_CTAG_RX);
-
-	netdev->hw_features |= NETIF_F_RXHASH;
+			       NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_RXHASH);
 
 	netdev->features |= netdev->hw_features;
-	netdev->hw_features |= NETIF_F_LOOPBACK;
+	if (veb_enabled)
+		netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	else
+		netdev->hw_features |= NETIF_F_LOOPBACK;
 
 	netdev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
 
@@ -1642,6 +2150,37 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	nic->msg_enable = debug;
 
 	nicvf_set_ethtool_ops(netdev);
+	if (veb_enabled) {
+		int bgx, lmac, chan, node, ret;
+
+		ret = sscanf(nic->phys_port_name, "%d %d %d %d", &node, &bgx,
+			     &lmac, &chan);
+		if (nic->true_vf) {
+			dev_info(dev,
+				 "interface %s enabled with node %d VF %d channel %d directly attached to physical port n%d-bgx-%d-%d\n",
+				 netdev->name, node, nic->vf_id, chan, node,
+				 bgx, lmac);
+		} else {
+			dev_info(dev,
+				 "interface %s enabled with node %d VF %d channel %d attached to physical port n%d-bgx-%d-%d\n",
+				 netdev->name, node, nic->vf_id, chan, node,
+				 bgx, lmac);
+		}
+		snprintf(nic->phys_port_name, IFNAMSIZ, "n%d-bgx-%d-%d",
+			 node, bgx, lmac);
+		nicvf_shadow_list_init(&nic->uc_shadow);
+		nicvf_shadow_list_init(&nic->mc_shadow);
+
+		nic->admin_vlan_id = -1;
+		nic->send_op_link_status = false;
+		nic->uc_mc_msg = alloc_workqueue("uc_mc_msg", WQ_UNBOUND |
+						 WQ_MEM_RECLAIM, 1);
+		if (!nic->uc_mc_msg)
+			return -ENOMEM;
+		INIT_DELAYED_WORK(&nic->dwork, send_uc_mc_msg);
+	} else {
+		strlcpy(nic->phys_port_name, netdev->name, IFNAMSIZ);
+	}
 
 	return 0;
 
@@ -1669,6 +2208,12 @@ static void nicvf_remove(struct pci_dev *pdev)
 		return;
 
 	nic = netdev_priv(netdev);
+	if (veb_enabled) {
+		if (nicvf_shadow_list_count(&nic->uc_shadow))
+			nicvf_shadow_list_flush(&nic->uc_shadow);
+		if (nicvf_shadow_list_count(&nic->mc_shadow))
+			nicvf_shadow_list_flush(&nic->mc_shadow);
+	}
 	pnetdev = nic->pnicvf->netdev;
 
 	/* Check if this Qset is assigned to different VF.
@@ -1678,6 +2223,12 @@ static void nicvf_remove(struct pci_dev *pdev)
 		unregister_netdev(pnetdev);
 	nicvf_unregister_interrupts(nic);
 	pci_set_drvdata(pdev, NULL);
+	if (veb_enabled) {
+		if (nic->uc_mc_msg) {
+			cancel_delayed_work_sync(&nic->dwork);
+			destroy_workqueue(nic->uc_mc_msg);
+		}
+	}
 	if (nic->drv_stats)
 		free_percpu(nic->drv_stats);
 	free_netdev(netdev);
-- 
1.8.3.1

^ permalink raw reply related

* [RFC PATCH 1/7] PF driver modified to enable HW filter support, changes works in backward compatibility mode Enable required things in Makefile Enable LZ4 dependecy inside config file
From: Satha Koteswara Rao @ 2016-12-21  8:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: rvatsavayi, rric, david.daney, netdev, satha.rao, derek.chickles,
	sgoutham, davem, linux-arm-kernel, philip.romanov
In-Reply-To: <1482310011-1862-1-git-send-email-satha.rao@caviumnetworks.com>

---
 drivers/net/ethernet/cavium/Kconfig            |   1 +
 drivers/net/ethernet/cavium/thunder/Makefile   |   2 +-
 drivers/net/ethernet/cavium/thunder/nic.h      | 203 ++++---
 drivers/net/ethernet/cavium/thunder/nic_main.c | 735 ++++++++++++++++++++++---
 4 files changed, 804 insertions(+), 137 deletions(-)

diff --git a/drivers/net/ethernet/cavium/Kconfig b/drivers/net/ethernet/cavium/Kconfig
index 92f411c..e4855a0 100644
--- a/drivers/net/ethernet/cavium/Kconfig
+++ b/drivers/net/ethernet/cavium/Kconfig
@@ -17,6 +17,7 @@ config THUNDER_NIC_PF
 	tristate "Thunder Physical function driver"
 	depends on 64BIT
 	select THUNDER_NIC_BGX
+        select CRYPTO_LZ4
 	---help---
 	  This driver supports Thunder's NIC physical function.
 	  The NIC provides the controller and DMA engines to
diff --git a/drivers/net/ethernet/cavium/thunder/Makefile b/drivers/net/ethernet/cavium/thunder/Makefile
index 6b4d4ad..30e4417 100644
--- a/drivers/net/ethernet/cavium/thunder/Makefile
+++ b/drivers/net/ethernet/cavium/thunder/Makefile
@@ -7,6 +7,6 @@ obj-$(CONFIG_THUNDER_NIC_BGX) += thunder_bgx.o
 obj-$(CONFIG_THUNDER_NIC_PF) += nicpf.o
 obj-$(CONFIG_THUNDER_NIC_VF) += nicvf.o
 
-nicpf-y := nic_main.o
+nicpf-y := pf_vf.o pf_reg.o pf_filter.o tbl_access.o nic_main.o
 nicvf-y := nicvf_main.o nicvf_queues.o
 nicvf-y += nicvf_ethtool.o
diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 86bd93c..17a29e7 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -54,6 +54,7 @@
 
 /* Max when CPI_ALG is IP diffserv */
 #define	NIC_MAX_CPI_PER_LMAC		64
+#define NIC_TNS_CPI_PER_LMAC		16
 
 /* NIC VF Interrupts */
 #define	NICVF_INTR_CQ			0
@@ -111,6 +112,7 @@
  * 1 tick per 0.025usec
  */
 #define NICPF_CLK_PER_INT_TICK		1
+#define NICPF_TNS_CLK_PER_INT_TICK	2
 
 /* Time to wait before we decide that a SQ is stuck.
  *
@@ -129,6 +131,7 @@ struct nicvf_cq_poll {
 
 #define NIC_MAX_RSS_HASH_BITS		8
 #define NIC_MAX_RSS_IDR_TBL_SIZE	(1 << NIC_MAX_RSS_HASH_BITS)
+#define NIC_TNS_RSS_IDR_TBL_SIZE	5
 #define RSS_HASH_KEY_SIZE		5 /* 320 bit key */
 
 struct nicvf_rss_info {
@@ -255,74 +258,6 @@ struct nicvf_drv_stats {
 	struct u64_stats_sync   syncp;
 };
 
-struct nicvf {
-	struct nicvf		*pnicvf;
-	struct net_device	*netdev;
-	struct pci_dev		*pdev;
-	void __iomem		*reg_base;
-#define	MAX_QUEUES_PER_QSET			8
-	struct queue_set	*qs;
-	struct nicvf_cq_poll	*napi[8];
-	u8			vf_id;
-	u8			sqs_id;
-	bool                    sqs_mode;
-	bool			hw_tso;
-	bool			t88;
-
-	/* Receive buffer alloc */
-	u32			rb_page_offset;
-	u16			rb_pageref;
-	bool			rb_alloc_fail;
-	bool			rb_work_scheduled;
-	struct page		*rb_page;
-	struct delayed_work	rbdr_work;
-	struct tasklet_struct	rbdr_task;
-
-	/* Secondary Qset */
-	u8			sqs_count;
-#define	MAX_SQS_PER_VF_SINGLE_NODE		5
-#define	MAX_SQS_PER_VF				11
-	struct nicvf		*snicvf[MAX_SQS_PER_VF];
-
-	/* Queue count */
-	u8			rx_queues;
-	u8			tx_queues;
-	u8			max_queues;
-
-	u8			node;
-	u8			cpi_alg;
-	bool			link_up;
-	u8			duplex;
-	u32			speed;
-	bool			tns_mode;
-	bool			loopback_supported;
-	struct nicvf_rss_info	rss_info;
-	struct tasklet_struct	qs_err_task;
-	struct work_struct	reset_task;
-
-	/* Interrupt coalescing settings */
-	u32			cq_coalesce_usecs;
-	u32			msg_enable;
-
-	/* Stats */
-	struct nicvf_hw_stats   hw_stats;
-	struct nicvf_drv_stats  __percpu *drv_stats;
-	struct bgx_stats	bgx_stats;
-
-	/* MSI-X  */
-	bool			msix_enabled;
-	u8			num_vec;
-	struct msix_entry	msix_entries[NIC_VF_MSIX_VECTORS];
-	char			irq_name[NIC_VF_MSIX_VECTORS][IFNAMSIZ + 15];
-	bool			irq_allocated[NIC_VF_MSIX_VECTORS];
-	cpumask_var_t		affinity_mask[NIC_VF_MSIX_VECTORS];
-
-	/* VF <-> PF mailbox communication */
-	bool			pf_acked;
-	bool			pf_nacked;
-	bool			set_mac_pending;
-} ____cacheline_aligned_in_smp;
-
 /* PF <--> VF Mailbox communication
  * Eight 64bit registers are shared between PF and VF.
  * Separate set for each VF.
@@ -357,6 +292,18 @@ struct nicvf {
 #define	NIC_MBOX_MSG_SNICVF_PTR		0x15	/* Send sqet nicvf ptr to PVF */
 #define	NIC_MBOX_MSG_LOOPBACK		0x16	/* Set interface in loopback */
 #define	NIC_MBOX_MSG_RESET_STAT_COUNTER 0x17	/* Reset statistics counters */
+/* Communicate regarding the added/deleted unicast/multicase address */
+#define NIC_MBOX_MSG_UC_MC              0x18
+/* Communicate regarding the setting of Promisc mode */
+#define NIC_MBOX_MSG_PROMISC		0x19
+/* Communicate with vf regarding admin vlan */
+#define NIC_MBOX_MSG_ADMIN_VLAN         0x20
+/* Communicate regarding the added/deleted VLAN */
+#define NIC_MBOX_MSG_VLAN               0x21
+/* Communicate to Pf that the VF carrier and tx queue are turned on */
+#define NIC_MBOX_MSG_OP_UP		0x22
+/* Communicate to Pf that the VF carrier and tx queue are turned off */
+#define NIC_MBOX_MSG_OP_DOWN		0x23
 #define	NIC_MBOX_MSG_CFG_DONE		0xF0	/* VF configuration done */
 #define	NIC_MBOX_MSG_SHUTDOWN		0xF1	/* VF is being shutdown */
 
@@ -367,6 +314,29 @@ struct nic_cfg_msg {
 	u8    tns_mode:1;
 	u8    sqs_mode:1;
 	u8    loopback_supported:1;
+	u8    pf_up:1;
+	bool  is_pf;
+	bool  veb_enabled;
+	bool  bgx_id;
+	u8    lmac;
+	u8    chan;
+	u8    mac_addr[ETH_ALEN];
+};
+
+/* VLAN INFO */
+struct vlan_msg {
+	u8 msg;
+	u8 vf_id;
+	bool vlan_add:1;
+	u16 vlan_id;
+};
+
+struct uc_mc_msg {
+	u8 msg;
+	u8 vf_id;
+	uint64_t addr_type:1;
+	uint64_t is_flush:1;
+	uint64_t is_add:1;
 	u8    mac_addr[ETH_ALEN];
 };
 
@@ -446,6 +416,7 @@ struct bgx_stats_msg {
 /* Physical interface link status */
 struct bgx_link_status {
 	u8    msg;
+	u8    lmac;
 	u8    link_up;
 	u8    duplex;
 	u32   speed;
@@ -498,9 +469,18 @@ struct reset_stat_cfg {
 	u16   sq_stat_mask;
 };
 
+struct promisc_info {
+	u8    msg;
+	u8    vf_id;
+	bool  on;
+};
+
 /* 128 bit shared memory between PF and each VF */
 union nic_mbx {
 	struct { u8 msg; }	msg;
+	struct promisc_info     promisc_cfg;
+	struct vlan_msg		vlan_cfg;
+	struct uc_mc_msg	uc_mc_cfg;
 	struct nic_cfg_msg	nic_cfg;
 	struct qs_cfg_msg	qs;
 	struct rq_cfg_msg	rq;
@@ -518,6 +498,93 @@ struct reset_stat_cfg {
 	struct reset_stat_cfg	reset_stat;
 };
 
+struct nicvf {
+	struct nicvf		*pnicvf;
+	struct net_device	*netdev;
+	struct pci_dev		*pdev;
+	void __iomem		*reg_base;
+#define	MAX_QUEUES_PER_QSET			8
+	struct queue_set	*qs;
+	struct nicvf_cq_poll	*napi[8];
+	u8			vf_id;
+	u8			sqs_id;
+	bool                    sqs_mode;
+	bool			hw_tso;
+	bool			t88;
+
+	/* Receive buffer alloc */
+	u32			rb_page_offset;
+	u16			rb_pageref;
+	bool			rb_alloc_fail;
+	bool			rb_work_scheduled;
+	struct page		*rb_page;
+	struct delayed_work	rbdr_work;
+	struct tasklet_struct	rbdr_task;
+
+	/* Secondary Qset */
+	u8			sqs_count;
+#define	MAX_SQS_PER_VF_SINGLE_NODE		5
+#define	MAX_SQS_PER_VF				11
+	struct nicvf		*snicvf[MAX_SQS_PER_VF];
+
+	/* Queue count */
+	u8			rx_queues;
+	u8			tx_queues;
+	u8			max_queues;
+
+	u8			node;
+	u8			cpi_alg;
+	u16			mtu;
+	bool			link_up;
+	u8			duplex;
+	u32			speed;
+	bool			tns_mode;
+	bool			loopback_supported;
+	/* In VEB mode, true_vf directely attached to physical port,
+	 * it acts as PF in this VF group (set of VF's attached to same
+	 * physical port).
+	 */
+	bool			true_vf;
+	struct nicvf_rss_info	rss_info;
+	struct tasklet_struct	qs_err_task;
+	struct work_struct	reset_task;
+
+	/* Interrupt coalescing settings */
+	u32			cq_coalesce_usecs;
+	u32			msg_enable;
+
+	/* Stats */
+	struct nicvf_hw_stats   hw_stats;
+	struct nicvf_drv_stats  __percpu *drv_stats;
+	struct bgx_stats	bgx_stats;
+
+	/* MSI-X  */
+	bool			msix_enabled;
+	u8			num_vec;
+	struct msix_entry	msix_entries[NIC_VF_MSIX_VECTORS];
+	char			irq_name[NIC_VF_MSIX_VECTORS][IFNAMSIZ + 15];
+	bool			irq_allocated[NIC_VF_MSIX_VECTORS];
+	cpumask_var_t		affinity_mask[NIC_VF_MSIX_VECTORS];
+
+	char			phys_port_name[IFNAMSIZ + 15];
+	/* VF <-> PF mailbox communication */
+	bool			pf_acked;
+	bool			pf_nacked;
+	bool			set_mac_pending;
+	struct netdev_hw_addr_list uc_shadow;
+	struct netdev_hw_addr_list mc_shadow;
+
+	/* work queue for handling UC MC mailbox messages */
+	bool			send_op_link_status;
+	struct delayed_work	dwork;
+	struct workqueue_struct *uc_mc_msg;
+
+	/* Admin vlan id */
+	int			admin_vlan_id;
+	bool			pf_ack_waiting;
+	bool			wait_for_ack;
+} ____cacheline_aligned_in_smp;
+
 #define NIC_NODE_ID_MASK	0x03
 #define NIC_NODE_ID_SHIFT	44
 
diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 6677b96..42299320 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -17,6 +17,7 @@
 #include "nic.h"
 #include "q_struct.h"
 #include "thunder_bgx.h"
+#include "pf_globals.h"
 
 #define DRV_NAME	"thunder-nic"
 #define DRV_VERSION	"1.0"
@@ -70,7 +71,52 @@ struct nicpf {
 	struct msix_entry	*msix_entries;
 	bool			irq_allocated[NIC_PF_MSIX_VECTORS];
 	char			irq_name[NIC_PF_MSIX_VECTORS][20];
-};
+	bool			vf_op_enabled[MAX_NUM_VFS_SUPPORTED];
+	bool			admin_vlan[MAX_NUM_VFS_SUPPORTED];
+	u8			vnic_intf_map[MAX_NUM_VFS_SUPPORTED];
+	u64			mac[MAX_NUM_VFS_SUPPORTED];
+	struct delayed_work	notification_dwork;
+	struct workqueue_struct *notification_msg;
+
+#define MAX_VF_MBOX_MESSAGE	(2 * MAX_NUM_VFS_SUPPORTED)
+	union nic_mbx		vf_mbx_msg[MAX_VF_MBOX_MESSAGE];
+	bool			valid_vf_mbx_msg[MAX_VF_MBOX_MESSAGE];
+	/* Protect different notification messages */
+	spinlock_t		vf_mbx_msg_lock;
+} ____cacheline_aligned_in_smp;
+
+static unsigned int num_vfs;
+module_param(num_vfs, uint, 0644);
+MODULE_PARM_DESC(num_vfs, "Non zero positive value, specifies number of VF's per physical port");
+
+static u8 link_lmac[MAX_NUMNODES][TNS_MAX_LMAC];
+static int pf_speed[MAX_NUMNODES][TNS_MAX_LMAC];
+static int pf_duplex[MAX_NUMNODES][TNS_MAX_LMAC];
+
+static void nic_send_msg_to_vf(struct nicpf *nic, int vf, union nic_mbx *mbx);
+static int veb_enabled;
+
+void send_link_change_to_vf(struct nicpf *nic, void *arg)
+{
+	int start_vf, end_vf;
+	int i;
+	union nic_mbx *mbx = (union nic_mbx *)arg;
+
+	get_vf_group(nic->node, mbx->link_status.lmac, &start_vf, &end_vf);
+
+	for (i = start_vf; i <= end_vf; i++) {
+		union nic_mbx lmbx = {};
+
+		if (!nic->vf_enabled[i])
+			continue;
+		if (!nic->mbx_lock[i])
+			nic_send_msg_to_vf(nic, i, mbx);
+		lmbx.mac.vf_id = i;
+		lmbx.msg.msg = mbx->link_status.link_up ? NIC_MBOX_MSG_OP_UP :
+							 NIC_MBOX_MSG_OP_DOWN;
+		pf_notify_msg_handler(nic->node, (void *)(&lmbx));
+	}
+}
 
 /* Supported devices */
 static const struct pci_device_id nic_id_table[] = {
@@ -135,6 +181,53 @@ static u64 nic_get_mbx_addr(int vf)
 	return NIC_PF_VF_0_127_MAILBOX_0_1 + (vf << NIC_VF_NUM_SHIFT);
 }
 
+/* Set RBDR Backpressure (RBDR_BP) and CQ backpressure (CQ_BP) of vnic queues
+ * to 129 each
+ * @vf: vf to which bp needs to be set
+ * @rcv_id: receive queue of the vf
+ */
+void set_rbdr_cq_bp(struct nicpf *nic, u8 vf, u8 rcv_id)
+{
+	union nic_pf_qsx_rqx_bp_cfg bp_info;
+	u64 offset = 0;
+
+	offset = (vf & 127) * 0x200000ull + (rcv_id & 7) * 0x40000ull;
+	bp_info.u = nic_reg_read(nic,  NIC_PF_QSX_RQX_BP_CFG + offset);
+	bp_info.s.rbdr_bp = RBDR_CQ_BP;
+	bp_info.s.cq_bp = RBDR_CQ_BP;
+	nic_reg_write(nic, NIC_PF_QSX_RQX_BP_CFG + offset, bp_info.u);
+}
+
+/* Set backpressure configuratin  on the NIC TNS receive interface
+ * @intf: NIC interface
+ * @tns_mode: if the NIC is in  TNS/BY-PASS mode
+ */
+void set_bp_id(struct nicpf *nic, u8 intf, u8 tns_mode)
+{
+	union nic_pf_intfx_bp_cfg bp_conf;
+	u8 offset = (intf & 1) * 0x100ull;
+
+	bp_conf.u = nic_reg_read(nic, NIC_PF_INTFX_BP_CFG + offset);
+	bp_conf.s.bp_id = (intf) ? ((tns_mode) ? 0x7 : 0x9) :
+					((tns_mode) ? 0x6 : 0x8);
+	nic_reg_write(nic,  NIC_PF_INTFX_BP_CFG + offset, bp_conf.u);
+}
+
+/* enable the BP bus for this interface
+ * @intf: NIC interface
+ */
+void bp_enable(struct nicpf *nic, u8 intf)
+{
+	union nic_pf_intfx_bp_cfg bp_conf;
+	u8 offset = (intf & 1) * 0x100ull;
+
+	bp_conf.u = nic_reg_read(nic, NIC_PF_INTFX_BP_CFG + offset);
+	if (!bp_conf.s.bp_ena)
+		bp_conf.s.bp_ena = 1;
+
+	nic_reg_write(nic, NIC_PF_INTFX_BP_CFG + offset, bp_conf.u);
+}
+
 /* Send a mailbox message to VF
  * @vf: vf to which this message to be sent
  * @mbx: Message to be sent
@@ -169,24 +262,53 @@ static void nic_mbx_send_ready(struct nicpf *nic, int vf)
 	union nic_mbx mbx = {};
 	int bgx_idx, lmac;
 	const char *mac;
+	int nid = nic->node;
 
 	mbx.nic_cfg.msg = NIC_MBOX_MSG_READY;
 	mbx.nic_cfg.vf_id = vf;
 
-	mbx.nic_cfg.tns_mode = NIC_TNS_BYPASS_MODE;
+	if (veb_enabled)
+		mbx.nic_cfg.tns_mode = NIC_TNS_MODE;
+	else
+		mbx.nic_cfg.tns_mode = NIC_TNS_BYPASS_MODE;
 
-	if (vf < nic->num_vf_en) {
+	if (!veb_enabled && vf < nic->num_vf_en) {
 		bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
 		lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
 
-		mac = bgx_get_lmac_mac(nic->node, bgx_idx, lmac);
+		mac = bgx_get_lmac_mac(nid, bgx_idx, lmac);
 		if (mac)
 			ether_addr_copy((u8 *)&mbx.nic_cfg.mac_addr, mac);
+	} else if (veb_enabled) {
+		int lmac = 0, bgx_idx = 0;
+
+		if (get_bgx_id(nid, vf, &bgx_idx, &lmac))
+			dev_err(&nic->pdev->dev, "!!ERROR!!Wrong BGX values\n");
+
+		if (is_pf(nid, vf)) {
+			mac = bgx_get_lmac_mac(nid, bgx_idx, lmac);
+			if (mac)
+				ether_addr_copy((u8 *)&nic->mac[vf], mac);
+		} else if (is_zero_ether_addr((u8 *)&nic->mac[vf])) {
+			eth_random_addr((u8 *)&nic->mac[vf]);
+		}
+
+		ether_addr_copy((u8 *)&mbx.nic_cfg.mac_addr,
+				(u8 *)&nic->mac[vf]);
+		mbx.nic_cfg.is_pf = is_pf(nid, vf);
+		mbx.nic_cfg.lmac = lmac;
+		mbx.nic_cfg.bgx_id = bgx_idx;
+		mbx.nic_cfg.chan = (vf < 64) ? vf : (64 + vf);
 	}
-	mbx.nic_cfg.sqs_mode = (vf >= nic->num_vf_en) ? true : false;
-	mbx.nic_cfg.node_id = nic->node;
+	mbx.nic_cfg.veb_enabled = (veb_enabled == 0) ? 0 : 1;
+	mbx.nic_cfg.node_id = nid;
 
-	mbx.nic_cfg.loopback_supported = vf < nic->num_vf_en;
+	if (veb_enabled) {
+		mbx.nic_cfg.pf_up = link_lmac[nid][vf_to_pport(nid, vf)];
+	} else {
+		mbx.nic_cfg.loopback_supported = vf < nic->num_vf_en;
+		mbx.nic_cfg.sqs_mode = (vf >= nic->num_vf_en) ? true : false;
+	}
 
 	nic_send_msg_to_vf(nic, vf, &mbx);
 }
@@ -242,8 +364,15 @@ static void nic_get_bgx_stats(struct nicpf *nic, struct bgx_stats_msg *bgx)
 	int bgx_idx, lmac;
 	union nic_mbx mbx = {};
 
-	bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[bgx->vf_id]);
-	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[bgx->vf_id]);
+	if (veb_enabled) {
+		if (get_bgx_id(nic->node, bgx->vf_id, &bgx_idx, &lmac))
+			dev_err(&nic->pdev->dev, "Unable to get BGX index\n");
+	} else {
+		bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(
+				nic->vf_lmac_map[bgx->vf_id]);
+		lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(
+				nic->vf_lmac_map[bgx->vf_id]);
+	}
 
 	mbx.bgx_stats.msg = NIC_MBOX_MSG_BGX_STATS;
 	mbx.bgx_stats.vf_id = bgx->vf_id;
@@ -267,6 +396,16 @@ static int nic_update_hw_frs(struct nicpf *nic, int new_frs, int vf)
 	if ((new_frs > NIC_HW_MAX_FRS) || (new_frs < NIC_HW_MIN_FRS))
 		return 1;
 
+	if (veb_enabled) {
+		new_frs += ETH_HLEN;
+		if (new_frs <= nic->pkind.maxlen)
+			return 0;
+
+		nic->pkind.maxlen = new_frs;
+		nic_reg_write(nic, NIC_PF_PKIND_0_15_CFG, *(u64 *)&nic->pkind);
+		return 0;
+	}
+
 	bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
 	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
 	lmac += bgx * MAX_LMAC_PER_BGX;
@@ -302,8 +441,14 @@ static void nic_set_tx_pkt_pad(struct nicpf *nic, int size)
 	 * Hence set this value to lessthan min pkt size of MAC+IP+TCP
 	 * headers, BGX will do the padding to transmit 64 byte pkt.
 	 */
-	if (size > 52)
-		size = 52;
+	if (size > 52) {
+		if (veb_enabled) {
+			if (size > 60)
+				size = 60;
+		} else {
+			size = 52;
+		}
+	}
 
 	pci_read_config_word(nic->pdev, PCI_SUBSYSTEM_ID, &sdevid);
 	/* 81xx's RGX has only one LMAC */
@@ -331,6 +476,10 @@ static void nic_set_lmac_vf_mapping(struct nicpf *nic)
 	u64 lmac_credit;
 
 	nic->num_vf_en = 0;
+	if (veb_enabled) {
+		nic->num_vf_en = PF_END;
+		return;
+	}
 
 	for (bgx = 0; bgx < nic->hw->bgx_cnt; bgx++) {
 		if (!(bgx_map & (1 << bgx)))
@@ -386,7 +535,8 @@ static int nic_get_hw_info(struct nicpf *nic)
 		hw->chans_per_bgx = 128;
 		hw->cpi_cnt = 2048;
 		hw->rssi_cnt = 4096;
-		hw->rss_ind_tbl_size = NIC_MAX_RSS_IDR_TBL_SIZE;
+		hw->rss_ind_tbl_size = veb_enabled ? NIC_TNS_RSS_IDR_TBL_SIZE :
+						     NIC_MAX_RSS_IDR_TBL_SIZE;
 		hw->tl3_cnt = 256;
 		hw->tl2_cnt = 64;
 		hw->tl1_cnt = 2;
@@ -451,6 +601,9 @@ static int nic_init_hw(struct nicpf *nic)
 	int i, err;
 	u64 cqm_cfg;
 
+	/* Reset NIC, in case the driver is repeatedly inserted and removed */
+	nic_reg_write(nic, NIC_PF_SOFT_RESET, 1);
+
 	/* Get HW capability info */
 	err = nic_get_hw_info(nic);
 	if (err)
@@ -462,23 +615,36 @@ static int nic_init_hw(struct nicpf *nic)
 	/* Enable backpressure */
 	nic_reg_write(nic, NIC_PF_BP_CFG, (1ULL << 6) | 0x03);
 
-	/* TNS and TNS bypass modes are present only on 88xx */
-	if (nic->pdev->subsystem_device == PCI_SUBSYS_DEVID_88XX_NIC_PF) {
-		/* Disable TNS mode on both interfaces */
+	if (veb_enabled) {
 		nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG,
-			      (NIC_TNS_BYPASS_MODE << 7) | BGX0_BLOCK);
+			      (NIC_TNS_MODE << 7) | (0x03ULL << 4) | 0x06);
 		nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG | (1 << 8),
-			      (NIC_TNS_BYPASS_MODE << 7) | BGX1_BLOCK);
+			      (NIC_TNS_MODE << 7) | (0x03ULL << 4) | 0x07);
+		nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG,
+			      (1ULL << 63) | (1ULL << 4) | 0x09);
+		nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG + (1 << 8),
+			      (1ULL << 63) | (1ULL << 4) | 0x09);
+	} else {
+		/* TNS and TNS bypass modes are present only on 88xx */
+		if (nic->pdev->subsystem_device ==
+		    PCI_SUBSYS_DEVID_88XX_NIC_PF) {
+			/* Disable TNS mode on both interfaces */
+			nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG,
+				      (NIC_TNS_BYPASS_MODE << 7) | BGX0_BLOCK);
+			nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG | (1 << 8),
+				      (NIC_TNS_BYPASS_MODE << 7) | BGX1_BLOCK);
+		}
+		nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG,
+			      (1ULL << 63) | BGX0_BLOCK);
+		nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG + (1 << 8),
+			      (1ULL << 63) | BGX1_BLOCK);
 	}
 
-	nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG,
-		      (1ULL << 63) | BGX0_BLOCK);
-	nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG + (1 << 8),
-		      (1ULL << 63) | BGX1_BLOCK);
-
 	/* PKIND configuration */
 	nic->pkind.minlen = 0;
-	nic->pkind.maxlen = NIC_HW_MAX_FRS + VLAN_ETH_HLEN + ETH_FCS_LEN + 4;
+	nic->pkind.maxlen = NIC_HW_MAX_FRS + ETH_HLEN;
+	if (!veb_enabled)
+		nic->pkind.maxlen += VLAN_HLEN + ETH_FCS_LEN + 4;
 	nic->pkind.lenerr_en = 1;
 	nic->pkind.rx_hdr = 0;
 	nic->pkind.hdr_sl = 0;
@@ -508,7 +674,7 @@ static int nic_init_hw(struct nicpf *nic)
 static void nic_config_cpi(struct nicpf *nic, struct cpi_cfg_msg *cfg)
 {
 	struct hw_info *hw = nic->hw;
-	u32 vnic, bgx, lmac, chan;
+	u32 vnic, bgx, lmac, chan = 0;
 	u32 padd, cpi_count = 0;
 	u64 cpi_base, cpi, rssi_base, rssi;
 	u8  qset, rq_idx = 0;
@@ -517,8 +683,17 @@ static void nic_config_cpi(struct nicpf *nic, struct cpi_cfg_msg *cfg)
 	bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vnic]);
 	lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vnic]);
 
-	chan = (lmac * hw->chans_per_lmac) + (bgx * hw->chans_per_bgx);
-	cpi_base = vnic * NIC_MAX_CPI_PER_LMAC;
+	if (veb_enabled) {
+		if (nic->vnic_intf_map[vnic] == 0)
+			chan = vnic;
+		else if (nic->vnic_intf_map[vnic] == 1)
+			chan = 128 + (vnic - 64);
+		cpi_base = vnic * NIC_TNS_CPI_PER_LMAC;
+	} else {
+		chan = (lmac * hw->chans_per_lmac) +
+			(bgx * hw->chans_per_bgx);
+		cpi_base = vnic * NIC_MAX_CPI_PER_LMAC;
+	}
 	rssi_base = vnic * hw->rss_ind_tbl_size;
 
 	/* Rx channel configuration */
@@ -534,7 +709,8 @@ static void nic_config_cpi(struct nicpf *nic, struct cpi_cfg_msg *cfg)
 	else if (cfg->cpi_alg == CPI_ALG_VLAN16) /* 3 bits PCP + DEI */
 		cpi_count = 16;
 	else if (cfg->cpi_alg == CPI_ALG_DIFF) /* 6bits DSCP */
-		cpi_count = NIC_MAX_CPI_PER_LMAC;
+		cpi_count = veb_enabled ? NIC_TNS_CPI_PER_LMAC :
+					  NIC_MAX_CPI_PER_LMAC;
 
 	/* RSS Qset, Qidx mapping */
 	qset = cfg->vf_id;
@@ -542,6 +718,8 @@ static void nic_config_cpi(struct nicpf *nic, struct cpi_cfg_msg *cfg)
 	for (; rssi < (rssi_base + cfg->rq_cnt); rssi++) {
 		nic_reg_write(nic, NIC_PF_RSSI_0_4097_RQ | (rssi << 3),
 			      (qset << 3) | rq_idx);
+		if (veb_enabled)
+			set_rbdr_cq_bp(nic, vnic, rq_idx);
 		rq_idx++;
 	}
 
@@ -652,8 +830,8 @@ static void nic_tx_channel_cfg(struct nicpf *nic, u8 vnic,
 			       struct sq_cfg_msg *sq)
 {
 	struct hw_info *hw = nic->hw;
-	u32 bgx, lmac, chan;
-	u32 tl2, tl3, tl4;
+	u32 bgx, lmac, chan = 0;
+	u32 tl2, tl3, tl4 = 0;
 	u32 rr_quantum;
 	u8 sq_idx = sq->sq_num;
 	u8 pqs_vnic;
@@ -670,10 +848,19 @@ static void nic_tx_channel_cfg(struct nicpf *nic, u8 vnic,
 	/* 24 bytes for FCS, IPG and preamble */
 	rr_quantum = ((NIC_HW_MAX_FRS + 24) / 4);
 
-	/* For 88xx 0-511 TL4 transmits via BGX0 and
-	 * 512-1023 TL4s transmit via BGX1.
-	 */
-	if (hw->tl1_per_bgx) {
+	if (veb_enabled) {
+		if (nic->vnic_intf_map[vnic] == 0) {
+			tl4 = (hw->tl4_cnt / hw->chans_per_bgx) * vnic;
+			chan = vnic;
+		} else if (nic->vnic_intf_map[vnic] == 1) {
+			tl4 = (hw->tl4_cnt / hw->bgx_cnt) +
+			      (hw->tl4_cnt / hw->chans_per_bgx) * (vnic - 64);
+			chan = 128 + (vnic - 64);
+		}
+	} else if (hw->tl1_per_bgx) {
+		/* For 88xx 0-511 TL4 transmits via BGX0 and
+		 * 512-1023 TL4s transmit via BGX1.
+		 */
 		tl4 = bgx * (hw->tl4_cnt / hw->bgx_cnt);
 		if (!sq->sqs_mode) {
 			tl4 += (lmac * MAX_QUEUES_PER_QSET);
@@ -686,8 +873,10 @@ static void nic_tx_channel_cfg(struct nicpf *nic, u8 vnic,
 			tl4 += (lmac * MAX_QUEUES_PER_QSET * MAX_SQS_PER_VF);
 			tl4 += (svf * MAX_QUEUES_PER_QSET);
 		}
+		chan = (lmac * hw->chans_per_lmac) + (bgx * hw->chans_per_bgx);
 	} else {
 		tl4 = (vnic * MAX_QUEUES_PER_QSET);
+		chan = (lmac * hw->chans_per_lmac) + (bgx * hw->chans_per_bgx);
 	}
 	tl4 += sq_idx;
 
@@ -706,7 +895,6 @@ static void nic_tx_channel_cfg(struct nicpf *nic, u8 vnic,
 	 * On 81xx/83xx TL3_CHAN reg should be configured with channel
 	 * within LMAC i.e 0-7 and not the actual channel number like on 88xx
 	 */
-	chan = (lmac * hw->chans_per_lmac) + (bgx * hw->chans_per_bgx);
 	if (hw->tl1_per_bgx)
 		nic_reg_write(nic, NIC_PF_TL3_0_255_CHAN | (tl3 << 3), chan);
 	else
@@ -874,6 +1062,30 @@ static void nic_enable_tunnel_parsing(struct nicpf *nic, int vf)
 		      ((0xfULL << 60) | vxlan_prot_def));
 }
 
+void send_notifications(struct work_struct *work)
+{
+	struct nicpf *nic;
+	int i;
+
+	nic = container_of(work, struct nicpf, notification_dwork.work);
+	spin_lock(&nic->vf_mbx_msg_lock);
+	for (i = 0; i < MAX_VF_MBOX_MESSAGE; i++) {
+		union nic_mbx *mbx = &nic->vf_mbx_msg[i];
+
+		if (!nic->valid_vf_mbx_msg[i])
+			continue;
+
+		spin_unlock(&nic->vf_mbx_msg_lock);
+		if (mbx->link_status.msg == NIC_MBOX_MSG_BGX_LINK_CHANGE)
+			send_link_change_to_vf(nic, (void *)mbx);
+		else
+			pf_notify_msg_handler(nic->node, (void *)mbx);
+		spin_lock(&nic->vf_mbx_msg_lock);
+		nic->valid_vf_mbx_msg[i] = false;
+	}
+	spin_unlock(&nic->vf_mbx_msg_lock);
+}
+
 static void nic_enable_vf(struct nicpf *nic, int vf, bool enable)
 {
 	int bgx, lmac;
@@ -889,6 +1101,187 @@ static void nic_enable_vf(struct nicpf *nic, int vf, bool enable)
 	bgx_lmac_rx_tx_enable(nic->node, bgx, lmac, enable);
 }
 
+static int nic_submit_msg_notification(struct nicpf *nic, int vf,
+				       union nic_mbx *mbx)
+{
+	int i, ret = 0;
+
+	if (!veb_enabled)
+		return ret;
+
+	/* PF<->VF Communication Work, and request Validation */
+	switch (mbx->msg.msg) {
+	case NIC_MBOX_MSG_VLAN:
+		if (mbx->vlan_cfg.vlan_add &&
+		    (tns_filter_valid_entry(nic->node, NIC_MBOX_MSG_VLAN, vf,
+						mbx->vlan_cfg.vlan_id) ||
+		     nic->admin_vlan[vf])) {
+			nic_mbx_send_nack(nic, vf);
+			return 1;
+		}
+		break;
+	case NIC_MBOX_MSG_ADMIN_VLAN:
+		if ((mbx->vlan_cfg.vlan_add &&
+		     (tns_filter_valid_entry(nic->node, NIC_MBOX_MSG_ADMIN_VLAN,
+						vf, mbx->vlan_cfg.vlan_id) ||
+		      nic->admin_vlan[mbx->vlan_cfg.vf_id])) ||
+		    (!is_pf(nic->node, vf) ||
+			(get_pf(nic->node, mbx->vlan_cfg.vf_id) != vf))) {
+			nic_mbx_send_nack(nic, vf);
+			return 1;
+		}
+		break;
+	case NIC_MBOX_MSG_UC_MC:
+		if (mbx->uc_mc_cfg.is_add &&
+		    tns_filter_valid_entry(nic->node, NIC_MBOX_MSG_UC_MC,
+					   vf, 0)) {
+			dev_err(&nic->pdev->dev, "MAC filter max reached\n");
+			nic_mbx_send_nack(nic, vf);
+			return 1;
+		}
+		break;
+	case NIC_MBOX_MSG_OP_UP:
+		if (!nic->vf_enabled[vf])
+			return 0;
+		break;
+	case NIC_MBOX_MSG_OP_DOWN:
+		if (!(nic->vf_enabled[vf] && nic->vf_op_enabled[vf]))
+			return 0;
+		break;
+	case NIC_MBOX_MSG_CFG_DONE:
+	{
+		int port = vf_to_pport(nic->node, vf);
+
+		/* Last message of VF config msg sequence */
+		nic->vf_enabled[vf] = true;
+		if (is_pf(nic->node, vf)) {
+			int bgx_id, lmac;
+
+			if (get_bgx_id(nic->node, vf, &bgx_id, &lmac))
+				dev_err(&nic->pdev->dev, "Unable to get BGX index\n");
+
+			/* ENABLE PAUSE FRAME GENERATION */
+			enable_pause_frames(nic->node, bgx_id, lmac);
+
+			bgx_lmac_rx_tx_enable(nic->node, bgx_id, lmac, true);
+		}
+		if (link_lmac[nic->node][port]) {
+			union nic_mbx mbx = {};
+
+			mbx.link_status.msg = NIC_MBOX_MSG_CFG_DONE;
+			mbx.link_status.link_up = 1;
+			mbx.link_status.duplex = pf_duplex[nic->node][port];
+			mbx.link_status.speed = pf_speed[nic->node][port];
+			nic_send_msg_to_vf(nic, vf, &mbx);
+		} else {
+			nic_mbx_send_ack(nic, vf);
+		}
+
+		if (is_pf(nic->node, vf) && link_lmac[nic->node][port]) {
+			mbx->link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE;
+			mbx->link_status.link_up = 1;
+			mbx->link_status.speed = pf_speed[nic->node][port];
+			mbx->link_status.duplex = pf_duplex[nic->node][port];
+			mbx->link_status.lmac = port;
+			break;
+		}
+		return 1;
+	}
+	}
+
+	spin_lock(&nic->vf_mbx_msg_lock);
+	for (i = 0; i < MAX_VF_MBOX_MESSAGE; i++)
+		if (!nic->valid_vf_mbx_msg[i])
+			break;
+	if (i == MAX_VF_MBOX_MESSAGE) {
+		spin_unlock(&nic->vf_mbx_msg_lock);
+		dev_err(&nic->pdev->dev, "Notification array full msg: %d\n",
+			mbx->msg.msg);
+		return -1;
+	}
+
+	memcpy(&nic->vf_mbx_msg[i], mbx, sizeof(union nic_mbx));
+
+	switch (mbx->msg.msg) {
+	case NIC_MBOX_MSG_READY:
+		nic->vf_mbx_msg[i].msg.msg = NIC_MBOX_MSG_SET_MAC;
+		ether_addr_copy((u8 *)&nic->vf_mbx_msg[i].mac.mac_addr,
+				(u8 *)&nic->mac[vf]);
+		/* fall-through */
+	case NIC_MBOX_MSG_SET_MAC:
+		nic->vf_mbx_msg[i].mac.vf_id = vf;
+		break;
+	case NIC_MBOX_MSG_ADMIN_VLAN:
+		nic_send_msg_to_vf(nic, mbx->vlan_cfg.vf_id, mbx);
+		nic->admin_vlan[mbx->vlan_cfg.vf_id] = mbx->vlan_cfg.vlan_add;
+		break;
+	case NIC_MBOX_MSG_PROMISC:
+		ret = 1;
+	case NIC_MBOX_MSG_VLAN:
+	case NIC_MBOX_MSG_UC_MC:
+		break;
+	case NIC_MBOX_MSG_OP_UP:
+		nic->vf_op_enabled[vf] = true;
+		nic->vf_mbx_msg[i].mac.vf_id = vf;
+		break;
+	case NIC_MBOX_MSG_OP_DOWN:
+		nic->vf_mbx_msg[i].mac.vf_id = vf;
+		nic->vf_op_enabled[vf] = false;
+		break;
+	case NIC_MBOX_MSG_SHUTDOWN:
+	{
+		int submit_work = 0;
+
+#ifdef VNIC_MULTI_QSET_SUPPORT
+		if (vf >= nic->num_vf_en)
+			nic->sqs_used[vf - nic->num_vf_en] = false;
+		nic->pqs_vf[vf] = 0;
+#endif
+		if (is_pf(nic->node, vf)) {
+			int bgx_idx, lmac_idx;
+
+			if (get_bgx_id(nic->node, vf, &bgx_idx, &lmac_idx))
+				dev_err(&nic->pdev->dev, "Unable to get BGX\n");
+
+			bgx_lmac_rx_tx_enable(nic->node, bgx_idx, lmac_idx,
+					      false);
+		}
+
+		if (is_pf(nic->node, vf) &&
+		    link_lmac[nic->node][vf_to_pport(nic->node, vf)]) {
+			union nic_mbx *lmbx = &nic->vf_mbx_msg[i + 1];
+
+			lmbx->link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE;
+			lmbx->link_status.lmac = vf_to_pport(nic->node, vf);
+			lmbx->link_status.link_up = 0;
+			nic->valid_vf_mbx_msg[i + 1] = true;
+			submit_work = 1;
+		}
+
+		if (nic->vf_enabled[vf] && nic->vf_op_enabled[vf]) {
+			nic->vf_mbx_msg[i].mac.vf_id = vf;
+			nic->vf_enabled[vf] = false;
+			submit_work = 1;
+		}
+		if (submit_work)
+			break;
+
+		/* First msg in VF teardown sequence */
+		nic->vf_enabled[vf] = false;
+		spin_unlock(&nic->vf_mbx_msg_lock);
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	nic->valid_vf_mbx_msg[i] = true;
+	spin_unlock(&nic->vf_mbx_msg_lock);
+	queue_delayed_work(nic->notification_msg, &nic->notification_dwork, 0);
+
+	return ret;
+}
+
 /* Interrupt handler to handle mailbox messages from VFs */
 static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 {
@@ -916,12 +1309,32 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 		__func__, mbx.msg.msg, vf);
 	switch (mbx.msg.msg) {
 	case NIC_MBOX_MSG_READY:
+		if (veb_enabled) {
+			if (!is_pf(nic->node, vf) &&
+			    (vf > (get_pf(nic->node, vf) + veb_enabled))) {
+				nic_mbx_send_nack(nic, vf);
+				goto unlock;
+			}
+		}
 		nic_mbx_send_ready(nic, vf);
 		if (vf < nic->num_vf_en) {
 			nic->link[vf] = 0;
 			nic->duplex[vf] = 0;
 			nic->speed[vf] = 0;
 		}
+		//PF assigning MAC address for VF, as part of VF probe init
+		//We need to notify this to filter as VF set MAC
+		if (veb_enabled)
+			nic_submit_msg_notification(nic, vf, &mbx);
+		goto unlock;
+	case NIC_MBOX_MSG_VLAN:
+	case NIC_MBOX_MSG_ADMIN_VLAN:
+	case NIC_MBOX_MSG_UC_MC:
+		if (nic_submit_msg_notification(nic, vf, &mbx))
+			goto unlock;
+		break;
+	case NIC_MBOX_MSG_PROMISC:
+		nic_submit_msg_notification(nic, vf, &mbx);
 		goto unlock;
 	case NIC_MBOX_MSG_QS_CFG:
 		reg_addr = NIC_PF_QSET_0_127_CFG |
@@ -977,10 +1390,18 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 			ret = -1; /* NACK */
 			break;
 		}
-		lmac = mbx.mac.vf_id;
-		bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
-		lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
-		bgx_set_lmac_mac(nic->node, bgx, lmac, mbx.mac.mac_addr);
+		if (veb_enabled) {
+			nic_submit_msg_notification(nic, vf, &mbx);
+		} else {
+			int vf_lmac;
+
+			lmac = mbx.mac.vf_id;
+			vf_lmac = nic->vf_lmac_map[lmac];
+			bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(vf_lmac);
+			lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(vf_lmac);
+			bgx_set_lmac_mac(nic->node, bgx, lmac,
+					 mbx.mac.mac_addr);
+		}
 		break;
 	case NIC_MBOX_MSG_SET_MAX_FRS:
 		ret = nic_update_hw_frs(nic, mbx.frs.max_frs,
@@ -996,16 +1417,28 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 	case NIC_MBOX_MSG_RSS_CFG_CONT:
 		nic_config_rss(nic, &mbx.rss_cfg);
 		break;
+	case NIC_MBOX_MSG_OP_UP:
+	case NIC_MBOX_MSG_OP_DOWN:
+		if (nic_submit_msg_notification(nic, vf, &mbx))
+			goto unlock;
+		break;
 	case NIC_MBOX_MSG_CFG_DONE:
 		/* Last message of VF config msg sequence */
-		nic_enable_vf(nic, vf, true);
+		if (veb_enabled)
+			nic_submit_msg_notification(nic, vf, &mbx);
+		else
+			nic_enable_vf(nic, vf, true);
 		goto unlock;
 	case NIC_MBOX_MSG_SHUTDOWN:
-		/* First msg in VF teardown sequence */
-		if (vf >= nic->num_vf_en)
-			nic->sqs_used[vf - nic->num_vf_en] = false;
-		nic->pqs_vf[vf] = 0;
-		nic_enable_vf(nic, vf, false);
+		if (veb_enabled) {
+			nic_submit_msg_notification(nic, vf, &mbx);
+		} else {
+			/* First msg in VF teardown sequence */
+			if (vf >= nic->num_vf_en)
+				nic->sqs_used[vf - nic->num_vf_en] = false;
+			nic->pqs_vf[vf] = 0;
+			nic_enable_vf(nic, vf, false);
+		}
 		break;
 	case NIC_MBOX_MSG_ALLOC_SQS:
 		nic_alloc_sqs(nic, &mbx.sqs_alloc);
@@ -1228,47 +1661,148 @@ static void nic_poll_for_link(struct work_struct *work)
 	union nic_mbx mbx = {};
 	struct nicpf *nic;
 	struct bgx_link_status link;
-	u8 vf, bgx, lmac;
+	int vf, bgx, lmac;
 
 	nic = container_of(work, struct nicpf, dwork.work);
 
 	mbx.link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE;
 
-	for (vf = 0; vf < nic->num_vf_en; vf++) {
-		/* Poll only if VF is UP */
-		if (!nic->vf_enabled[vf])
-			continue;
+	if (veb_enabled) {
+		int port = 0, i;
+		union nic_mbx *mbxp;
 
-		/* Get BGX, LMAC indices for the VF */
-		bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
-		lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
-		/* Get interface link status */
-		bgx_get_lmac_link_state(nic->node, bgx, lmac, &link);
+		for (port = 0; port < TNS_MAX_LMAC; port++) {
+			int start_vf, end_vf;
 
-		/* Inform VF only if link status changed */
-		if (nic->link[vf] == link.link_up)
-			continue;
+			if (phy_port_to_bgx_lmac(nic->node, port, &bgx, &lmac))
+				continue;
 
-		if (!nic->mbx_lock[vf]) {
-			nic->link[vf] = link.link_up;
-			nic->duplex[vf] = link.duplex;
-			nic->speed[vf] = link.speed;
+			get_vf_group(nic->node, port, &start_vf, &end_vf);
+			if (!nic->vf_enabled[start_vf])
+				continue;
 
-			/* Send a mbox message to VF with current link status */
-			mbx.link_status.link_up = link.link_up;
-			mbx.link_status.duplex = link.duplex;
-			mbx.link_status.speed = link.speed;
-			nic_send_msg_to_vf(nic, vf, &mbx);
+			bgx_get_lmac_link_state(nic->node, bgx, lmac, &link);
+
+			if (link_lmac[nic->node][port] == link.link_up)
+				continue;
+
+			link_lmac[nic->node][port] = link.link_up;
+			pf_speed[nic->node][port] = link.speed;
+			pf_duplex[nic->node][port] = link.duplex;
+
+			spin_lock(&nic->vf_mbx_msg_lock);
+			for (i = 0; i < MAX_VF_MBOX_MESSAGE; i++)
+				if (!nic->valid_vf_mbx_msg[i])
+					break;
+
+			if (i == MAX_VF_MBOX_MESSAGE) {
+				spin_unlock(&nic->vf_mbx_msg_lock);
+				return;
+			}
+
+			mbxp = &nic->vf_mbx_msg[i];
+			nic->valid_vf_mbx_msg[i] = true;
+			mbxp->link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE;
+			mbxp->link_status.link_up = link.link_up;
+			mbxp->link_status.speed = link.speed;
+			mbxp->link_status.duplex = link.duplex;
+			mbxp->link_status.lmac = port;
+			spin_unlock(&nic->vf_mbx_msg_lock);
+			queue_delayed_work(nic->notification_msg,
+					   &nic->notification_dwork, 0);
+
+			break;
+		}
+	} else {
+		for (vf = 0; vf < nic->num_vf_en; vf++) {
+			int vf_lmac = nic->vf_lmac_map[vf];
+
+			/* Poll only if VF is UP */
+			if (!nic->vf_enabled[vf])
+				continue;
+
+			/* Get BGX, LMAC indices for the VF */
+			bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(vf_lmac);
+			lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(vf_lmac);
+			/* Get interface link status */
+			bgx_get_lmac_link_state(nic->node, bgx, lmac, &link);
+
+			/* Inform VF only if link status changed */
+			if (nic->link[vf] == link.link_up)
+				continue;
+
+			if (!nic->mbx_lock[vf]) {
+				nic->link[vf] = link.link_up;
+				nic->duplex[vf] = link.duplex;
+				nic->speed[vf] = link.speed;
+
+				/* Send a mbox message to VF with current
+				 * link status
+				 */
+				mbx.link_status.link_up = link.link_up;
+				mbx.link_status.duplex = link.duplex;
+				mbx.link_status.speed = link.speed;
+				nic_send_msg_to_vf(nic, vf, &mbx);
+			}
 		}
 	}
 	queue_delayed_work(nic->check_link, &nic->dwork, HZ * 2);
 }
 
+static void set_tns_config(struct nicpf *nic)
+{
+	int i;
+	u32 vf_count;
+
+	bp_enable(nic, 0);
+	bp_enable(nic, 1);
+	vf_count = PF_END;
+	for (i = 0; i < vf_count; i++) {
+		if (i < 64)
+			nic->vnic_intf_map[i]		= 0;
+		else
+			nic->vnic_intf_map[i]		= 1;
+	}
+
+	set_bp_id(nic, 0, 1);
+	set_bp_id(nic, 1, 1);
+
+	nic->num_vf_en = vf_count;
+}
+
+static inline bool firmware_image_available(const struct firmware **fw,
+					    struct device *dev)
+{
+	int ret = 0;
+
+	ret = request_firmware(fw, FW_NAME, dev);
+	if (ret) {
+		dev_err(dev, "firmware file %s not found\n", FW_NAME);
+		dev_err(dev, "Fall back to backward compatible mode\n");
+		return false;
+	}
+
+	return true;
+}
+
+static int tns_init_done;
+
+void nic_enable_valid_vf(int max_vf_cnt)
+{
+	if (veb_enabled > (max_vf_cnt - 1)) {
+		veb_enabled = max_vf_cnt - 1;
+		pr_info("Number of VF's per physical port set to %d\n",
+			veb_enabled);
+		num_vfs = veb_enabled;
+	}
+}
+
 static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct device *dev = &pdev->dev;
 	struct nicpf *nic;
 	int    err;
+	const struct firmware *fw;
 
 	BUILD_BUG_ON(sizeof(union nic_mbx) > 16);
 
@@ -1319,6 +1853,19 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_release_regions;
 	}
 
+	if (veb_enabled && !tns_init_done) {
+		u16 sdevid;
+
+		pci_read_config_word(nic->pdev, PCI_SUBSYSTEM_ID, &sdevid);
+		if (sdevid == PCI_SUBSYS_DEVID_88XX_NIC_PF &&
+		    firmware_image_available(&fw, dev)) {
+			pr_info("Number Of VF's %d enabled per physical port\n",
+				num_vfs);
+		} else {
+			veb_enabled = 0;
+			num_vfs = 0;
+		}
+	}
 	nic->node = nic_get_node_id(pdev);
 
 	/* Initialize hardware */
@@ -1326,13 +1873,48 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto err_release_regions;
 
-	nic_set_lmac_vf_mapping(nic);
+	if (veb_enabled) {
+		nic_set_pf_vf_mapping(nic->node);
+		/* init TNS function pointers */
+		set_tns_config(nic);
+	} else {
+		nic_set_lmac_vf_mapping(nic);
+	}
 
 	/* Register interrupts */
 	err = nic_register_interrupts(nic);
 	if (err)
 		goto err_release_regions;
 
+	if (veb_enabled) {
+		int i;
+
+		for (i = 0; i < TNS_MAX_LMAC; i++)
+			link_lmac[nic->node][i] = 0;
+
+		spin_lock_init(&nic->vf_mbx_msg_lock);
+
+		nic->notification_msg = alloc_workqueue("notification_work",
+						WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+		if (!nic->notification_msg) {
+			err = -ENOMEM;
+			goto err_unregister_interrupts;
+		}
+		INIT_DELAYED_WORK(&nic->notification_dwork, send_notifications);
+		if (!tns_init_done) {
+			if (tns_init(fw, dev)) {
+				dev_err(dev, "Failed to init filter block\n");
+				err = -ENODEV;
+				goto err_unregister_interrupts;
+			}
+			tns_init_done = 1;
+			if (pf_filter_init()) {
+				pr_info("Failed to configure HW filter\n");
+				goto err_unregister_interrupts;
+			}
+		}
+	}
+
 	/* Configure SRIOV */
 	err = nic_sriov_init(pdev, nic);
 	if (err)
@@ -1356,6 +1938,10 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		pci_disable_sriov(pdev);
 err_unregister_interrupts:
 	nic_unregister_interrupts(nic);
+	if (veb_enabled && nic->notification_msg) {
+		cancel_delayed_work_sync(&nic->notification_dwork);
+		destroy_workqueue(nic->notification_msg);
+	}
 err_release_regions:
 	pci_release_regions(pdev);
 err_disable_device:
@@ -1379,10 +1965,14 @@ static void nic_remove(struct pci_dev *pdev)
 		cancel_delayed_work_sync(&nic->dwork);
 		destroy_workqueue(nic->check_link);
 	}
-
 	nic_unregister_interrupts(nic);
 	pci_release_regions(pdev);
 
+	if (veb_enabled && nic->notification_msg) {
+		cancel_delayed_work_sync(&nic->notification_dwork);
+		destroy_workqueue(nic->notification_msg);
+	}
+
 	nic_free_lmacmem(nic);
 	devm_kfree(&pdev->dev, nic->hw);
 	devm_kfree(&pdev->dev, nic);
@@ -1402,11 +1992,20 @@ static int __init nic_init_module(void)
 {
 	pr_info("%s, ver %s\n", DRV_NAME, DRV_VERSION);
 
+	veb_enabled = num_vfs;
+	if (veb_enabled)
+		nic_init_pf_vf_mapping();
+
 	return pci_register_driver(&nic_driver);
 }
 
 static void __exit nic_cleanup_module(void)
 {
+	if (veb_enabled) {
+		tns_init_done = 0;
+		tns_exit();
+	}
+
 	pci_unregister_driver(&nic_driver);
 }
 
-- 
1.8.3.1

^ permalink raw reply related

* [RFC PATCH 0/7] ThunderX Embedded switch support
From: Satha Koteswara Rao @ 2016-12-21  8:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: rvatsavayi, rric, david.daney, netdev, satha.rao, derek.chickles,
	sgoutham, davem, linux-arm-kernel, philip.romanov

Background
==========

Proposed patch configures programmable ThunderX embedded network switch to
emulate functions of e-switch found in most industry standard NICs.

Embedded switch is pre-configured by loading firmware image which exposes
several firmware defined tables allowing configuration of VLAN and MAC filters.

Embedded switch configuration profile and the driver introduce the following
features:

* Support of configurable number of VFs per physical port (see num_vfs below)

* VLAN filters per VF

* Unicast MAC-DA filters per VF

* Multicast MAC-DA filters per VF

* Support of dedicated VF allowing packet mirroring of all traffic traversing
  physical port (such VF is attached to the interface representing physical
  port)

* Administrative VLAN enforcement per VF (i.e. inserting/overwriting VLAN tag
  on all traffic originated by particular VF)

Each VF operates in two modes: a) full filter mode, where it receives only
registered MAC-DA/VLAN packets and b) multicast promiscuous mode. The latter
is enabled when VF reaches it's maximum MAC-DA filter limit: in this mode VF
receives all multicast and registered unicast MAC frames.

Special effort is made to track association of interface switching groups
to underlying physical ports: entry of /sys/class/net/<intf>/phys_port_name
contains string describing underlying physical port the interface is attached
to in a form <node-id>-<port-group-id>-<port>.

Set of patches include following changes:

1) Patch to original NIC drivers to enable internal switch, and load firmware
   image.

2) Modification of VF driver to subscribe to interface MAC/VLAN ADD/DELETE
   notifications and send them to the PF driver.

3) Modification of PF driver to receive MBOX interrupts from VF for ADD/DELETE
   MAC/VLAN registrations.

4) E-switch initialization code

5) API to access firmware-defined tables embedded switch tables.

The following new parameter is introduced by the driver:

num_vfs: Number of VFs attached to each physical port, default value of this
         parameter is 0, in which case driver operates in backward-compatible
         switch bypass mode.

Set of patches uses below git branch

git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git

patchset generated against below commit

commit 69973b830859bc6529a7a0468ba0d80ee5117826
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sun Dec 11 11:17:54 2016 -0800

    Linux 4.9

Thank You!

-------------------------------------------------------------------------------

Satha Koteswara Rao (7):

* Patch 1:
  * PF driver modified to enable HW filter support, changes works in
    backward compatibility mode. Enable required things in Makefile.
    Enable LZ4 dependecy inside config file

* Patch 2:
  * VF driver changes to enable hooks to get kernel notifications

* Patch 3:
  * Enable pause frame support

* Patch 4:
  * HW Filter Initialization code and register access APIs

* Patch 5:
  * Multiple VF's grouped together under single physical port called PF
    group. PF Group maintainance API's.

* Patch 6:
  * HW Filter Table access API's

* Patch 7:
  * Get notifications from PF driver and configure filter block based on
    requested data.

 drivers/net/ethernet/cavium/Kconfig               |    1 +
 drivers/net/ethernet/cavium/thunder/Makefile      |    2 +-
 drivers/net/ethernet/cavium/thunder/nic.h         |  203 ++-
 drivers/net/ethernet/cavium/thunder/nic_main.c    |  735 ++++++++-
 drivers/net/ethernet/cavium/thunder/nicvf_main.c  |  579 ++++++-
 drivers/net/ethernet/cavium/thunder/pf_filter.c   | 1678 +++++++++++++++++++++
 drivers/net/ethernet/cavium/thunder/pf_globals.h  |   78 +
 drivers/net/ethernet/cavium/thunder/pf_locals.h   |  365 +++++
 drivers/net/ethernet/cavium/thunder/pf_reg.c      |  660 ++++++++
 drivers/net/ethernet/cavium/thunder/pf_vf.c       |  207 +++
 drivers/net/ethernet/cavium/thunder/tbl_access.c  |  262 ++++
 drivers/net/ethernet/cavium/thunder/tbl_access.h  |   61 +
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c |   25 +
 drivers/net/ethernet/cavium/thunder/thunder_bgx.h |    7 +
 14 files changed, 4712 insertions(+), 151 deletions(-)
 create mode 100644 drivers/net/ethernet/cavium/thunder/pf_filter.c
 create mode 100644 drivers/net/ethernet/cavium/thunder/pf_globals.h
 create mode 100644 drivers/net/ethernet/cavium/thunder/pf_locals.h
 create mode 100644 drivers/net/ethernet/cavium/thunder/pf_reg.c
 create mode 100644 drivers/net/ethernet/cavium/thunder/pf_vf.c
 create mode 100644 drivers/net/ethernet/cavium/thunder/tbl_access.c
 create mode 100644 drivers/net/ethernet/cavium/thunder/tbl_access.h

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v3] rfkill: Add rfkill-any LED trigger
From: Michał Kępień @ 2016-12-21  8:45 UTC (permalink / raw)
  To: Johannes Berg, David S . Miller
  Cc: Михаил Кринкин,
	linux-wireless, netdev, linux-kernel

Add a new "global" (i.e. not per-rfkill device) LED trigger, rfkill-any,
which may be useful on laptops with a single "radio LED" and multiple
radio transmitters.  The trigger is meant to turn a LED on whenever
there is at least one radio transmitter active and turn it off
otherwise.

This requires taking rfkill_global_mutex before calling
rfkill_set_block() in rfkill_resume(): since
rfkill_any_led_trigger_event(true) is called from rfkill_set_block()
unconditionally, each caller of the latter needs to take care of locking
rfkill_global_mutex.

Signed-off-by: Michał Kępień <kernel@kempniu.pl>
---
Jonathan, I refrained from resending patch 1/2 from v2 as part of this
series as it is currently applied in mac80211-next/master along with
Arnd's fix.  Please let me know if you would like me to handle this
differently.

Mike, could you please test whether this version works fine on your
machine?  Thanks!

Changes from v2:

  - Handle the global mutex properly when rfkill_set_{hw,sw}_state() or
    rfkill_set_states() is called from within an rfkill callback.  v2
    always tried to lock the global mutex in such a case, which led to a
    deadlock when an rfkill driver called one of the above functions
    from its query or set_block callback.  This is solved by defining a
    new bitfield, RFKILL_BLOCK_SW_HASLOCK, which is set before the above
    callbacks are invoked and cleared afterwards; the functions listed
    above use this bitfield to tell rfkill_any_led_trigger_event()
    whether the global mutex is currently held or not.
    RFKILL_BLOCK_SW_SETCALL cannot be reused for this purpose as setting
    it before invoking the query callback would cause any calls to
    rfkill_set_sw_state() made from within that callback to work on
    RFKILL_BLOCK_SW_PREV instead of RFKILL_BLOCK_SW and thus change the
    way rfkill_set_block() behaves.

  - As rfkill_any_led_trigger_event() now takes a boolean argument which
    tells it whether the global mutex was already taken by the caller,
    all calls to __rfkill_any_led_trigger_event() outside
    rfkill_any_led_trigger_event() have been replaced with calls to
    rfkill_any_led_trigger_event(true).

 net/rfkill/core.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index afa4f71b4c7b..688eac7b97ef 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -44,6 +44,7 @@
 #define RFKILL_BLOCK_ANY	(RFKILL_BLOCK_HW |\
 				 RFKILL_BLOCK_SW |\
 				 RFKILL_BLOCK_SW_PREV)
+#define RFKILL_BLOCK_SW_HASLOCK	BIT(30)
 #define RFKILL_BLOCK_SW_SETCALL	BIT(31)
 
 struct rfkill {
@@ -176,6 +177,51 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 	led_trigger_unregister(&rfkill->led_trigger);
 }
+
+static struct led_trigger rfkill_any_led_trigger;
+
+static void __rfkill_any_led_trigger_event(void)
+{
+	enum led_brightness brightness = LED_OFF;
+	struct rfkill *rfkill;
+
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
+			brightness = LED_FULL;
+			break;
+		}
+	}
+
+	led_trigger_event(&rfkill_any_led_trigger, brightness);
+}
+
+static void rfkill_any_led_trigger_event(bool global_locked)
+{
+	if (global_locked) {
+		__rfkill_any_led_trigger_event();
+	} else {
+		mutex_lock(&rfkill_global_mutex);
+		__rfkill_any_led_trigger_event();
+		mutex_unlock(&rfkill_global_mutex);
+	}
+}
+
+static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
+{
+	rfkill_any_led_trigger_event(false);
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+	rfkill_any_led_trigger.name = "rfkill-any";
+	rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
+	return led_trigger_register(&rfkill_any_led_trigger);
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+	led_trigger_unregister(&rfkill_any_led_trigger);
+}
 #else
 static void rfkill_led_trigger_event(struct rfkill *rfkill)
 {
@@ -189,6 +235,19 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
 static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 }
+
+static void rfkill_any_led_trigger_event(bool global_locked)
+{
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+	return 0;
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+}
 #endif /* CONFIG_RFKILL_LEDS */
 
 static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -253,6 +312,10 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	if (unlikely(rfkill->dev.power.power_state.event & PM_EVENT_SLEEP))
 		return;
 
+	spin_lock_irqsave(&rfkill->lock, flags);
+	rfkill->state |= RFKILL_BLOCK_SW_HASLOCK;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
 	/*
 	 * Some platforms (...!) generate input events which affect the
 	 * _hard_ kill state -- whenever something tries to change the
@@ -292,11 +355,13 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 			rfkill->state &= ~RFKILL_BLOCK_SW;
 	}
 	rfkill->state &= ~RFKILL_BLOCK_SW_SETCALL;
+	rfkill->state &= ~RFKILL_BLOCK_SW_HASLOCK;
 	rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
 	curr = rfkill->state & RFKILL_BLOCK_SW;
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event(true);
 
 	if (prev != curr)
 		rfkill_event(rfkill);
@@ -463,7 +528,7 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type)
 bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 {
 	unsigned long flags;
-	bool ret, prev;
+	bool ret, prev, global_locked;
 
 	BUG_ON(!rfkill);
 
@@ -474,9 +539,11 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 	else
 		rfkill->state &= ~RFKILL_BLOCK_HW;
 	ret = !!(rfkill->state & RFKILL_BLOCK_ANY);
+	global_locked = !!(rfkill->state & RFKILL_BLOCK_SW_HASLOCK);
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event(global_locked);
 
 	if (rfkill->registered && prev != blocked)
 		schedule_work(&rfkill->uevent_work);
@@ -502,7 +569,7 @@ static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 {
 	unsigned long flags;
-	bool prev, hwblock;
+	bool prev, hwblock, global_locked;
 
 	BUG_ON(!rfkill);
 
@@ -511,6 +578,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 	__rfkill_set_sw_state(rfkill, blocked);
 	hwblock = !!(rfkill->state & RFKILL_BLOCK_HW);
 	blocked = blocked || hwblock;
+	global_locked = !!(rfkill->state & RFKILL_BLOCK_SW_HASLOCK);
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	if (!rfkill->registered)
@@ -520,6 +588,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	rfkill_led_trigger_event(rfkill);
+	rfkill_any_led_trigger_event(global_locked);
 
 	return blocked;
 }
@@ -542,7 +611,7 @@ EXPORT_SYMBOL(rfkill_init_sw_state);
 void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 {
 	unsigned long flags;
-	bool swprev, hwprev;
+	bool swprev, hwprev, global_locked;
 
 	BUG_ON(!rfkill);
 
@@ -559,6 +628,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 		rfkill->state |= RFKILL_BLOCK_HW;
 	else
 		rfkill->state &= ~RFKILL_BLOCK_HW;
+	global_locked = !!(rfkill->state & RFKILL_BLOCK_SW_HASLOCK);
 
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
@@ -569,6 +639,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 			schedule_work(&rfkill->uevent_work);
 
 		rfkill_led_trigger_event(rfkill);
+		rfkill_any_led_trigger_event(global_locked);
 	}
 }
 EXPORT_SYMBOL(rfkill_set_states);
@@ -812,8 +883,10 @@ static int rfkill_resume(struct device *dev)
 	rfkill->suspended = false;
 
 	if (!rfkill->persistent) {
+		mutex_lock(&rfkill_global_mutex);
 		cur = !!(rfkill->state & RFKILL_BLOCK_SW);
 		rfkill_set_block(rfkill, cur);
+		mutex_unlock(&rfkill_global_mutex);
 	}
 
 	if (rfkill->ops->poll && !rfkill->polling_paused)
@@ -985,6 +1058,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 #endif
 	}
 
+	rfkill_any_led_trigger_event(true);
 	rfkill_send_events(rfkill, RFKILL_OP_ADD);
 
 	mutex_unlock(&rfkill_global_mutex);
@@ -1017,6 +1091,7 @@ void rfkill_unregister(struct rfkill *rfkill)
 	mutex_lock(&rfkill_global_mutex);
 	rfkill_send_events(rfkill, RFKILL_OP_DEL);
 	list_del_init(&rfkill->node);
+	rfkill_any_led_trigger_event(true);
 	mutex_unlock(&rfkill_global_mutex);
 
 	rfkill_led_trigger_unregister(rfkill);
@@ -1269,6 +1344,10 @@ static int __init rfkill_init(void)
 	if (error)
 		goto error_misc;
 
+	error = rfkill_any_led_trigger_register();
+	if (error)
+		goto error_led_trigger;
+
 #ifdef CONFIG_RFKILL_INPUT
 	error = rfkill_handler_init();
 	if (error)
@@ -1279,8 +1358,10 @@ static int __init rfkill_init(void)
 
 #ifdef CONFIG_RFKILL_INPUT
 error_input:
-	misc_deregister(&rfkill_miscdev);
+	rfkill_any_led_trigger_unregister();
 #endif
+error_led_trigger:
+	misc_deregister(&rfkill_miscdev);
 error_misc:
 	class_unregister(&rfkill_class);
 error_class:
@@ -1293,6 +1374,7 @@ static void __exit rfkill_exit(void)
 #ifdef CONFIG_RFKILL_INPUT
 	rfkill_handler_exit();
 #endif
+	rfkill_any_led_trigger_unregister();
 	misc_deregister(&rfkill_miscdev);
 	class_unregister(&rfkill_class);
 }
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net] be2net: Increase skb headroom size to 256 bytes
From: Sriharsha Basavapatna @ 2016-12-21  8:13 UTC (permalink / raw)
  To: David Miller; +Cc: Suresh Kumar Reddy Reddygari, netdev, Kalesh Anakkur Purayil
In-Reply-To: <20161220.143032.511618714729371345.davem@davemloft.net>

On Wed, Dec 21, 2016 at 1:00 AM, David Miller <davem@davemloft.net> wrote:
>
> From: Suresh Reddy <suresh.reddy@broadcom.com>
> Date: Tue, 20 Dec 2016 10:14:30 -0500
>
> > From: Kalesh A P <kalesh-anakkur.purayil@broadcom.com>
> >
> > The driver currently allocates 128 bytes of skb headroom.
> > This was found to be insufficient with some configurations
> > like Geneve tunnels, which resulted in skb head reallocations.
> >
> > Increase the headroom to 256 bytes to fix this.
> >
> > Signed-off-by: Kalesh A P <kalesh-anakkur.purayil@broadcom.com>
> > Signed-off-by: Suresh Reddy <suresh.reddy@broadcom.com>
>
> Adding 128 bytes of headroom just for geneve seems excessive.
>
> Do you really need to add that much?

Hi David,

With geneve, there's a fixed 8 byte header followed by a variable
options header of max length 256 bytes. And with vlan+ipv6 combination
we could run out of 128 bytes headroom even with just the fixed geveve
header (+ outer and inner eth/ipv6/transport headers).

Thanks,
-Harsha

^ permalink raw reply

* Re: Soft lockup in tc_classify
From: Cong Wang @ 2016-12-21  7:03 UTC (permalink / raw)
  To: Shahar Klein
  Cc: Daniel Borkmann, Or Gerlitz, Linux Netdev List, Roi Dayan,
	David Miller, Jiri Pirko, John Fastabend, Hadar Hen Zion
In-Reply-To: <66a3cb1e-862f-70e6-fff4-5a421766c432@mellanox.com>

On Tue, Dec 20, 2016 at 10:44 PM, Shahar Klein <shahark@mellanox.com> wrote:
>
> Tried it with same results

This piece is pretty interesting:

[  408.554689] DEBUGG:SK thread-2853[cpu-1] setting tp_created to 1
tp=ffff94b5b02805a0 back=ffff94b9ea932060
[  408.574258] DEBUGG:SK thread-2853[cpu-1] add/change filter by:
fl_get [cls_flower] tp=ffff94b5b02805a0 tp->next=ffff94b9ea932060
[  408.587849] DEBUGG:SK destroy ffff94b5b0280780 tcf_destroy:1905
[  408.595862] DEBUGG:SK thread-2845[cpu-1] add/change filter by:
fl_get [cls_flower] tp=ffff94b5b02805a0 tp->next=ffff94b5b02805a0

Looks like you added a debug printk inside tcf_destroy() too,
which seems racy with filter creation, it should not happen since
in both cases we take RTNL lock.

Don't know if changing all RCU_INIT_POINTER in that file to
rcu_assign_pointer could help anything or not. Mind to try?


Thanks for debugging!

^ permalink raw reply

* Re: HalfSipHash Acceptable Usage
From: George Spelvin @ 2016-12-21  6:34 UTC (permalink / raw)
  To: eric.dumazet, linux
  Cc: ak, davem, David.Laight, djb, ebiggers3, hannes, Jason,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, luto, netdev, tom, torvalds, tytso, vegard.nossum
In-Reply-To: <1482298164.8944.8.camel@edumazet-glaptop3.roam.corp.google.com>

Eric Dumazet wrote:
> On Tue, 2016-12-20 at 22:28 -0500, George Spelvin wrote:
>> Cycles per byte on 1024 bytes of data:
>> 			Pentium	Core 2	Ivy
>> 			4	Duo	Bridge
>> SipHash-2-4		38.9	 8.3	 5.8
>> HalfSipHash-2-4	12.7	 4.5	 3.2
>> MD5			 8.3	 5.7	 4.7
>
> So definitely not faster.
> 
> 38 cycles per byte is a problem, considering IPV6 is ramping up.

As I said earlier, SipHash performance on 32-bit x86 really sucks,
because it wants an absolute minimum of 9 32-bit registers (8 for the
state plus one temporary for the rotates), and x86 has 7.

> What about SHA performance (syncookies) on P4 ?

I recompiled with -mtune=pentium4 and re-ran.  MD5 time went *up* by
0.3 cycles/byte, HalfSipHash went down by 1 cycle, and SipHash didn't
change:

Cycles per byte on 1024 bytes of data:
		Pentium	Core 2	Ivy
		4	Duo	Bridge
SipHash-2-4	38.9	 8.3	 5.8
HalfSipHash-2-4	11.5	 4.5	 3.2
MD5		 8.6	 5.7	 4.7
SHA-1		19.0	 8.0	 6.8

(This is with a verbatim copy of the lib/sha1.c code; I might be
able to optimize it with some asm hackery.)

Anyway, you see why we were looking longingly at HalfSipHash.

In fact, I have an idea.  Allow me to make the following concrete
suggestion for using HalfSipHash with 128 bits of key material:

- 64 bits are used as the key.
- The other 64 bits are used as an IV which is prepended to
  the message to be hashed.

As a matter of practical implementation, we precompute the effect
of hashing the IV and store the 128-bit HalfSipHash state, which
is used just like a 128-bit key.

Because of the way it is constructed, it is obviously no weaker than
standard HalfSipHash's 64-bit security claim.

I don't know the security of this, and it's almost certainly weaker than
128 bits, but I *hope* it's at least a few bits stronger than 64 bits.
80 would be enough to dissuade any attacker without a six-figure budget
(that's per attack, not a one-time capital investment).  96 would be
ample for our purposes.

What I do know is that it makes a brute-force attack without
significant cryptanalytic effort impossible.

To match the spec exactly, we'd need to add the 8-byte IV length to
the length byte which pads the final block, but from a security point
of view, it does not matter.  As long as we are consistent within any
single key, any unique mapping between padding byte and message length
(mod 256) is equally good.

We may choose based on implementation convenience.

(Also note my earlier comments about when it is okay to omit the padding
length byte entirely: any time all the data to be hashed with a given
key is fixed in format or self-delimiting (e.g. null-terminated).
This applies to many of the networking uses.)

^ permalink raw reply

* Re: ipv6: handle -EFAULT from skb_copy_bits
From: Cong Wang @ 2016-12-21  6:09 UTC (permalink / raw)
  To: Dave Jones; +Cc: David Miller, Linux Kernel Network Developers
In-Reply-To: <20161220221214.w3zerfiy4wu6apee@codemonkey.org.uk>

On Tue, Dec 20, 2016 at 2:12 PM, Dave Jones <davej@codemonkey.org.uk> wrote:
>         fd = socket(AF_INET6, SOCK_RAW, 7);
>
>         setsockopt(fd, SOL_IPV6, IPV6_CHECKSUM, &zero, 4);
>         setsockopt(fd, SOL_IPV6, IPV6_DSTOPTS, &buf, LEN);
>

Interesting, you set the checksum offset to be 0, but the packet size
is actually 49, transport header is located at offset 48, so apparently
the packet doesn't have room for a 16bit checksum after network header.

Your original patch seems reasonable to me, unless there is some
check in __ip6_append_data() which is supposed to catch this, but
CHECKSUM is specific to raw socket only.

^ permalink raw reply

* Re: HalfSipHash Acceptable Usage
From: Eric Dumazet @ 2016-12-21  5:29 UTC (permalink / raw)
  To: George Spelvin
  Cc: tytso, ak, davem, David.Laight, djb, ebiggers3, hannes, Jason,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, luto, netdev, tom, torvalds, vegard.nossum
In-Reply-To: <20161221032829.3031.qmail@ns.sciencehorizons.net>

On Tue, 2016-12-20 at 22:28 -0500, George Spelvin wrote:
> > I do not see why SipHash, if faster than MD5 and more secure, would be a
> > problem.
> 
> Because on 32-bit x86, it's slower.
> 
> Cycles per byte on 1024 bytes of data:
> 			Pentium	Core 2	Ivy
> 			4	Duo	Bridge
> SipHash-2-4		38.9	 8.3	 5.8
> HalfSipHash-2-4		12.7	 4.5	 3.2
> MD5			 8.3	 5.7	 4.7

So definitely not faster.

38 cycles per byte is a problem, considering IPV6 is ramping up.

But TCP session establishment on P4 is probably not a big deal.
Nobody would expect a P4 to handle gazillions of TCP flows (using a
32bit kernel)

What about SHA performance (syncookies) on P4 ?

Synfloods are probably the only case we might take care of for 2000-era
cpus.

^ permalink raw reply

* Re: Potential issues (security and otherwise) with the current cgroup-bpf API
From: Alexei Starovoitov @ 2016-12-21  4:01 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Daniel Mack, Andy Lutomirski, Mickaël Salaün, Kees Cook,
	Jann Horn, Tejun Heo, David Ahern, David S. Miller, Thomas Graf,
	Michael Kerrisk, Peter Zijlstra, Linux API,
	linux-kernel@vger.kernel.org, Network Development
In-Reply-To: <CALCETrVhjiRMSkvmMDgq_O7niWHiRcvjq-my8nYS9_s8JDye1g@mail.gmail.com>

On Tue, Dec 20, 2016 at 10:49:25AM -0800, Andy Lutomirski wrote:
> >> FWIW, everywhere I say ioctl(), the bpf() syscall would be okay, too.
> >> It doesn't make a semantic difference, except that I dislike
> >> BPF_PROG_DETACH because that particular command isn't BPF-specific at
> >> all.
> >
> > Well, I think it is; it pops the bpf program from a target and drops the
> > reference on it. It's not much code, but it's certainly bpf-specific.
> 
> I mean the interface isn't bpf-specific.  If there was something that
> wasn't bpf attached to the target, you'd still want an API to detach
> it.

This discussion won't go anywhere while you keep thinking that this api
has to be generalized. As I explained several times earlier
BPF_CGROUP_INET_SOCK_CREATE hook is bpf specific. There is nothing
in the kernel that can take advantage of it today, so by definition
the hook is bpf specific. Period. Saying that something in the future
may come along that would want to use that is like saying I want
to design the generic steering wheel for any car that will ever need it.

Hence if you want to change 'target_fd' in BPF_PROG_ATTACH/DETACH cmds
from being fd of open("cgroupdir") to fd of open("cgroupdir/cgroup.bpf")
file inside it then I'm ok with that.
All other proposals with non-extensible ioctls() and crazy text based
per-hook permissions is nack.

^ permalink raw reply

* Re: HalfSipHash Acceptable Usage
From: George Spelvin @ 2016-12-21  3:28 UTC (permalink / raw)
  To: eric.dumazet, tytso
  Cc: ak, davem, David.Laight, djb, ebiggers3, hannes, Jason,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, linux, luto, netdev, tom, torvalds, vegard.nossum
In-Reply-To: <1482278145.1521.46.camel@edumazet-glaptop3.roam.corp.google.com>

> I do not see why SipHash, if faster than MD5 and more secure, would be a
> problem.

Because on 32-bit x86, it's slower.

Cycles per byte on 1024 bytes of data:
			Pentium	Core 2	Ivy
			4	Duo	Bridge
SipHash-2-4		38.9	 8.3	 5.8
HalfSipHash-2-4		12.7	 4.5	 3.2
MD5			 8.3	 5.7	 4.7

SipHash is more parallelizable and runs faster on superscalar processors,
but MD5 is optimized for 2000-era processors, and is faster on them than
HalfSipHash even.

Now, in the applications we care about, we're hashing short blocks, and
SipHash has the advantage that it can hash less than 64 bytes.  But it
also pays a penalty on short blocks for the finalization, equivalent to
two words (16 bytes) of input.

It turns out that on both Ivy Bridge and Core 2 Duo, the crossover happens
between 23 (SipHash is faster) and 24 (MD5 is faster) bytes of input.

This is assuming you're adding the 1 byte of length padding to SipHash's
input, so 24 bytes pads to 4 64-bit words, which makes 2*4+4 = 12 rounds,
vs. one block for MD5.  (MD5 takes a similar jump between 55 and 56 bytes.)

On a P4, SipHash is *never* faster; it takes 2.5x longer than MD5 on a
12-byte block (an IPv4 address/port pair).

This is why there was discussion of using HalfSipHash on these machines.
(On a P4, the HalfSipHash/MD5 crossover is somewhere between 24 and 31
bytes; I haven't benchmarked every possible size.)

^ permalink raw reply

* Re: [PATCH] staging: octeon: Call SET_NETDEV_DEV()
From: David Miller @ 2016-12-21  3:20 UTC (permalink / raw)
  To: f.fainelli
  Cc: devel, asbjorn, aaro.koskinen, netdev, nevola, bhaktipriya96,
	jarod, gregkh, linux-kernel, tremyfr
In-Reply-To: <d5cd5832-4402-2468-9d64-0a8d6e62d965@gmail.com>

From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 20 Dec 2016 17:02:37 -0800

> On 12/14/2016 05:13 PM, Florian Fainelli wrote:
>> The Octeon driver calls into PHYLIB which now checks for
>> net_device->dev.parent, so make sure we do set it before calling into
>> any MDIO/PHYLIB related function.
>> 
>> Fixes: ec988ad78ed6 ("phy: Don't increment MDIO bus refcount unless it's a different owner")
>> Reported-by: Aaro Koskinen <aaro.koskinen@iki.fi>
>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> 
> Greg, David, since this is a fix for a regression introduced in the net
> tree, it may make sense that David take it via his tree.

Since the change in question is in Linus's tree, it's equally valid
for Greg to take it as well.

^ permalink raw reply

* Re: [PATCH net-next 1/1] driver: ipvlan: Define common functions to decrease duplicated codes used to add or del IP address
From: Gao Feng @ 2016-12-21  3:00 UTC (permalink / raw)
  To: David Miller
  Cc: Mahesh Bandewar, Eric Dumazet, Linux Kernel Network Developers
In-Reply-To: <20161220.133025.2171735752938243916.davem@davemloft.net>

On Wed, Dec 21, 2016 at 2:30 AM, David Miller <davem@davemloft.net> wrote:
> From: fgao@ikuai8.com
> Date: Mon, 19 Dec 2016 09:24:05 +0800
>
>>  It is sent again because the first email is sent during net-next closing.
>
> It is still closed, and will not open again for at least one week.

Thanks David.
I thought it only last one week.

I would waiting for reopen, and resend again.

Regards
Feng

^ permalink raw reply

* Re: [PATCH] phy: check if parent device is NULL
From: Ruslan Babayev @ 2016-12-21  2:35 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev
In-Reply-To: <8cdfecf2-60c1-121d-a708-20de9f44d906@gmail.com>

Yes, I saw that with the staging Octeon driver.
Your patch works for me too.

Thanks Florian!


On Tue, Dec 20, 2016 at 4:33 PM, Florian Fainelli <f.fainelli@gmail.com> wrote:
> On 12/20/2016 03:51 PM, Ruslan Babayev wrote:
>> Fixes a crash observed on Octeon.
>>
>> Signed-off-by: Ruslan Babayev <ruslan@babayev.com>
>> Fixes: ec988ad78ed6 ("phy: Don't increment MDIO bus refcount unless it's a
>> different owner")
>
> Assuming you saw this with the staging Octeon driver, a fix has already
> been submitted:
>
> https://lkml.org/lkml/2016/12/14/756
>
> If this is with a different driver, I would rather we fix it in a
> similar way that the fix proposed above.
>
> Thanks
>
>> ---
>>  drivers/net/phy/phy_device.c | 3 ++-
>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index 9c06f8028f0c..043328b85643 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -905,7 +905,8 @@ EXPORT_SYMBOL(phy_attached_print);
>>  int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>>                     u32 flags, phy_interface_t interface)
>>  {
>> -     struct module *ndev_owner = dev->dev.parent->driver->owner;
>> +     struct device *parent = dev->dev.parent;
>> +     struct module *ndev_owner = parent ? parent->driver->owner : NULL;
>>       struct mii_bus *bus = phydev->mdio.bus;
>>       struct device *d = &phydev->mdio.dev;
>>       int err;
>>
>
>
> --
> Florian

^ permalink raw reply

* Re: [PATCH 2/3] NFC: trf7970a: Add device tree option of 1.8 Volt IO voltage
From: Mark Greer @ 2016-12-21  2:23 UTC (permalink / raw)
  To: Geoff Lansberry
  Cc: linux-wireless, lauro.venancio, aloisio.almeida, sameo, robh+dt,
	mark.rutland, netdev, devicetree, linux-kernel, justin
In-Reply-To: <1482250592-4268-2-git-send-email-glansberry@gmail.com>

On Tue, Dec 20, 2016 at 11:16:31AM -0500, Geoff Lansberry wrote:
> From: Geoff Lansberry <geoff@kuvee.com>
> 
> The TRF7970A has configuration options for supporting hardware designs
> with 1.8 Volt or 3.3 Volt IO.   This commit adds a device tree option,
> using a fixed regulator binding, for setting the io voltage to match
> the hardware configuration. If no option is supplied it defaults to
> 3.3 volt configuration.

Sign-off ??  Same comment for you other patches.

<time passes>

Okay I see you have it at the end of the patch.  It should be here.
'git commit -s' is your friend.

> ---
>  .../devicetree/bindings/net/nfc/trf7970a.txt       |  4 ++--
>  drivers/nfc/trf7970a.c                             | 28 +++++++++++++++++++++-
>  2 files changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/net/nfc/trf7970a.txt b/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
> index e262ac1..b5777d8 100644
> --- a/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
> +++ b/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
> @@ -21,9 +21,9 @@ Optional SoC Specific Properties:
>  - t5t-rmb-extra-byte-quirk: Specify that the trf7970a has the erratum
>    where an extra byte is returned by Read Multiple Block commands issued
>    to Type 5 tags.
> +- vdd-io-supply: Regulator specifying voltage for vdd-io
>  - clock-frequency: Set to specify that the input frequency to the trf7970a is 13560000Hz or 27120000Hz
>  
> -
>  Example (for ARM-based BeagleBone with TRF7970A on SPI1):
>  
>  &spi1 {
> @@ -41,11 +41,11 @@ Example (for ARM-based BeagleBone with TRF7970A on SPI1):
>  				  <&gpio2 5 GPIO_ACTIVE_LOW>;
>  		vin-supply = <&ldo3_reg>;
>  		vin-voltage-override = <5000000>;
> +		vdd-io-supply = <&ldo2_reg>;
>  		autosuspend-delay = <30000>;
>  		irq-status-read-quirk;
>  		en2-rf-quirk;
>  		t5t-rmb-extra-byte-quirk;
> -		vdd_io_1v8;

It was already mentioned but this shouldn't have been added in the
previous patch so it shouldn't be here now.

>  		clock-frequency = <27120000>;
>  		status = "okay";
>  	};
> diff --git a/drivers/nfc/trf7970a.c b/drivers/nfc/trf7970a.c
> index 4e051e9..8a88195 100644
> --- a/drivers/nfc/trf7970a.c
> +++ b/drivers/nfc/trf7970a.c

> @@ -2062,6 +2068,7 @@ static int trf7970a_probe(struct spi_device *spi)
>  		return ret;
>  	}
>  
> +

Please don't add an extra blank line.

>  	of_property_read_u32(np, "clock-frequency", &clk_freq);
>  	if ((clk_freq != TRF7970A_27MHZ_CLOCK_FREQUENCY) ||
>  		(clk_freq != TRF7970A_27MHZ_CLOCK_FREQUENCY)) {
> @@ -2105,6 +2112,25 @@ static int trf7970a_probe(struct spi_device *spi)
>  	if (uvolts > 4000000)
>  		trf->chip_status_ctrl = TRF7970A_CHIP_STATUS_VRS5_3;
>  
> +	trf->regulator = devm_regulator_get(&spi->dev, "vdd-io");
> +	if (IS_ERR(trf->regulator)) {
> +		ret = PTR_ERR(trf->regulator);
> +		dev_err(trf->dev, "Can't get VDD_IO regulator: %d\n", ret);
> +		goto err_destroy_lock;
> +	}
> +
> +	ret = regulator_enable(trf->regulator);
> +	if (ret) {
> +		dev_err(trf->dev, "Can't enable VDD_IO: %d\n", ret);
> +		goto err_destroy_lock;
> +	}
> +
> +

Please don't add an extra blank line.

> +	if (regulator_get_voltage(trf->regulator) == 1800000) {
> +		trf->io_ctrl = TRF7970A_REG_IO_CTRL_IO_LOW;
> +		dev_dbg(trf->dev, "trf7970a config vdd_io to 1.8V\n");
> +	}
> +
>  	trf->ddev = nfc_digital_allocate_device(&trf7970a_nfc_ops,
>  			TRF7970A_SUPPORTED_PROTOCOLS,
>  			NFC_DIGITAL_DRV_CAPS_IN_CRC |
> -- 
> Signed-off-by: Geoff Lansberry <geoff@kuvee.com>

Your 'Signed-off-by:' goes at the end of the commit description not here.

Overall, I think you did the right thing (unless someone disagrees).
Just some minor issues.

Mark
--

^ permalink raw reply

* Re: [PATCH net-next] ixgbevf: fix 'Etherleak' in ixgbevf
From: Alexander Duyck @ 2016-12-21  2:20 UTC (permalink / raw)
  To: Weilong Chen
  Cc: Jeff Kirsher, intel-wired-lan, Netdev,
	linux-kernel@vger.kernel.org, wangkefeng.wang
In-Reply-To: <ee9ad90d-1c4b-7358-a172-8369a0c7da8b@huawei.com>

I find it curious that only the last 4 bytes have data in them.  I'm
wondering if the NIC/driver in the Windows/Nessus system is
interpreting the 4 byte CRC on the end of the frame as padding instead
of stripping it.

Is there any chance you could capture the entire frame instead of just
the padding?  Maybe you could run something like wireshark without
enabling promiscuous mode on the VF and capture the frames it is
trying to send and receive.  What I want to verify is what the actual
amount of padding is that is needed to get to 60 bytes and where the
CRC should start.

- Alex

On Tue, Dec 20, 2016 at 5:40 PM, Weilong Chen <chenweilong@huawei.com> wrote:
> Thanks for you explanation, it's very professional.
>
> My test is like this:
> The Nessus is deployed on a windows server, the peer is a X86_64 linux host
> which run several VMs on it. The nic is Intel 82599 and SRIOV is enabled.
> VFs are passthroughed to the VMs. No DPDK.
>
> The Nessus server send small ICMP echo request packets to the VM, and
> then check the reply, and report the error:
>
> "11197 - Multiple Ethernet Driver Frame Padding Information Disclosure
> (Etherleak)"
>
> "Padding observed in one frame :
>
> 0x00: 00 00 00 00 00 00 00 00 00 00 00 00 00 57 37 28 .............W7(
> 0x10: 76 v
>
> Padding observed in another frame :
>
> 0x00: 00 00 00 00 00 00 00 00 00 00 00 00 00 D3 4D 75 ..............Mu
> 0x10: 28 ("
>
> I only have Nessus's windows version, so can't test on linux. Maybe the
> windows server does not pad small packets to 60 bytes on the receive path.
>
>
> On 2016/12/21 0:36, Alexander Duyck wrote:
>>
>> The limit of 17 is just based on the hardware.  Specifically the
>> olinfo field in the Tx descriptor has a minimum length of 17 has a
>> requirement.  The hardware itself is supposed to be capable of padding
>> short frames that are supposed to be transmitted.  The drivers are
>> supposed to pad short frames on receive to get them up to 60 bytes.
>>
>> When you are seeing this issue are you sending frames from the VF to
>> one of the local interfaces on the same port or to an external
>> interface?  Also are you receiving on another linux ixgbevf driver or
>> are you receiving the packet using a different driver interface such
>> as DPDK?  I'm just wanting to verify this as it is possible that the
>> memory leak you are seeing is on the receiver and not on the source if
>> you are transmitting to a local VF or the PF as the receiver will have
>> to pad the frame in such a case to get it up to 60 bytes.
>>
>> - Alex
>>
>> On Tue, Dec 20, 2016 at 3:50 AM, Weilong Chen <chenweilong@huawei.com>
>> wrote:
>>>
>>> Hi,
>>>
>>> Thanks for you reply.
>>> We test you patch, but the problem is still there, it seems do not work.
>>>
>>> I'm not sure why ixgbe use the limit 17. The kenel use ETH_ZLEN (60) with
>>> out FCS. A lot of drivers such as e1000 use it. Any explaination?
>>>
>>> Thanks.
>>>
>>>
>>> On 2016/12/16 0:13, Alexander Duyck wrote:
>>>>
>>>>
>>>> On Thu, Dec 15, 2016 at 3:40 AM, Weilong Chen <chenweilong@huawei.com>
>>>> wrote:
>>>>>
>>>>>
>>>>> Nessus report the vf appears to leak memory in network packets.
>>>>> Fix this by padding all small packets manually.
>>>>>
>>>>> And the CVE-2003-0001.
>>>>>
>>>>>
>>>>> https://ofirarkin.files.wordpress.com/2008/11/atstake_etherleak_report.pdf
>>>>>
>>>>> Signed-off-by: Weilong Chen <chenweilong@huawei.com>
>>>>> ---
>>>>>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 7 +++++++
>>>>>  1 file changed, 7 insertions(+)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>>>>> b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>>>>> index 6d4bef5..137a154 100644
>>>>> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>>>>> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>>>>> @@ -3654,6 +3654,13 @@ static int ixgbevf_xmit_frame(struct sk_buff
>>>>> *skb,
>>>>> struct net_device *netdev)
>>>>>                 return NETDEV_TX_OK;
>>>>>         }
>>>>>
>>>>> +       /* On PCI/PCI-X HW, if packet size is less than ETH_ZLEN,
>>>>> +        * packets may get corrupted during padding by HW.
>>>>> +        * To WA this issue, pad all small packets manually.
>>>>> +        */
>>>>> +       if (eth_skb_pad(skb))
>>>>> +               return NETDEV_TX_OK;
>>>>> +
>>>>
>>>>
>>>>
>>>> So the patch description for this probably isn't correct.  It looks
>>>> like the problem isn't leaking data it is the fact that the frames
>>>> aren't being padded to prevent malicious events.  The only issue is
>>>> the patch is padding by a bit too much.  I would recommend replacing
>>>> this with the following from ixgbe:
>>>>
>>>>         /*
>>>>          * The minimum packet size for olinfo paylen is 17 so pad the
>>>> skb
>>>>          * in order to meet this minimum size requirement.
>>>>          */
>>>>         if (skb_put_padto(skb, 17))
>>>>                 return NETDEV_TX_OK;
>>>>
>>>>
>>>>>         tx_ring = adapter->tx_ring[skb->queue_mapping];
>>>>>
>>>>>         /* need: 1 descriptor per page *
>>>>> PAGE_SIZE/IXGBE_MAX_DATA_PER_TXD,
>>>>> --
>>>>> 1.7.12
>>>>>
>>>>
>>>> .
>>>>
>>>
>>
>> .
>>
>

^ permalink raw reply

* Re: [PATCH 2/3] NFC: trf7970a: Add device tree option of 1.8 Volt IO voltage
From: Mark Greer @ 2016-12-21  2:07 UTC (permalink / raw)
  To: Geoff Lansberry
  Cc: Rob Herring, linux-wireless, Lauro Ramos Venancio,
	Aloisio Almeida Jr, Samuel Ortiz, mark.rutland-5wv7dgnIgG8,
	netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Justin Bronder
In-Reply-To: <CAO7Z3WLC7J+JzmtArOc-ZUNoLGeMX6s=1XQbzwB1zx1U3yTX2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Dec 20, 2016 at 11:13:23AM -0500, Geoff Lansberry wrote:
> On Mon, Dec 19, 2016 at 5:35 PM, Rob Herring <robh-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> > On Thu, Dec 15, 2016 at 05:30:43PM -0500, Geoff Lansberry wrote:
> >> From: Geoff Lansberry <geoff-R+k406RtEhcAvxtiuMwx3w@public.gmane.org>
> >>
> >> ---
> >>  Documentation/devicetree/bindings/net/nfc/trf7970a.txt |  2 ++
> >>  drivers/nfc/trf7970a.c                                 | 13 ++++++++++++-
> >>  2 files changed, 14 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/Documentation/devicetree/bindings/net/nfc/trf7970a.txt b/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
> >> index 9dda879..208f045 100644
> >> --- a/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
> >> +++ b/Documentation/devicetree/bindings/net/nfc/trf7970a.txt
> >> @@ -21,6 +21,7 @@ Optional SoC Specific Properties:
> >>  - t5t-rmb-extra-byte-quirk: Specify that the trf7970a has the erratum
> >>    where an extra byte is returned by Read Multiple Block commands issued
> >>    to Type 5 tags.
> >> +- vdd_io_1v8: Set to specify that the trf7970a io voltage should be set to 1.8V
> >
> > Use the regulator binding and provide a fixed 1.8V supply.
> >
> >>  - crystal_27mhz: Set to specify that the input frequency to the trf7970a is 27.12MHz
> >>
> >>
> >> @@ -45,6 +46,7 @@ Example (for ARM-based BeagleBone with TRF7970A on SPI1):
> >>               irq-status-read-quirk;
> >>               en2-rf-quirk;
> >>               t5t-rmb-extra-byte-quirk;
> >> +             vdd_io_1v8;
> >>               crystal_27mhz;
> >>               status = "okay";
> >>       };
> 
> Rob - using the regulator binding is new to me, but I've given it a
> shot and just sent you another set of patches for your inspection.
> Please let me know if this is what you had in mind.

This is my bad.  Geoff followed my example and did something similar to
'vin-voltage-override' which shouldn't have been there in the first place.
I have this fixed (I think) locally and will submit once it I'm back from
my holiday travels.

Mark
--
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next] ixgbevf: fix 'Etherleak' in ixgbevf
From: Weilong Chen @ 2016-12-21  1:40 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Jeff Kirsher, intel-wired-lan, Netdev,
	linux-kernel@vger.kernel.org, wangkefeng.wang
In-Reply-To: <CAKgT0Uf_HXtJiZJsEYm9L19cZ+deVX=E+Y8JvU5+u0=bggpAqw@mail.gmail.com>

Thanks for you explanation, it's very professional.

My test is like this:
The Nessus is deployed on a windows server, the peer is a X86_64 linux 
host which run several VMs on it. The nic is Intel 82599 and SRIOV is 
enabled. VFs are passthroughed to the VMs. No DPDK.

The Nessus server send small ICMP echo request packets to the VM, and
then check the reply, and report the error:

"11197 - Multiple Ethernet Driver Frame Padding Information Disclosure 
(Etherleak)"

"Padding observed in one frame :

0x00: 00 00 00 00 00 00 00 00 00 00 00 00 00 57 37 28 .............W7(
0x10: 76 v

Padding observed in another frame :

0x00: 00 00 00 00 00 00 00 00 00 00 00 00 00 D3 4D 75 ..............Mu
0x10: 28 ("

I only have Nessus's windows version, so can't test on linux. Maybe the 
windows server does not pad small packets to 60 bytes on the receive path.

On 2016/12/21 0:36, Alexander Duyck wrote:
> The limit of 17 is just based on the hardware.  Specifically the
> olinfo field in the Tx descriptor has a minimum length of 17 has a
> requirement.  The hardware itself is supposed to be capable of padding
> short frames that are supposed to be transmitted.  The drivers are
> supposed to pad short frames on receive to get them up to 60 bytes.
>
> When you are seeing this issue are you sending frames from the VF to
> one of the local interfaces on the same port or to an external
> interface?  Also are you receiving on another linux ixgbevf driver or
> are you receiving the packet using a different driver interface such
> as DPDK?  I'm just wanting to verify this as it is possible that the
> memory leak you are seeing is on the receiver and not on the source if
> you are transmitting to a local VF or the PF as the receiver will have
> to pad the frame in such a case to get it up to 60 bytes.
>
> - Alex
>
> On Tue, Dec 20, 2016 at 3:50 AM, Weilong Chen <chenweilong@huawei.com> wrote:
>> Hi,
>>
>> Thanks for you reply.
>> We test you patch, but the problem is still there, it seems do not work.
>>
>> I'm not sure why ixgbe use the limit 17. The kenel use ETH_ZLEN (60) with
>> out FCS. A lot of drivers such as e1000 use it. Any explaination?
>>
>> Thanks.
>>
>>
>> On 2016/12/16 0:13, Alexander Duyck wrote:
>>>
>>> On Thu, Dec 15, 2016 at 3:40 AM, Weilong Chen <chenweilong@huawei.com>
>>> wrote:
>>>>
>>>> Nessus report the vf appears to leak memory in network packets.
>>>> Fix this by padding all small packets manually.
>>>>
>>>> And the CVE-2003-0001.
>>>>
>>>> https://ofirarkin.files.wordpress.com/2008/11/atstake_etherleak_report.pdf
>>>>
>>>> Signed-off-by: Weilong Chen <chenweilong@huawei.com>
>>>> ---
>>>>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 7 +++++++
>>>>  1 file changed, 7 insertions(+)
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>>>> b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>>>> index 6d4bef5..137a154 100644
>>>> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>>>> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
>>>> @@ -3654,6 +3654,13 @@ static int ixgbevf_xmit_frame(struct sk_buff *skb,
>>>> struct net_device *netdev)
>>>>                 return NETDEV_TX_OK;
>>>>         }
>>>>
>>>> +       /* On PCI/PCI-X HW, if packet size is less than ETH_ZLEN,
>>>> +        * packets may get corrupted during padding by HW.
>>>> +        * To WA this issue, pad all small packets manually.
>>>> +        */
>>>> +       if (eth_skb_pad(skb))
>>>> +               return NETDEV_TX_OK;
>>>> +
>>>
>>>
>>> So the patch description for this probably isn't correct.  It looks
>>> like the problem isn't leaking data it is the fact that the frames
>>> aren't being padded to prevent malicious events.  The only issue is
>>> the patch is padding by a bit too much.  I would recommend replacing
>>> this with the following from ixgbe:
>>>
>>>         /*
>>>          * The minimum packet size for olinfo paylen is 17 so pad the skb
>>>          * in order to meet this minimum size requirement.
>>>          */
>>>         if (skb_put_padto(skb, 17))
>>>                 return NETDEV_TX_OK;
>>>
>>>
>>>>         tx_ring = adapter->tx_ring[skb->queue_mapping];
>>>>
>>>>         /* need: 1 descriptor per page *
>>>> PAGE_SIZE/IXGBE_MAX_DATA_PER_TXD,
>>>> --
>>>> 1.7.12
>>>>
>>>
>>> .
>>>
>>
>
> .
>

^ permalink raw reply

* Re: [PATCH] staging: octeon: Call SET_NETDEV_DEV()
From: Florian Fainelli @ 2016-12-21  1:02 UTC (permalink / raw)
  To: netdev
  Cc: davem, Greg Kroah-Hartman, Aaro Koskinen, Laura Garcia Liebana,
	Philippe Reynes, Asbjoern Sloth Toennesen, Jarod Wilson,
	Bhaktipriya Shridhar, open list:STAGING SUBSYSTEM, open list
In-Reply-To: <20161215011326.8852-1-f.fainelli@gmail.com>

On 12/14/2016 05:13 PM, Florian Fainelli wrote:
> The Octeon driver calls into PHYLIB which now checks for
> net_device->dev.parent, so make sure we do set it before calling into
> any MDIO/PHYLIB related function.
> 
> Fixes: ec988ad78ed6 ("phy: Don't increment MDIO bus refcount unless it's a different owner")
> Reported-by: Aaro Koskinen <aaro.koskinen@iki.fi>
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Greg, David, since this is a fix for a regression introduced in the net
tree, it may make sense that David take it via his tree.

Thanks

> ---
>  drivers/staging/octeon/ethernet.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/staging/octeon/ethernet.c b/drivers/staging/octeon/ethernet.c
> index 8130dfe89745..4971aa54756a 100644
> --- a/drivers/staging/octeon/ethernet.c
> +++ b/drivers/staging/octeon/ethernet.c
> @@ -770,6 +770,7 @@ static int cvm_oct_probe(struct platform_device *pdev)
>  			/* Initialize the device private structure. */
>  			struct octeon_ethernet *priv = netdev_priv(dev);
>  
> +			SET_NETDEV_DEV(dev, &pdev->dev);
>  			dev->netdev_ops = &cvm_oct_pow_netdev_ops;
>  			priv->imode = CVMX_HELPER_INTERFACE_MODE_DISABLED;
>  			priv->port = CVMX_PIP_NUM_INPUT_PORTS;
> @@ -816,6 +817,7 @@ static int cvm_oct_probe(struct platform_device *pdev)
>  			}
>  
>  			/* Initialize the device private structure. */
> +			SET_NETDEV_DEV(dev, &pdev->dev);
>  			priv = netdev_priv(dev);
>  			priv->netdev = dev;
>  			priv->of_node = cvm_oct_node_for_port(pip, interface,
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH] phy: check if parent device is NULL
From: Florian Fainelli @ 2016-12-21  0:33 UTC (permalink / raw)
  To: Ruslan Babayev, netdev
In-Reply-To: <87oa06upl5.fsf@babayev.com>

On 12/20/2016 03:51 PM, Ruslan Babayev wrote:
> Fixes a crash observed on Octeon.
> 
> Signed-off-by: Ruslan Babayev <ruslan@babayev.com>
> Fixes: ec988ad78ed6 ("phy: Don't increment MDIO bus refcount unless it's a
> different owner")

Assuming you saw this with the staging Octeon driver, a fix has already
been submitted:

https://lkml.org/lkml/2016/12/14/756

If this is with a different driver, I would rather we fix it in a
similar way that the fix proposed above.

Thanks

> ---
>  drivers/net/phy/phy_device.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index 9c06f8028f0c..043328b85643 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -905,7 +905,8 @@ EXPORT_SYMBOL(phy_attached_print);
>  int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>  		      u32 flags, phy_interface_t interface)
>  {
> -	struct module *ndev_owner = dev->dev.parent->driver->owner;
> +	struct device *parent = dev->dev.parent;
> +	struct module *ndev_owner = parent ? parent->driver->owner : NULL;
>  	struct mii_bus *bus = phydev->mdio.bus;
>  	struct device *d = &phydev->mdio.dev;
>  	int err;
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 00/10] netcp: enhancements and minor fixes
From: David Miller @ 2016-12-21  0:03 UTC (permalink / raw)
  To: m-karicheri2
  Cc: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>


The net-next tree is not open, do not resubmit this series until it
is open again.

Thanks.

^ permalink raw reply

* Re: HalfSipHash Acceptable Usage
From: Eric Dumazet @ 2016-12-20 23:55 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Jason A. Donenfeld, Jean-Philippe Aumasson, Hannes Frederic Sowa,
	LKML, Eric Biggers, Daniel J . Bernstein, David Laight,
	David Miller, Andi Kleen, George Spelvin, kernel-hardening,
	Andy Lutomirski, Linux Crypto Mailing List, Tom Herbert,
	Vegard Nossum, Netdev, Linus Torvalds
In-Reply-To: <20161220213636.tiqj2o4uupasr4aj@thunk.org>

On Tue, 2016-12-20 at 16:36 -0500, Theodore Ts'o wrote:
> On Mon, Dec 19, 2016 at 06:32:44PM +0100, Jason A. Donenfeld wrote:
> > 1) Anything that requires actual long-term security will use
> > SipHash2-4, with the 64-bit output and the 128-bit key. This includes
> > things like TCP sequence numbers. This seems pretty uncontroversial to
> > me. Seem okay to you?
> 
> Um, why do TCP sequence numbers need long-term security?  So long as
> you rekey every 5 minutes or so, TCP sequence numbers don't need any
> more security than that, since even if you break the key used to
> generate initial sequence numbers seven a minute or two later, any
> pending TCP connections will have timed out long before.
> 
> See the security analysis done in RFC 6528[1], where among other
> things, it points out why MD5 is acceptable with periodic rekeying,
> although there is the concern that this could break certain hueristics
> used when establishing new connections during the TIME-WAIT state.
> 
> [1] https://tools.ietf.org/html/rfc6528


We do not use rekeying for TCP ISN, not anymore after commit
6e5714eaf77d79ae1 (where we switched from MD4 to MD5 )

It might hurt some common cases and I do not believe it is mandated by a
current (ie not obsolete) RFC.

Our clock has a 64 ns resolution and 274 second period (commit
9b42c336d0641) (compared to 4 usec one in RFC 6528)

I do not see why SipHash, if faster than MD5 and more secure, would be a
problem.

Same for syncookies.

BTW, we probably should add a ratelimit on SYNACK retransmits,
because it seems that attackers understood linux kernels resist to
synfloods, and they (the bad guys) use reflection attacks.

^ permalink raw reply

* [PATCH] phy: check if parent device is NULL
From: Ruslan Babayev @ 2016-12-20 23:51 UTC (permalink / raw)
  To: netdev; +Cc: f.fainelli

Fixes a crash observed on Octeon.

Signed-off-by: Ruslan Babayev <ruslan@babayev.com>
Fixes: ec988ad78ed6 ("phy: Don't increment MDIO bus refcount unless it's a
different owner")
---
 drivers/net/phy/phy_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 9c06f8028f0c..043328b85643 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -905,7 +905,8 @@ EXPORT_SYMBOL(phy_attached_print);
 int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 		      u32 flags, phy_interface_t interface)
 {
-	struct module *ndev_owner = dev->dev.parent->driver->owner;
+	struct device *parent = dev->dev.parent;
+	struct module *ndev_owner = parent ? parent->driver->owner : NULL;
 	struct mii_bus *bus = phydev->mdio.bus;
 	struct device *d = &phydev->mdio.dev;
 	int err;
-- 
2.7.4

^ permalink raw reply related

* Re: HalfSipHash Acceptable Usage
From: George Spelvin @ 2016-12-20 23:07 UTC (permalink / raw)
  To: Jason, tytso
  Cc: ak, davem, David.Laight, djb, ebiggers3, hannes,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, linux, luto, netdev, tom, torvalds, vegard.nossum
In-Reply-To: <20161220213636.tiqj2o4uupasr4aj@thunk.org>

Theodore Ts'o wrote:
> On Mon, Dec 19, 2016 at 06:32:44PM +0100, Jason A. Donenfeld wrote:
>> 1) Anything that requires actual long-term security will use
>> SipHash2-4, with the 64-bit output and the 128-bit key. This includes
>> things like TCP sequence numbers. This seems pretty uncontroversial to
>> me. Seem okay to you?

> Um, why do TCP sequence numbers need long-term security?  So long as
> you rekey every 5 minutes or so, TCP sequence numbers don't need any
> more security than that, since even if you break the key used to
> generate initial sequence numbers seven a minute or two later, any
> pending TCP connections will have timed out long before.
> 
> See the security analysis done in RFC 6528[1], where among other
> things, it points out why MD5 is acceptable with periodic rekeying,
> although there is the concern that this could break certain hueristics
> used when establishing new connections during the TIME-WAIT state.

Because we don't rekey TCP sequence numbers, ever.  See commit
6e5714eaf77d79ae1c8b47e3e040ff5411b717ec

To rekey them requires dividing the sequence number base into a "random"
part and some "generation" msbits.  While we can do better than the
previous 8+24 split (I'd suggest 4+28 or 3+29), only 2 is tricks, and
1 generation bit isn't enough.

So while it helps in the long term, it reduces the security offered by
the random part in the short term.  (If I know 4 bits of your ISN,
I only need to send 256 MB to hit your TCP window.)

At the time, I objected, and suggested doing two hashes, with a fixed
32-bit base plus a split rekeyed portion, but that was vetoed on the
grounds of performance.

On further consideration, the fixed base doesn't help much.
(Details below for anyone that cares.)

Suppose we let the TCP initial sequence number be:

(Hash(<srcIP,dstIP,srcPort,dstPort>, fixed_key) & 0xffffffff) +
(i << 28) + (Hash(<srcIP,dstIP,srcPort,dstPort>, key[i]) & 0x0fffffff) +
(current_time_in_nanoseconds / 64)

It's not hugely difficult to mount an effective attack against a
64-bit fixed_key.

As an attacker, I can ask the target to send me these numbers for dstPort
values i control and other values I know.  I can (with high probability)
detect the large jumps when the generation changes, so I can make a
significant number of queries with the same generation.  After 23-ish
queries, I have enough information to identify a 64-bit fixed_key.

I don't know the current generation counter "i", but I know it's the
same for all my queries, so for any two queries, the maximum difference
between the 28-bit hash values is 29 bits.  (We can also add a small
margin to allow for timeing uncertainty, but that's even less.)

So if I guess a fixed key, hash my known plaintexts with that guess,
subtract the ciphertexts from the observed sequence numbers, and the
difference between the remaining (unknown) 28-bit hash values plus
timestamps exceeds what's possible, my guess is wrong.

I can then repeat with additional known plaintexts, reducing the space
of admissible keys by about 3 bits each time.

Assuming I can rent GPU horsepower from a bitcoin miner to do this in a
reasonable period of time, after 22 known plaintext differences, I have
uniquely identified the key.

Of course, in practice I'd do is a first pass with maybe 6 plaintexts
on the GPU, and then deal with the candidates found in a second pass.
But either way, it's about 2.3 SipHash evaluations per key tested.
As I noted earlier, a bitcoin blockchain block, worth 25 bitcoins,
currently costs 2^71 evaluations of SHA-2 (2^70 evaluations of double
SHA-2), and that's accomplished every 10 minutes, this is definitely
practical.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox