Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC v2 04/10] IB/hfi-vnic: VNIC Ethernet Management (EM) structure definitions
From: Vishwanathapura, Niranjana @ 2016-12-15  7:59 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	ira.weiny-ral2JQCrhuEAvxtiuMwx3w, Niranjana Vishwanathapura,
	Sadanand Warrier, Tanya K Jajodia
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Define VNIC EM MAD structures and the associated macros. These structures
are used for information exchange between VNIC EM agent (EMA) on the HFI
host and the Ethernet manager. These include the virtual ethernet switch
(vesw) port information, vesw port mac table, summay and error counters,
vesw port interface mac lists and the EMA trap.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Reviewed-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Sadanand Warrier <sadanand.warrier-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Tanya K Jajodia <tanya.k.jajodia-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 .../infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h  | 444 +++++++++++++++++++++
 .../sw/intel/hfi_vnic/hfi_vnic_internal.h          |  33 ++
 2 files changed, 477 insertions(+)

diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h
index 6786cce..a6770ef 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h
@@ -52,11 +52,455 @@
  * and decapsulation of Ethernet packets
  */
 
+#include <linux/types.h>
+#include <rdma/ib_mad.h>
+
+/* Maximum number of vnics supported */
+#define HFI_MAX_VPORTS_SUPPORTED 256
+
+/* EMA class version */
+#define HFI_EMA_CLASS_VERSION               0x80
+
+/*
+ * Define the Intel vendor management class for HFI
+ * ETHERNET MANAGEMENT
+ */
+#define HFI_MGMT_CLASS_INTEL_EMA            0x34
+
+/* EM attribute IDs */
+#define HFI_EM_ATTR_CLASS_PORT_INFO                 0x0001
+#define HFI_EM_ATTR_VESWPORT_INFO                   0x0011
+#define HFI_EM_ATTR_VESWPORT_MAC_ENTRIES            0x0012
+#define HFI_EM_ATTR_IFACE_UCAST_MACS                0x0013
+#define HFI_EM_ATTR_IFACE_MCAST_MACS                0x0014
+#define HFI_EM_ATTR_DELETE_VESW                     0x0015
+#define HFI_EM_ATTR_VESWPORT_SUMMARY_COUNTERS       0x0020
+#define HFI_EM_ATTR_VESWPORT_ERROR_COUNTERS         0x0022
+
 #define HFI_VESW_MAX_NUM_DEF_PORT   16
 #define HFI_VNIC_MAX_NUM_PCP        8
 
+#define HFI_VNIC_EMA_DATA    (OPA_MGMT_MAD_SIZE - IB_MGMT_VENDOR_HDR)
+
+/* Defines for vendor specific notice(trap) attributes */
+#define HFI_INTEL_EMA_NOTICE_TYPE_INFO 0x04
+
+/* INTEL OUI */
+#define INTEL_OUI_1 0x00
+#define INTEL_OUI_2 0x06
+#define INTEL_OUI_3 0x6a
+
+/* Trap opcodes sent from VNIC */
+#define HFI_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE 0x1
+#define HFI_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE 0x2
+#define HFI_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE 0x3
+
 /* VNIC configured and operational state values */
 #define HFI_VNIC_STATE_DROP_ALL        0x1
 #define HFI_VNIC_STATE_FORWARDING      0x3
 
+/**
+ * struct hfi_vesw_info - HFI vnic switch information
+ * @fabric_id: 10-bit fabric id
+ * @vesw_id: 12-bit virtual ethernet switch id
+ * @def_port_mask: bitmask of default ports
+ * @pkey: partition key
+ * @u_mcast_dlid: unknown multicast dlid
+ * @u_ucast_dlid: array of unknown unicast dlids
+ * @eth_mtu: MTUs for each vlan PCP
+ * @eth_mtu_non_vlan: MTU for non vlan packets
+ */
+struct hfi_vesw_info {
+	__be16  fabric_id;
+	__be16  vesw_id;
+
+	u8      rsvd0[6];
+	__be16  def_port_mask;
+
+	u8      rsvd1[2];
+	__be16  pkey;
+
+	u8      rsvd2[4];
+	__be32  u_mcast_dlid;
+	__be32  u_ucast_dlid[HFI_VESW_MAX_NUM_DEF_PORT];
+
+	u8      rsvd3[44];
+	__be16  eth_mtu[HFI_VNIC_MAX_NUM_PCP];
+	__be16  eth_mtu_non_vlan;
+	u8      rsvd4[2];
+} __packed;
+
+/**
+ * struct hfi_per_veswport_info - HFI vnic per port information
+ * @port_num: port number
+ * @eth_link_status: current ethernet link state
+ * @base_mac_addr: base mac address
+ * @config_state: configured port state
+ * @oper_state: operational port state
+ * @max_mac_tbl_ent: max number of mac table entries
+ * @max_smac_ent: max smac entries in mac table
+ * @mac_tbl_digest: mac table digest
+ * @encap_slid: base slid for the port
+ * @pcp_to_sc_uc: sc by pcp index for unicast ethernet packets
+ * @pcp_to_vl_uc: vl by pcp index for unicast ethernet packets
+ * @pcp_to_sc_mc: sc by pcp index for multicast ethernet packets
+ * @pcp_to_vl_mc: vl by pcp index for multicast ethernet packets
+ * @non_vlan_sc_uc: sc for non-vlan unicast ethernet packets
+ * @non_vlan_vl_uc: vl for non-vlan unicast ethernet packets
+ * @non_vlan_sc_mc: sc for non-vlan multicast ethernet packets
+ * @non_vlan_vl_mc: vl for non-vlan multicast ethernet packets
+ * @uc_macs_gen_count: generation count for unicast macs list
+ * @mc_macs_gen_count: generation count for multicast macs list
+ */
+struct hfi_per_veswport_info {
+	__be32  port_num;
+
+	u8      eth_link_status;
+	u8      rsvd0[3];
+
+	u8      base_mac_addr[ETH_ALEN];
+	u8      config_state;
+	u8      oper_state;
+
+	__be16  max_mac_tbl_ent;
+	__be16  max_smac_ent;
+	__be32  mac_tbl_digest;
+	u8      rsvd1[4];
+
+	__be32  encap_slid;
+
+	u8      pcp_to_sc_uc[HFI_VNIC_MAX_NUM_PCP];
+	u8      pcp_to_vl_uc[HFI_VNIC_MAX_NUM_PCP];
+	u8      pcp_to_sc_mc[HFI_VNIC_MAX_NUM_PCP];
+	u8      pcp_to_vl_mc[HFI_VNIC_MAX_NUM_PCP];
+
+	u8      non_vlan_sc_uc;
+	u8      non_vlan_vl_uc;
+	u8      non_vlan_sc_mc;
+	u8      non_vlan_vl_mc;
+
+	u8      rsvd2[48];
+
+	__be16  uc_macs_gen_count;
+	__be16  mc_macs_gen_count;
+
+	u8      rsvd3[8];
+} __packed;
+
+/**
+ * struct hfi_veswport_info - HFI vnic port information
+ * @vesw: HFI vnic switch information
+ * @vport: HFI vnic per port information
+ *
+ * On host, each of the virtual ethernet ports belongs
+ * to a different virtual ethernet switches.
+ */
+struct hfi_veswport_info {
+	struct hfi_vesw_info          vesw;
+	struct hfi_per_veswport_info  vport;
+};
+
+/**
+ * union __hfi_vnic_dlid_sd - vnic dlid and side data needed.
+ * @sd_is_src_mac: 1 = entry is SMAC, 0 = not SMAC
+ * @dlid: Destination lid corresponding to MAC addr
+ */
+union __hfi_vnic_dlid_sd {
+	struct {
+		u32  sd_reserved    : 5;
+		u32  sd_is_src_mac  : 1;
+		u32  rsvd0          : 2;
+		u32  dlid           : 24;
+	};
+	u32 dw;
+};
+
+/* Same as __hfi_vnic_dlid_sd, but with a big endian attribute */
+union hfi_vnic_dlid_sd {
+	union __hfi_vnic_dlid_sd u;
+	__be32 dw;
+};
+
+/**
+ * struct hfi_veswport_mactable_entry - single entry in the forwarding table
+ * @mac_addr: MAC address
+ * @mac_addr_mask: MAC address bit mask
+ * @dlid_sd: Matching DLID and side data
+ *
+ * On the host each virtual ethernet port will have
+ * a forwarding table. These tables are used to
+ * map a MAC to a LID and other data. For more
+ * details see struct hfi_veswport_mactable_entries.
+ * This is the structure of a single mactable entry
+ */
+struct hfi_veswport_mactable_entry {
+	u8                      mac_addr[ETH_ALEN];
+	u8                      mac_addr_mask[ETH_ALEN];
+	union hfi_vnic_dlid_sd  dlid_sd;
+} __packed;
+
+/**
+ * struct hfi_veswport_mactable - Forwarding table array
+ * @offset: mac table starting offset
+ * @num_entries: Number of entries to get or set
+ * @mac_tbl_digest: mac table digest
+ * @tbl_entries[]: Array of table entries
+ *
+ * The EM sends down this structure in a MAD indicating
+ * the starting offset in the forwarding table that this
+ * entry is to be loaded into and the number of entries
+ * that that this MAD instance contains
+ * The mac_tbl_digest has been added to this MAD structure. It will be set by
+ * the EM and it will be used by the EM to check if there are any
+ * discrepancies with this value and the value
+ * maintained by the EM in the case of VNIC port being deleted or unloaded
+ * A new instantiation of a VNIC will always have a value of zero.
+ * This value is stored as part of the vnic adapter structure and will be
+ * accessed by the GET and SET routines for both the mactable entries and the
+ * veswport info.
+ */
+struct hfi_veswport_mactable {
+	__be16                              offset;
+	__be16                              num_entries;
+	__be32                              mac_tbl_digest;
+	struct hfi_veswport_mactable_entry  tbl_entries[0];
+} __packed;
+
+/**
+ * struct hfi_veswport_summary_counters - summary counters
+ * @vp_instance: vport instance on the HFI port
+ * @vesw_id: virtual ethernet switch id
+ * @veswport_num: virtual ethernet switch port number
+ * @tx_errors: transmit errors
+ * @rx_errors: receive errors
+ * @tx_packets: transmit packets
+ * @rx_packets: receive packets
+ * @tx_bytes: transmit bytes
+ * @rx_bytes: receive bytes
+ * @tx_unicast: unicast packets transmitted
+ * @tx_mcastbcast: multicast/broadcast packets transmitted
+ * @tx_untagged: non-vlan packets transmitted
+ * @tx_vlan: vlan packets transmitted
+ * @tx_64_size: transmit packet length is 64 bytes
+ * @tx_65_127: transmit packet length is >=65 and < 127 bytes
+ * @tx_128_255: transmit packet length is >=128 and < 255 bytes
+ * @tx_256_511: transmit packet length is >=256 and < 511 bytes
+ * @tx_512_1023: transmit packet length is >=512 and < 1023 bytes
+ * @tx_1024_1518: transmit packet length is >=1024 and < 1518 bytes
+ * @tx_1519_max: transmit packet length >= 1519 bytes
+ * @rx_unicast: unicast packets received
+ * @rx_mcastbcast: multicast/broadcast packets received
+ * @rx_untagged: non-vlan packets received
+ * @rx_vlan: vlan packets received
+ * @rx_64_size: received packet length is 64 bytes
+ * @rx_65_127: received packet length is >=65 and < 127 bytes
+ * @rx_128_255: received packet length is >=128 and < 255 bytes
+ * @rx_256_511: received packet length is >=256 and < 511 bytes
+ * @rx_512_1023: received packet length is >=512 and < 1023 bytes
+ * @rx_1024_1518: received packet length is >=1024 and < 1518 bytes
+ * @rx_1519_max: received packet length >= 1519 bytes
+ *
+ * All the above are counters of corresponding conditions.
+ */
+struct hfi_veswport_summary_counters {
+	__be16  vp_instance;
+	__be16  vesw_id;
+	__be32  veswport_num;
+
+	__be64  tx_errors;
+	__be64  rx_errors;
+	__be64  tx_packets;
+	__be64  rx_packets;
+	__be64  tx_bytes;
+	__be64  rx_bytes;
+
+	__be64  tx_unicast;
+	__be64  tx_mcastbcast;
+
+	__be64  tx_untagged;
+	__be64  tx_vlan;
+
+	__be64  tx_64_size;
+	__be64  tx_65_127;
+	__be64  tx_128_255;
+	__be64  tx_256_511;
+	__be64  tx_512_1023;
+	__be64  tx_1024_1518;
+	__be64  tx_1519_max;
+
+	__be64  rx_unicast;
+	__be64  rx_mcastbcast;
+
+	__be64  rx_untagged;
+	__be64  rx_vlan;
+
+	__be64  rx_64_size;
+	__be64  rx_65_127;
+	__be64  rx_128_255;
+	__be64  rx_256_511;
+	__be64  rx_512_1023;
+	__be64  rx_1024_1518;
+	__be64  rx_1519_max;
+
+	__be64  reserved[16];
+} __packed;
+
+/**
+ * struct hfi_veswport_error_counters - error counters
+ * @vp_instance: vport instance on the HFI port
+ * @vesw_id: virtual ethernet switch id
+ * @veswport_num: virtual ethernet switch port number
+ * @tx_errors: transmit errors
+ * @rx_errors: receive errors
+ * @tx_smac_filt: smac filter errors
+ * @tx_dlid_zero: transmit packets with invalid dlid
+ * @tx_logic: other transmit errors
+ * @tx_drop_state: packet tansmission in non-forward port state
+ * @rx_bad_veswid: received packet with invalid vesw id
+ * @rx_runt: received ethernet packet with length < 64 bytes
+ * @rx_oversize: received ethernet packet with length > MTU size
+ * @rx_eth_down: received packets when interface is down
+ * @rx_drop_state: received packets in non-forwarding port state
+ * @rx_logic: other receive errors
+ *
+ * All the above are counters of corresponding erorr conditions.
+ */
+struct hfi_veswport_error_counters {
+	__be16  vp_instance;
+	__be16  vesw_id;
+	__be32  veswport_num;
+
+	__be64  tx_errors;
+	__be64  rx_errors;
+
+	__be64  rsvd0;
+	__be64  tx_smac_filt;
+	__be64  rsvd1;
+	__be64  rsvd2;
+	__be64  rsvd3;
+	__be64  tx_dlid_zero;
+	__be64  rsvd4;
+	__be64  tx_logic;
+	__be64  rsvd5;
+	__be64  tx_drop_state;
+
+	__be64  rx_bad_veswid;
+	__be64  rsvd6;
+	__be64  rx_runt;
+	__be64  rx_oversize;
+	__be64  rsvd7;
+	__be64  rx_eth_down;
+	__be64  rx_drop_state;
+	__be64  rx_logic;
+	__be64  rsvd8;
+
+	__be64  rsvd9[16];
+} __packed;
+
+/**
+ * struct hfi_veswport_trap - Trap message sent to EM by VNIC
+ * @fabric_id: 10 bit fabric id
+ * @veswid: 12 bit virtual ethernet switch id
+ * @veswportnum: logical port number on the Virtual switch
+ * @hfiportnum: physical port num (redundant on host)
+ * @veswportindex: switch port index on hfi port 0 based
+ * @opcode: operation
+ * @reserved: 32 bit for alignment
+ *
+ * The VNIC will send trap messages to the Ethernet manager to
+ * inform it about changes to the VNIC config, behaviour etc.
+ * This is the format of the trap payload.
+ */
+struct hfi_veswport_trap {
+	__be16  fabric_id;
+	__be16  veswid;
+	__be32  veswportnum;
+	__be16  hfiportnum;
+	u8      veswportindex;
+	u8      opcode;
+	__be32  reserved;
+} __packed;
+
+/**
+ * struct hfi_vnic_iface_macs_entry - single entry in the mac list
+ * @mac_addr: MAC address
+ */
+struct hfi_vnic_iface_mac_entry {
+	u8 mac_addr[ETH_ALEN];
+};
+
+/**
+ * struct hfi_veswport_iface_macs - Msg to set globally administered MAC
+ * @start_idx: position of first entry (0 based)
+ * @num_macs_in_msg: number of MACs in this message
+ * @tot_macs_in_lst: The total number of MACs the agent has
+ * @gen_count: gen_count to indicate change
+ * @entry: The mac list entry
+ *
+ * Same attribute IDS and attribute modifiers as in locally administered
+ * addresses used to set globally administered addresses
+ */
+struct hfi_veswport_iface_macs {
+	__be16 start_idx;
+	__be16 num_macs_in_msg;
+	__be16 tot_macs_in_lst;
+	__be16 gen_count;
+	struct hfi_vnic_iface_mac_entry entry[0];
+} __packed;
+
+/**
+ * struct hfi_vnic_vema_mad - Generic VEMA MAD
+ * @mad_hdr: Generic MAD header
+ * @rmpp_hdr: RMPP header for vendor specific MADs
+ * @oui: Unique org identifier
+ * @data: MAD data
+ */
+struct hfi_vnic_vema_mad {
+	struct ib_mad_hdr  mad_hdr;
+	struct ib_rmpp_hdr rmpp_hdr;
+	u8                 reserved;
+	u8                 oui[3];
+	u8                 data[HFI_VNIC_EMA_DATA];
+};
+
+/**
+ * struct hfi_vnic_notice_attr - Generic Notice MAD
+ * @gen_type: Generic/Specific bit and type of notice
+ * @oui_1: Vendor ID byte 1
+ * @oui_2: Vendor ID byte 2
+ * @oui_3: Vendor ID byte 3
+ * @trap_num: Trap number
+ * @toggle_count: Notice toggle bit and count value
+ * @issuer_lid: Trap issuer's lid
+ * @issuer_gid: Issuer GID (only if Report method)
+ * @raw_data: Trap message body
+ */
+struct hfi_vnic_notice_attr {
+	u8     gen_type;
+	u8     oui_1;
+	u8     oui_2;
+	u8     oui_3;
+	__be16 trap_num;
+	__be16 toggle_count;
+	__be32 issuer_lid;
+	__be32 reserved;
+	u8     issuer_gid[16];
+	u8     raw_data[64];
+} __packed;
+
+/**
+ * struct hfi_vnic_vema_mad_trap - Generic VEMA MAD Trap
+ * @mad_hdr: Generic MAD header
+ * @rmpp_hdr: RMPP header for vendor specific MADs
+ * @oui: Unique org identifier
+ * @notice: Notice structure
+ */
+struct hfi_vnic_vema_mad_trap {
+	struct ib_mad_hdr            mad_hdr;
+	struct ib_rmpp_hdr           rmpp_hdr;
+	u8                           reserved;
+	u8                           oui[3];
+	struct hfi_vnic_notice_attr  notice;
+};
+
 #endif /* _HFI_VNIC_ENCAP_H */
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
index 30731b4..63d6db6 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
@@ -96,6 +96,8 @@ enum hfi_vnic_flags_t {
 
 /**
  * struct __hfi_vesw_info - HFI vnic virtual switch info
+ *
+ * Same as hfi_vesw_info without bitwise attribute.
  */
 struct __hfi_vesw_info {
 	u16  fabric_id;
@@ -119,6 +121,8 @@ struct __hfi_vesw_info {
 
 /**
  * struct __hfi_per_veswport_info - HFI vnic per port info
+ *
+ * Same as hfi_per_veswport_info without bitwise attribute.
  */
 struct __hfi_per_veswport_info {
 	u32  port_num;
@@ -157,6 +161,8 @@ struct __hfi_per_veswport_info {
 
 /**
  * struct __hfi_veswport_info - HFI vnic port info
+ *
+ * Same as hfi_veswport_info without bitwise attribute.
  */
 struct __hfi_veswport_info {
 	struct __hfi_vesw_info            vesw;
@@ -164,6 +170,21 @@ struct __hfi_veswport_info {
 };
 
 /**
+ * struct __hfi_veswport_trap - HFI vnic trap info
+ *
+ * Same as hfi_veswport_trap without bitwise attribute.
+ */
+struct __hfi_veswport_trap {
+	u16	fabric_id;
+	u16	veswid;
+	u32	veswportnum;
+	u16	hfiportnum;
+	u8	veswportindex;
+	u8	opcode;
+	u32	reserved;
+} __packed;
+
+/**
  * struct hfi_vnic_rx_queue - HFI VNIC receive queue
  * @idx: queue index
  * @adapter: netdev adapter
@@ -209,6 +230,18 @@ struct hfi_vnic_adapter {
 #define v_warn(format, arg...) \
 	netdev_warn(adapter->netdev, format, ## arg)
 
+/* The maximum allowed entries in the mac table */
+#define HFI_VNIC_MAC_TBL_MAX_ENTRIES  2048
+/* Limit of smac entries in mac table */
+#define HFI_VNIC_MAX_SMAC_LIMIT       256
+
+/* The last octet of the MAC address is used as the key to the hash table */
+#define HFI_VNIC_MAC_HASH_IDX         5
+
+/* The VNIC MAC hash table is of size 2^8 */
+#define HFI_VNIC_MAC_TBL_HASH_BITS    8
+#define HFI_VNIC_MAC_TBL_SIZE  BIT(HFI_VNIC_MAC_TBL_HASH_BITS)
+
 struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 					     struct device *parent);
 void hfi_vnic_rem_netdev(struct hfi_vnic_port *vport);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC v2 06/10] IB/hfi-vnic: VNIC MAC table support
From: Vishwanathapura, Niranjana @ 2016-12-15  7:59 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	ira.weiny-ral2JQCrhuEAvxtiuMwx3w, Niranjana Vishwanathapura,
	Sadanand Warrier
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

HFI VNIC MAC table contains the MAC address to DLID mappings provided by
the Ethernet manager. During transmission, the MAC table provides the MAC
address to DLID translation. Implement MAC table using simple hash list.
Also provide support to update/query the MAC table by Ethernet manager.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Reviewed-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Sadanand Warrier <sadanand.warrier-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 .../infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c  | 236 +++++++++++++++++++++
 .../sw/intel/hfi_vnic/hfi_vnic_internal.h          |  53 ++++-
 .../infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c |   4 +
 3 files changed, 292 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c
index 3fdfb7b..e45cff8 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c
@@ -104,6 +104,238 @@
 
 #define HFI_VNIC_SC_MASK 0x1f
 
+/*
+ * Using a simple hash table for mac table implementation with the last octet
+ * of mac address as a key.
+ */
+static void hfi_vnic_free_mac_tbl(struct hlist_head *mactbl)
+{
+	struct hfi_vnic_mac_tbl_node *node;
+	struct hlist_node *tmp;
+	int bkt;
+
+	if (!mactbl)
+		return;
+
+	vnic_hash_for_each_safe(mactbl, bkt, tmp, node, hlist) {
+		hash_del(&node->hlist);
+		kfree(node);
+	}
+	kfree(mactbl);
+}
+
+static struct hlist_head *hfi_vnic_alloc_mac_tbl(void)
+{
+	u32 size = sizeof(struct hlist_head) * HFI_VNIC_MAC_TBL_SIZE;
+	struct hlist_head *mactbl;
+
+	mactbl = kzalloc(size, GFP_KERNEL);
+	if (!mactbl)
+		return ERR_PTR(-ENOMEM);
+
+	vnic_hash_init(mactbl);
+	return mactbl;
+}
+
+/* hfi_vnic_release_mac_tbl - empty and free the mac table */
+void hfi_vnic_release_mac_tbl(struct hfi_vnic_adapter *adapter)
+{
+	struct hlist_head *mactbl;
+
+	mutex_lock(&adapter->mactbl_lock);
+	mactbl = rcu_access_pointer(adapter->mactbl);
+	rcu_assign_pointer(adapter->mactbl, NULL);
+	synchronize_rcu();
+	hfi_vnic_free_mac_tbl(mactbl);
+	mutex_unlock(&adapter->mactbl_lock);
+}
+
+/*
+ * hfi_vnic_query_mac_tbl - query the mac table for a section
+ *
+ * This function implements query of specific function of the mac table.
+ * The function also expects the requested range to be valid.
+ */
+void hfi_vnic_query_mac_tbl(struct hfi_vnic_adapter *adapter,
+			    struct hfi_veswport_mactable *tbl)
+{
+	struct hfi_vnic_mac_tbl_node *node;
+	struct hlist_head *mactbl;
+	int bkt;
+	u16 loffset, lnum_entries;
+
+	rcu_read_lock();
+	mactbl = rcu_dereference(adapter->mactbl);
+	if (!mactbl)
+		goto get_mac_done;
+
+	loffset = be16_to_cpu(tbl->offset);
+	lnum_entries = be16_to_cpu(tbl->num_entries);
+
+	vnic_hash_for_each(mactbl, bkt, node, hlist) {
+		struct __hfi_vnic_mactable_entry *nentry = &node->entry;
+		struct hfi_veswport_mactable_entry *entry;
+
+		if ((node->index < loffset) ||
+		    (node->index >= (loffset + lnum_entries)))
+			continue;
+
+		/* populate entry in the tbl corresponding to the index */
+		entry = &tbl->tbl_entries[node->index - loffset];
+		memcpy(entry->mac_addr, nentry->mac_addr,
+		       ARRAY_SIZE(entry->mac_addr));
+		memcpy(entry->mac_addr_mask, nentry->mac_addr_mask,
+		       ARRAY_SIZE(entry->mac_addr_mask));
+		entry->dlid_sd.dw = cpu_to_be32(nentry->dlid_sd.dw);
+	}
+	tbl->mac_tbl_digest = cpu_to_be32(adapter->info.vport.mac_tbl_digest);
+get_mac_done:
+	rcu_read_unlock();
+}
+
+/*
+ * hfi_vnic_update_mac_tbl - update mac table section
+ *
+ * This function updates the specified section of the mac table.
+ * The procedure includes following steps.
+ *  - Allocate a new mac (hash) table.
+ *  - Add the specified entries to the new table.
+ *    (except the ones that are requested to be deleted).
+ *  - Add all the other entries from the old mac table.
+ *  - If there is a failure, free the new table and return.
+ *  - Switch to the new table.
+ *  - Free the old table and return.
+ *
+ * The function also expects the requested range to be valid.
+ */
+int hfi_vnic_update_mac_tbl(struct hfi_vnic_adapter *adapter,
+			    struct hfi_veswport_mactable *tbl)
+{
+	struct hfi_vnic_mac_tbl_node *node, *new_node;
+	struct hlist_head *new_mactbl, *old_mactbl;
+	int i, bkt, rc = 0;
+	u8 key;
+	u16 loffset, lnum_entries;
+
+	mutex_lock(&adapter->mactbl_lock);
+	/* allocate new mac table */
+	new_mactbl = hfi_vnic_alloc_mac_tbl();
+	if (IS_ERR(new_mactbl)) {
+		mutex_unlock(&adapter->mactbl_lock);
+		return PTR_ERR(new_mactbl);
+	}
+
+	loffset = be16_to_cpu(tbl->offset);
+	lnum_entries = be16_to_cpu(tbl->num_entries);
+
+	/* add updated entries to the new mac table */
+	for (i = 0; i < lnum_entries; i++) {
+		struct __hfi_vnic_mactable_entry *nentry;
+		struct hfi_veswport_mactable_entry *entry =
+							&tbl->tbl_entries[i];
+		u8 *mac_addr = entry->mac_addr;
+		u8 empty_mac[ETH_ALEN] = { 0 };
+
+		v_dbg("new mac entry %4d: %02x:%02x:%02x:%02x:%02x:%02x %x\n",
+		      loffset + i, mac_addr[0], mac_addr[1], mac_addr[2],
+		      mac_addr[3], mac_addr[4], mac_addr[5],
+		      entry->dlid_sd.dw);
+
+		/* if the entry is being removed, do not add it */
+		if (!memcmp(mac_addr, empty_mac, ARRAY_SIZE(empty_mac)))
+			continue;
+
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node) {
+			rc = -ENOMEM;
+			goto updt_done;
+		}
+
+		node->index = loffset + i;
+		nentry = &node->entry;
+		memcpy(nentry->mac_addr, entry->mac_addr,
+		       ARRAY_SIZE(nentry->mac_addr));
+		memcpy(nentry->mac_addr_mask, entry->mac_addr_mask,
+		       ARRAY_SIZE(nentry->mac_addr_mask));
+		nentry->dlid_sd.dw = be32_to_cpu(entry->dlid_sd.dw);
+		key = node->entry.mac_addr[HFI_VNIC_MAC_HASH_IDX];
+		vnic_hash_add(new_mactbl, &node->hlist, key);
+	}
+
+	/* add other entries from current mac table to new mac table */
+	old_mactbl = rcu_access_pointer(adapter->mactbl);
+	if (!old_mactbl)
+		goto switch_tbl;
+
+	vnic_hash_for_each(old_mactbl, bkt, node, hlist) {
+		if ((node->index >= loffset) &&
+		    (node->index < (loffset + lnum_entries)))
+			continue;
+
+		new_node = kzalloc(sizeof(*new_node), GFP_KERNEL);
+		if (!new_node) {
+			rc = -ENOMEM;
+			goto updt_done;
+		}
+
+		new_node->index = node->index;
+		memcpy(&new_node->entry, &node->entry, sizeof(node->entry));
+		key = new_node->entry.mac_addr[HFI_VNIC_MAC_HASH_IDX];
+		vnic_hash_add(new_mactbl, &new_node->hlist, key);
+	}
+
+switch_tbl:
+	/* switch to new table */
+	rcu_assign_pointer(adapter->mactbl, new_mactbl);
+	synchronize_rcu();
+
+	adapter->info.vport.mac_tbl_digest = be32_to_cpu(tbl->mac_tbl_digest);
+updt_done:
+	/* upon failure, free the new table; otherwise, free the old table */
+	if (rc)
+		hfi_vnic_free_mac_tbl(new_mactbl);
+	else
+		hfi_vnic_free_mac_tbl(old_mactbl);
+
+	mutex_unlock(&adapter->mactbl_lock);
+	return rc;
+}
+
+/* hfi_vnic_chk_mac_tbl - check mac table for dlid */
+static uint32_t hfi_vnic_chk_mac_tbl(struct hfi_vnic_adapter *adapter,
+				     struct ethhdr *mac_hdr)
+{
+	struct hfi_vnic_mac_tbl_node *node;
+	struct hlist_head *mactbl;
+	u32 dlid = 0;
+	u8 key;
+
+	rcu_read_lock();
+	mactbl = rcu_dereference(adapter->mactbl);
+	if (!mactbl)
+		goto chk_done;
+
+	key = mac_hdr->h_dest[HFI_VNIC_MAC_HASH_IDX];
+	vnic_hash_for_each_possible(mactbl, node, hlist, key) {
+		struct __hfi_vnic_mactable_entry *entry = &node->entry;
+
+		/* if related to source mac, skip */
+		if (entry->dlid_sd.sd_is_src_mac)
+			continue;
+
+		if (!memcmp(node->entry.mac_addr, mac_hdr->h_dest,
+			    ARRAY_SIZE(node->entry.mac_addr))) {
+			/* mac address found */
+			dlid = node->entry.dlid_sd.dlid;
+			break;
+		}
+	}
+
+chk_done:
+	rcu_read_unlock();
+	return dlid;
+}
+
 /* hfi_vnic_get_dlid - find and return the DLID */
 static uint32_t hfi_vnic_get_dlid(struct hfi_vnic_adapter *adapter,
 				  struct sk_buff *skb, u8 def_port)
@@ -112,6 +344,10 @@ static uint32_t hfi_vnic_get_dlid(struct hfi_vnic_adapter *adapter,
 	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
 	u32 dlid;
 
+	dlid = hfi_vnic_chk_mac_tbl(adapter, mac_hdr);
+	if (dlid)
+		return dlid;
+
 	if (is_multicast_ether_addr(mac_hdr->h_dest)) {
 		dlid = info->vesw.u_mcast_dlid;
 	} else {
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
index af3ff0e..6d5c5f8 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
@@ -262,6 +262,8 @@ struct hfi_vnic_rx_queue {
  * @lock: adapter lock
  * @rxq: receive queue array
  * @info: virtual ethernet switch port information
+ * @mactbl: hash table of MAC entries
+ * @mactbl_lock: mac table lock
  * @stats_lock: statistics lock
  * @flow_tbl: flow to default port redirection table
  * @q_sum_cntrs: per queue EM summary counters
@@ -284,7 +286,11 @@ struct hfi_vnic_adapter {
 
 	struct hfi_vnic_rx_queue  rxq[HFI_VNIC_MAX_QUEUE];
 
-	struct __hfi_veswport_info info;
+	struct __hfi_veswport_info  info;
+	struct hlist_head  __rcu   *mactbl;
+
+	/* Lock used to protect updates to mac table */
+	struct mutex mactbl_lock;
 
 	/* Lock used to protect access to vnic counters */
 	struct mutex stats_lock;
@@ -304,6 +310,25 @@ struct hfi_vnic_adapter {
 	struct __hfi_vnic_error_counters    err_cntrs;
 };
 
+/* Same as hfi_veswport_mactable_entry, but without bitwise attribute */
+struct __hfi_vnic_mactable_entry {
+	u8                         mac_addr[ETH_ALEN];
+	u8                         mac_addr_mask[ETH_ALEN];
+	union __hfi_vnic_dlid_sd   dlid_sd;
+} __packed;
+
+/**
+ * struct hfi_vnic_mac_tbl_node - HFI VNIC mac table node
+ * @hlist: hash list handle
+ * @index: index of entry in the mac table
+ * @entry: entry in the table
+ */
+struct hfi_vnic_mac_tbl_node {
+	struct hlist_node                    hlist;
+	u16                                  index;
+	struct __hfi_vnic_mactable_entry     entry;
+};
+
 #define v_dbg(format, arg...) \
 	netdev_dbg(adapter->netdev, format, ## arg)
 #define v_err(format, arg...) \
@@ -325,12 +350,38 @@ struct hfi_vnic_adapter {
 #define HFI_VNIC_MAC_TBL_HASH_BITS    8
 #define HFI_VNIC_MAC_TBL_SIZE  BIT(HFI_VNIC_MAC_TBL_HASH_BITS)
 
+/* VNIC HASH MACROS */
+#define vnic_hash_init(hashtable) __hash_init(hashtable, HFI_VNIC_MAC_TBL_SIZE)
+
+#define vnic_hash_add(hashtable, node, key)                                   \
+	hlist_add_head(node,                                                  \
+		&hashtable[hash_min(key, ilog2(HFI_VNIC_MAC_TBL_SIZE))])
+
+#define vnic_hash_for_each_safe(name, bkt, tmp, obj, member)                  \
+	for ((bkt) = 0, obj = NULL;                                           \
+		    !obj && (bkt) < HFI_VNIC_MAC_TBL_SIZE; (bkt)++)           \
+		hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)
+
+#define vnic_hash_for_each_possible(name, obj, member, key)                   \
+	hlist_for_each_entry(obj,                                             \
+		&name[hash_min(key, ilog2(HFI_VNIC_MAC_TBL_SIZE))], member)
+
+#define vnic_hash_for_each(name, bkt, obj, member)                            \
+	for ((bkt) = 0, obj = NULL;                                           \
+		    !obj && (bkt) < HFI_VNIC_MAC_TBL_SIZE; (bkt)++)           \
+		hlist_for_each_entry(obj, &name[bkt], member)
+
 struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 					     struct device *parent);
 void hfi_vnic_rem_netdev(struct hfi_vnic_port *vport);
 int hfi_vnic_encap_skb(struct hfi_vnic_adapter *adapter, struct sk_buff *skb);
 int hfi_vnic_decap_skb(struct hfi_vnic_rx_queue *rxq, struct sk_buff *skb);
 u8 hfi_vnic_calc_entropy(struct hfi_vnic_adapter *adapter, struct sk_buff *skb);
+void hfi_vnic_release_mac_tbl(struct hfi_vnic_adapter *adapter);
+void hfi_vnic_query_mac_tbl(struct hfi_vnic_adapter *adapter,
+			    struct hfi_veswport_mactable *tbl);
+int hfi_vnic_update_mac_tbl(struct hfi_vnic_adapter *adapter,
+			    struct hfi_veswport_mactable *tbl);
 void hfi_vnic_update_stats(struct net_device *netdev);
 void hfi_vnic_set_ethtool_ops(struct net_device *netdev);
 
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
index 1626e44..04edafa 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
@@ -616,6 +616,7 @@ struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 	netdev->netdev_ops = &hfi_netdev_ops;
 	netdev->hard_header_len += HFI_VNIC_SKB_HEADROOM;
 	mutex_init(&adapter->lock);
+	mutex_init(&adapter->mactbl_lock);
 	mutex_init(&adapter->stats_lock);
 	strcpy(netdev->name, "veth%d");
 
@@ -638,6 +639,7 @@ struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 	return adapter;
 netdev_err:
 	mutex_destroy(&adapter->lock);
+	mutex_destroy(&adapter->mactbl_lock);
 	mutex_destroy(&adapter->stats_lock);
 	free_netdev(netdev);
 
@@ -651,7 +653,9 @@ void hfi_vnic_rem_netdev(struct hfi_vnic_port *vport)
 
 	v_info("removing\n");
 	unregister_netdev(vport->netdev);
+	hfi_vnic_release_mac_tbl(adapter);
 	mutex_destroy(&adapter->lock);
+	mutex_destroy(&adapter->mactbl_lock);
 	mutex_destroy(&adapter->stats_lock);
 	free_netdev(vport->netdev);
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC v2 07/10] IB/hfi-vnic: VNIC Ethernet Management Agent (VEMA) interface
From: Vishwanathapura, Niranjana @ 2016-12-15  7:59 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	ira.weiny-ral2JQCrhuEAvxtiuMwx3w, Niranjana Vishwanathapura,
	Sadanand Warrier, Tanya K Jajodia
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

HFI VNIC EMA interface functions are the management interfaces to the HFI
VNIC netdev. Add support to add and remove VNIC ports. Implement the
required GET/SET management interface functions and processing of new
management information. Add support to send trap notifications upon various
events like interface status change, unicast/multicast mac list update and
mac address change.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Reviewed-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Sadanand Warrier <sadanand.warrier-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Tanya K Jajodia <tanya.k.jajodia-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 drivers/infiniband/sw/intel/hfi_vnic/Makefile      |   3 +-
 .../infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h  |   4 +
 .../sw/intel/hfi_vnic/hfi_vnic_internal.h          |  44 +++
 .../infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c | 153 +++++++-
 .../sw/intel/hfi_vnic/hfi_vnic_vema_iface.c        | 432 +++++++++++++++++++++
 5 files changed, 633 insertions(+), 3 deletions(-)
 create mode 100644 drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema_iface.c

diff --git a/drivers/infiniband/sw/intel/hfi_vnic/Makefile b/drivers/infiniband/sw/intel/hfi_vnic/Makefile
index 8e3dca7..a0562af 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/Makefile
+++ b/drivers/infiniband/sw/intel/hfi_vnic/Makefile
@@ -3,4 +3,5 @@
 #
 obj-$(CONFIG_HFI_VNIC) += hfi_vnic.o
 
-hfi_vnic-y := hfi_vnic_netdev.o hfi_vnic_encap.o hfi_vnic_ethtool.o
+hfi_vnic-y := hfi_vnic_netdev.o hfi_vnic_encap.o hfi_vnic_ethtool.o \
+              hfi_vnic_vema_iface.o
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h
index a6770ef..54e9081 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.h
@@ -99,6 +99,10 @@
 #define HFI_VNIC_STATE_DROP_ALL        0x1
 #define HFI_VNIC_STATE_FORWARDING      0x3
 
+/* VNIC Ethernet link status */
+#define HFI_VNIC_ETH_LINK_UP     1
+#define HFI_VNIC_ETH_LINK_DOWN   2
+
 /**
  * struct hfi_vesw_info - HFI vnic switch information
  * @fabric_id: 10-bit fabric id
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
index 6d5c5f8..7723a4e 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
@@ -243,6 +243,16 @@ struct __hfi_veswport_trap {
 } __packed;
 
 /**
+ * struct hfi_vnic_ctrl_port - HFI virtual NIC control port
+ * @ibdev: pointer to ib device
+ * @ops: hfi vnic control operations
+ */
+struct hfi_vnic_ctrl_port {
+	struct ib_device           *ibdev;
+	struct hfi_vnic_ctrl_ops   *ops;
+};
+
+/**
  * struct hfi_vnic_rx_queue - HFI VNIC receive queue
  * @idx: queue index
  * @adapter: netdev adapter
@@ -257,11 +267,15 @@ struct hfi_vnic_rx_queue {
 /**
  * struct hfi_vnic_adapter - HFI VNIC netdev private data structure
  * @netdev: pointer to associated netdev
+ * @cport: pointer to hfi vnic control port
  * @vport: pointer to hfi vnic port
  * @flags: flags indicating various states
  * @lock: adapter lock
  * @rxq: receive queue array
  * @info: virtual ethernet switch port information
+ * @vema_mac_addr: mac address configured by vema
+ * @umac_hash: unicast maclist hash
+ * @mmac_hash: multicast maclist hash
  * @mactbl: hash table of MAC entries
  * @mactbl_lock: mac table lock
  * @stats_lock: statistics lock
@@ -278,6 +292,7 @@ struct hfi_vnic_rx_queue {
  */
 struct hfi_vnic_adapter {
 	struct net_device             *netdev;
+	struct hfi_vnic_ctrl_port     *cport;
 	struct hfi_vnic_port          *vport;
 	unsigned long                  flags;
 
@@ -287,6 +302,9 @@ struct hfi_vnic_adapter {
 	struct hfi_vnic_rx_queue  rxq[HFI_VNIC_MAX_QUEUE];
 
 	struct __hfi_veswport_info  info;
+	u8                          vema_mac_addr[ETH_ALEN];
+	u32                         umac_hash;
+	u32                         mmac_hash;
 	struct hlist_head  __rcu   *mactbl;
 
 	/* Lock used to protect updates to mac table */
@@ -338,6 +356,11 @@ struct hfi_vnic_mac_tbl_node {
 #define v_warn(format, arg...) \
 	netdev_warn(adapter->netdev, format, ## arg)
 
+#define c_err(format, arg...) \
+	dev_err(&cport->ibdev->dev, format, ## arg)
+#define c_info(format, arg...) \
+	dev_info(&cport->ibdev->dev, format, ## arg)
+
 /* The maximum allowed entries in the mac table */
 #define HFI_VNIC_MAC_TBL_MAX_ENTRIES  2048
 /* Limit of smac entries in mac table */
@@ -377,12 +400,33 @@ struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 int hfi_vnic_encap_skb(struct hfi_vnic_adapter *adapter, struct sk_buff *skb);
 int hfi_vnic_decap_skb(struct hfi_vnic_rx_queue *rxq, struct sk_buff *skb);
 u8 hfi_vnic_calc_entropy(struct hfi_vnic_adapter *adapter, struct sk_buff *skb);
+void hfi_vnic_process_vema_config(struct hfi_vnic_adapter *adapter);
 void hfi_vnic_release_mac_tbl(struct hfi_vnic_adapter *adapter);
 void hfi_vnic_query_mac_tbl(struct hfi_vnic_adapter *adapter,
 			    struct hfi_veswport_mactable *tbl);
 int hfi_vnic_update_mac_tbl(struct hfi_vnic_adapter *adapter,
 			    struct hfi_veswport_mactable *tbl);
+void hfi_vnic_query_ucast_macs(struct hfi_vnic_adapter *adapter,
+			       struct hfi_veswport_iface_macs *macs);
+void hfi_vnic_query_mcast_macs(struct hfi_vnic_adapter *adapter,
+			       struct hfi_veswport_iface_macs *macs);
 void hfi_vnic_update_stats(struct net_device *netdev);
+void hfi_vnic_get_summary_counters(struct hfi_vnic_adapter *adapter,
+				   struct hfi_veswport_summary_counters *cntrs);
+void hfi_vnic_get_error_counters(struct hfi_vnic_adapter *adapter,
+				 struct hfi_veswport_error_counters *cntrs);
+void hfi_vnic_get_vesw_info(struct hfi_vnic_adapter *adapter,
+			    struct hfi_vesw_info *info);
+void hfi_vnic_set_vesw_info(struct hfi_vnic_adapter *adapter,
+			    struct hfi_vesw_info *info);
+void hfi_vnic_get_per_veswport_info(struct hfi_vnic_adapter *adapter,
+				    struct hfi_per_veswport_info *info);
+void hfi_vnic_set_per_veswport_info(struct hfi_vnic_adapter *adapter,
+				    struct hfi_per_veswport_info *info);
+void hfi_vnic_vema_report_event(struct hfi_vnic_adapter *adapter, u8 event);
+struct hfi_vnic_adapter *hfi_vnic_add_vport(struct hfi_vnic_ctrl_port *cport,
+					    u8 port_num, u8 vport_num);
+void hfi_vnic_rem_vport(struct hfi_vnic_adapter *adapter);
 void hfi_vnic_set_ethtool_ops(struct net_device *netdev);
 
 #endif /* _HFI_VNIC_INTERNAL_H */
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
index 04edafa..7e58c1c 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
@@ -51,6 +51,7 @@
 
 #include <linux/module.h>
 #include <linux/if_vlan.h>
+#include <linux/crc32.h>
 
 #include "hfi_vnic_internal.h"
 
@@ -528,7 +529,90 @@ static void hfi_vnic_down(struct hfi_vnic_adapter *adapter)
 	clear_bit(HFI_VNIC_UP, &adapter->flags);
 }
 
-/* hfi_vnic_set_mac_addr - change mac address */
+/* hfi_vnic_process_vema_config - process vema configuration updates */
+void hfi_vnic_process_vema_config(struct hfi_vnic_adapter *adapter)
+{
+	struct __hfi_veswport_info *info = &adapter->info;
+	struct hfi_vnic_port *vport = adapter->vport;
+	u8 port_num[HFI_VESW_MAX_NUM_DEF_PORT] = { 0 };
+	u16 port_mask, mtu_limit = ETH_ZLEN - ETH_HLEN;
+	u8 i, port_count = 0;
+
+	/* If the base_mac_addr is changed, update the interface mac address */
+	if (memcmp(info->vport.base_mac_addr, adapter->vema_mac_addr,
+		   ARRAY_SIZE(info->vport.base_mac_addr))) {
+		struct sockaddr saddr;
+
+		memcpy(saddr.sa_data, info->vport.base_mac_addr,
+		       ARRAY_SIZE(info->vport.base_mac_addr));
+		mutex_lock(&adapter->lock);
+		eth_mac_addr(adapter->netdev, &saddr);
+		memcpy(adapter->vema_mac_addr,
+		       info->vport.base_mac_addr, ETH_ALEN);
+		mutex_unlock(&adapter->lock);
+	}
+
+	/*
+	 * If vesw_id is being changed, and if the vnic interface
+	 * is up, reset the hfi interface to ensure new vesw_id
+	 * is picked by hfi driver
+	 */
+	if (vport->vesw_id != info->vesw.vesw_id) {
+		mutex_lock(&adapter->lock);
+		if (test_bit(HFI_VNIC_UP, &adapter->flags))
+			hfi_vnic_down(adapter);
+
+		vport->vesw_id = info->vesw.vesw_id;
+		if (test_bit(HFI_VNIC_OPEN, &adapter->flags))
+			hfi_vnic_up(adapter);
+
+		mutex_unlock(&adapter->lock);
+	}
+
+	/* Handle MTU limit change */
+	mtu_limit = max(info->vesw.eth_mtu_non_vlan, mtu_limit);
+	rtnl_lock();
+	if (adapter->netdev->mtu > mtu_limit)
+		dev_set_mtu(adapter->netdev, mtu_limit);
+	rtnl_unlock();
+
+	/* Update flow to default port redirection table */
+	port_mask = info->vesw.def_port_mask;
+	for (i = 0; i < HFI_VESW_MAX_NUM_DEF_PORT; i++) {
+		if (port_mask & 1)
+			port_num[port_count++] = i;
+		port_mask >>= 1;
+	}
+
+	/*
+	 * Build the flow table. Flow table is required when destination LID
+	 * is not available. Up to HFI_VNIC_FLOW_TBL_SIZE flows supported.
+	 * Each flow need a default port number to get its dlid from the
+	 * u_ucast_dlid array.
+	 */
+	for (i = 0; i < HFI_VNIC_FLOW_TBL_SIZE; i++)
+		adapter->flow_tbl[i] = port_count ? port_num[i % port_count] :
+						    HFI_VNIC_INVALID_PORT;
+
+	/* Operational state can only be DROP_ALL or FORWARDING */
+	if (info->vport.config_state == HFI_VNIC_STATE_FORWARDING)
+		info->vport.oper_state = HFI_VNIC_STATE_FORWARDING;
+	else
+		info->vport.oper_state = HFI_VNIC_STATE_DROP_ALL;
+}
+
+/*
+ * Set the power on default values in adapter's vema interface structure.
+ */
+static inline void hfi_vnic_set_pod_values(struct hfi_vnic_adapter *adapter)
+{
+	adapter->info.vport.max_mac_tbl_ent = HFI_VNIC_MAC_TBL_MAX_ENTRIES;
+	adapter->info.vport.max_smac_ent = HFI_VNIC_MAX_SMAC_LIMIT;
+	adapter->info.vport.config_state = HFI_VNIC_STATE_DROP_ALL;
+	adapter->info.vport.eth_link_status = HFI_VNIC_ETH_LINK_DOWN;
+}
+
+/* hfi_vnic_set_mac_addr - change mac address and send trap */
 static int hfi_vnic_set_mac_addr(struct net_device *netdev, void *addr)
 {
 	struct hfi_vnic_adapter *adapter = netdev_priv(netdev);
@@ -541,8 +625,62 @@ static int hfi_vnic_set_mac_addr(struct net_device *netdev, void *addr)
 	mutex_lock(&adapter->lock);
 	rc = eth_mac_addr(netdev, addr);
 	mutex_unlock(&adapter->lock);
+	if (rc)
+		return rc;
 
-	return rc;
+	adapter->info.vport.uc_macs_gen_count++;
+	hfi_vnic_vema_report_event(adapter,
+				   HFI_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE);
+	return 0;
+}
+
+/*
+ * hfi_vnic_mac_send_event - post event on possible mac list exchange
+ *  Send trap when digest from uc/mc mac list differs from previous run.
+ *  Digest is evaluated similar to how cksum does.
+ */
+static void hfi_vnic_mac_send_event(struct net_device *netdev, u8 event)
+{
+	struct hfi_vnic_adapter *adapter = netdev_priv(netdev);
+	struct netdev_hw_addr *ha;
+	struct netdev_hw_addr_list *hw_list;
+	u32 *ref_crc;
+	u32 l, crc = 0;
+
+	switch (event) {
+	case HFI_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE:
+		hw_list = &netdev->uc;
+		adapter->info.vport.uc_macs_gen_count++;
+		ref_crc = &adapter->umac_hash;
+		break;
+	case HFI_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE:
+		hw_list = &netdev->mc;
+		adapter->info.vport.mc_macs_gen_count++;
+		ref_crc = &adapter->mmac_hash;
+		break;
+	default:
+		return;
+	}
+	netdev_hw_addr_list_for_each(ha, hw_list) {
+		crc = crc32_le(crc, ha->addr, ETH_ALEN);
+	}
+	l = netdev_hw_addr_list_count(hw_list) * ETH_ALEN;
+	crc = ~crc32_le(crc, (void *)&l, sizeof(l));
+
+	if (crc != *ref_crc) {
+		*ref_crc = crc;
+		hfi_vnic_vema_report_event(adapter, event);
+	}
+}
+
+/* hfi_vnic_set_rx_mode - handle uc/mc mac list change */
+static void hfi_vnic_set_rx_mode(struct net_device *netdev)
+{
+	hfi_vnic_mac_send_event(netdev,
+				HFI_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE);
+
+	hfi_vnic_mac_send_event(netdev,
+				HFI_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE);
 }
 
 /* hfi_netdev_open - activate network interface */
@@ -556,6 +694,10 @@ static int hfi_netdev_open(struct net_device *netdev)
 	if (rc)
 		goto open_done;
 
+	/* Update eth link status and send trap */
+	adapter->info.vport.eth_link_status = HFI_VNIC_ETH_LINK_UP;
+	hfi_vnic_vema_report_event(adapter,
+				   HFI_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE);
 	set_bit(HFI_VNIC_OPEN, &adapter->flags);
 	v_info("opened\n");
 open_done:
@@ -572,6 +714,10 @@ static int hfi_netdev_close(struct net_device *netdev)
 	if (test_bit(HFI_VNIC_UP, &adapter->flags))
 		hfi_vnic_down(adapter);
 
+	/* Update eth link status and send trap */
+	adapter->info.vport.eth_link_status = HFI_VNIC_ETH_LINK_DOWN;
+	hfi_vnic_vema_report_event(adapter,
+				   HFI_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE);
 	clear_bit(HFI_VNIC_OPEN, &adapter->flags);
 	mutex_unlock(&adapter->lock);
 	v_info("closed\n");
@@ -585,6 +731,7 @@ static int hfi_netdev_close(struct net_device *netdev)
 	.ndo_start_xmit = hfi_netdev_start_xmit,
 	.ndo_change_mtu = hfi_netdev_change_mtu,
 	.ndo_get_stats64 = hfi_vnic_get_stats64,
+	.ndo_set_rx_mode = hfi_vnic_set_rx_mode,
 	.ndo_select_queue = hfi_vnic_select_queue,
 	.ndo_set_mac_address = hfi_vnic_set_mac_addr,
 };
@@ -629,6 +776,8 @@ struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 		netif_napi_add(netdev, &adapter->rxq[i].napi, vnic_napi, 64);
 	}
 
+	hfi_vnic_set_pod_values(adapter);
+
 	rc = register_netdev(netdev);
 	if (rc)
 		goto netdev_err;
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema_iface.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema_iface.c
new file mode 100644
index 0000000..f912171f
--- /dev/null
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema_iface.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains HFI VNIC EMA Interface functions.
+ */
+
+#include "hfi_vnic_internal.h"
+
+/**
+ * hfi_vnic_vema_report_event - sent trap to report the specified event
+ * @adapter: vnic port adapter
+ * @event: event to be reported
+ *
+ * This function calls vema api to sent a trap for the given event.
+ */
+void hfi_vnic_vema_report_event(struct hfi_vnic_adapter *adapter, u8 event)
+{
+	struct __hfi_veswport_info *info = &adapter->info;
+	struct hfi_vnic_port *vport = adapter->vport;
+	struct __hfi_veswport_trap trap_data;
+
+	trap_data.fabric_id = info->vesw.fabric_id;
+	trap_data.veswid = info->vesw.vesw_id;
+	trap_data.veswportnum = info->vport.port_num;
+	trap_data.hfiportnum = vport->port_num;
+	trap_data.veswportindex = vport->vport_num;
+	trap_data.opcode = event;
+
+	/* Need to send trap here */
+}
+
+/**
+ * hfi_vnic_get_error_counters - get summary counters
+ * @adapter: vnic port adapter
+ * @cntrs: pointer to destination summary counters structure
+ *
+ * This function populates the summary counters that is maintained by the
+ * given adapter to destination address provided.
+ */
+void hfi_vnic_get_summary_counters(struct hfi_vnic_adapter *adapter,
+				   struct hfi_veswport_summary_counters *cntrs)
+{
+	__be64 *dst;
+	u64 *src;
+
+	mutex_lock(&adapter->stats_lock);
+	/* update stats */
+	hfi_vnic_update_stats(adapter->netdev);
+
+	cntrs->vp_instance = cpu_to_be16(adapter->vport->vport_num);
+	cntrs->vesw_id = cpu_to_be16(adapter->vport->vesw_id);
+	cntrs->veswport_num = cpu_to_be32(adapter->info.vport.port_num);
+
+	/*
+	 * This loop depends on layout of
+	 * struct hfi_veswport_summary_counters and
+	 * struct __hfi_vnic_summary_counter
+	 */
+	for (dst = &cntrs->tx_errors, src = &adapter->sum_cntrs.tx_errors;
+	     dst < &cntrs->reserved[0]; dst++, src++) {
+		*dst = cpu_to_be64(*src);
+	}
+
+	mutex_unlock(&adapter->stats_lock);
+}
+
+/**
+ * hfi_vnic_get_error_counters - get error counters
+ * @adapter: vnic port adapter
+ * @cntrs: pointer to destination error counters structure
+ *
+ * This function populates the error counters that is maintained by the
+ * given adapter to destination address provided.
+ */
+void hfi_vnic_get_error_counters(struct hfi_vnic_adapter *adapter,
+				 struct hfi_veswport_error_counters *cntrs)
+{
+	mutex_lock(&adapter->stats_lock);
+	/* update stats */
+	hfi_vnic_update_stats(adapter->netdev);
+
+	cntrs->vp_instance = cpu_to_be16(adapter->vport->vport_num);
+	cntrs->vesw_id = cpu_to_be16(adapter->vport->vesw_id);
+	cntrs->veswport_num = cpu_to_be32(adapter->info.vport.port_num);
+
+	cntrs->tx_errors = cpu_to_be64(adapter->err_cntrs.tx_errors);
+	cntrs->rx_errors = cpu_to_be64(adapter->err_cntrs.rx_errors);
+	cntrs->tx_smac_filt = cpu_to_be64(adapter->err_cntrs.tx_smac_filt);
+	cntrs->tx_dlid_zero = cpu_to_be64(adapter->err_cntrs.tx_dlid_zero);
+	cntrs->tx_logic = cpu_to_be64(adapter->err_cntrs.tx_logic);
+	cntrs->tx_drop_state = cpu_to_be64(adapter->err_cntrs.tx_drop_state);
+
+	cntrs->rx_bad_veswid = cpu_to_be64(adapter->err_cntrs.rx_bad_veswid);
+	cntrs->rx_runt = cpu_to_be64(adapter->err_cntrs.rx_runt);
+	cntrs->rx_oversize = cpu_to_be64(adapter->err_cntrs.rx_oversize);
+	cntrs->rx_eth_down = cpu_to_be64(adapter->err_cntrs.rx_eth_down);
+	cntrs->rx_drop_state = cpu_to_be64(adapter->err_cntrs.rx_drop_state);
+	cntrs->rx_logic = cpu_to_be64(adapter->err_cntrs.rx_logic);
+	mutex_unlock(&adapter->stats_lock);
+}
+
+/**
+ * hfi_vnic_get_vesw_info -- Get the vesw information
+ * @adapter: vnic port adapter
+ * @info: pointer to destination vesw info structure
+ *
+ * This function copies the vesw info that is maintained by the
+ * given adapter to destination address provided.
+ */
+void hfi_vnic_get_vesw_info(struct hfi_vnic_adapter *adapter,
+			    struct hfi_vesw_info *info)
+{
+	struct __hfi_vesw_info *src = &adapter->info.vesw;
+	int i;
+
+	info->fabric_id = cpu_to_be16(src->fabric_id);
+	info->vesw_id = cpu_to_be16(src->vesw_id);
+	memcpy(info->rsvd0, src->rsvd0, ARRAY_SIZE(src->rsvd0));
+	info->def_port_mask = cpu_to_be16(src->def_port_mask);
+	memcpy(info->rsvd1, src->rsvd1, ARRAY_SIZE(src->rsvd1));
+	info->pkey = cpu_to_be16(src->pkey);
+
+	memcpy(info->rsvd2, src->rsvd2, ARRAY_SIZE(src->rsvd2));
+	info->u_mcast_dlid = cpu_to_be32(src->u_mcast_dlid);
+	for (i = 0; i < HFI_VESW_MAX_NUM_DEF_PORT; i++)
+		info->u_ucast_dlid[i] = cpu_to_be32(src->u_ucast_dlid[i]);
+
+	memcpy(info->rsvd3, src->rsvd3, ARRAY_SIZE(src->rsvd3));
+	for (i = 0; i < HFI_VNIC_MAX_NUM_PCP; i++)
+		info->eth_mtu[i] = cpu_to_be16(src->eth_mtu[i]);
+
+	info->eth_mtu_non_vlan = cpu_to_be16(src->eth_mtu_non_vlan);
+	memcpy(info->rsvd4, src->rsvd4, ARRAY_SIZE(src->rsvd4));
+}
+
+/**
+ * hfi_vnic_set_vesw_info -- Set the vesw information
+ * @adapter: vnic port adapter
+ * @info: pointer to vesw info structure
+ *
+ * This function updates the vesw info that is maintained by the
+ * given adapter with vesw info provided. Reserved fields are stored
+ * and returned back to EM as is.
+ */
+void hfi_vnic_set_vesw_info(struct hfi_vnic_adapter *adapter,
+			    struct hfi_vesw_info *info)
+{
+	struct __hfi_vesw_info *dst = &adapter->info.vesw;
+	int i;
+
+	dst->fabric_id = be16_to_cpu(info->fabric_id);
+	dst->vesw_id = be16_to_cpu(info->vesw_id);
+	memcpy(dst->rsvd0, info->rsvd0, ARRAY_SIZE(info->rsvd0));
+	dst->def_port_mask = be16_to_cpu(info->def_port_mask);
+	memcpy(dst->rsvd1, info->rsvd1, ARRAY_SIZE(info->rsvd1));
+	dst->pkey = be16_to_cpu(info->pkey);
+
+	memcpy(dst->rsvd2, info->rsvd2, ARRAY_SIZE(info->rsvd2));
+	dst->u_mcast_dlid = be32_to_cpu(info->u_mcast_dlid);
+	for (i = 0; i < HFI_VESW_MAX_NUM_DEF_PORT; i++)
+		dst->u_ucast_dlid[i] = be32_to_cpu(info->u_ucast_dlid[i]);
+
+	memcpy(dst->rsvd3, info->rsvd3, ARRAY_SIZE(info->rsvd3));
+	for (i = 0; i < HFI_VNIC_MAX_NUM_PCP; i++)
+		dst->eth_mtu[i] = be16_to_cpu(info->eth_mtu[i]);
+
+	dst->eth_mtu_non_vlan = be16_to_cpu(info->eth_mtu_non_vlan);
+	memcpy(dst->rsvd4, info->rsvd4, ARRAY_SIZE(info->rsvd4));
+}
+
+/**
+ * hfi_vnic_get_per_veswport_info -- Get the vesw per port information
+ * @adapter: vnic port adapter
+ * @info: pointer to destination vport info structure
+ *
+ * This function copies the vesw per port info that is maintained by the
+ * given adapter to destination address provided.
+ * Note that the read only fields are not copied.
+ */
+void hfi_vnic_get_per_veswport_info(struct hfi_vnic_adapter *adapter,
+				    struct hfi_per_veswport_info *info)
+{
+	struct __hfi_per_veswport_info *src = &adapter->info.vport;
+
+	info->port_num = cpu_to_be32(src->port_num);
+	info->eth_link_status = src->eth_link_status;
+	memcpy(info->rsvd0, src->rsvd0, ARRAY_SIZE(src->rsvd0));
+
+	memcpy(info->base_mac_addr, src->base_mac_addr,
+	       ARRAY_SIZE(info->base_mac_addr));
+	info->config_state = src->config_state;
+	info->oper_state = src->oper_state;
+	info->max_mac_tbl_ent = cpu_to_be16(src->max_mac_tbl_ent);
+	info->max_smac_ent = cpu_to_be16(src->max_smac_ent);
+	info->mac_tbl_digest = cpu_to_be32(src->mac_tbl_digest);
+	memcpy(info->rsvd1, src->rsvd1, ARRAY_SIZE(src->rsvd1));
+
+	info->encap_slid = cpu_to_be32(src->encap_slid);
+	memcpy(info->pcp_to_sc_uc, src->pcp_to_sc_uc,
+	       ARRAY_SIZE(info->pcp_to_sc_uc));
+	memcpy(info->pcp_to_vl_uc, src->pcp_to_vl_uc,
+	       ARRAY_SIZE(info->pcp_to_vl_uc));
+	memcpy(info->pcp_to_sc_mc, src->pcp_to_sc_mc,
+	       ARRAY_SIZE(info->pcp_to_sc_mc));
+	memcpy(info->pcp_to_vl_mc, src->pcp_to_vl_mc,
+	       ARRAY_SIZE(info->pcp_to_vl_mc));
+	info->non_vlan_sc_uc = src->non_vlan_sc_uc;
+	info->non_vlan_vl_uc = src->non_vlan_vl_uc;
+	info->non_vlan_sc_mc = src->non_vlan_sc_mc;
+	info->non_vlan_vl_mc = src->non_vlan_vl_mc;
+	memcpy(info->rsvd2, src->rsvd2, ARRAY_SIZE(src->rsvd2));
+
+	info->uc_macs_gen_count = cpu_to_be16(src->uc_macs_gen_count);
+	info->mc_macs_gen_count = cpu_to_be16(src->mc_macs_gen_count);
+	memcpy(info->rsvd3, src->rsvd3, ARRAY_SIZE(src->rsvd3));
+}
+
+/**
+ * hfi_vnic_set_per_veswport_info -- Set vesw per port information
+ * @adapter: vnic port adapter
+ * @info: pointer to vport info structure
+ *
+ * This function updates the vesw per port info that is maintained by the
+ * given adapter with vesw per port info provided. Reserved fields are
+ * stored and returned back to EM as is.
+ */
+void hfi_vnic_set_per_veswport_info(struct hfi_vnic_adapter *adapter,
+				    struct hfi_per_veswport_info *info)
+{
+	struct __hfi_per_veswport_info *dst = &adapter->info.vport;
+
+	dst->port_num = be32_to_cpu(info->port_num);
+	memcpy(dst->rsvd0, info->rsvd0, ARRAY_SIZE(info->rsvd0));
+
+	memcpy(dst->base_mac_addr, info->base_mac_addr,
+	       ARRAY_SIZE(dst->base_mac_addr));
+	dst->config_state = info->config_state;
+	memcpy(dst->rsvd1, info->rsvd1, ARRAY_SIZE(info->rsvd1));
+
+	dst->encap_slid = be32_to_cpu(info->encap_slid);
+	memcpy(dst->pcp_to_sc_uc, info->pcp_to_sc_uc,
+	       ARRAY_SIZE(dst->pcp_to_sc_uc));
+	memcpy(dst->pcp_to_vl_uc, info->pcp_to_vl_uc,
+	       ARRAY_SIZE(dst->pcp_to_vl_uc));
+	memcpy(dst->pcp_to_sc_mc, info->pcp_to_sc_mc,
+	       ARRAY_SIZE(dst->pcp_to_sc_mc));
+	memcpy(dst->pcp_to_vl_mc, info->pcp_to_vl_mc,
+	       ARRAY_SIZE(dst->pcp_to_vl_mc));
+	dst->non_vlan_sc_uc = info->non_vlan_sc_uc;
+	dst->non_vlan_vl_uc = info->non_vlan_vl_uc;
+	dst->non_vlan_sc_mc = info->non_vlan_sc_mc;
+	dst->non_vlan_vl_mc = info->non_vlan_vl_mc;
+	memcpy(dst->rsvd2, info->rsvd2, ARRAY_SIZE(info->rsvd2));
+	memcpy(dst->rsvd3, info->rsvd3, ARRAY_SIZE(info->rsvd3));
+}
+
+/**
+ * hfi_vnic_query_mcast_macs - query multicast mac list
+ * @adapter: vnic port adapter
+ * @macs: pointer mac list
+ *
+ * This function populates the provided mac list with the configured
+ * multicast addresses in the adapter.
+ */
+void hfi_vnic_query_mcast_macs(struct hfi_vnic_adapter *adapter,
+			       struct hfi_veswport_iface_macs *macs)
+{
+	u16 start_idx, num_macs, idx = 0, count = 0;
+	struct netdev_hw_addr *ha;
+
+	start_idx = be16_to_cpu(macs->start_idx);
+	num_macs = be16_to_cpu(macs->num_macs_in_msg);
+	netdev_for_each_mc_addr(ha, adapter->netdev) {
+		struct hfi_vnic_iface_mac_entry *entry = &macs->entry[count];
+
+		if (start_idx > idx++)
+			continue;
+		else if (num_macs == count)
+			break;
+		memcpy(entry, ha->addr, sizeof(*entry));
+		count++;
+	}
+
+	macs->tot_macs_in_lst = cpu_to_be16(netdev_mc_count(adapter->netdev));
+	macs->num_macs_in_msg = cpu_to_be16(count);
+	macs->gen_count = cpu_to_be16(adapter->info.vport.mc_macs_gen_count);
+}
+
+/**
+ * hfi_vnic_query_ucast_macs - query unicast mac list
+ * @adapter: vnic port adapter
+ * @macs: pointer mac list
+ *
+ * This function populates the provided mac list with the configured
+ * unicast addresses in the adapter.
+ */
+void hfi_vnic_query_ucast_macs(struct hfi_vnic_adapter *adapter,
+			       struct hfi_veswport_iface_macs *macs)
+{
+	u16 start_idx, tot_macs, num_macs, idx = 0, count = 0;
+	struct netdev_hw_addr *ha;
+
+	start_idx = be16_to_cpu(macs->start_idx);
+	num_macs = be16_to_cpu(macs->num_macs_in_msg);
+	/* loop through dev_addrs list first */
+	for_each_dev_addr(adapter->netdev, ha) {
+		struct hfi_vnic_iface_mac_entry *entry = &macs->entry[count];
+
+		/* Do not include EM specified MAC address */
+		if (!memcmp(adapter->info.vport.base_mac_addr, ha->addr,
+			    ARRAY_SIZE(adapter->info.vport.base_mac_addr)))
+			continue;
+
+		if (start_idx > idx++)
+			continue;
+		else if (num_macs == count)
+			break;
+		memcpy(entry, ha->addr, sizeof(*entry));
+		count++;
+	}
+
+	/* loop through uc list */
+	netdev_for_each_uc_addr(ha, adapter->netdev) {
+		struct hfi_vnic_iface_mac_entry *entry = &macs->entry[count];
+
+		if (start_idx > idx++)
+			continue;
+		else if (num_macs == count)
+			break;
+		memcpy(entry, ha->addr, sizeof(*entry));
+		count++;
+	}
+
+	tot_macs = netdev_hw_addr_list_count(&adapter->netdev->dev_addrs) +
+		   netdev_uc_count(adapter->netdev);
+	macs->tot_macs_in_lst = cpu_to_be16(tot_macs);
+	macs->num_macs_in_msg = cpu_to_be16(count);
+	macs->gen_count = cpu_to_be16(adapter->info.vport.uc_macs_gen_count);
+}
+
+/**
+ * hfi_vnic_add_vport - Add a new vnic port
+ * @cport: vnic control port
+ * @port_num: OPA port number
+ * @vport_num: vnic port number
+ *
+ * Return pointer to adapter of newly created vnic port.
+ */
+struct hfi_vnic_adapter *hfi_vnic_add_vport(struct hfi_vnic_ctrl_port *cport,
+					    u8 port_num, u8 vport_num)
+{
+	struct hfi_vnic_adapter *adapter;
+	struct hfi_vnic_port *vport;
+
+	vport = cport->ops->add_vport(cport->ibdev, port_num, vport_num);
+	if (IS_ERR(vport))
+		return ERR_CAST(vport);
+
+	if (vport->hfi_info.num_rx_q > HFI_VNIC_MAX_QUEUE ||
+	    vport->hfi_info.num_tx_q > HFI_VNIC_MAX_QUEUE) {
+		c_err("Number of VNIC (rx %d, tx %d) queues > Max Queue Size (%d)",
+		      vport->hfi_info.num_rx_q, vport->hfi_info.num_tx_q,
+		      HFI_VNIC_MAX_QUEUE);
+		return ERR_PTR(-EINVAL);
+	}
+
+	adapter = hfi_vnic_add_netdev(vport, cport->ibdev->dma_device);
+	if (IS_ERR(adapter))
+		cport->ops->rem_vport(vport);
+	else
+		adapter->cport = cport;
+
+	return adapter;
+}
+
+/**
+ * hfi_vnic_rem_vport - Remove a new vnic port
+ * @adapter: vnic adapter
+ */
+void hfi_vnic_rem_vport(struct hfi_vnic_adapter *adapter)
+{
+	struct hfi_vnic_ctrl_port *cport = adapter->cport;
+	struct hfi_vnic_port *vport = adapter->vport;
+
+	hfi_vnic_rem_netdev(vport);
+	cport->ops->rem_vport(vport);
+}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [RFC v2 01/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) documentation
From: Vishwanathapura, Niranjana @ 2016-12-15  7:59 UTC (permalink / raw)
  To: dledford
  Cc: linux-rdma, netdev, dennis.dalessandro, ira.weiny,
	Niranjana Vishwanathapura
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura@intel.com>

Add HFI VNIC design document explaining the VNIC architecture and the
driver design.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
---
 Documentation/infiniband/hfi_vnic.txt | 95 +++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 Documentation/infiniband/hfi_vnic.txt

diff --git a/Documentation/infiniband/hfi_vnic.txt b/Documentation/infiniband/hfi_vnic.txt
new file mode 100644
index 0000000..1f39d8b
--- /dev/null
+++ b/Documentation/infiniband/hfi_vnic.txt
@@ -0,0 +1,95 @@
+Intel Omni-Path Host Fabric Interface (HFI) Virtual Network Interface
+Controller (VNIC) feature supports Ethernet functionality over Omni-Path
+fabric by encapsulating the Ethernet packets between HFI nodes.
+
+The patterns of exchanges of Omni-Path encapsulated Ethernet packets
+involves one or more virtual Ethernet switches overlaid on the Omni-Path
+fabric topology. A subset of HFI nodes on the Omni-Path fabric are
+permitted to exchange encapsulated Ethernet packets across a particular
+virtual Ethernet switch. The virtual Ethernet switches are logical
+abstractions achieved by configuring the HFI nodes on the fabric for
+header generation and processing. In the simplest configuration all HFI
+nodes across the fabric exchange encapsulated Ethernet packets over a
+single virtual Ethernet switch. A virtual Ethernet switch, is effectively
+an independent Ethernet network. The configuration is performed by an
+Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM)
+application. HFI nodes can have multiple VNICs each connected to a
+different virtual Ethernet switch. The below diagram presents a case
+of two virtual Ethernet switches with two HFI nodes.
+
+                             +-------------------+
+                             |      Subnet/      |
+                             |     Ethernet      |
+                             |      Manager      |
+                             +-------------------+
+                                /          /
+                              /           /
+                            /            /
+                          /             /
++-----------------------------+  +------------------------------+
+|  Virtual Ethernet Switch    |  |  Virtual Ethernet Switch     |
+|  +---------+    +---------+ |  | +---------+    +---------+   |
+|  | VPORT   |    |  VPORT  | |  | |  VPORT  |    |  VPORT  |   |
++--+---------+----+---------+-+  +-+---------+----+---------+---+
+         |                 \        /                 |
+         |                   \    /                   |
+         |                     \/                     |
+         |                    /  \                    |
+         |                  /      \                  |
+     +-----------+------------+  +-----------+------------+
+     |   VNIC    |    VNIC    |  |    VNIC   |    VNIC    |
+     +-----------+------------+  +-----------+------------+
+     |          HFI           |  |          HFI           |
+     +------------------------+  +------------------------+
+
+Intel HFI VNIC software design is presented in the below diagram.
+HFI VNIC functionality has a HW dependent component and a HW
+independent component.
+
+The HW dependent VNIC functionality is part of the HFI1 driver. It
+implements the callback functions to do various tasks which includes
+adding and removing of VNIC ports, HW resource allocation for VNIC
+functionality and actual transmission and reception of encapsulated
+Ethernet packets over the fabric. Each VNIC port is addressed by the
+HFI port number, and the VNIC port number on that HFI port.
+
+The HFI VNIC module implements the HW independent VNIC functionality.
+It consists of two parts. The VNIC Ethernet Management Agent (VEMA)
+registers itself with IB core as an IB client and interfaces with the
+IB MAD stack. It exchanges the management information with the Ethernet
+Manager (EM) and the VNIC netdev. The VNIC netdev part interfaces with
+the Linux network stack, thus providing standard Ethernet network
+interfaces. It invokes HFI device's VNIC callback functions for HW access.
+The VNIC netdev encapsulates the Ethernet packets with an Omni-Path
+header before passing them to the HFI1 driver for transmission.
+Similarly, it de-encapsulates the received Omni-Path packets before
+passing them to the network stack. For each VNIC interface, the
+information required for encapsulation is configured by EM via VEMA MAD
+interface.
+
+
+        +-------------------+ +----------------------+
+        |                   | |       Linux          |
+        |     IB MAD        | |      Network         |
+        |                   | |       Stack          |
+        +-------------------+ +----------------------+
+                 |                       |
+                 |                       |
+        +--------------------------------------------+
+        |                                            |
+        |             HFI VNIC Module                |
+        |    (HFI VNIC Netdev and EMA drivers)       |
+        |                                            |
+        +--------------------------------------------+
+                             |
+                             |
+                    +------------------+
+                    |      IB core     |
+                    +------------------+
+                             |
+                             |
+        +--------------------------------------------+
+        |                                            |
+        |      HFI1 Driver with VNIC support         |
+        |                                            |
+        +--------------------------------------------+
-- 
1.8.3.1

^ permalink raw reply related

* [RFC v2 05/10] IB/hfi-vnic: VNIC statistics support
From: Vishwanathapura, Niranjana @ 2016-12-15  7:59 UTC (permalink / raw)
  To: dledford
  Cc: linux-rdma, netdev, dennis.dalessandro, ira.weiny,
	Niranjana Vishwanathapura
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura@intel.com>

HFI VNIC driver statistics support maintains various counters including
standard netdev counters and the Ethernet manager defined counters.
Add the Ethtool hook to read the counters.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
---
 .../infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c  |  19 +-
 .../sw/intel/hfi_vnic/hfi_vnic_ethtool.c           | 131 +++++++++++
 .../sw/intel/hfi_vnic/hfi_vnic_internal.h          |  84 +++++++
 .../infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c | 260 ++++++++++++++++++++-
 4 files changed, 486 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c
index 093df67..3fdfb7b 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_encap.c
@@ -209,8 +209,10 @@ int hfi_vnic_encap_skb(struct hfi_vnic_adapter *adapter, struct sk_buff *skb)
 	hdr->slid_high = info->vport.encap_slid >> 20;
 
 	dlid = hfi_vnic_get_dlid(adapter, skb, def_port);
-	if (unlikely(!dlid))
+	if (unlikely(!dlid)) {
+		adapter->q_err_cntrs[skb->queue_mapping].tx_dlid_zero++;
 		return -EFAULT;
+	}
 
 	hdr->dlid = dlid;
 	hdr->dlid_high = dlid >> 20;
@@ -233,6 +235,19 @@ int hfi_vnic_encap_skb(struct hfi_vnic_adapter *adapter, struct sk_buff *skb)
 /* hfi_vnic_decap_skb - strip OPA header from the skb (ethernet) packet */
 int hfi_vnic_decap_skb(struct hfi_vnic_rx_queue *rxq, struct sk_buff *skb)
 {
+	struct hfi_vnic_adapter *adapter = rxq->adapter;
+	int max_len = adapter->netdev->mtu + VLAN_ETH_HLEN;
+	int rc = -EFAULT;
+
 	skb_pull(skb, HFI_VNIC_HDR_LEN);
-	return 0;
+
+	/* Validate Packet length */
+	if (skb->len > max_len)
+		adapter->q_err_cntrs[rxq->idx].rx_oversize++;
+	else if (skb->len < ETH_ZLEN)
+		adapter->q_err_cntrs[rxq->idx].rx_runt++;
+	else
+		rc = 0;
+
+	return rc;
 }
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_ethtool.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_ethtool.c
index 0b4da5e..9289ab2 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_ethtool.c
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_ethtool.c
@@ -53,9 +53,140 @@
 
 #include "hfi_vnic_internal.h"
 
+enum {NETDEV_STATS, VNIC_STATS};
+
+struct vnic_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	struct {
+		int type;
+		int sizeof_stat;
+		int stat_offset;
+	};
+};
+
+#define VNIC_STAT(m)            { VNIC_STATS,                               \
+				  FIELD_SIZEOF(struct hfi_vnic_adapter, m), \
+				  offsetof(struct hfi_vnic_adapter, m) }
+#define VNIC_NETDEV_STAT(m)     { NETDEV_STATS,                             \
+				  FIELD_SIZEOF(struct net_device, m),       \
+				  offsetof(struct net_device, m) }
+
+static struct vnic_stats vnic_gstrings_stats[] = {
+	/* NETDEV stats */
+	{"rx_packets", VNIC_NETDEV_STAT(stats.rx_packets)},
+	{"tx_packets", VNIC_NETDEV_STAT(stats.tx_packets)},
+	{"rx_bytes", VNIC_NETDEV_STAT(stats.rx_bytes)},
+	{"tx_bytes", VNIC_NETDEV_STAT(stats.tx_bytes)},
+	{"rx_errors", VNIC_NETDEV_STAT(stats.rx_errors)},
+	{"tx_errors", VNIC_NETDEV_STAT(stats.tx_errors)},
+	{"rx_dropped", VNIC_NETDEV_STAT(stats.rx_dropped)},
+	{"tx_dropped", VNIC_NETDEV_STAT(stats.tx_dropped)},
+
+	{"rx_fifo_errors", VNIC_NETDEV_STAT(stats.rx_fifo_errors)},
+	{"rx_missed_errors", VNIC_NETDEV_STAT(stats.rx_missed_errors)},
+	{"tx_carrier_errors", VNIC_NETDEV_STAT(stats.tx_carrier_errors)},
+	{"tx_fifo_errors", VNIC_NETDEV_STAT(stats.tx_fifo_errors)},
+
+	/* SUMMARY counters */
+	{"tx_unicast", VNIC_STAT(sum_cntrs.tx_grp.unicast)},
+	{"tx_mcastbcast", VNIC_STAT(sum_cntrs.tx_grp.mcastbcast)},
+	{"tx_untagged", VNIC_STAT(sum_cntrs.tx_grp.untagged)},
+	{"tx_vlan", VNIC_STAT(sum_cntrs.tx_grp.vlan)},
+
+	{"tx_64_size", VNIC_STAT(sum_cntrs.tx_grp.xx_64_size)},
+	{"tx_65_127", VNIC_STAT(sum_cntrs.tx_grp.xx_65_127)},
+	{"tx_128_255", VNIC_STAT(sum_cntrs.tx_grp.xx_128_255)},
+	{"tx_256_511", VNIC_STAT(sum_cntrs.tx_grp.xx_256_511)},
+	{"tx_512_1023", VNIC_STAT(sum_cntrs.tx_grp.xx_512_1023)},
+	{"tx_1024_1518", VNIC_STAT(sum_cntrs.tx_grp.xx_1024_1518)},
+	{"tx_1519_max", VNIC_STAT(sum_cntrs.tx_grp.xx_1519_max)},
+
+	{"rx_unicast", VNIC_STAT(sum_cntrs.rx_grp.unicast)},
+	{"rx_mcastbcast", VNIC_STAT(sum_cntrs.rx_grp.mcastbcast)},
+	{"rx_untagged", VNIC_STAT(sum_cntrs.rx_grp.untagged)},
+	{"rx_vlan", VNIC_STAT(sum_cntrs.rx_grp.vlan)},
+
+	{"rx_64_size", VNIC_STAT(sum_cntrs.rx_grp.xx_64_size)},
+	{"rx_65_127", VNIC_STAT(sum_cntrs.rx_grp.xx_65_127)},
+	{"rx_128_255", VNIC_STAT(sum_cntrs.rx_grp.xx_128_255)},
+	{"rx_256_511", VNIC_STAT(sum_cntrs.rx_grp.xx_256_511)},
+	{"rx_512_1023", VNIC_STAT(sum_cntrs.rx_grp.xx_512_1023)},
+	{"rx_1024_1518", VNIC_STAT(sum_cntrs.rx_grp.xx_1024_1518)},
+	{"rx_1519_max", VNIC_STAT(sum_cntrs.rx_grp.xx_1519_max)},
+
+	/* ERROR counters */
+	{"tx_smac_filt", VNIC_STAT(err_cntrs.tx_smac_filt)},
+	{"tx_dlid_zero", VNIC_STAT(err_cntrs.tx_dlid_zero)},
+	{"tx_logic", VNIC_STAT(err_cntrs.tx_logic)},
+	{"tx_drop_state", VNIC_STAT(err_cntrs.tx_drop_state)},
+
+	{"rx_bad_veswid", VNIC_STAT(err_cntrs.rx_bad_veswid)},
+	{"rx_runt", VNIC_STAT(err_cntrs.rx_runt)},
+	{"rx_oversize", VNIC_STAT(err_cntrs.rx_oversize)},
+	{"rx_eth_down", VNIC_STAT(err_cntrs.rx_eth_down)},
+	{"rx_drop_state", VNIC_STAT(err_cntrs.rx_drop_state)},
+	{"rx_logic", VNIC_STAT(err_cntrs.rx_logic)},
+};
+
+#define VNIC_STATS_LEN  ARRAY_SIZE(vnic_gstrings_stats)
+
+/* vnic_get_sset_count - get string set count */
+static int vnic_get_sset_count(struct net_device *netdev, int sset)
+{
+	return (sset == ETH_SS_STATS) ? VNIC_STATS_LEN : -EOPNOTSUPP;
+}
+
+/* vnic_get_ethtool_stats - get statistics */
+static void vnic_get_ethtool_stats(struct net_device *netdev,
+				   struct ethtool_stats *stats, u64 *data)
+{
+	struct hfi_vnic_adapter *adapter = netdev_priv(netdev);
+	int i;
+	char *p = NULL;
+
+	mutex_lock(&adapter->stats_lock);
+	hfi_vnic_update_stats(netdev);
+	for (i = 0; i < VNIC_STATS_LEN; i++) {
+		switch (vnic_gstrings_stats[i].type) {
+		case NETDEV_STATS:
+			p = (char *)netdev +
+			  vnic_gstrings_stats[i].stat_offset;
+			break;
+		case VNIC_STATS:
+			p = (char *)adapter +
+			  vnic_gstrings_stats[i].stat_offset;
+			break;
+		default:
+			p = NULL;
+		}
+
+		if (p)
+			data[i] = (vnic_gstrings_stats[i].sizeof_stat ==
+			   sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+	mutex_unlock(&adapter->stats_lock);
+}
+
+/* vnic_get_strings - get strings */
+static void vnic_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+{
+	int i;
+
+	if (stringset != ETH_SS_STATS)
+		return;
+
+	for (i = 0; i < VNIC_STATS_LEN; i++)
+		memcpy(data + i * ETH_GSTRING_LEN,
+		       vnic_gstrings_stats[i].stat_string,
+		       ETH_GSTRING_LEN);
+}
+
 /* ethtool ops */
 static const struct ethtool_ops hfi_vnic_ethtool_ops = {
 	.get_link = ethtool_op_get_link,
+	.get_strings = vnic_get_strings,
+	.get_sset_count = vnic_get_sset_count,
+	.get_ethtool_stats = vnic_get_ethtool_stats,
 };
 
 /* hfi_vnic_set_ethtool_ops - set ethtool ops */
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
index 63d6db6..af3ff0e 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
@@ -95,6 +95,64 @@ enum hfi_vnic_flags_t {
 struct hfi_vnic_adapter;
 
 /**
+ * struct __hfi_vnic_summary_counters - HFI summary counters
+ *
+ * Same as __hfi_veswport_summary_counters without bitwise
+ * attribute and reserved fields.
+ */
+struct __hfi_vnic_summary_counters {
+	u64  tx_errors;
+	u64  rx_errors;
+	u64  tx_packets;
+	u64  rx_packets;
+	u64  tx_bytes;
+	u64  rx_bytes;
+
+	/* Group of histogram statistic counters */
+	struct __hfi_vnic_group_scs {
+		u64  unicast;
+		u64  mcastbcast;
+
+		u64  untagged;
+		u64  vlan;
+
+		u64  xx_64_size;
+		u64  xx_65_127;
+		u64  xx_128_255;
+		u64  xx_256_511;
+		u64  xx_512_1023;
+		u64  xx_1024_1518;
+		u64  xx_1519_max;
+	} tx_grp;
+
+	struct __hfi_vnic_group_scs rx_grp;
+
+} __packed;
+
+/**
+ * struct __hfi_vnic_error_counters - HFI error counters
+ *
+ * Same as hfi_veswport_error_counters without bitwise
+ * attribute and reserved fields.
+ */
+struct __hfi_vnic_error_counters {
+	u64  tx_errors;
+	u64  rx_errors;
+
+	u64  tx_smac_filt;
+	u64  tx_dlid_zero;
+	u64  tx_logic;
+	u64  tx_drop_state;
+
+	u64  rx_bad_veswid;
+	u64  rx_runt;
+	u64  rx_oversize;
+	u64  rx_eth_down;
+	u64  rx_drop_state;
+	u64  rx_logic;
+} __packed;
+
+/**
  * struct __hfi_vesw_info - HFI vnic virtual switch info
  *
  * Same as hfi_vesw_info without bitwise attribute.
@@ -204,7 +262,17 @@ struct hfi_vnic_rx_queue {
  * @lock: adapter lock
  * @rxq: receive queue array
  * @info: virtual ethernet switch port information
+ * @stats_lock: statistics lock
  * @flow_tbl: flow to default port redirection table
+ * @q_sum_cntrs: per queue EM summary counters
+ * @q_err_cntrs: per queue EM error counters
+ * @q_rx_logic_errors: per queue rx logic (default) errors
+ * @q_tx_logic_errors: per queue tx logic (default) errors
+ * @q_tx_halt: per queue tx halt counts
+ * @q_tx_restart: per queue tx restart counts
+ * @q_tx_wakeup: per queue tx wakeup counts
+ * @sum_cntrs: Total EM summary counters (from all queues)
+ * @err_cntrs: Total EM error counters (from all queues)
  */
 struct hfi_vnic_adapter {
 	struct net_device             *netdev;
@@ -218,7 +286,22 @@ struct hfi_vnic_adapter {
 
 	struct __hfi_veswport_info info;
 
+	/* Lock used to protect access to vnic counters */
+	struct mutex stats_lock;
+
 	u8 flow_tbl[HFI_VNIC_FLOW_TBL_SIZE];
+
+	struct __hfi_vnic_summary_counters  q_sum_cntrs[HFI_VNIC_MAX_QUEUE];
+	struct __hfi_vnic_error_counters    q_err_cntrs[HFI_VNIC_MAX_QUEUE];
+	u64 q_rx_logic_errors[HFI_VNIC_MAX_QUEUE];
+	u64 q_tx_logic_errors[HFI_VNIC_MAX_QUEUE];
+
+	u64 q_tx_halt[HFI_VNIC_MAX_QUEUE];
+	u64 q_tx_restart[HFI_VNIC_MAX_QUEUE];
+	u64 q_tx_wakeup[HFI_VNIC_MAX_QUEUE];
+
+	struct __hfi_vnic_summary_counters  sum_cntrs;
+	struct __hfi_vnic_error_counters    err_cntrs;
 };
 
 #define v_dbg(format, arg...) \
@@ -248,6 +331,7 @@ struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 int hfi_vnic_encap_skb(struct hfi_vnic_adapter *adapter, struct sk_buff *skb);
 int hfi_vnic_decap_skb(struct hfi_vnic_rx_queue *rxq, struct sk_buff *skb);
 u8 hfi_vnic_calc_entropy(struct hfi_vnic_adapter *adapter, struct sk_buff *skb);
+void hfi_vnic_update_stats(struct net_device *netdev);
 void hfi_vnic_set_ethtool_ops(struct net_device *netdev);
 
 #endif /* _HFI_VNIC_INTERNAL_H */
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
index 6360d37..1626e44 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_netdev.c
@@ -58,6 +58,235 @@
 
 #define HFI_VNIC_MIN_ETH_MTU (ETH_ZLEN - ETH_HLEN)
 
+#define SUM_GRP_COUNTERS(adpt, summary, x_grp) do {                     \
+		u64 *src64, *dst64;                                     \
+		for (src64 = &summary->x_grp.unicast,                   \
+			dst64 = &adpt->sum_cntrs.x_grp.unicast;         \
+			dst64 <= &adpt->sum_cntrs.x_grp.xx_1519_max;) { \
+			*dst64++ += *src64++;                           \
+		}                                                       \
+	} while (0)
+
+/* hfi_vnic_update_stats - update statistics */
+void hfi_vnic_update_stats(struct net_device *netdev)
+{
+	struct hfi_vnic_adapter *adapter = netdev_priv(netdev);
+	struct hfi_vnic_port *vport = adapter->vport;
+	struct hfi_vnic_stats h_stats = { 0 };
+	u64 tx_logic_errors = 0;
+	u64 rx_logic_errors = 0;
+	u8 i;
+
+	/* first clear the total counters */
+	memset(&adapter->sum_cntrs, 0, sizeof(adapter->sum_cntrs));
+	memset(&adapter->err_cntrs, 0, sizeof(adapter->err_cntrs));
+
+	/* add tx counters on different queues */
+	for (i = 0; i < vport->hfi_info.num_tx_q; i++) {
+		struct hfi_vnic_stats *hfi_stats = &vport->hfi_stats[i];
+		struct __hfi_vnic_summary_counters *sum_cntrs =
+						&adapter->q_sum_cntrs[i];
+		struct __hfi_vnic_error_counters *err_cntrs =
+						&adapter->q_err_cntrs[i];
+
+		h_stats.tx_fifo_errors += hfi_stats->tx_fifo_errors;
+		h_stats.tx_carrier_errors += hfi_stats->tx_carrier_errors;
+		h_stats.tx_logic_errors += hfi_stats->tx_logic_errors;
+
+		SUM_GRP_COUNTERS(adapter, sum_cntrs, tx_grp);
+		adapter->sum_cntrs.tx_packets += sum_cntrs->tx_packets;
+		adapter->sum_cntrs.tx_bytes += sum_cntrs->tx_bytes;
+
+		adapter->err_cntrs.tx_smac_filt += err_cntrs->tx_smac_filt;
+		adapter->err_cntrs.tx_dlid_zero += err_cntrs->tx_dlid_zero;
+		adapter->err_cntrs.tx_drop_state += err_cntrs->tx_drop_state;
+
+		tx_logic_errors += adapter->q_tx_logic_errors[i];
+	}
+
+	/* add rx counters on different queues */
+	for (i = 0; i < vport->hfi_info.num_rx_q; i++) {
+		struct hfi_vnic_stats *hfi_stats = &vport->hfi_stats[i];
+		struct __hfi_vnic_summary_counters *sum_cntrs =
+						&adapter->q_sum_cntrs[i];
+		struct __hfi_vnic_error_counters *err_cntrs =
+						&adapter->q_err_cntrs[i];
+
+		h_stats.rx_fifo_errors += hfi_stats->rx_fifo_errors;
+		h_stats.rx_missed_errors += hfi_stats->rx_missed_errors;
+		h_stats.rx_bad_veswid += hfi_stats->rx_bad_veswid;
+		h_stats.rx_logic_errors += hfi_stats->rx_logic_errors;
+
+		SUM_GRP_COUNTERS(adapter, sum_cntrs, rx_grp);
+		adapter->sum_cntrs.rx_packets += sum_cntrs->rx_packets;
+		adapter->sum_cntrs.rx_bytes += sum_cntrs->rx_bytes;
+
+		adapter->err_cntrs.rx_drop_state += err_cntrs->rx_drop_state;
+		adapter->err_cntrs.rx_runt += err_cntrs->rx_runt;
+		adapter->err_cntrs.rx_oversize += err_cntrs->rx_oversize;
+
+		rx_logic_errors += adapter->q_rx_logic_errors[i];
+	}
+
+	/* update hfi errors */
+	netdev->stats.rx_fifo_errors = h_stats.rx_fifo_errors;
+	netdev->stats.tx_fifo_errors = h_stats.tx_fifo_errors;
+	netdev->stats.rx_missed_errors = h_stats.rx_missed_errors;
+	netdev->stats.tx_carrier_errors = h_stats.tx_carrier_errors;
+	adapter->err_cntrs.rx_bad_veswid = h_stats.rx_bad_veswid;
+
+	/* update tx counters */
+	netdev->stats.tx_packets = adapter->sum_cntrs.tx_packets;
+	netdev->stats.tx_bytes = adapter->sum_cntrs.tx_bytes;
+
+	adapter->err_cntrs.tx_logic = netdev->stats.tx_carrier_errors +
+				      netdev->stats.tx_fifo_errors +
+				      h_stats.tx_logic_errors +
+				      tx_logic_errors;
+
+	netdev->stats.tx_errors = adapter->err_cntrs.tx_smac_filt +
+				  adapter->err_cntrs.tx_dlid_zero +
+				  adapter->err_cntrs.tx_drop_state +
+				  adapter->err_cntrs.tx_logic;
+
+	netdev->stats.tx_dropped = netdev->stats.tx_errors;
+	adapter->sum_cntrs.tx_errors = netdev->stats.tx_errors;
+	adapter->err_cntrs.tx_errors = netdev->stats.tx_errors;
+
+	/* update rx counters */
+	netdev->stats.rx_packets = adapter->sum_cntrs.rx_packets;
+	netdev->stats.rx_bytes = adapter->sum_cntrs.rx_bytes;
+	netdev->stats.multicast = adapter->sum_cntrs.rx_grp.mcastbcast;
+	netdev->stats.rx_over_errors = adapter->err_cntrs.rx_oversize;
+	netdev->stats.rx_length_errors = adapter->err_cntrs.rx_oversize +
+					 adapter->err_cntrs.rx_runt;
+
+	adapter->err_cntrs.rx_logic = netdev->stats.rx_missed_errors +
+				      netdev->stats.rx_fifo_errors +
+				      h_stats.rx_logic_errors +
+				      rx_logic_errors;
+
+	netdev->stats.rx_errors = adapter->err_cntrs.rx_bad_veswid +
+				  adapter->err_cntrs.rx_runt +
+				  adapter->err_cntrs.rx_oversize +
+				  adapter->err_cntrs.rx_eth_down +
+				  adapter->err_cntrs.rx_drop_state +
+				  adapter->err_cntrs.rx_logic;
+
+	netdev->stats.rx_dropped = netdev->stats.rx_errors;
+	adapter->sum_cntrs.rx_errors = netdev->stats.rx_errors;
+	adapter->err_cntrs.rx_errors = netdev->stats.rx_errors;
+}
+
+/* update_len_counters - update pkt's len histogram counters */
+static inline void update_len_counters(struct __hfi_vnic_group_scs *grp,
+				       int len)
+{
+	/* account for 4 byte FCS */
+	if (len >= 1515)
+		grp->xx_1519_max++;
+	else if (len >= 1020)
+		grp->xx_1024_1518++;
+	else if (len >= 508)
+		grp->xx_512_1023++;
+	else if (len >= 252)
+		grp->xx_256_511++;
+	else if (len >= 124)
+		grp->xx_128_255++;
+	else if (len >= 61)
+		grp->xx_65_127++;
+	else
+		grp->xx_64_size++;
+}
+
+/* hfi_vnic_update_tx_counters - update transmit counters */
+static void hfi_vnic_update_tx_counters(struct net_device *netdev, u8 q_idx,
+					struct sk_buff *skb, int err)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	struct hfi_vnic_adapter *adapter = netdev_priv(netdev);
+	struct __hfi_vnic_group_scs *grp_cntrs =
+			&adapter->q_sum_cntrs[q_idx].tx_grp;
+	u16 vlan_tci;
+
+	adapter->q_sum_cntrs[q_idx].tx_packets++;
+	adapter->q_sum_cntrs[q_idx].tx_bytes += skb->len + ETH_FCS_LEN;
+
+	update_len_counters(grp_cntrs, skb->len);
+
+	/* rest of the counts are for good packets only */
+	if (err)
+		return;
+
+	if (is_multicast_ether_addr(mac_hdr->h_dest))
+		grp_cntrs->mcastbcast++;
+	else
+		grp_cntrs->unicast++;
+
+	if (!__vlan_get_tag(skb, &vlan_tci))
+		grp_cntrs->vlan++;
+	else
+		grp_cntrs->untagged++;
+}
+
+/* hfi_vnic_update_rx_counters - update receive counters */
+static void hfi_vnic_update_rx_counters(struct net_device *netdev, u8 q_idx,
+					struct sk_buff *skb, int err)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb->data;
+	struct hfi_vnic_adapter *adapter = netdev_priv(netdev);
+	struct __hfi_vnic_group_scs *grp_cntrs =
+			&adapter->q_sum_cntrs[q_idx].rx_grp;
+	u16 vlan_tci;
+
+	adapter->q_sum_cntrs[q_idx].rx_packets++;
+	adapter->q_sum_cntrs[q_idx].rx_bytes += skb->len + ETH_FCS_LEN;
+
+	update_len_counters(grp_cntrs, skb->len);
+
+	/* rest of the counts are for good packets only */
+	if (err)
+		return;
+
+	if (is_multicast_ether_addr(mac_hdr->h_dest))
+		grp_cntrs->mcastbcast++;
+	else
+		grp_cntrs->unicast++;
+
+	if (!__vlan_get_tag(skb, &vlan_tci))
+		grp_cntrs->vlan++;
+	else
+		grp_cntrs->untagged++;
+}
+
+static struct rtnl_link_stats64 *
+hfi_vnic_get_stats64(struct net_device *netdev,
+		     struct rtnl_link_stats64 *stats)
+{
+	struct hfi_vnic_adapter *adapter = netdev_priv(netdev);
+
+	mutex_lock(&adapter->stats_lock);
+	hfi_vnic_update_stats(netdev);
+
+	stats->rx_packets = netdev->stats.rx_packets;
+	stats->tx_packets = netdev->stats.tx_packets;
+	stats->rx_bytes = netdev->stats.rx_bytes;
+	stats->tx_bytes = netdev->stats.tx_bytes;
+	stats->rx_errors = netdev->stats.rx_errors;
+	stats->tx_errors = netdev->stats.tx_errors;
+	stats->rx_dropped = netdev->stats.rx_dropped;
+	stats->tx_dropped = netdev->stats.tx_dropped;
+	stats->multicast = netdev->stats.multicast;
+	stats->rx_length_errors = netdev->stats.rx_length_errors;
+	stats->rx_over_errors = netdev->stats.rx_over_errors;
+	stats->rx_fifo_errors = netdev->stats.rx_fifo_errors;
+	stats->rx_missed_errors = netdev->stats.rx_missed_errors;
+	stats->tx_carrier_errors = netdev->stats.tx_carrier_errors;
+	stats->tx_fifo_errors = netdev->stats.tx_fifo_errors;
+	mutex_unlock(&adapter->stats_lock);
+	return stats;
+}
+
 /* hfi_vnic_maybe_stop_tx - stop tx queue if required */
 static void hfi_vnic_maybe_stop_tx(struct hfi_vnic_adapter *adapter, u8 q_idx)
 {
@@ -67,6 +296,7 @@ static void hfi_vnic_maybe_stop_tx(struct hfi_vnic_adapter *adapter, u8 q_idx)
 	if (!vport->ops->get_write_avail(vport, q_idx))
 		return;
 
+	adapter->q_tx_restart[q_idx]++;
 	netif_start_subqueue(vport->netdev, q_idx);
 }
 
@@ -82,12 +312,15 @@ static netdev_tx_t hfi_netdev_start_xmit(struct sk_buff *skb,
 
 	v_dbg("xmit: queue %d skb len %d\n", q_idx, skb->len);
 	if (unlikely(adapter->info.vport.oper_state !=
-		     HFI_VNIC_STATE_FORWARDING))
+		     HFI_VNIC_STATE_FORWARDING)) {
+		adapter->q_err_cntrs[q_idx].tx_drop_state++;
 		goto tx_finish;
+	}
 
 	/* pad to ensure mininum ethernet packet length */
 	if (unlikely(skb->len < ETH_ZLEN)) {
 		if (skb_padto(skb, ETH_ZLEN)) {
+			adapter->q_tx_logic_errors[q_idx]++;
 			skip_skb_free = true;
 			goto tx_finish;
 		}
@@ -101,16 +334,19 @@ static netdev_tx_t hfi_netdev_start_xmit(struct sk_buff *skb,
 	/* Get reference to skb as hfi driver might release it */
 	skb_get(skb);
 	rc = vport->ops->put_skb(vport, q_idx, skb);
-	/* remove the header */
+	/* remove the header before updating tx counters */
 	skb_pull(skb, HFI_VNIC_HDR_LEN);
 
 tx_finish:
 	if (unlikely(rc == -EBUSY)) {
 		hfi_vnic_maybe_stop_tx(adapter, q_idx);
+		adapter->q_tx_halt[q_idx]++;
 		dev_kfree_skb_any(skb);
 		return NETDEV_TX_BUSY;
 	}
 
+	/* update tx counters */
+	hfi_vnic_update_tx_counters(netdev, q_idx, skb, rc);
 	if (!skip_skb_free)
 		dev_kfree_skb_any(skb);
 	return NETDEV_TX_OK;
@@ -123,6 +359,7 @@ static void vnic_handle_rx(struct hfi_vnic_rx_queue *rxq,
 	struct hfi_vnic_adapter *adapter = rxq->adapter;
 	struct hfi_vnic_port *vport = adapter->vport;
 	struct sk_buff *skb;
+	int rc;
 
 	while (1) {
 		if (*work_done >= work_to_do)
@@ -132,7 +369,11 @@ static void vnic_handle_rx(struct hfi_vnic_rx_queue *rxq,
 		if (!skb)
 			break;
 
-		if (hfi_vnic_decap_skb(rxq, skb)) {
+		rc = hfi_vnic_decap_skb(rxq, skb);
+
+		/* update rx counters */
+		hfi_vnic_update_rx_counters(adapter->netdev, rxq->idx, skb, rc);
+		if (rc) {
 			dev_kfree_skb_any(skb);
 			continue;
 		}
@@ -178,8 +419,10 @@ static void vnic_event_cb(struct hfi_vnic_port *vport, u8 evt)
 	if (evt < vport->hfi_info.num_rx_q) {
 		q_idx = evt;
 		if (unlikely(adapter->info.vport.oper_state !=
-			     HFI_VNIC_STATE_FORWARDING))
+			     HFI_VNIC_STATE_FORWARDING)) {
+			adapter->q_err_cntrs[q_idx].rx_drop_state++;
 			return;
+		}
 
 		rxq = &adapter->rxq[q_idx];
 		if (napi_schedule_prep(&rxq->napi)) {
@@ -193,9 +436,10 @@ static void vnic_event_cb(struct hfi_vnic_port *vport, u8 evt)
 	    (evt < (HFI_VNIC_EVT_TX0 + vport->hfi_info.num_tx_q))) {
 		q_idx = evt - HFI_VNIC_EVT_TX0;
 
-		if (__netif_subqueue_stopped(vport->netdev, q_idx))
+		if (__netif_subqueue_stopped(vport->netdev, q_idx)) {
 			netif_wake_subqueue(vport->netdev, q_idx);
-
+			adapter->q_tx_wakeup[q_idx]++;
+		}
 		return;
 	}
 	v_err("Invalid event\n");
@@ -340,6 +584,7 @@ static int hfi_netdev_close(struct net_device *netdev)
 	.ndo_stop = hfi_netdev_close,
 	.ndo_start_xmit = hfi_netdev_start_xmit,
 	.ndo_change_mtu = hfi_netdev_change_mtu,
+	.ndo_get_stats64 = hfi_vnic_get_stats64,
 	.ndo_select_queue = hfi_vnic_select_queue,
 	.ndo_set_mac_address = hfi_vnic_set_mac_addr,
 };
@@ -371,6 +616,7 @@ struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 	netdev->netdev_ops = &hfi_netdev_ops;
 	netdev->hard_header_len += HFI_VNIC_SKB_HEADROOM;
 	mutex_init(&adapter->lock);
+	mutex_init(&adapter->stats_lock);
 	strcpy(netdev->name, "veth%d");
 
 	SET_NETDEV_DEV(netdev, parent);
@@ -392,6 +638,7 @@ struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 	return adapter;
 netdev_err:
 	mutex_destroy(&adapter->lock);
+	mutex_destroy(&adapter->stats_lock);
 	free_netdev(netdev);
 
 	return ERR_PTR(rc);
@@ -405,5 +652,6 @@ void hfi_vnic_rem_netdev(struct hfi_vnic_port *vport)
 	v_info("removing\n");
 	unregister_netdev(vport->netdev);
 	mutex_destroy(&adapter->lock);
+	mutex_destroy(&adapter->stats_lock);
 	free_netdev(vport->netdev);
 }
-- 
1.8.3.1

^ permalink raw reply related

* [RFC v2 08/10] IB/hfi-vnic: VNIC Ethernet Management Agent (VEMA) function
From: Vishwanathapura, Niranjana @ 2016-12-15  7:59 UTC (permalink / raw)
  To: dledford
  Cc: linux-rdma, netdev, dennis.dalessandro, ira.weiny,
	Sadanand Warrier, Niranjana Vishwanathapura, Tanya K Jajodia,
	Sudeep Dutt
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura@intel.com>

HFI VEMA function interfaces with the Infiniband MAD stack to exchange the
management information packets with the Ethernet Manager (EM).
It interfaces with the HFI VNIC netdev function to SET/GET the management
information. The information exchanged with the EM includes class port
details, encapsulation configuration, various counters, unicast and
multicast MAC list and the MAC table. It also supports sending traps
to the EM.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Sadanand Warrier <sadanand.warrier@intel.com>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Signed-off-by: Tanya K Jajodia <tanya.k.jajodia@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
---
 drivers/infiniband/sw/intel/hfi_vnic/Makefile      |    2 +-
 .../sw/intel/hfi_vnic/hfi_vnic_ethtool.c           |   12 +
 .../sw/intel/hfi_vnic/hfi_vnic_internal.h          |   11 +
 .../infiniband/sw/intel/hfi_vnic/hfi_vnic_vema.c   | 1024 ++++++++++++++++++++
 .../sw/intel/hfi_vnic/hfi_vnic_vema_iface.c        |    4 +-
 5 files changed, 1050 insertions(+), 3 deletions(-)
 create mode 100644 drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema.c

diff --git a/drivers/infiniband/sw/intel/hfi_vnic/Makefile b/drivers/infiniband/sw/intel/hfi_vnic/Makefile
index a0562af..16c0830 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/Makefile
+++ b/drivers/infiniband/sw/intel/hfi_vnic/Makefile
@@ -4,4 +4,4 @@
 obj-$(CONFIG_HFI_VNIC) += hfi_vnic.o
 
 hfi_vnic-y := hfi_vnic_netdev.o hfi_vnic_encap.o hfi_vnic_ethtool.o \
-              hfi_vnic_vema_iface.o
+              hfi_vnic_vema.o hfi_vnic_vema_iface.o
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_ethtool.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_ethtool.c
index 9289ab2..9c2ed37 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_ethtool.c
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_ethtool.c
@@ -130,6 +130,17 @@ struct vnic_stats {
 
 #define VNIC_STATS_LEN  ARRAY_SIZE(vnic_gstrings_stats)
 
+/* vnic_get_drvinfo - get driver info */
+static void vnic_get_drvinfo(struct net_device *netdev,
+			     struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, hfi_vnic_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, hfi_vnic_driver_version,
+		sizeof(drvinfo->version));
+	strlcpy(drvinfo->bus_info, dev_name(netdev->dev.parent),
+		sizeof(drvinfo->bus_info));
+}
+
 /* vnic_get_sset_count - get string set count */
 static int vnic_get_sset_count(struct net_device *netdev, int sset)
 {
@@ -183,6 +194,7 @@ static void vnic_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 
 /* ethtool ops */
 static const struct ethtool_ops hfi_vnic_ethtool_ops = {
+	.get_drvinfo = vnic_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_strings = vnic_get_strings,
 	.get_sset_count = vnic_get_sset_count,
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
index 7723a4e..b36bb76 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_internal.h
@@ -246,10 +246,12 @@ struct __hfi_veswport_trap {
  * struct hfi_vnic_ctrl_port - HFI virtual NIC control port
  * @ibdev: pointer to ib device
  * @ops: hfi vnic control operations
+ * @num_ports: number of hfi ports
  */
 struct hfi_vnic_ctrl_port {
 	struct ib_device           *ibdev;
 	struct hfi_vnic_ctrl_ops   *ops;
+	u8                          num_ports;
 };
 
 /**
@@ -280,6 +282,8 @@ struct hfi_vnic_rx_queue {
  * @mactbl_lock: mac table lock
  * @stats_lock: statistics lock
  * @flow_tbl: flow to default port redirection table
+ * @trap_timeout: trap timeout
+ * @trap_count: no. of traps allowed within timeout period
  * @q_sum_cntrs: per queue EM summary counters
  * @q_err_cntrs: per queue EM error counters
  * @q_rx_logic_errors: per queue rx logic (default) errors
@@ -314,6 +318,8 @@ struct hfi_vnic_adapter {
 	struct mutex stats_lock;
 
 	u8 flow_tbl[HFI_VNIC_FLOW_TBL_SIZE];
+	unsigned long trap_timeout;
+	u8            trap_count;
 
 	struct __hfi_vnic_summary_counters  q_sum_cntrs[HFI_VNIC_MAX_QUEUE];
 	struct __hfi_vnic_error_counters    q_err_cntrs[HFI_VNIC_MAX_QUEUE];
@@ -394,6 +400,9 @@ struct hfi_vnic_mac_tbl_node {
 		    !obj && (bkt) < HFI_VNIC_MAC_TBL_SIZE; (bkt)++)           \
 		hlist_for_each_entry(obj, &name[bkt], member)
 
+extern char hfi_vnic_driver_name[];
+extern const char hfi_vnic_driver_version[];
+
 struct hfi_vnic_adapter *hfi_vnic_add_netdev(struct hfi_vnic_port *vport,
 					     struct device *parent);
 void hfi_vnic_rem_netdev(struct hfi_vnic_port *vport);
@@ -428,5 +437,7 @@ struct hfi_vnic_adapter *hfi_vnic_add_vport(struct hfi_vnic_ctrl_port *cport,
 					    u8 port_num, u8 vport_num);
 void hfi_vnic_rem_vport(struct hfi_vnic_adapter *adapter);
 void hfi_vnic_set_ethtool_ops(struct net_device *netdev);
+void hfi_vnic_vema_send_trap(struct hfi_vnic_adapter *adapter,
+			     struct __hfi_veswport_trap *data, u32 lid);
 
 #endif /* _HFI_VNIC_INTERNAL_H */
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema.c
new file mode 100644
index 0000000..f7d4bc1
--- /dev/null
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema.c
@@ -0,0 +1,1024 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains HFI Virtual Network Interface Controller (VNIC)
+ * Ethernet Management Agent (EMA) driver
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi_vnic_internal.h"
+
+#define DRV_VERSION "1.0"
+char hfi_vnic_driver_name[] = "hfi_vnic";
+const char hfi_vnic_driver_version[] = DRV_VERSION;
+
+/*
+ * The trap service level is kept in bits 3 to 7 in the trap_sl_rsvd
+ * field in the class port info MAD.
+ */
+#define GET_TRAP_SL_FROM_CLASS_PORT_INFO(x)  (((x) >> 3) & 0x1f)
+
+/* Cap trap bursts to a reasonable limit good for normal cases */
+#define HFI_VNIC_TRAP_BURST_LIMIT 4
+
+/*
+ * VNIC trap limit timeout.
+ * Inverse of cap2_mask response time out (1.0737 secs) = 0.9
+ * secs approx IB spec 13.4.6.2.1 PortInfoSubnetTimeout and
+ * 13.4.9 Traps.
+ */
+#define HFI_VNIC_TRAP_TIMEOUT  ((4096 * (1UL << 18)) / 1000)
+
+#define HFI_VNIC_UNSUP_ATTR  \
+		cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB)
+
+#define HFI_VNIC_INVAL_ATTR  \
+		cpu_to_be16(IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE)
+
+#define HFI_VNIC_CLASS_CAP_TRAP  cpu_to_be16(1 << 8)
+
+struct hfi_class_port_info {
+	u8 base_version;
+	u8 class_version;
+	__be16 cap_mask;
+	__be32 cap_mask2_resp_time;
+
+	u8 redirect_gid[16];
+	__be32 redirect_tc_fl;
+	__be32 redirect_lid;
+	__be32 redirect_sl_qp;
+	__be32 redirect_qkey;
+
+	u8 trap_gid[16];
+	__be32 trap_tc_fl;
+	__be32 trap_lid;
+	__be32 trap_hl_qp;
+	__be32 trap_qkey;
+
+	__be16 trap_pkey;
+	__be16 redirect_pkey;
+
+	u8 trap_sl_rsvd;
+	u8 reserved[3];
+} __packed;
+
+/**
+ * struct hfi_vnic_vema_port -- VNIC VEMA port details
+ * @cport: pointer to port
+ * @mad_agent: pointer to mad agent for port
+ * @class_port_info: Class port info information.
+ * @tid: Transaction id
+ * @port_num: HFI port number
+ * @vport_idr: vnic ports idr
+ * @lock: adapter interface lock
+ */
+struct hfi_vnic_vema_port {
+	struct hfi_vnic_ctrl_port      *cport;
+	struct ib_mad_agent            *mad_agent;
+	struct hfi_class_port_info      class_port_info;
+	u64                             tid;
+	u8                              port_num;
+	struct idr                      vport_idr;
+
+	/* Lock to query/update network adapter */
+	struct mutex                    lock;
+};
+
+static void hfi_vnic_vema_add_one(struct ib_device *device);
+static void hfi_vnic_vema_rem_one(struct ib_device *device,
+				  void *client_data);
+
+static struct ib_client hfi_vnic_client = {
+	.name   = hfi_vnic_driver_name,
+	.add    = hfi_vnic_vema_add_one,
+	.remove = hfi_vnic_vema_rem_one,
+};
+
+/**
+ * vema_get_vport_num -- Get the vnic from the mad
+ * @recvd_mad:  Received mad
+ *
+ * Return: returns value of the vnic port number
+ */
+static inline u8 vema_get_vport_num(struct hfi_vnic_vema_mad *recvd_mad)
+{
+	return be32_to_cpu(recvd_mad->mad_hdr.attr_mod) >> 16 & 0xff;
+}
+
+/**
+ * vema_get_vport_adapter -- Get vnic port adapter from recvd mad
+ * @recvd_mad: received mad
+ * @port: ptr to port struct on which MAD was recvd
+ *
+ * Return: vnic adapter
+ */
+static inline struct hfi_vnic_adapter *
+vema_get_vport_adapter(struct hfi_vnic_vema_mad *recvd_mad,
+		       struct hfi_vnic_vema_port *port)
+{
+	u8 vport_num = vema_get_vport_num(recvd_mad);
+
+	return idr_find(&port->vport_idr, vport_num);
+}
+
+/**
+ * vema_mac_tbl_req_ok -- Check if mac request has correct values
+ * @mac_tbl: mac table
+ *
+ * This function checks for the validity of the offset and number of
+ * entries required.
+ *
+ * Return: true if offset and num_entries are valid
+ */
+static inline bool vema_mac_tbl_req_ok(struct hfi_veswport_mactable *mac_tbl)
+{
+	u16 offset, num_entries;
+	u16 req_entries = ((HFI_VNIC_EMA_DATA - sizeof(*mac_tbl)) /
+			   sizeof(mac_tbl->tbl_entries[0]));
+
+	offset = be16_to_cpu(mac_tbl->offset);
+	num_entries = be16_to_cpu(mac_tbl->num_entries);
+
+	return ((num_entries <= req_entries) &&
+		(offset + num_entries <= HFI_VNIC_MAC_TBL_MAX_ENTRIES));
+}
+
+/*
+ * Return the power on default values in the port info structure
+ * in big endian format as required by MAD.
+ */
+static inline void vema_get_pod_values(struct hfi_veswport_info *port_info)
+{
+	memset(port_info, 0, sizeof(*port_info));
+	port_info->vport.max_mac_tbl_ent =
+		cpu_to_be16(HFI_VNIC_MAC_TBL_MAX_ENTRIES);
+	port_info->vport.max_smac_ent =
+		cpu_to_be16(HFI_VNIC_MAX_SMAC_LIMIT);
+	port_info->vport.oper_state = HFI_VNIC_STATE_DROP_ALL;
+	port_info->vport.config_state = HFI_VNIC_STATE_DROP_ALL;
+}
+
+/**
+ * vema_add_vport -- Add a new vnic port
+ * @port: ptr to hfi_vnic_vema_port struct
+ * @vport_num: vnic port number (to be added)
+ *
+ * Return a pointer to the vnic adapter structure
+ */
+static struct hfi_vnic_adapter *vema_add_vport(struct hfi_vnic_vema_port *port,
+					       u8 vport_num)
+{
+	struct hfi_vnic_ctrl_port *cport = port->cport;
+	struct hfi_vnic_adapter *adapter;
+
+	adapter = hfi_vnic_add_vport(cport, port->port_num, vport_num);
+	if (!IS_ERR(adapter)) {
+		int rc;
+
+		rc = idr_alloc(&port->vport_idr, adapter, vport_num,
+			       vport_num + 1, GFP_NOWAIT);
+		if (rc < 0) {
+			hfi_vnic_rem_vport(adapter);
+			adapter = ERR_PTR(rc);
+		}
+	}
+
+	return adapter;
+}
+
+/**
+ * vema_get_class_port_info -- Get class info for port
+ * @port:  Port on whic MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function copies the latest class port info value set for the
+ * port and stores it for generating traps
+ */
+static void vema_get_class_port_info(struct hfi_vnic_vema_port *port,
+				     struct hfi_vnic_vema_mad *recvd_mad,
+				     struct hfi_vnic_vema_mad *rsp_mad)
+{
+	struct hfi_class_port_info *port_info;
+
+	port_info = (struct hfi_class_port_info *)rsp_mad->data;
+	memcpy(port_info, &port->class_port_info, sizeof(*port_info));
+	port_info->base_version = OPA_MGMT_BASE_VERSION,
+	port_info->class_version = HFI_EMA_CLASS_VERSION;
+
+	/* Agent generates traps */
+	port_info->cap_mask = HFI_VNIC_CLASS_CAP_TRAP;
+
+	/*
+	 * Since a get routine is always sent by the EM first we
+	 * set the expected response time to
+	 * 4.096 usec * 2^18 == 1.0737 sec here.
+	 */
+	port_info->cap_mask2_resp_time = cpu_to_be32(18);
+}
+
+/**
+ * vema_set_class_port_info -- Get class info for port
+ * @port:  Port on whic MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function updates the port class info for the specific vnic
+ * and sets up the response mad data
+ */
+static void vema_set_class_port_info(struct hfi_vnic_vema_port *port,
+				     struct hfi_vnic_vema_mad *recvd_mad,
+				     struct hfi_vnic_vema_mad *rsp_mad)
+{
+	memcpy(&port->class_port_info, recvd_mad->data,
+	       sizeof(port->class_port_info));
+
+	vema_get_class_port_info(port, recvd_mad, rsp_mad);
+}
+
+/**
+ * vema_get_veswport_info -- Get veswport info
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ */
+static void vema_get_veswport_info(struct hfi_vnic_vema_port *port,
+				   struct hfi_vnic_vema_mad *recvd_mad,
+				   struct hfi_vnic_vema_mad *rsp_mad)
+{
+	struct hfi_veswport_info *port_info =
+				  (struct hfi_veswport_info *)rsp_mad->data;
+	struct hfi_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (adapter) {
+		memset(port_info, 0, sizeof(*port_info));
+		hfi_vnic_get_vesw_info(adapter, &port_info->vesw);
+		hfi_vnic_get_per_veswport_info(adapter,
+					       &port_info->vport);
+	} else {
+		vema_get_pod_values(port_info);
+	}
+}
+
+/**
+ * vema_set_veswport_info -- Set veswport info
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function gets the port class infor for vnic
+ */
+static void vema_set_veswport_info(struct hfi_vnic_vema_port *port,
+				   struct hfi_vnic_vema_mad *recvd_mad,
+				   struct hfi_vnic_vema_mad *rsp_mad)
+{
+	struct hfi_vnic_ctrl_port *cport = port->cport;
+	struct hfi_veswport_info *port_info;
+	struct hfi_vnic_adapter *adapter;
+	u8 vport_num;
+
+	vport_num = vema_get_vport_num(recvd_mad);
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		adapter = vema_add_vport(port, vport_num);
+		if (IS_ERR(adapter)) {
+			c_err("failed to add vport %d: %ld\n",
+			      vport_num, PTR_ERR(adapter));
+			goto err_exit;
+		}
+	}
+
+	port_info = (struct hfi_veswport_info *)recvd_mad->data;
+	hfi_vnic_set_vesw_info(adapter, &port_info->vesw);
+	hfi_vnic_set_per_veswport_info(adapter, &port_info->vport);
+
+	/* Process the new config settings */
+	hfi_vnic_process_vema_config(adapter);
+
+	vema_get_veswport_info(port, recvd_mad, rsp_mad);
+	return;
+
+err_exit:
+	rsp_mad->mad_hdr.status = HFI_VNIC_INVAL_ATTR;
+}
+
+/**
+ * vema_get_mac_entries -- Get MAC entries in VNIC MAC table
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function gets the MAC entries that are programmed into
+ * the VNIC MAC forwarding table. It checks for the validity of
+ * the index into the MAC table and the number of entries that
+ * are to be retrieved.
+ */
+static void vema_get_mac_entries(struct hfi_vnic_vema_port *port,
+				 struct hfi_vnic_vema_mad *recvd_mad,
+				 struct hfi_vnic_vema_mad *rsp_mad)
+{
+	struct hfi_veswport_mactable *mac_tbl_in, *mac_tbl_out;
+	struct hfi_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		rsp_mad->mad_hdr.status = HFI_VNIC_INVAL_ATTR;
+		return;
+	}
+
+	mac_tbl_in = (struct hfi_veswport_mactable *)recvd_mad->data;
+	mac_tbl_out = (struct hfi_veswport_mactable *)rsp_mad->data;
+
+	if (vema_mac_tbl_req_ok(mac_tbl_in)) {
+		mac_tbl_out->offset = mac_tbl_in->offset;
+		mac_tbl_out->num_entries = mac_tbl_in->num_entries;
+		hfi_vnic_query_mac_tbl(adapter, mac_tbl_out);
+	} else {
+		rsp_mad->mad_hdr.status = HFI_VNIC_INVAL_ATTR;
+	}
+}
+
+/**
+ * vema_set_mac_entries -- Set MAC entries in VNIC MAC table
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function sets the MAC entries in the VNIC forwarding table
+ * It checks for the validity of the index and the number of forwarding
+ * table entries to be programmed.
+ */
+static void vema_set_mac_entries(struct hfi_vnic_vema_port *port,
+				 struct hfi_vnic_vema_mad *recvd_mad,
+				 struct hfi_vnic_vema_mad *rsp_mad)
+{
+	struct hfi_veswport_mactable *mac_tbl;
+	struct hfi_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		rsp_mad->mad_hdr.status = HFI_VNIC_INVAL_ATTR;
+		return;
+	}
+
+	mac_tbl = (struct hfi_veswport_mactable *)recvd_mad->data;
+	if (vema_mac_tbl_req_ok(mac_tbl)) {
+		if (hfi_vnic_update_mac_tbl(adapter, mac_tbl))
+			rsp_mad->mad_hdr.status = HFI_VNIC_UNSUP_ATTR;
+	} else {
+		rsp_mad->mad_hdr.status = HFI_VNIC_UNSUP_ATTR;
+	}
+	vema_get_mac_entries(port, recvd_mad, rsp_mad);
+}
+
+/**
+ * vema_set_delete_vesw -- Reset VESW info to POD values
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function clears all the fields of veswport info for the requested vesw
+ * and sets them back to the power-on default values. It does not delete the
+ * vesw.
+ */
+static void vema_set_delete_vesw(struct hfi_vnic_vema_port *port,
+				 struct hfi_vnic_vema_mad *recvd_mad,
+				 struct hfi_vnic_vema_mad *rsp_mad)
+{
+	struct hfi_veswport_info *port_info =
+				  (struct hfi_veswport_info *)rsp_mad->data;
+	struct hfi_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		rsp_mad->mad_hdr.status = HFI_VNIC_INVAL_ATTR;
+		return;
+	}
+
+	vema_get_pod_values(port_info);
+	hfi_vnic_set_vesw_info(adapter, &port_info->vesw);
+	hfi_vnic_set_per_veswport_info(adapter, &port_info->vport);
+
+	/* Process the new config settings */
+	hfi_vnic_process_vema_config(adapter);
+
+	hfi_vnic_release_mac_tbl(adapter);
+
+	vema_get_veswport_info(port, recvd_mad, rsp_mad);
+}
+
+/**
+ * vema_get_mac_list -- Get the unicast/multicast macs.
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad contains fields to set vnic parameters
+ * @rsp_mad:   Response mad to be built
+ * @attr_id:   Attribute ID indicating multicast or unicast mac list
+ */
+static void vema_get_mac_list(struct hfi_vnic_vema_port *port,
+			      struct hfi_vnic_vema_mad *recvd_mad,
+			      struct hfi_vnic_vema_mad *rsp_mad,
+			      u16 attr_id)
+{
+	struct hfi_veswport_iface_macs *macs_in, *macs_out;
+	int max_entries = (HFI_VNIC_EMA_DATA - sizeof(*macs_out)) / ETH_ALEN;
+	struct hfi_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		rsp_mad->mad_hdr.status = HFI_VNIC_INVAL_ATTR;
+		return;
+	}
+
+	macs_in = (struct hfi_veswport_iface_macs *)recvd_mad->data;
+	macs_out = (struct hfi_veswport_iface_macs *)rsp_mad->data;
+
+	macs_out->start_idx = macs_in->start_idx;
+	if (macs_in->num_macs_in_msg)
+		macs_out->num_macs_in_msg = macs_in->num_macs_in_msg;
+	else
+		macs_out->num_macs_in_msg = cpu_to_be16(max_entries);
+
+	if (attr_id == HFI_EM_ATTR_IFACE_MCAST_MACS)
+		hfi_vnic_query_mcast_macs(adapter, macs_out);
+	else
+		hfi_vnic_query_ucast_macs(adapter, macs_out);
+}
+
+/**
+ * vema_get_summary_counters -- Gets summary counters.
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad contains fields to set vnic parameters
+ * @rsp_mad:   Response mad to be built
+ */
+static void vema_get_summary_counters(struct hfi_vnic_vema_port *port,
+				      struct hfi_vnic_vema_mad *recvd_mad,
+				      struct hfi_vnic_vema_mad *rsp_mad)
+{
+	struct hfi_veswport_summary_counters *cntrs;
+	struct hfi_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (adapter) {
+		cntrs = (struct hfi_veswport_summary_counters *)rsp_mad->data;
+		hfi_vnic_get_summary_counters(adapter, cntrs);
+	} else {
+		rsp_mad->mad_hdr.status = HFI_VNIC_INVAL_ATTR;
+	}
+}
+
+/**
+ * vema_get_error_counters -- Gets summary counters.
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad contains fields to set vnic parameters
+ * @rsp_mad:   Response mad to be built
+ */
+static void vema_get_error_counters(struct hfi_vnic_vema_port *port,
+				    struct hfi_vnic_vema_mad *recvd_mad,
+				    struct hfi_vnic_vema_mad *rsp_mad)
+{
+	struct hfi_veswport_error_counters *cntrs;
+	struct hfi_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (adapter) {
+		cntrs = (struct hfi_veswport_error_counters *)rsp_mad->data;
+		hfi_vnic_get_error_counters(adapter, cntrs);
+	} else {
+		rsp_mad->mad_hdr.status = HFI_VNIC_INVAL_ATTR;
+	}
+}
+
+/**
+ * vema_get -- Process received get MAD
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad
+ * @rsp_mad:   Response mad to be built
+ */
+static void vema_get(struct hfi_vnic_vema_port *port,
+		     struct hfi_vnic_vema_mad *recvd_mad,
+		     struct hfi_vnic_vema_mad *rsp_mad)
+{
+	u16 attr_id = be16_to_cpu(recvd_mad->mad_hdr.attr_id);
+
+	switch (attr_id) {
+	case HFI_EM_ATTR_CLASS_PORT_INFO:
+		vema_get_class_port_info(port, recvd_mad, rsp_mad);
+		break;
+	case HFI_EM_ATTR_VESWPORT_INFO:
+		vema_get_veswport_info(port, recvd_mad, rsp_mad);
+		break;
+	case HFI_EM_ATTR_VESWPORT_MAC_ENTRIES:
+		vema_get_mac_entries(port, recvd_mad, rsp_mad);
+		break;
+	case HFI_EM_ATTR_IFACE_UCAST_MACS:
+		/* fall through */
+	case HFI_EM_ATTR_IFACE_MCAST_MACS:
+		vema_get_mac_list(port, recvd_mad, rsp_mad, attr_id);
+		break;
+	case HFI_EM_ATTR_VESWPORT_SUMMARY_COUNTERS:
+		vema_get_summary_counters(port, recvd_mad, rsp_mad);
+		break;
+	case HFI_EM_ATTR_VESWPORT_ERROR_COUNTERS:
+		vema_get_error_counters(port, recvd_mad, rsp_mad);
+		break;
+	default:
+		rsp_mad->mad_hdr.status = HFI_VNIC_UNSUP_ATTR;
+		break;
+	}
+}
+
+/**
+ * vema_set -- Process received set MAD
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad contains fields to set vnic parameters
+ * @rsp_mad:   Response mad to be built
+ */
+static void vema_set(struct hfi_vnic_vema_port *port,
+		     struct hfi_vnic_vema_mad *recvd_mad,
+		     struct hfi_vnic_vema_mad *rsp_mad)
+{
+	u16 attr_id = be16_to_cpu(recvd_mad->mad_hdr.attr_id);
+
+	switch (attr_id) {
+	case HFI_EM_ATTR_CLASS_PORT_INFO:
+		vema_set_class_port_info(port, recvd_mad, rsp_mad);
+		break;
+	case HFI_EM_ATTR_VESWPORT_INFO:
+		vema_set_veswport_info(port, recvd_mad, rsp_mad);
+		break;
+	case HFI_EM_ATTR_VESWPORT_MAC_ENTRIES:
+		vema_set_mac_entries(port, recvd_mad, rsp_mad);
+		break;
+	case HFI_EM_ATTR_DELETE_VESW:
+		vema_set_delete_vesw(port, recvd_mad, rsp_mad);
+		break;
+	default:
+		rsp_mad->mad_hdr.status = HFI_VNIC_UNSUP_ATTR;
+		break;
+	}
+}
+
+/**
+ * vema_send -- Send handler for VEMA MAD agent
+ * @mad_agent: pointer to the mad agent
+ * @mad_wc:    pointer to mad send work completion information
+ *
+ * Free all the data structures associated with the sent MAD
+ */
+static void vema_send(struct ib_mad_agent *mad_agent,
+		      struct ib_mad_send_wc *mad_wc)
+{
+	ib_destroy_ah(mad_wc->send_buf->ah);
+	ib_free_send_mad(mad_wc->send_buf);
+}
+
+/**
+ * vema_recv -- Recv handler for VEMA MAD agent
+ * @mad_agent: pointer to the mad agent
+ * @send_buf: Send buffer if found, else NULL
+ * @mad_wc:    pointer to mad send work completion information
+ *
+ * Handle only set and get methods and respond to other methods
+ * as unsupported. Allocate response buffer and address handle
+ * for the response MAD.
+ */
+static void vema_recv(struct ib_mad_agent *mad_agent,
+		      struct ib_mad_send_buf *send_buf,
+		      struct ib_mad_recv_wc *mad_wc)
+{
+	struct hfi_vnic_vema_port *port;
+	struct ib_ah              *ah;
+	struct ib_mad_send_buf    *rsp;
+	struct hfi_vnic_vema_mad  *vema_mad;
+
+	if (!mad_wc || !mad_wc->recv_buf.mad)
+		return;
+
+	port = mad_agent->context;
+	ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc,
+				  mad_wc->recv_buf.grh, mad_agent->port_num);
+	if (IS_ERR(ah))
+		goto free_recv_mad;
+
+	rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp,
+				 mad_wc->wc->pkey_index, 0,
+				 IB_MGMT_VENDOR_HDR, HFI_VNIC_EMA_DATA,
+				 GFP_KERNEL, OPA_MGMT_BASE_VERSION);
+	if (IS_ERR(rsp))
+		goto err_rsp;
+
+	rsp->ah = ah;
+	vema_mad = rsp->mad;
+	memcpy(vema_mad, mad_wc->recv_buf.mad, IB_MGMT_VENDOR_HDR);
+	vema_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	vema_mad->mad_hdr.status = 0;
+
+	/* Lock ensures network adapter is not removed */
+	mutex_lock(&port->lock);
+
+	switch (mad_wc->recv_buf.mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		vema_get(port, (struct hfi_vnic_vema_mad *)mad_wc->recv_buf.mad,
+			 vema_mad);
+		break;
+	case IB_MGMT_METHOD_SET:
+		vema_set(port, (struct hfi_vnic_vema_mad *)mad_wc->recv_buf.mad,
+			 vema_mad);
+		break;
+	default:
+		vema_mad->mad_hdr.status = HFI_VNIC_UNSUP_ATTR;
+		break;
+	}
+	mutex_unlock(&port->lock);
+
+	if (!ib_post_send_mad(rsp, NULL)) {
+		/*
+		 * with post send successful ah and send mad
+		 * will be destroyed in send handler
+		 */
+		goto free_recv_mad;
+	}
+
+	ib_free_send_mad(rsp);
+
+err_rsp:
+	ib_destroy_ah(ah);
+free_recv_mad:
+	ib_free_recv_mad(mad_wc);
+}
+
+/**
+ * vema_get_port -- Gets the hfi_vnic_vema_port
+ * @cport: pointer to control dev
+ * @port_num: Port number
+ *
+ * This function loops through the ports and returns
+ * the hfi_vnic_vema port structure that is associated
+ * with the HFI port number
+ *
+ * Return: ptr to requested hfi_vnic_vema_port strucure
+ *         if success, NULL if not
+ */
+static struct hfi_vnic_vema_port *
+vema_get_port(struct hfi_vnic_ctrl_port *cport, u8 port_num)
+{
+	struct hfi_vnic_vema_port *port = (void *)cport + sizeof(*cport);
+
+	if (port_num > cport->num_ports)
+		return NULL;
+
+	return port + (port_num - 1);
+}
+
+static int vema_rem_vport(int id, void *p, void *data)
+{
+	struct hfi_vnic_adapter *adapter = p;
+
+	hfi_vnic_rem_vport(adapter);
+	return 0;
+}
+
+/**
+ * vema_unregister -- Unregisters agent
+ * @cport: pointer to control port
+ *
+ * This deletes the registration by VEMA for MADs
+ */
+static void vema_unregister(struct hfi_vnic_ctrl_port *cport)
+{
+	int i;
+
+	for (i = 1; i <= cport->num_ports; i++) {
+		struct hfi_vnic_vema_port *port = vema_get_port(cport, i);
+
+		if (!port->mad_agent)
+			continue;
+
+		/* Lock ensures no MAD is being processed */
+		mutex_lock(&port->lock);
+		idr_for_each(&port->vport_idr, vema_rem_vport, NULL);
+		mutex_unlock(&port->lock);
+
+		ib_unregister_mad_agent(port->mad_agent);
+		mutex_destroy(&port->lock);
+		idr_destroy(&port->vport_idr);
+	}
+}
+
+/**
+ * vema_register -- Registers agent
+ * @cport: pointer to control port
+ *
+ * This function registers the handlers for the VEMA MADs
+ *
+ * Return: returns 0 on success. non zero otherwise
+ */
+static int vema_register(struct hfi_vnic_ctrl_port *cport)
+{
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = HFI_MGMT_CLASS_INTEL_EMA,
+		.mgmt_class_version = OPA_MGMT_BASE_VERSION,
+		.oui = { INTEL_OUI_1, INTEL_OUI_2, INTEL_OUI_3 }
+	};
+	int i;
+
+	set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask);
+	set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask);
+
+	/* register mad agent for each port on dev */
+	for (i = 1; i <= cport->num_ports; i++) {
+		struct hfi_vnic_vema_port *port = vema_get_port(cport, i);
+
+		port->cport = cport;
+		port->port_num = i;
+		idr_init(&port->vport_idr);
+		mutex_init(&port->lock);
+		port->mad_agent = ib_register_mad_agent(cport->ibdev, i,
+							IB_QPT_GSI, &reg_req,
+							IB_MGMT_RMPP_VERSION,
+							vema_send, vema_recv,
+							port, 0);
+		if (IS_ERR(port->mad_agent)) {
+			int ret = PTR_ERR(port->mad_agent);
+
+			port->mad_agent = NULL;
+			mutex_destroy(&port->lock);
+			idr_destroy(&port->vport_idr);
+			vema_unregister(cport);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * hfi_vnic_vema_send_trap -- This function sends a trap to the EM
+ * @cport: pointer to vnic control port
+ * @data: pointer to trap data filled by calling function
+ * @lid:  issuers lid (encap_slid from vesw_port_info)
+ *
+ * This function is called from the VNIC driver to send a trap if there
+ * is somethng the EM should be notified about. These events currently
+ * are
+ * 1) UNICAST INTERFACE MACADDRESS changes
+ * 2) MULTICAST INTERFACE MACADDRESS changes
+ * 3) ETHERNET LINK STATUS changes
+ * While allocating the send mad the remote site qpn used is 1
+ * as this is the well known QP.
+ *
+ */
+void hfi_vnic_vema_send_trap(struct hfi_vnic_adapter *adapter,
+			     struct __hfi_veswport_trap *data, u32 lid)
+{
+	struct hfi_vnic_ctrl_port *cport = adapter->cport;
+	struct ib_mad_send_buf *send_buf;
+	struct hfi_vnic_vema_port *port;
+	struct ib_device *ibp;
+	struct hfi_vnic_vema_mad_trap *trap_mad;
+	struct hfi_class_port_info *class;
+	struct ib_ah_attr ah_attr;
+	struct ib_ah *ah;
+	struct hfi_veswport_trap *trap;
+	u32 trap_lid;
+	u16 pkey_idx;
+
+	if (!cport)
+		goto err_exit;
+	ibp = cport->ibdev;
+	port = vema_get_port(cport, data->hfiportnum);
+	if (!port || !port->mad_agent)
+		goto err_exit;
+
+	if (time_before(jiffies, adapter->trap_timeout)) {
+		if (adapter->trap_count == HFI_VNIC_TRAP_BURST_LIMIT) {
+			v_warn("Trap rate exceeded\n");
+			goto err_exit;
+		} else {
+			adapter->trap_count++;
+		}
+	} else {
+		adapter->trap_count = 0;
+	}
+
+	class = &port->class_port_info;
+	/* Set up address handle */
+	memset(&ah_attr, 0, sizeof(ah_attr));
+	ah_attr.sl = GET_TRAP_SL_FROM_CLASS_PORT_INFO(class->trap_sl_rsvd);
+	ah_attr.port_num = port->port_num;
+	trap_lid = be32_to_cpu(class->trap_lid);
+	/*
+	 * check for trap lid validity, must not be zero
+	 * The trap sink could change after we fashion the MAD but since traps
+	 * are not guaranteed we won't use a lock as anyway the change will take
+	 * place even with locking.
+	 */
+	if (!trap_lid) {
+		c_err("%s: Invalid dlid\n", __func__);
+		goto err_exit;
+	}
+
+	ah_attr.dlid = trap_lid;
+	ah = ib_create_ah(port->mad_agent->qp->pd, &ah_attr);
+	if (IS_ERR(ah)) {
+		c_err("%s:Couldn't create new AH = %p\n", __func__, ah);
+		c_err("%s:dlid = %d, sl = %d, port = %d\n", __func__,
+		      ah_attr.dlid, ah_attr.sl, ah_attr.port_num);
+		goto err_exit;
+	}
+
+	if (ib_find_pkey(ibp, data->hfiportnum, IB_DEFAULT_PKEY_FULL,
+			 &pkey_idx) < 0) {
+		c_err("%s:full key not found, defaulting to partial\n",
+		      __func__);
+		if (ib_find_pkey(ibp, data->hfiportnum, IB_DEFAULT_PKEY_PARTIAL,
+				 &pkey_idx) < 0)
+			pkey_idx = 1;
+	}
+
+	send_buf = ib_create_send_mad(port->mad_agent, 1, pkey_idx, 0,
+				      IB_MGMT_VENDOR_HDR, IB_MGMT_MAD_DATA,
+				      GFP_KERNEL, OPA_MGMT_BASE_VERSION);
+	if (IS_ERR(send_buf)) {
+		c_err("%s:Couldn't allocate send buf\n", __func__);
+		goto err_sndbuf;
+	}
+
+	send_buf->ah = ah;
+
+	/* Set up common MAD hdr */
+	trap_mad = send_buf->mad;
+	trap_mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION;
+	trap_mad->mad_hdr.mgmt_class = HFI_MGMT_CLASS_INTEL_EMA;
+	trap_mad->mad_hdr.class_version = HFI_EMA_CLASS_VERSION;
+	trap_mad->mad_hdr.method = IB_MGMT_METHOD_TRAP;
+	port->tid++;
+	trap_mad->mad_hdr.tid = cpu_to_be64(port->tid);
+	trap_mad->mad_hdr.attr_id = IB_SMP_ATTR_NOTICE;
+
+	/* Set up vendor OUI */
+	trap_mad->oui[0] = INTEL_OUI_1;
+	trap_mad->oui[1] = INTEL_OUI_2;
+	trap_mad->oui[2] = INTEL_OUI_3;
+
+	/* Setup notice attribute portion */
+	trap_mad->notice.gen_type = HFI_INTEL_EMA_NOTICE_TYPE_INFO << 1;
+	trap_mad->notice.oui_1 = INTEL_OUI_1;
+	trap_mad->notice.oui_2 = INTEL_OUI_2;
+	trap_mad->notice.oui_3 = INTEL_OUI_3;
+	trap_mad->notice.issuer_lid = cpu_to_be32(lid);
+
+	/* copy the actual trap data */
+	trap = (struct hfi_veswport_trap *)trap_mad->notice.raw_data;
+	trap->fabric_id = cpu_to_be16(data->fabric_id);
+	trap->veswid = cpu_to_be16(data->veswid);
+	trap->veswportnum = cpu_to_be32(data->veswportnum);
+	trap->hfiportnum = cpu_to_be16(data->hfiportnum);
+	trap->veswportindex = data->veswportindex;
+	trap->opcode = data->opcode;
+
+	/* If successful send set up rate limit timeout else bail */
+	if (ib_post_send_mad(send_buf, NULL)) {
+		ib_free_send_mad(send_buf);
+	} else {
+		if (adapter->trap_count)
+			return;
+		adapter->trap_timeout = jiffies +
+					usecs_to_jiffies(HFI_VNIC_TRAP_TIMEOUT);
+		return;
+	}
+
+err_sndbuf:
+	ib_destroy_ah(ah);
+err_exit:
+	v_err("Aborting trap\n");
+}
+
+/**
+ * hfi_vnic_vema_add_one -- Handle new ib device
+ * @device: ib device pointer
+ *
+ * Allocate the vnic control port and initialize it.
+ */
+static void hfi_vnic_vema_add_one(struct ib_device *device)
+{
+	struct hfi_vnic_ctrl_port *cport;
+	struct hfi_ibdev *hfidev;
+	int rc, size = sizeof(*cport);
+
+	if (!is_hfi_ibdev(device))
+		return;
+
+	size += device->phys_port_cnt * sizeof(struct hfi_vnic_vema_port);
+	cport = kzalloc(size, GFP_KERNEL);
+	if (!cport)
+		return;
+
+	hfidev = to_hfi_ibdev(device);
+	cport->num_ports = device->phys_port_cnt;
+	cport->ops = &hfidev->vnic_ctrl_ops;
+	cport->ibdev = device;
+
+	/* Initialize hfi vnic management agent (vema) */
+	rc = vema_register(cport);
+	if (!rc)
+		c_info("VNIC client initialized\n");
+
+	ib_set_client_data(device, &hfi_vnic_client, cport);
+}
+
+/**
+ * hfi_vnic_vema_rem_one -- Handle ib device removal
+ * @device: ib device pointer
+ * @client_data: ib client data
+ *
+ * Uninitialize and free the vnic control port.
+ */
+static void hfi_vnic_vema_rem_one(struct ib_device *device,
+				  void *client_data)
+{
+	struct hfi_vnic_ctrl_port *cport = client_data;
+
+	if (!cport)
+		return;
+
+	c_info("removing VNIC client\n");
+	vema_unregister(cport);
+	kfree(cport);
+}
+
+static int __init hfi_vnic_init(void)
+{
+	int rc;
+
+	pr_info("HFI Virtual Network Driver - v%s\n",
+		hfi_vnic_driver_version);
+
+	rc = ib_register_client(&hfi_vnic_client);
+	if (rc)
+		pr_err("VNIC driver register failed %d\n", rc);
+
+	return rc;
+}
+module_init(hfi_vnic_init);
+
+static void hfi_vnic_deinit(void)
+{
+	ib_unregister_client(&hfi_vnic_client);
+}
+module_exit(hfi_vnic_deinit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel HFI Virtual Network driver");
+MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema_iface.c b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema_iface.c
index f912171f..e669f80 100644
--- a/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema_iface.c
+++ b/drivers/infiniband/sw/intel/hfi_vnic/hfi_vnic_vema_iface.c
@@ -71,7 +71,7 @@ void hfi_vnic_vema_report_event(struct hfi_vnic_adapter *adapter, u8 event)
 	trap_data.veswportindex = vport->vport_num;
 	trap_data.opcode = event;
 
-	/* Need to send trap here */
+	hfi_vnic_vema_send_trap(adapter, &trap_data, info->vport.encap_slid);
 }
 
 /**
@@ -419,7 +419,7 @@ struct hfi_vnic_adapter *hfi_vnic_add_vport(struct hfi_vnic_ctrl_port *cport,
 }
 
 /**
- * hfi_vnic_rem_vport - Remove a new vnic port
+ * hfi_vnic_rem_vport - Remove the vnic port
  * @adapter: vnic adapter
  */
 void hfi_vnic_rem_vport(struct hfi_vnic_adapter *adapter)
-- 
1.8.3.1

^ permalink raw reply related

* [RFC v2 09/10] IB/hfi1: Virtual Network Interface Controller (VNIC) support
From: Vishwanathapura, Niranjana @ 2016-12-15  7:59 UTC (permalink / raw)
  To: dledford
  Cc: linux-rdma, netdev, dennis.dalessandro, ira.weiny,
	Niranjana Vishwanathapura, Andrzej Kacprowski
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura@intel.com>

HFI1 HW specific support for VNIC functionality. Add support to add
and remove VNIC ports. Also implement the operations to allocate
resources, transmit and receive of Omni-Path encapsulated Ethernet
packets.

Dynamically allocate a set of contexts for VNIC when the first vnic
port is instantiated. Allocate VNIC contexts from user contexts pool
and return them back to the same pool while freeing up. Set aside
enough MSI-X interrupts for VNIC contexts and assign them when the
contexts are allocated. On the receive side, use an RSM rule to
spread TCP/UDP streams among VNIC contexts.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Signed-off-by: Andrzej Kacprowski <andrzej.kacprowski@intel.com>
---
 drivers/infiniband/hw/hfi1/Makefile       |   2 +-
 drivers/infiniband/hw/hfi1/aspm.h         |  13 +-
 drivers/infiniband/hw/hfi1/chip.c         | 270 +++++++++++--
 drivers/infiniband/hw/hfi1/chip.h         |   2 +
 drivers/infiniband/hw/hfi1/debugfs.c      |   6 +-
 drivers/infiniband/hw/hfi1/driver.c       |  74 +++-
 drivers/infiniband/hw/hfi1/file_ops.c     |  25 +-
 drivers/infiniband/hw/hfi1/hfi.h          |  49 ++-
 drivers/infiniband/hw/hfi1/init.c         |  37 +-
 drivers/infiniband/hw/hfi1/mad.c          |   8 +-
 drivers/infiniband/hw/hfi1/pio.c          |  17 +
 drivers/infiniband/hw/hfi1/pio.h          |   6 +
 drivers/infiniband/hw/hfi1/sysfs.c        |   2 +-
 drivers/infiniband/hw/hfi1/user_exp_rcv.c |   6 +-
 drivers/infiniband/hw/hfi1/user_pages.c   |   3 +-
 drivers/infiniband/hw/hfi1/verbs.c        |   7 +
 drivers/infiniband/hw/hfi1/vnic.h         | 145 +++++++
 drivers/infiniband/hw/hfi1/vnic_main.c    | 614 ++++++++++++++++++++++++++++++
 drivers/infiniband/hw/hfi1/vnic_sdma.c    |  60 +++
 include/rdma/opa_port_info.h              |   2 +-
 20 files changed, 1252 insertions(+), 96 deletions(-)
 create mode 100644 drivers/infiniband/hw/hfi1/vnic.h
 create mode 100644 drivers/infiniband/hw/hfi1/vnic_main.c
 create mode 100644 drivers/infiniband/hw/hfi1/vnic_sdma.c

diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index 0cf97a0..88085f6 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -12,7 +12,7 @@ hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
 	init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
 	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
 	uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
-	verbs_txreq.o
+	verbs_txreq.o vnic_main.o vnic_sdma.o
 hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
 
 CFLAGS_trace.o = -I$(src)
diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h
index 0d58fe3..3a01b69 100644
--- a/drivers/infiniband/hw/hfi1/aspm.h
+++ b/drivers/infiniband/hw/hfi1/aspm.h
@@ -229,14 +229,17 @@ static inline void aspm_ctx_timer_function(unsigned long data)
 	spin_unlock_irqrestore(&rcd->aspm_lock, flags);
 }
 
-/* Disable interrupt processing for verbs contexts when PSM contexts are open */
+/*
+ * Disable interrupt processing for verbs contexts when PSM or VNIC contexts
+ * are open.
+ */
 static inline void aspm_disable_all(struct hfi1_devdata *dd)
 {
 	struct hfi1_ctxtdata *rcd;
 	unsigned long flags;
 	unsigned i;
 
-	for (i = 0; i < dd->first_user_ctxt; i++) {
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
 		rcd = dd->rcd[i];
 		del_timer_sync(&rcd->aspm_timer);
 		spin_lock_irqsave(&rcd->aspm_lock, flags);
@@ -260,7 +263,7 @@ static inline void aspm_enable_all(struct hfi1_devdata *dd)
 	if (aspm_mode != ASPM_MODE_DYNAMIC)
 		return;
 
-	for (i = 0; i < dd->first_user_ctxt; i++) {
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
 		rcd = dd->rcd[i];
 		spin_lock_irqsave(&rcd->aspm_lock, flags);
 		rcd->aspm_intr_enable = true;
@@ -276,7 +279,7 @@ static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
 		    (unsigned long)rcd);
 	rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
 		aspm_mode == ASPM_MODE_DYNAMIC &&
-		rcd->ctxt < rcd->dd->first_user_ctxt;
+		rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt;
 }
 
 static inline void aspm_init(struct hfi1_devdata *dd)
@@ -286,7 +289,7 @@ static inline void aspm_init(struct hfi1_devdata *dd)
 	spin_lock_init(&dd->aspm_lock);
 	dd->aspm_supported = aspm_hw_l1_supported(dd);
 
-	for (i = 0; i < dd->first_user_ctxt; i++)
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++)
 		aspm_ctx_init(dd->rcd[i]);
 
 	/* Start with ASPM disabled */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 9263984..472ce55 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -125,9 +125,16 @@ struct flag_table {
 #define DEFAULT_KRCVQS		  2
 #define MIN_KERNEL_KCTXTS         2
 #define FIRST_KERNEL_KCTXT        1
-/* sizes for both the QP and RSM map tables */
-#define NUM_MAP_ENTRIES		256
-#define NUM_MAP_REGS             32
+
+/*
+ * RSM instance allocation
+ *   0 - Verbs
+ *   1 - User Fecn Handling
+ *   2 - Vnic
+ */
+#define RSM_INS_VERBS             0
+#define RSM_INS_FECN              1
+#define RSM_INS_VNIC              2
 
 /* Bit offset into the GUID which carries HFI id information */
 #define GUID_HFI_INDEX_SHIFT     39
@@ -138,8 +145,7 @@ struct flag_table {
 #define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
 #define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
 
-/* RSM fields */
-
+/* RSM fields for Verbs */
 /* packet type */
 #define IB_PACKET_TYPE         2ull
 #define QW_SHIFT               6ull
@@ -169,6 +175,28 @@ struct flag_table {
 /* QPN[m+n:1] QW 1, OFFSET 1 */
 #define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
 
+/* RSM fields for Vnic */
+/* L2_TYPE: QW 0, OFFSET 61 - for match */
+#define L2_TYPE_QW             0ull
+#define L2_TYPE_BIT_OFFSET     61ull
+#define L2_TYPE_OFFSET(off)    ((L2_TYPE_QW << QW_SHIFT) | (off))
+#define L2_TYPE_MATCH_OFFSET   L2_TYPE_OFFSET(L2_TYPE_BIT_OFFSET)
+#define L2_TYPE_MASK           3ull
+#define L2_16B_VALUE           2ull
+
+/* L4_TYPE QW 1, OFFSET 0 - for match */
+#define L4_TYPE_QW              1ull
+#define L4_TYPE_BIT_OFFSET      0ull
+#define L4_TYPE_OFFSET(off)     ((L4_TYPE_QW << QW_SHIFT) | (off))
+#define L4_TYPE_MATCH_OFFSET    L4_TYPE_OFFSET(L4_TYPE_BIT_OFFSET)
+#define L4_16B_TYPE_MASK        0xFFull
+#define L4_16B_ETH_VALUE        0x78ull
+
+/* 16B VESWID - for select */
+#define L4_16B_HDR_VESWID_OFFSET  ((2 << QW_SHIFT) | (16ull))
+/* 16B ENTROPY - for select */
+#define L2_16B_ENTROPY_OFFSET     ((1 << QW_SHIFT) | (32ull))
+
 /* defines to build power on SC2VL table */
 #define SC2VL_VAL( \
 	num, \
@@ -1045,6 +1073,7 @@ static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
 			   unsigned int *np);
 static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd);
+static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index);
 
 /*
  * Error interrupt table entry.  This is used as input to the interrupt
@@ -6712,7 +6741,13 @@ static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
 	int i;
 
 	/* enable all kernel contexts */
-	for (i = 0; i < dd->n_krcv_queues; i++) {
+	for (i = 0; i < dd->num_rcv_contexts; i++) {
+		struct hfi1_ctxtdata *rcd = dd->rcd[i];
+
+		/* Ensure all non-user contexts(including vnic) are enabled */
+		if (!rcd || !rcd->sc || (rcd->sc->type == SC_USER))
+			continue;
+
 		rcvmask = HFI1_RCVCTRL_CTXT_ENB;
 		/* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
 		rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
@@ -8004,7 +8039,9 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
 	if (likely(source < dd->num_rcv_contexts)) {
 		rcd = dd->rcd[source];
 		if (rcd) {
-			if (source < dd->first_user_ctxt)
+			/* Check for non-user contexts, including vnic */
+			if ((source < dd->first_dyn_alloc_ctxt) ||
+			    (rcd->sc && (rcd->sc->type == SC_KERNEL)))
 				rcd->do_interrupt(rcd, 0);
 			else
 				handle_user_interrupt(rcd);
@@ -8032,7 +8069,8 @@ static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
 		rcd = dd->rcd[source];
 		if (rcd) {
 			/* only pay attention to user urgent interrupts */
-			if (source >= dd->first_user_ctxt)
+			if ((source >= dd->first_dyn_alloc_ctxt) &&
+			    (!rcd->sc || (rcd->sc->type == SC_USER)))
 				handle_user_interrupt(rcd);
 			return;	/* OK */
 		}
@@ -12736,7 +12774,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd)
 	first_sdma = last_general;
 	last_sdma = first_sdma + dd->num_sdma;
 	first_rx = last_sdma;
-	last_rx = first_rx + dd->n_krcv_queues;
+	last_rx = first_rx + dd->n_krcv_queues + HFI1_NUM_VNIC_CTXT;
 
 	/*
 	 * Sanity check - the code expects all SDMA chip source
@@ -12750,7 +12788,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd)
 		const char *err_info;
 		irq_handler_t handler;
 		irq_handler_t thread = NULL;
-		void *arg;
+		void *arg = NULL;
 		int idx;
 		struct hfi1_ctxtdata *rcd = NULL;
 		struct sdma_engine *sde = NULL;
@@ -12777,24 +12815,24 @@ static int request_msix_irqs(struct hfi1_devdata *dd)
 		} else if (first_rx <= i && i < last_rx) {
 			idx = i - first_rx;
 			rcd = dd->rcd[idx];
-			/* no interrupt if no rcd */
-			if (!rcd)
-				continue;
-			/*
-			 * Set the interrupt register and mask for this
-			 * context's interrupt.
-			 */
-			rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-			rcd->imask = ((u64)1) <<
-					((IS_RCVAVAIL_START + idx) % 64);
-			handler = receive_context_interrupt;
-			thread = receive_context_thread;
-			arg = rcd;
-			snprintf(me->name, sizeof(me->name),
-				 DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
-			err_info = "receive context";
-			remap_intr(dd, IS_RCVAVAIL_START + idx, i);
-			me->type = IRQ_RCVCTXT;
+			if (rcd) {
+				/*
+				 * Set the interrupt register and mask for this
+				 * context's interrupt.
+				 */
+				rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
+				rcd->imask = ((u64)1) <<
+					  ((IS_RCVAVAIL_START + idx) % 64);
+				handler = receive_context_interrupt;
+				thread = receive_context_thread;
+				arg = rcd;
+				snprintf(me->name, sizeof(me->name),
+					 DRIVER_NAME "_%d kctxt%d",
+					 dd->unit, idx);
+				err_info = "receive context";
+				remap_intr(dd, IS_RCVAVAIL_START + idx, i);
+				me->type = IRQ_RCVCTXT;
+			}
 		} else {
 			/* not in our expected range - complain, then
 			 * ignore it
@@ -12832,6 +12870,67 @@ static int request_msix_irqs(struct hfi1_devdata *dd)
 	return ret;
 }
 
+void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd)
+{
+	int idx = rcd->ctxt;
+	struct hfi1_devdata *dd = rcd->dd;
+	int i = 1 + dd->num_sdma + idx;
+	struct hfi1_msix_entry *me = &dd->msix_entries[i];
+
+	if (!me->arg) /* => no irq, no affinity */
+		return;
+
+	hfi1_put_irq_affinity(dd, me);
+	free_irq(me->msix.vector, me->arg);
+
+	me->arg = NULL;
+}
+
+void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd)
+{
+	int idx = rcd->ctxt;
+	void *arg = rcd;
+	int ret;
+	struct hfi1_devdata *dd = rcd->dd;
+	int i = 1 + dd->num_sdma + idx;
+	struct hfi1_msix_entry *me = &dd->msix_entries[i];
+
+	/*
+	 * Set the interrupt register and mask for this
+	 * context's interrupt.
+	 */
+	rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
+	rcd->imask = ((u64)1) <<
+		  ((IS_RCVAVAIL_START + idx) % 64);
+
+	snprintf(me->name, sizeof(me->name),
+		 DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
+	me->name[sizeof(me->name) - 1] = 0;
+	me->type = IRQ_RCVCTXT;
+
+	remap_intr(dd, IS_RCVAVAIL_START + idx, i);
+
+	ret = request_threaded_irq(me->msix.vector, receive_context_interrupt,
+				   receive_context_thread, 0, me->name, arg);
+	if (ret) {
+		dd_dev_err(dd, "vnic irq request (vector %d, idx %d) fail %d\n",
+			   me->msix.vector, idx, ret);
+		return;
+	}
+	/*
+	 * assign arg after request_irq call, so it will be
+	 * cleaned up
+	 */
+	me->arg = arg;
+
+	ret = hfi1_get_irq_affinity(dd, me);
+	if (ret) {
+		dd_dev_err(dd,
+			   "unable to pin IRQ %d\n", ret);
+		free_irq(me->msix.vector, me->arg);
+	}
+}
+
 /*
  * Set the general handler to accept all interrupts, remap all
  * chip interrupts back to MSI-X 0.
@@ -12863,7 +12962,7 @@ static int set_up_interrupts(struct hfi1_devdata *dd)
 	 *	N interrupts - one per used SDMA engine
 	 *	M interrupt - one per kernel receive context
 	 */
-	total = 1 + dd->num_sdma + dd->n_krcv_queues;
+	total = 1 + dd->num_sdma + dd->n_krcv_queues + HFI1_NUM_VNIC_CTXT;
 
 	entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
 	if (!entries) {
@@ -12928,7 +13027,8 @@ static int set_up_interrupts(struct hfi1_devdata *dd)
  *
  *	num_rcv_contexts - number of contexts being used
  *	n_krcv_queues - number of kernel contexts
- *	first_user_ctxt - first non-kernel context in array of contexts
+ *	first_dyn_alloc_ctxt - first dynamically allocated context
+ *                             in array of contexts
  *	freectxts  - number of free user contexts
  *	num_send_contexts - number of PIO send contexts being used
  */
@@ -13005,10 +13105,14 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		total_contexts = num_kernel_contexts + num_user_contexts;
 	}
 
-	/* the first N are kernel contexts, the rest are user contexts */
+	/* Accommodate VNIC contexts */
+	if ((total_contexts + HFI1_NUM_VNIC_CTXT) <= dd->chip_rcv_contexts)
+		total_contexts += HFI1_NUM_VNIC_CTXT;
+
+	/* the first N are kernel contexts, the rest are user/vnic contexts */
 	dd->num_rcv_contexts = total_contexts;
 	dd->n_krcv_queues = num_kernel_contexts;
-	dd->first_user_ctxt = num_kernel_contexts;
+	dd->first_dyn_alloc_ctxt = num_kernel_contexts;
 	dd->num_user_contexts = num_user_contexts;
 	dd->freectxts = num_user_contexts;
 	dd_dev_info(dd,
@@ -13464,11 +13568,8 @@ static void reset_rxe_csrs(struct hfi1_devdata *dd)
 		write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
 	for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
 		write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
-	for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
-		write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
-		write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
-		write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
-	}
+	for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++)
+		clear_rsm_rule(dd, i);
 	for (i = 0; i < 32; i++)
 		write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
 
@@ -13827,6 +13928,16 @@ static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
 		  (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
 }
 
+/*
+ * Clear a receive side mapping rule.
+ */
+static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index)
+{
+	write_csr(dd, RCV_RSM_CFG + (8 * rule_index), 0);
+	write_csr(dd, RCV_RSM_SELECT + (8 * rule_index), 0);
+	write_csr(dd, RCV_RSM_MATCH + (8 * rule_index), 0);
+}
+
 /* return the number of RSM map table entries that will be used for QOS */
 static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
 			   unsigned int *np)
@@ -13942,7 +14053,7 @@ static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
 	rrd.value2 = LRH_SC_VALUE;
 
 	/* add rule 0 */
-	add_rsm_rule(dd, 0, &rrd);
+	add_rsm_rule(dd, RSM_INS_VERBS, &rrd);
 
 	/* mark RSM map entries as used */
 	rmt->used += rmt_entries;
@@ -13972,7 +14083,7 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd,
 	/*
 	 * RSM will extract the destination context as an index into the
 	 * map table.  The destination contexts are a sequential block
-	 * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+	 * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive).
 	 * Map entries are accessed as offset + extracted value.  Adjust
 	 * the added offset so this sequence can be placed anywhere in
 	 * the table - as long as the entries themselves do not wrap.
@@ -13980,9 +14091,9 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd,
 	 * start with that to allow for a "negative" offset.
 	 */
 	offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
-						(int)dd->first_user_ctxt);
+						(int)dd->first_dyn_alloc_ctxt);
 
-	for (i = dd->first_user_ctxt, idx = rmt->used;
+	for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used;
 				i < dd->num_rcv_contexts; i++, idx++) {
 		/* replace with identity mapping */
 		regoff = (idx % 8) * 8;
@@ -14016,11 +14127,84 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd,
 	rrd.value2 = 1;
 
 	/* add rule 1 */
-	add_rsm_rule(dd, 1, &rrd);
+	add_rsm_rule(dd, RSM_INS_FECN, &rrd);
 
 	rmt->used += dd->num_user_contexts;
 }
 
+/* Initialize RSM for VNIC */
+void hfi1_init_vnic_rsm(struct hfi1_devdata *dd)
+{
+	u8 i, j;
+	u8 ctx_id = 0;
+	u64 reg;
+	u32 regoff;
+	struct rsm_rule_data rrd;
+
+	if (hfi1_vnic_is_rsm_full(dd, NUM_VNIC_MAP_ENTRIES)) {
+		dd_dev_err(dd, "Vnic RSM disabled, rmt entries used = %d\n",
+			   dd->vnic.rmt_start);
+		return;
+	}
+
+	dev_dbg(&(dd)->pcidev->dev, "Vnic rsm start = %d, end %d\n",
+		dd->vnic.rmt_start,
+		dd->vnic.rmt_start + NUM_VNIC_MAP_ENTRIES);
+
+	/* Update RSM mapping table, 32 regs, 256 entries - 1 ctx per byte */
+	regoff = RCV_RSM_MAP_TABLE + (dd->vnic.rmt_start / 8) * 8;
+	reg = read_csr(dd, regoff);
+	for (i = 0; i < NUM_VNIC_MAP_ENTRIES; i++) {
+		/* Update map register with vnic context */
+		j = (dd->vnic.rmt_start + i) % 8;
+		reg &= ~(0xffllu << (j * 8));
+		reg |= (u64)dd->vnic.ctxt[ctx_id++]->ctxt << (j * 8);
+		/* Wrap up vnic ctx index */
+		ctx_id %= dd->vnic.num_ctxt;
+		/* Write back map register */
+		if (j == 7 || ((i + 1) == NUM_VNIC_MAP_ENTRIES)) {
+			dev_dbg(&(dd)->pcidev->dev,
+				"Vnic rsm map reg[%d] =0x%llx\n",
+				regoff - RCV_RSM_MAP_TABLE, reg);
+
+			write_csr(dd, regoff, reg);
+			regoff += 8;
+			if (i < (NUM_VNIC_MAP_ENTRIES - 1))
+				reg = read_csr(dd, regoff);
+		}
+	}
+
+	/* Add rule for vnic */
+	rrd.offset = dd->vnic.rmt_start;
+	rrd.pkt_type = 4;
+	/* Match 16B packets */
+	rrd.field1_off = L2_TYPE_MATCH_OFFSET;
+	rrd.mask1 = L2_TYPE_MASK;
+	rrd.value1 = L2_16B_VALUE;
+	/* Match ETH L4 packets */
+	rrd.field2_off = L4_TYPE_MATCH_OFFSET;
+	rrd.mask2 = L4_16B_TYPE_MASK;
+	rrd.value2 = L4_16B_ETH_VALUE;
+	/* Calc context from veswid and entropy */
+	rrd.index1_off = L4_16B_HDR_VESWID_OFFSET;
+	rrd.index1_width = ilog2(NUM_VNIC_MAP_ENTRIES);
+	rrd.index2_off = L2_16B_ENTROPY_OFFSET;
+	rrd.index2_width = ilog2(NUM_VNIC_MAP_ENTRIES);
+	add_rsm_rule(dd, RSM_INS_VNIC, &rrd);
+
+	/* Enable RSM if not already enabled */
+	add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+}
+
+void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd)
+{
+	clear_rsm_rule(dd, RSM_INS_VNIC);
+
+	/* Disable RSM if used only by vnic */
+	if (dd->vnic.rmt_start == 0)
+		clear_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+}
+
 static void init_rxe(struct hfi1_devdata *dd)
 {
 	struct rsm_map_table *rmt;
@@ -14033,6 +14217,8 @@ static void init_rxe(struct hfi1_devdata *dd)
 	init_qos(dd, rmt);
 	init_user_fecn_handling(dd, rmt);
 	complete_rsm_map_table(dd, rmt);
+	/* record number of used rsm map entries for vnic */
+	dd->vnic.rmt_start = rmt->used;
 	kfree(rmt);
 
 	/*
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index 9234525..1e177f5 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -1355,6 +1355,8 @@ void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
 int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
 int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
 void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
+void hfi1_init_vnic_rsm(struct hfi1_devdata *dd);
+void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd);
 
 /*
  * Interrupt source table.
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
index 8725f4c..dc8bb86 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.c
+++ b/drivers/infiniband/hw/hfi1/debugfs.c
@@ -169,7 +169,7 @@ static int _opcode_stats_seq_show(struct seq_file *s, void *v)
 	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
 	struct hfi1_devdata *dd = dd_from_dev(ibd);
 
-	for (j = 0; j < dd->first_user_ctxt; j++) {
+	for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) {
 		if (!dd->rcd[j])
 			continue;
 		n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
@@ -195,7 +195,7 @@ static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
 
 	if (!*pos)
 		return SEQ_START_TOKEN;
-	if (*pos >= dd->first_user_ctxt)
+	if (*pos >= dd->first_dyn_alloc_ctxt)
 		return NULL;
 	return pos;
 }
@@ -209,7 +209,7 @@ static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
 		return pos;
 
 	++*pos;
-	if (*pos >= dd->first_user_ctxt)
+	if (*pos >= dd->first_dyn_alloc_ctxt)
 		return NULL;
 	return pos;
 }
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index e219c3b..fbe40b3 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -59,6 +59,7 @@
 #include "trace.h"
 #include "qp.h"
 #include "sdma.h"
+#include "vnic.h"
 
 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@ -860,20 +861,42 @@ int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread)
 	return last;
 }
 
-static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
+static inline void set_nodma_rtail(struct hfi1_devdata *dd, u8 ctxt)
 {
 	int i;
 
-	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+	/*
+	 * For dynamically allocated kernel contexts (like vnic) switch
+	 * interrupt handler only for that context. Otherwise, switch
+	 * interrupt handler for all statically allocated kernel contexts.
+	 */
+	if (ctxt >= dd->first_dyn_alloc_ctxt) {
+		dd->rcd[ctxt]->do_interrupt =
+			&handle_receive_interrupt_nodma_rtail;
+		return;
+	}
+
+	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++)
 		dd->rcd[i]->do_interrupt =
 			&handle_receive_interrupt_nodma_rtail;
 }
 
-static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
+static inline void set_dma_rtail(struct hfi1_devdata *dd, u8 ctxt)
 {
 	int i;
 
-	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+	/*
+	 * For dynamically allocated kernel contexts (like vnic) switch
+	 * interrupt handler only for that context. Otherwise, switch
+	 * interrupt handler for all statically allocated kernel contexts.
+	 */
+	if (ctxt >= dd->first_dyn_alloc_ctxt) {
+		dd->rcd[ctxt]->do_interrupt =
+			&handle_receive_interrupt_dma_rtail;
+		return;
+	}
+
+	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++)
 		dd->rcd[i]->do_interrupt =
 			&handle_receive_interrupt_dma_rtail;
 }
@@ -883,8 +906,13 @@ void set_all_slowpath(struct hfi1_devdata *dd)
 	int i;
 
 	/* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
-	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
-		dd->rcd[i]->do_interrupt = &handle_receive_interrupt;
+	for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) {
+		struct hfi1_ctxtdata *rcd = dd->rcd[i];
+
+		if ((i < dd->first_dyn_alloc_ctxt) ||
+		    (rcd && rcd->sc && (rcd->sc->type == SC_KERNEL)))
+			rcd->do_interrupt = &handle_receive_interrupt;
+	}
 }
 
 static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
@@ -994,7 +1022,7 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
 				last = RCV_PKT_DONE;
 			if (needset) {
 				dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
-				set_all_nodma_rtail(dd);
+				set_nodma_rtail(dd, rcd->ctxt);
 				needset = 0;
 			}
 		} else {
@@ -1016,7 +1044,7 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
 			if (needset) {
 				dd_dev_info(dd,
 					    "Switching to DMA_RTAIL\n");
-				set_all_dma_rtail(dd);
+				set_dma_rtail(dd, rcd->ctxt);
 				needset = 0;
 			}
 		}
@@ -1064,10 +1092,10 @@ void receive_interrupt_work(struct work_struct *work)
 	set_link_state(ppd, HLS_UP_ACTIVE);
 
 	/*
-	 * Interrupt all kernel contexts that could have had an
-	 * interrupt during auto activation.
+	 * Interrupt all statically allocated kernel contexts that could
+	 * have had an interrupt during auto activation.
 	 */
-	for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++)
+	for (i = HFI1_CTRL_CTXT; i < dd->first_dyn_alloc_ctxt; i++)
 		force_recv_intr(dd->rcd[i]);
 }
 
@@ -1281,7 +1309,8 @@ int hfi1_reset_device(int unit)
 
 	spin_lock_irqsave(&dd->uctxt_lock, flags);
 	if (dd->rcd)
-		for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+		for (i = dd->first_dyn_alloc_ctxt;
+		     i < dd->num_rcv_contexts; i++) {
 			if (!dd->rcd[i] || !dd->rcd[i]->cnt)
 				continue;
 			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
@@ -1359,13 +1388,30 @@ int process_receive_ib(struct hfi1_packet *packet)
 	return RHF_RCV_CONTINUE;
 }
 
+static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet)
+{
+	/* Packet received in VNIC context via RSM */
+	if (packet->rcd->is_vnic)
+		return true;
+
+	if ((HFI1_GET_L2_TYPE(packet->ebuf) == HFI1_L2_TYPE_HDR_16B) &&
+	    (HFI1_GET_L4_TYPE(packet->ebuf) == HFI1_VNIC_L4_ETHR))
+		return true;
+
+	return false;
+}
+
 int process_receive_bypass(struct hfi1_packet *packet)
 {
-	if (unlikely(rhf_err_flags(packet->rhf)))
+	if (unlikely(rhf_err_flags(packet->rhf))) {
 		handle_eflags(packet);
+	} else if (hfi1_is_vnic_packet(packet)) {
+		hfi1_vnic_bypass_rcv(packet);
+		return RHF_RCV_CONTINUE;
+	}
 
 	dd_dev_err(packet->rcd->dd,
-		   "Bypass packets are not supported in normal operation. Dropping\n");
+		   "Unsupported bypass packet. Dropping\n");
 	incr_cntr64(&packet->rcd->dd->sw_rcv_bypass_packet_errors);
 	return RHF_RCV_CONTINUE;
 }
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 677efa0..863fbbb 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -576,8 +576,8 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 		 * knows where it's own bitmap is within the page.
 		 */
 		memaddr = (unsigned long)(dd->events +
-					  ((uctxt->ctxt - dd->first_user_ctxt) *
-					   HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
+				  ((uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
+				   HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
 		memlen = PAGE_SIZE;
 		/*
 		 * v3.7 removes VM_RESERVED but the effect is kept by
@@ -746,7 +746,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 	 * Clear any left over, unhandled events so the next process that
 	 * gets this context doesn't get confused.
 	 */
-	ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+	ev = dd->events + ((uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
 			   HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
 	*ev = 0;
 
@@ -895,12 +895,18 @@ static int find_shared_ctxt(struct file *fp,
 
 		if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
 			continue;
-		for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+		for (i = dd->first_dyn_alloc_ctxt;
+		     i < dd->num_rcv_contexts; i++) {
 			struct hfi1_ctxtdata *uctxt = dd->rcd[i];
 
 			/* Skip ctxts which are not yet open */
 			if (!uctxt || !uctxt->cnt)
 				continue;
+
+			/* Skip dynamically allocted kernel contexts */
+			if (uctxt->sc && (uctxt->sc->type == SC_KERNEL))
+				continue;
+
 			/* Skip ctxt if it doesn't match the requested one */
 			if (memcmp(uctxt->uuid, uinfo->uuid,
 				   sizeof(uctxt->uuid)) ||
@@ -946,7 +952,8 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
 		return -EIO;
 	}
 
-	for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
+	for (ctxt = dd->first_dyn_alloc_ctxt;
+	     ctxt < dd->num_rcv_contexts; ctxt++)
 		if (!dd->rcd[ctxt])
 			break;
 
@@ -1292,7 +1299,7 @@ static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
 	 */
 	binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
 					    fd->subctxt, 0);
-	offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) *
+	offset = offset_in_page((((uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
 		    HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
 		  sizeof(*dd->events));
 	binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
@@ -1386,12 +1393,12 @@ int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
 	}
 
 	spin_lock_irqsave(&dd->uctxt_lock, flags);
-	for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
+	for (ctxt = dd->first_dyn_alloc_ctxt; ctxt < dd->num_rcv_contexts;
 	     ctxt++) {
 		uctxt = dd->rcd[ctxt];
 		if (uctxt) {
 			unsigned long *evs = dd->events +
-				(uctxt->ctxt - dd->first_user_ctxt) *
+				(uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
 				HFI1_MAX_SHARED_CTXTS;
 			int i;
 			/*
@@ -1463,7 +1470,7 @@ static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
 	if (!dd->events)
 		return 0;
 
-	evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+	evs = dd->events + ((uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
 			    HFI1_MAX_SHARED_CTXTS) + subctxt;
 
 	for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 1fc5b68..78d1726 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -54,6 +54,7 @@
 #include <linux/list.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <linux/idr.h>
 #include <linux/io.h>
 #include <linux/fs.h>
 #include <linux/completion.h>
@@ -66,6 +67,7 @@
 #include <linux/i2c-algo-bit.h>
 #include <rdma/ib_hdrs.h>
 #include <linux/rhashtable.h>
+#include <linux/netdevice.h>
 #include <rdma/rdma_vt.h>
 
 #include "chip_registers.h"
@@ -337,6 +339,12 @@ struct hfi1_ctxtdata {
 	 * packets with the wrong interrupt handler.
 	 */
 	int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded);
+
+	/* Indicates that this is vnic context */
+	bool is_vnic;
+
+	/* vnic queue index this context is mapped to */
+	u8 vnic_q_idx;
 };
 
 /*
@@ -831,6 +839,30 @@ struct hfi1_asic_data {
 	struct hfi1_i2c_bus *i2c_bus1;
 };
 
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES	 256
+#define NUM_MAP_REGS      32
+
+/*
+ * Number of VNIC contexts used. Ensure it is less than or equal to
+ * max queues supported by VNIC (HFI_VNIC_MAX_QUEUE).
+ */
+#define HFI1_NUM_VNIC_CTXT   8
+
+/* Number of VNIC RSM entries */
+#define NUM_VNIC_MAP_ENTRIES     8
+
+/* Virtual NIC information */
+struct hfi1_vnic_data {
+	struct hfi1_ctxtdata *ctxt[HFI1_NUM_VNIC_CTXT];
+	u8 num_vports;
+	struct idr vesw_idr;
+	u8 rmt_start;
+	u8 num_ctxt;
+};
+
+struct hfi1_vnic_vport_info;
+
 /* device data struct now contains only "general per-device" info.
  * fields related to a physical IB port are in a hfi1_pportdata struct.
  */
@@ -1140,6 +1172,9 @@ struct hfi1_devdata {
 	send_routine process_dma_send;
 	void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
 				u64 pbc, const void *from, size_t count);
+	int (*process_vnic_dma_send)(struct hfi1_devdata *dd, u8 q_idx,
+				     struct hfi1_vnic_vport_info *vinfo,
+				     struct sk_buff *skb, u64 pbc, u8 plen);
 	/* hfi1_pportdata, points to array of (physical) port-specific
 	 * data structs, indexed by pidx (0..n-1)
 	 */
@@ -1151,8 +1186,8 @@ struct hfi1_devdata {
 	u16 flags;
 	/* Number of physical ports available */
 	u8 num_pports;
-	/* Lowest context number which can be used by user processes */
-	u8 first_user_ctxt;
+	/* Lowest context number which can be used by user processes or VNIC */
+	u8 first_dyn_alloc_ctxt;
 	/* adding a new field here would make it part of this cacheline */
 
 	/* seqlock for sc2vl */
@@ -1191,8 +1226,16 @@ struct hfi1_devdata {
 	struct rhashtable sdma_rht;
 
 	struct kobject kobj;
+
+	/* vnic data */
+	struct hfi1_vnic_data vnic;
 };
 
+static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare)
+{
+	return (dd->vnic.rmt_start + spare) > NUM_MAP_ENTRIES;
+}
+
 /* 8051 firmware version helper */
 #define dc8051_ver(a, b) ((a) << 8 | (b))
 #define dc8051_ver_maj(a) ((a & 0xff00) >> 8)
@@ -1258,6 +1301,8 @@ void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
 int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int);
 int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int);
 void set_all_slowpath(struct hfi1_devdata *dd);
+void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd);
+void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd);
 
 extern const struct pci_device_id hfi1_pci_tbl[];
 
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 13f6862..0b510d5 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -65,6 +65,7 @@
 #include "verbs.h"
 #include "aspm.h"
 #include "affinity.h"
+#include "vnic.h"
 
 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@ -139,7 +140,7 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd)
 		goto nomem;
 
 	/* create one or more kernel contexts */
-	for (i = 0; i < dd->first_user_ctxt; ++i) {
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
 		struct hfi1_pportdata *ppd;
 		struct hfi1_ctxtdata *rcd;
 
@@ -213,9 +214,9 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
 	u32 base;
 
 	if (dd->rcv_entries.nctxt_extra >
-	    dd->num_rcv_contexts - dd->first_user_ctxt)
+	    dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)
 		kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
-				 (dd->num_rcv_contexts - dd->first_user_ctxt));
+			 (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt));
 	rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
 	if (rcd) {
 		u32 rcvtids, max_entries;
@@ -237,10 +238,10 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
 		 * Calculate the context's RcvArray entry starting point.
 		 * We do this here because we have to take into account all
 		 * the RcvArray entries that previous context would have
-		 * taken and we have to account for any extra groups
-		 * assigned to the kernel or user contexts.
+		 * taken and we have to account for any extra groups assigned
+		 * to the static (kernel) or dynamic (vnic/user) contexts.
 		 */
-		if (ctxt < dd->first_user_ctxt) {
+		if (ctxt < dd->first_dyn_alloc_ctxt) {
 			if (ctxt < kctxt_ngroups) {
 				base = ctxt * (dd->rcv_entries.ngroups + 1);
 				rcd->rcv_array_groups++;
@@ -248,7 +249,7 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
 				base = kctxt_ngroups +
 					(ctxt * dd->rcv_entries.ngroups);
 		} else {
-			u16 ct = ctxt - dd->first_user_ctxt;
+			u16 ct = ctxt - dd->first_dyn_alloc_ctxt;
 
 			base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
 				kctxt_ngroups);
@@ -327,7 +328,8 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
 		}
 		rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
 
-		if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+		/* Applicable only for statically created kernel contexts */
+		if (ctxt < dd->first_dyn_alloc_ctxt) {
 			rcd->opstats = kzalloc(sizeof(*rcd->opstats),
 				GFP_KERNEL);
 			if (!rcd->opstats)
@@ -591,7 +593,7 @@ static void enable_chip(struct hfi1_devdata *dd)
 	 * Enable kernel ctxts' receive and receive interrupt.
 	 * Other ctxts done as user opens and initializes them.
 	 */
-	for (i = 0; i < dd->first_user_ctxt; ++i) {
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
 		rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
 		rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
 			HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
@@ -685,6 +687,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
 	dd->process_pio_send = hfi1_verbs_send_pio;
 	dd->process_dma_send = hfi1_verbs_send_dma;
 	dd->pio_inline_send = pio_copy;
+	dd->process_vnic_dma_send = hfi1_vnic_send_dma;
 
 	if (is_ax(dd)) {
 		atomic_set(&dd->drop_packet, DROP_PACKET_ON);
@@ -720,7 +723,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
 	}
 
 	/* dd->rcd can be NULL if early initialization failed */
-	for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+	for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) {
 		/*
 		 * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
 		 * re-init, the simplest way to handle this is to free
@@ -1489,6 +1492,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* do the generic initialization */
 	initfail = hfi1_init(dd, 0);
 
+	/* setup vnic */
+	hfi1_vnic_setup(dd);
+
 	ret = hfi1_register_ib_device(dd);
 
 	/*
@@ -1522,6 +1528,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			hfi1_device_remove(dd);
 		if (!ret)
 			hfi1_unregister_ib_device(dd);
+		hfi1_vnic_cleanup(dd);
 		postinit_cleanup(dd);
 		if (initfail)
 			ret = initfail;
@@ -1547,6 +1554,9 @@ static void remove_one(struct pci_dev *pdev)
 	/* unregister from IB core */
 	hfi1_unregister_ib_device(dd);
 
+	/* cleanup vnic */
+	hfi1_vnic_cleanup(dd);
+
 	/*
 	 * Disable the IB link, disable interrupts on the device,
 	 * clear dma engines, etc.
@@ -1588,8 +1598,11 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 		amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
 				 sizeof(u32));
 
-		gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
-			GFP_USER : GFP_KERNEL;
+		if ((rcd->ctxt < dd->first_dyn_alloc_ctxt) ||
+		    (rcd->sc && (rcd->sc->type == SC_KERNEL)))
+			gfp_flags = GFP_KERNEL;
+		else
+			gfp_flags = GFP_USER;
 		rcd->rcvhdrq = dma_zalloc_coherent(
 			&dd->pcidev->dev, amt, &rcd->rcvhdrq_dma,
 			gfp_flags | __GFP_COMP);
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index ed8ae22..cca61e4 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -53,6 +53,7 @@
 #include "mad.h"
 #include "trace.h"
 #include "qp.h"
+#include "vnic.h"
 
 /* the reset value from the FM is supposed to be 0xffff, handle both */
 #define OPA_LINK_WIDTH_RESET_OLD 0x0fff
@@ -650,9 +651,11 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 					OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
 
 	pi->port_packet_format.supported =
-		cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+		cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B |
+			    OPA_PORT_PACKET_FORMAT_16B);
 	pi->port_packet_format.enabled =
-		cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+		cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B |
+			    OPA_PORT_PACKET_FORMAT_16B);
 
 	/* flit_control.interleave is (OPA V1, version .76):
 	 * bits		use
@@ -678,6 +681,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	pi->resptimevalue = 3;
 
 	pi->local_port_num = port;
+	pi->num_vesw_port_supported = HFI_MAX_NUM_VNICS;
 
 	/* buffer info for FM */
 	pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 64c9eeb..3c5466e 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -710,6 +710,7 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 {
 	struct send_context_info *sci;
 	struct send_context *sc = NULL;
+	int req_type = type;
 	dma_addr_t dma;
 	unsigned long flags;
 	u64 reg;
@@ -736,6 +737,13 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 		return NULL;
 	}
 
+	/*
+	 * VNIC contexts are dynamically allocated.
+	 * Hence, pick a user context for VNIC.
+	 */
+	if (type == SC_VNIC)
+		type = SC_USER;
+
 	spin_lock_irqsave(&dd->sc_lock, flags);
 	ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
 	if (ret) {
@@ -745,6 +753,15 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 		return NULL;
 	}
 
+	/*
+	 * VNIC contexts are used by kernel driver.
+	 * Hence, mark them as kernel contexts.
+	 */
+	if (req_type == SC_VNIC) {
+		dd->send_contexts[sw_index].type = SC_KERNEL;
+		type = SC_KERNEL;
+	}
+
 	sci = &dd->send_contexts[sw_index];
 	sci->sc = sc;
 
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
index 867e5ff..22e19d5 100644
--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -54,6 +54,12 @@
 #define SC_USER   3	/* must be the last one: it may take all left */
 #define SC_MAX    4	/* count of send context types */
 
+/*
+ * SC_VNIC types are allocated (dynamically) from the user context pool,
+ * (SC_USER) and used by kernel driver as kernel contexts (SC_KERNEL).
+ */
+#define SC_VNIC   SC_MAX
+
 /* invalid send context index */
 #define INVALID_SCI 0xff
 
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
index 06bff50..cefc916 100644
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -543,7 +543,7 @@ static ssize_t show_nctxts(struct device *device,
 	 * give a more accurate picture of total contexts available.
 	 */
 	return scnprintf(buf, PAGE_SIZE, "%u\n",
-			 min(dd->num_rcv_contexts - dd->first_user_ctxt,
+			 min(dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt,
 			     (u32)dd->sc_sizes[SC_USER].count));
 }
 
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index 64d2652..fdcd686 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -612,7 +612,7 @@ int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
 	struct hfi1_filedata *fd = fp->private_data;
 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	unsigned long *ev = uctxt->dd->events +
-		(((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
+		(((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
 		  HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
 	u32 *array;
 	int ret = 0;
@@ -1016,8 +1016,8 @@ static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
 			 * process in question.
 			 */
 			ev = uctxt->dd->events +
-				(((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
-				  HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
+			  (((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
+			    HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
 			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
 		}
 		fdata->invalid_tid_idx++;
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 20f4ddc..7238a34 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -73,7 +73,8 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
 {
 	unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
 		size = (cache_size * (1UL << 20)); /* convert to bytes */
-	unsigned usr_ctxts = dd->num_rcv_contexts - dd->first_user_ctxt;
+	unsigned int usr_ctxts =
+			dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt;
 	bool can_lock = capable(CAP_IPC_LOCK);
 
 	/*
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 9355c4c..32485bc 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -60,6 +60,7 @@
 #include "trace.h"
 #include "qp.h"
 #include "verbs_txreq.h"
+#include "vnic.h"
 
 static unsigned int hfi1_lkey_table_size = 16;
 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
@@ -131,6 +132,11 @@
 MODULE_PARM_DESC(sge_copy_mode,
 		 "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
 
+static struct hfi_vnic_ctrl_ops hfi1_vnic_ctrl_ops = {
+	.add_vport = hfi1_vnic_add_vport,
+	.rem_vport = hfi1_vnic_rem_vport
+};
+
 static void verbs_sdma_complete(
 	struct sdma_txreq *cookie,
 	int status);
@@ -1870,6 +1876,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 			      i,
 			      ppd->pkeys);
 
+	dd->verbs_dev.hfidev.vnic_ctrl_ops = hfi1_vnic_ctrl_ops;
 	ret = rvt_register_device(rdi);
 	if (ret)
 		goto err_verbs_txreq;
diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h
new file mode 100644
index 0000000..047845e
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/vnic.h
@@ -0,0 +1,145 @@
+#ifndef _HFI1_VNIC_H
+#define _HFI1_VNIC_H
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/opa_hfi.h>
+#include "hfi.h"
+
+#define HFI1_VNIC_ICRC_LEN   4
+#define HFI1_VNIC_TAIL_LEN   1
+#define HFI1_VNIC_ICRC_TAIL_LEN  (HFI1_VNIC_ICRC_LEN + HFI1_VNIC_TAIL_LEN)
+
+#define HFI1_VNIC_MAX_TXQ     16
+#define HFI1_VNIC_MAX_PAD     12
+
+/* L2 header definitions */
+#define HFI1_L2_TYPE_OFFSET     0x7
+#define HFI1_L2_TYPE_SHFT       0x5
+#define HFI1_L2_TYPE_MASK       0x3
+#define HFI1_L2_TYPE_HDR_16B    0x2
+
+#define HFI1_GET_L2_TYPE(hdr)                                            \
+	((*((u8 *)(hdr) + HFI1_L2_TYPE_OFFSET) >> HFI1_L2_TYPE_SHFT) &   \
+	 HFI1_L2_TYPE_MASK)
+
+/* L4 type definitions */
+#define HFI1_L4_TYPE_OFFSET 8
+
+#define HFI1_GET_L4_TYPE(data)   \
+	(*((u8 *)(data) + HFI1_L4_TYPE_OFFSET))
+
+#define HFI1_VNIC_L4_ETHR  0x78
+
+/* L4 header definitions */
+#define HFI1_VNIC_L4_HDR_OFFSET  18
+
+#define HFI1_VNIC_GET_L4_HDR(data)   \
+	(*((u16 *)((u8 *)(data) + HFI1_VNIC_L4_HDR_OFFSET)))
+
+#define HFI1_VNIC_GET_VESWID(data)   \
+	(HFI1_VNIC_GET_L4_HDR(data) & 0xFF)
+
+/* Service class */
+#define HFI1_VNIC_SC_OFFSET_LOW 6
+#define HFI1_VNIC_SC_OFFSET_HI  7
+#define HFI1_VNIC_SC_SHIFT      4
+
+/**
+ * struct hfi1_vnic_notifier - VNIC notifer structure
+ * @cb - vnic callback function
+ */
+struct hfi1_vnic_notifier {
+	hfi_vnic_evt_cb_fn  cb;
+};
+
+/**
+ * struct hfi1_vnic_vport_info - HFI1 VNIC virtual port information
+ * @dd: device data pointer
+ * @notifier: vnic notifier
+ * @event_flags: event notification flags
+ * @vport: vnic port pointer
+ * @skbq: Array of queues for received socket buffers
+ */
+struct hfi1_vnic_vport_info {
+	struct hfi1_devdata *dd;
+
+	struct hfi1_vnic_notifier __rcu *notifier;
+	DECLARE_BITMAP(event_flags, HFI_VNIC_NUM_EVTS);
+	struct hfi_vnic_port *vport;
+
+	struct sk_buff_head skbq[HFI1_NUM_VNIC_CTXT];
+};
+
+static inline struct hfi1_devdata *vnic_dev2dd(struct hfi_vnic_port *vport)
+{
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+
+	return vinfo->dd;
+}
+
+/* setup the last plen bypes of pad */
+static inline void hfi1_vnic_update_pad(unsigned char *pad, u8 plen)
+{
+	pad[HFI1_VNIC_MAX_PAD - 1] = plen - HFI1_VNIC_ICRC_TAIL_LEN;
+}
+
+/* vnic hfi1 internal functions */
+void hfi1_vnic_setup(struct hfi1_devdata *dd);
+void hfi1_vnic_cleanup(struct hfi1_devdata *dd);
+
+void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet);
+
+/* vnic port operations */
+struct hfi_vnic_port *hfi1_vnic_add_vport(struct ib_device *device,
+					  u8 port_num, u8 vport_num);
+void hfi1_vnic_rem_vport(struct hfi_vnic_port *vport);
+int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx,
+		       struct hfi1_vnic_vport_info *vinfo,
+		       struct sk_buff *skb, u64 pbc, u8 plen);
+
+#endif /* _HFI1_VNIC_H */
diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
new file mode 100644
index 0000000..1e237f3
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains HFI1 support for VNIC functionality
+ */
+
+#include <linux/io.h>
+
+#include "vnic.h"
+
+#define HFI1_VNIC_RCV_Q_SIZE   1024
+
+static DEFINE_SPINLOCK(vport_cntr_lock);
+
+static inline u8 hfi1_vnic_get_sc5(u8 *hdr)
+{
+	return  (((*(hdr + HFI1_VNIC_SC_OFFSET_LOW)) >> HFI1_VNIC_SC_SHIFT) |
+		 (((*(hdr + HFI1_VNIC_SC_OFFSET_HI)) & 0x1) <<
+		  HFI1_VNIC_SC_SHIFT));
+}
+
+static int setup_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt)
+{
+	unsigned int rcvctrl_ops = 0;
+	int ret;
+
+	ret = hfi1_init_ctxt(uctxt->sc);
+	if (ret)
+		goto done;
+
+	uctxt->do_interrupt = &handle_receive_interrupt;
+
+	/* Now allocate the RcvHdr queue and eager buffers. */
+	ret = hfi1_create_rcvhdrq(dd, uctxt);
+	if (ret)
+		goto done;
+
+	ret = hfi1_setup_eagerbufs(uctxt);
+	if (ret)
+		goto done;
+
+	set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
+
+	if (uctxt->rcvhdrtail_kvaddr)
+		clear_rcvhdrtail(uctxt);
+
+	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+	rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_ENB;
+
+	if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+
+	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+
+	uctxt->is_vnic = true;
+done:
+	return ret;
+}
+
+static int allocate_vnic_ctxt(struct hfi1_devdata *dd,
+			      struct hfi1_ctxtdata **vnic_ctxt)
+{
+	struct hfi1_ctxtdata *uctxt;
+	unsigned int ctxt;
+	int ret;
+
+	if (dd->flags & HFI1_FROZEN)
+		return -EIO;
+
+	for (ctxt = dd->first_dyn_alloc_ctxt;
+	     ctxt < dd->num_rcv_contexts; ctxt++)
+		if (!dd->rcd[ctxt])
+			break;
+
+	if (ctxt == dd->num_rcv_contexts)
+		return -EBUSY;
+
+	uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, dd->node);
+	if (!uctxt) {
+		dd_dev_err(dd, "Unable to create ctxtdata, failing open\n");
+		return -ENOMEM;
+	}
+
+	uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+			HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+			HFI1_CAP_KGET(NODROP_EGR_FULL) |
+			HFI1_CAP_KGET(DMA_RTAIL);
+	uctxt->seq_cnt = 1;
+
+	/* Allocate and enable a PIO send context */
+	uctxt->sc = sc_alloc(dd, SC_VNIC, uctxt->rcvhdrqentsize,
+			     uctxt->numa_id);
+
+	ret = uctxt->sc ? 0 : -ENOMEM;
+	if (ret)
+		goto bail;
+
+	dd_dev_dbg(dd, "allocated vnic send context %u(%u)\n",
+		   uctxt->sc->sw_index, uctxt->sc->hw_context);
+	ret = sc_enable(uctxt->sc);
+	if (ret)
+		goto bail;
+
+	if (dd->num_msix_entries)
+		hfi1_set_vnic_msix_info(uctxt);
+
+	hfi1_stats.sps_ctxts++;
+	dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt);
+	*vnic_ctxt = uctxt;
+
+	return ret;
+bail:
+	/*
+	 * hfi1_free_ctxtdata() also releases send_context
+	 * structure if uctxt->sc is not null
+	 */
+	dd->rcd[uctxt->ctxt] = NULL;
+	hfi1_free_ctxtdata(dd, uctxt);
+	dd_dev_dbg(dd, "vnic allocation failed. rc %d\n", ret);
+	return ret;
+}
+
+static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
+				 struct hfi1_ctxtdata *uctxt)
+{
+	unsigned long flags;
+
+	dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt);
+	flush_wc();
+
+	if (dd->num_msix_entries)
+		hfi1_reset_vnic_msix_info(uctxt);
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	/*
+	 * Disable receive context and interrupt available, reset all
+	 * RcvCtxtCtrl bits to default values.
+	 */
+	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+		     HFI1_RCVCTRL_TIDFLOW_DIS |
+		     HFI1_RCVCTRL_INTRAVAIL_DIS |
+		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+	/*
+	 * VNIC contexts are allocated from user context pool.
+	 * Release them back to user context pool.
+	 *
+	 * Reset context integrity checks to default.
+	 * (writes to CSRs probably belong in chip.c)
+	 */
+	write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
+			hfi1_pkt_default_send_ctxt_mask(dd, SC_USER));
+	sc_disable(uctxt->sc);
+
+	dd->send_contexts[uctxt->sc->sw_index].type = SC_USER;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	dd->rcd[uctxt->ctxt] = NULL;
+	uctxt->event_flags = 0;
+
+	hfi1_clear_tids(uctxt);
+	hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
+
+	hfi1_stats.sps_ctxts--;
+	hfi1_free_ctxtdata(dd, uctxt);
+}
+
+void hfi1_vnic_setup(struct hfi1_devdata *dd)
+{
+	idr_init(&dd->vnic.vesw_idr);
+}
+
+void hfi1_vnic_cleanup(struct hfi1_devdata *dd)
+{
+	idr_destroy(&dd->vnic.vesw_idr);
+}
+
+static u64 create_bypass_pbc(u32 vl, u32 dw_len)
+{
+	u64 pbc;
+
+	pbc = ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+		| PBC_INSERT_BYPASS_ICRC | PBC_CREDIT_RETURN
+		| PBC_PACKET_BYPASS
+		| ((vl & PBC_VL_MASK) << PBC_VL_SHIFT)
+		| (dw_len & PBC_LENGTH_DWS_MASK) << PBC_LENGTH_DWS_SHIFT;
+
+	return pbc;
+}
+
+static int hfi1_vnic_put_skb(struct hfi_vnic_port *vport,
+			     u8 q_idx, struct sk_buff *skb)
+{
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+	struct hfi1_devdata *dd = vinfo->dd;
+	u32 vl, pkt_len, total_len;
+	u8 sc5, pad_len;
+	int ret = 0;
+	u64 pbc;
+
+	if (q_idx >= vport->hfi_info.num_tx_q) {
+		dev_kfree_skb_any(skb);
+		return -EINVAL;
+	}
+
+	/* add tail padding (for 8 bytes size alignment) and icrc */
+	pad_len = -(skb->len + HFI1_VNIC_ICRC_TAIL_LEN) & 0x7;
+	pad_len += HFI1_VNIC_ICRC_TAIL_LEN;
+
+	/*
+	 * pkt_len is how much data we have to write, includes header and data.
+	 * total_len is length of the packet in Dwords plus the PBC should not
+	 * include the CRC.
+	 */
+	pkt_len = (skb->len + pad_len) >> 2;
+	total_len = pkt_len + 2; /* PBC + packet */
+
+	sc5 = hfi1_vnic_get_sc5(skb->data);
+	vl = sc_to_vlt(dd, sc5);
+	pbc = create_bypass_pbc(vl, total_len);
+
+	dd_dev_dbg(dd, "%d: pbc 0x%016llX len %d pad_len %d\n",
+		   vport->vport_num, pbc, skb->len, pad_len);
+
+	ret = dd->process_vnic_dma_send(dd, q_idx, vinfo, skb,
+					pbc, pad_len);
+
+	if (ret) {
+		if (ret == -ENOMEM)
+			vport->hfi_stats[q_idx].tx_fifo_errors++;
+		else if (ret != -EBUSY)
+			vport->hfi_stats[q_idx].tx_logic_errors++;
+	}
+
+	return ret;
+}
+
+static u8 hfi1_vnic_select_queue(struct hfi_vnic_port *vport, u8 vl, u8 entropy)
+{
+	return 0;
+}
+
+static bool hfi1_vnic_get_write_avail(struct hfi_vnic_port *vport, u8 q_idx)
+{
+	if (q_idx >= vport->hfi_info.num_tx_q)
+		return false;
+
+	return true;
+}
+
+void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_devdata *dd = packet->rcd->dd;
+	struct hfi1_vnic_vport_info *vinfo;
+	struct hfi_vnic_port *vport = NULL;
+	struct hfi1_vnic_notifier *notifier;
+	struct sk_buff *skb;
+	int l4_type, vesw_id = -1;
+	u8 q_idx;
+
+	rcu_read_lock();
+	l4_type = HFI1_GET_L4_TYPE(packet->ebuf);
+	if (l4_type == HFI1_VNIC_L4_ETHR) {
+		vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
+		vport = idr_find(&dd->vnic.vesw_idr, vesw_id);
+
+		/*
+		 * In case of invalid vesw id, update the rx_bad_veswid
+		 * error count of first available vport.
+		 */
+		if (unlikely(!vport)) {
+			struct hfi_vnic_port *vport_tmp;
+			int id_tmp = 0;
+
+			vport_tmp =  idr_get_next(&dd->vnic.vesw_idr, &id_tmp);
+			if (vport_tmp) {
+				spin_lock(&vport_cntr_lock);
+				vport_tmp->hfi_stats[0].rx_bad_veswid++;
+				spin_unlock(&vport_cntr_lock);
+			}
+		}
+	}
+
+	if (unlikely(!vport)) {
+		dd_dev_warn(dd, "vnic rcv err: l4 %d vesw id %d ctx %d\n",
+			    l4_type, vesw_id, packet->rcd->ctxt);
+		goto rcv_done;
+	}
+
+	vinfo = vport->hfi_priv;
+	q_idx = packet->rcd->vnic_q_idx;
+	notifier = rcu_dereference(vinfo->notifier);
+	if (!notifier || !notifier->cb) {
+		vport->hfi_stats[q_idx].rx_logic_errors++;
+		goto rcv_done;
+	}
+
+	if (skb_queue_len(&vinfo->skbq[q_idx]) > HFI1_VNIC_RCV_Q_SIZE) {
+		vport->hfi_stats[q_idx].rx_fifo_errors++;
+		goto rcv_done;
+	}
+
+	skb = netdev_alloc_skb(vport->netdev, packet->tlen);
+	if (!skb) {
+		vport->hfi_stats[q_idx].rx_missed_errors++;
+		goto rcv_done;
+	}
+	memcpy(skb->data, packet->ebuf, packet->tlen);
+	skb_put(skb, packet->tlen);
+
+	skb_queue_tail(&vinfo->skbq[q_idx], skb);
+	if (test_bit((HFI_VNIC_EVT_RX0 + q_idx), vinfo->event_flags))
+		notifier->cb(vport, HFI_VNIC_EVT_RX0 + q_idx);
+
+rcv_done:
+	rcu_read_unlock();
+}
+
+static u16 hfi1_vnic_get_read_avail(struct hfi_vnic_port *vport, u8 q_idx)
+{
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+
+	if (q_idx >= vport->hfi_info.num_rx_q)
+		return 0;
+
+	return skb_queue_len(&vinfo->skbq[q_idx]);
+}
+
+static struct sk_buff *hfi1_vnic_get_skb(struct hfi_vnic_port *vport, u8 q_idx)
+{
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+	unsigned char *pad_info;
+	struct sk_buff *skb;
+
+	if (q_idx >= vport->hfi_info.num_rx_q)
+		return NULL;
+
+	skb = skb_dequeue(&vinfo->skbq[q_idx]);
+	if (!skb)
+		return NULL;
+
+	/* remove tail padding and icrc */
+	pad_info = skb->data + skb->len - 1;
+	skb_trim(skb, (skb->len - HFI1_VNIC_ICRC_TAIL_LEN -
+		       ((*pad_info) & 0x7)));
+
+	return skb;
+}
+
+static void hfi1_vnic_config_notify(struct hfi_vnic_port *vport,
+				    u8 evt, bool enable)
+{
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+
+	if (enable)
+		set_bit(evt, vinfo->event_flags);
+	else
+		clear_bit(evt, vinfo->event_flags);
+}
+
+static int hfi1_vnic_open(struct hfi_vnic_port *vport, hfi_vnic_evt_cb_fn cb)
+{
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+	struct hfi1_devdata *dd = vinfo->dd;
+	struct hfi1_vnic_notifier *notifier;
+	int i, rc;
+
+	if (!cb)
+		return -EINVAL;
+
+	notifier = kmalloc(sizeof(*notifier), GFP_KERNEL);
+	if (!notifier)
+		return -ENOMEM;
+
+	notifier->cb = cb;
+
+	/* ensure virtual eth switch id is valid */
+	if (!vport->vesw_id) {
+		rc = -EINVAL;
+		goto open_fail;
+	}
+
+	rc = idr_alloc(&dd->vnic.vesw_idr, vport, vport->vesw_id,
+		       vport->vesw_id + 1, GFP_NOWAIT);
+	if (rc < 0)
+		goto open_fail;
+
+	for (i = 0; i < HFI1_NUM_VNIC_CTXT; i++)
+		skb_queue_head_init(&vinfo->skbq[i]);
+
+	/* Enable all events */
+	for (i = 0; i < HFI_VNIC_NUM_EVTS; i++)
+		set_bit(i, vinfo->event_flags);
+
+	rcu_assign_pointer(vinfo->notifier, notifier);
+	synchronize_rcu();
+	return 0;
+
+open_fail:
+	kfree(notifier);
+	return rc;
+}
+
+static void hfi1_vnic_close(struct hfi_vnic_port *vport)
+{
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+	struct hfi1_devdata *dd = vinfo->dd;
+	struct hfi1_vnic_notifier *notifier;
+	u8 i;
+
+	idr_remove(&dd->vnic.vesw_idr, vport->vesw_id);
+	notifier = rcu_access_pointer(vinfo->notifier);
+	rcu_assign_pointer(vinfo->notifier, NULL);
+	synchronize_rcu();
+	kfree(notifier);
+
+	/* remove unread skbs */
+	for (i = 0; i < HFI1_NUM_VNIC_CTXT; i++)
+		skb_queue_purge(&vinfo->skbq[i]);
+}
+
+static int hfi1_vnic_allot_ctxt(struct hfi1_devdata *dd,
+				struct hfi1_ctxtdata **vnic_ctxt)
+{
+	int rc;
+
+	rc = allocate_vnic_ctxt(dd, vnic_ctxt);
+	if (rc) {
+		dd_dev_err(dd, "vnic ctxt alloc failed %d\n", rc);
+		return rc;
+	}
+
+	rc = setup_vnic_ctxt(dd, *vnic_ctxt);
+	if (rc) {
+		dd_dev_err(dd, "vnic ctxt setup failed %d\n", rc);
+		deallocate_vnic_ctxt(dd, *vnic_ctxt);
+		*vnic_ctxt = NULL;
+	}
+
+	return rc;
+}
+
+static int hfi1_vnic_init(struct hfi_vnic_port *vport)
+{
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+	struct hfi1_devdata *dd = vinfo->dd;
+	int i, rc = 0;
+
+	mutex_lock(&hfi1_mutex);
+	for (i = dd->vnic.num_ctxt; i < vport->hfi_info.num_rx_q; i++) {
+		rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]);
+		if (rc)
+			break;
+		dd->vnic.ctxt[i]->vnic_q_idx = i;
+	}
+
+	if (i < vport->hfi_info.num_rx_q) {
+		/*
+		 * If required amount of contexts is not
+		 * allocated successfully then remaining contexts
+		 * are released.
+		 */
+		while (i-- > dd->vnic.num_ctxt) {
+			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
+			dd->vnic.ctxt[i] = NULL;
+		}
+		goto alloc_fail;
+	}
+
+	if (dd->vnic.num_ctxt != i) {
+		dd->vnic.num_ctxt = i;
+		hfi1_init_vnic_rsm(dd);
+	}
+
+	dd->vnic.num_vports++;
+	vinfo->vport = vport;
+alloc_fail:
+	mutex_unlock(&hfi1_mutex);
+	return rc;
+}
+
+static void hfi1_vnic_deinit(struct hfi_vnic_port *vport)
+{
+	struct hfi1_devdata *dd = vnic_dev2dd(vport);
+	int i;
+
+	mutex_lock(&hfi1_mutex);
+	if (--dd->vnic.num_vports == 0) {
+		for (i = 0; i < dd->vnic.num_ctxt; i++) {
+			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
+			dd->vnic.ctxt[i] = NULL;
+		}
+		hfi1_deinit_vnic_rsm(dd);
+		dd->vnic.num_ctxt = 0;
+	}
+	mutex_unlock(&hfi1_mutex);
+}
+
+/* vnic operations */
+static struct hfi_vnic_ops hfi1_vnic_ops = {
+	.open = hfi1_vnic_open,
+	.close = hfi1_vnic_close,
+	.put_skb = hfi1_vnic_put_skb,
+	.get_skb = hfi1_vnic_get_skb,
+	.get_read_avail = hfi1_vnic_get_read_avail,
+	.get_write_avail = hfi1_vnic_get_write_avail,
+	.select_queue = hfi1_vnic_select_queue,
+	.config_notify = hfi1_vnic_config_notify
+};
+
+/* hfi1_vnic_add_vport - Allocate and initialize a vnic port */
+struct hfi_vnic_port *hfi1_vnic_add_vport(struct ib_device *device,
+					  u8 port_num, u8 vport_num)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(device);
+	struct hfi1_vnic_vport_info *vinfo;
+	struct hfi_vnic_port *vport;
+	int rc;
+
+	if (!port_num || (port_num > dd->num_pports) ||
+	    (vport_num == HFI_MAX_NUM_VNICS))
+		return ERR_PTR(-EINVAL);
+
+	vport = kzalloc(sizeof(*vport), GFP_KERNEL);
+	if (!vport)
+		return ERR_PTR(-ENOMEM);
+
+	vinfo = kzalloc(sizeof(*vinfo), GFP_KERNEL);
+	if (!vinfo) {
+		rc = -ENOMEM;
+		goto vinfo_fail;
+	}
+
+	vinfo->dd = dd;
+	vport->hfi_info.num_tx_q = dd->chip_sdma_engines;
+	vport->hfi_info.num_rx_q = HFI1_NUM_VNIC_CTXT;
+	vport->hfi_info.cap = HFI_VNIC_CAP_SG;
+	vport->ops = &hfi1_vnic_ops;
+	vport->hfi_priv = vinfo;
+	vport->port_num = port_num;
+	vport->vport_num = vport_num;
+
+	rc = hfi1_vnic_init(vport);
+	if (rc)
+		goto init_fail;
+
+	dd_dev_info(dd, "added vnic port %d:%d\n", port_num, vport_num);
+	return vport;
+init_fail:
+	kfree(vinfo);
+vinfo_fail:
+	kfree(vport);
+	return ERR_PTR(rc);
+}
+
+/* hfi1_vnic_rem_vport - Uninitialize and free vnic port */
+void hfi1_vnic_rem_vport(struct hfi_vnic_port *vport)
+{
+	hfi1_vnic_deinit(vport);
+	kfree(vport->hfi_priv);
+	kfree(vport);
+}
diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c
new file mode 100644
index 0000000..66abad0
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains HFI1 support for VNIC SDMA functionality
+ */
+
+#include "sdma.h"
+#include "vnic.h"
+
+int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx,
+		       struct hfi1_vnic_vport_info *vinfo,
+		       struct sk_buff *skb, u64 pbc, u8 plen)
+{
+	return 0;
+}
diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h
index 9303e0e..84caa5b 100644
--- a/include/rdma/opa_port_info.h
+++ b/include/rdma/opa_port_info.h
@@ -410,7 +410,7 @@ struct opa_port_info {
 
 	u8     resptimevalue;		        /* 3 res, 5 bits */
 	u8     local_port_num;
-	u8     reserved12;
+	u8     num_vesw_port_supported;
 	u8     reserved13;                       /* was guid_cap */
 } __attribute__ ((packed));
 
-- 
1.8.3.1

^ permalink raw reply related

* [RFC v2 10/10] IB/hfi1: VNIC SDMA support
From: Vishwanathapura, Niranjana @ 2016-12-15  7:59 UTC (permalink / raw)
  To: dledford
  Cc: linux-rdma, netdev, dennis.dalessandro, ira.weiny,
	Niranjana Vishwanathapura
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura@intel.com>

HFI1 VNIC SDMA support enables transmission of VNIC packets over SDMA.
Map VNIC queues to SDMA engines and support halting and wakeup of the
VNIC queues.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
---
 drivers/infiniband/hw/hfi1/hfi.h       |   1 +
 drivers/infiniband/hw/hfi1/vnic.h      |  30 +++-
 drivers/infiniband/hw/hfi1/vnic_main.c |  21 ++-
 drivers/infiniband/hw/hfi1/vnic_sdma.c | 260 +++++++++++++++++++++++++++++++++
 4 files changed, 309 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 78d1726..8d5949f 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -855,6 +855,7 @@ struct hfi1_asic_data {
 /* Virtual NIC information */
 struct hfi1_vnic_data {
 	struct hfi1_ctxtdata *ctxt[HFI1_NUM_VNIC_CTXT];
+	struct kmem_cache *txreq_cache;
 	u8 num_vports;
 	struct idr vesw_idr;
 	u8 rmt_start;
diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h
index 047845e..2d4eb8f 100644
--- a/drivers/infiniband/hw/hfi1/vnic.h
+++ b/drivers/infiniband/hw/hfi1/vnic.h
@@ -49,6 +49,7 @@
 
 #include <rdma/opa_hfi.h>
 #include "hfi.h"
+#include "sdma.h"
 
 #define HFI1_VNIC_ICRC_LEN   4
 #define HFI1_VNIC_TAIL_LEN   1
@@ -90,6 +91,26 @@
 #define HFI1_VNIC_SC_SHIFT      4
 
 /**
+ * struct hfi1_vnic_sdma - VNIC per Tx ring SDMA information
+ * @dd - device data pointer
+ * @sde - sdma engine
+ * @vinfo - vnic info pointer
+ * @wait - iowait structure
+ * @stx - sdma tx request
+ * @state - vnic Tx ring SDMA state
+ * @q_idx - vnic Tx queue index
+ */
+struct hfi1_vnic_sdma {
+	struct hfi1_devdata *dd;
+	struct sdma_engine  *sde;
+	struct hfi1_vnic_vport_info *vinfo;
+	struct iowait wait;
+	struct sdma_txreq stx;
+	unsigned int state;
+	u8 q_idx;
+};
+
+/**
  * struct hfi1_vnic_notifier - VNIC notifer structure
  * @cb - vnic callback function
  */
@@ -104,6 +125,7 @@ struct hfi1_vnic_notifier {
  * @event_flags: event notification flags
  * @vport: vnic port pointer
  * @skbq: Array of queues for received socket buffers
+ * @sdma: VNIC SDMA structure per TXQ
  */
 struct hfi1_vnic_vport_info {
 	struct hfi1_devdata *dd;
@@ -112,7 +134,8 @@ struct hfi1_vnic_vport_info {
 	DECLARE_BITMAP(event_flags, HFI_VNIC_NUM_EVTS);
 	struct hfi_vnic_port *vport;
 
-	struct sk_buff_head skbq[HFI1_NUM_VNIC_CTXT];
+	struct sk_buff_head    skbq[HFI1_NUM_VNIC_CTXT];
+	struct hfi1_vnic_sdma  sdma[HFI1_VNIC_MAX_TXQ];
 };
 
 static inline struct hfi1_devdata *vnic_dev2dd(struct hfi_vnic_port *vport)
@@ -131,8 +154,13 @@ static inline void hfi1_vnic_update_pad(unsigned char *pad, u8 plen)
 /* vnic hfi1 internal functions */
 void hfi1_vnic_setup(struct hfi1_devdata *dd);
 void hfi1_vnic_cleanup(struct hfi1_devdata *dd);
+int hfi1_vnic_txreq_init(struct hfi1_devdata *dd);
+void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd);
 
 void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet);
+void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo);
+bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo,
+				u8 q_idx);
 
 /* vnic port operations */
 struct hfi_vnic_port *hfi1_vnic_add_vport(struct ib_device *device,
diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
index 1e237f3..19843a4 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -289,15 +289,21 @@ static int hfi1_vnic_put_skb(struct hfi_vnic_port *vport,
 
 static u8 hfi1_vnic_select_queue(struct hfi_vnic_port *vport, u8 vl, u8 entropy)
 {
-	return 0;
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+	struct sdma_engine *sde;
+
+	sde = sdma_select_engine_vl(vinfo->dd, entropy, vl);
+	return sde->this_idx;
 }
 
 static bool hfi1_vnic_get_write_avail(struct hfi_vnic_port *vport, u8 q_idx)
 {
+	struct hfi1_vnic_vport_info *vinfo = vport->hfi_priv;
+
 	if (q_idx >= vport->hfi_info.num_tx_q)
 		return false;
 
-	return true;
+	return hfi1_vnic_sdma_write_avail(vinfo, q_idx);
 }
 
 void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
@@ -499,6 +505,12 @@ static int hfi1_vnic_init(struct hfi_vnic_port *vport)
 	int i, rc = 0;
 
 	mutex_lock(&hfi1_mutex);
+	if (!dd->vnic.num_vports) {
+		rc = hfi1_vnic_txreq_init(dd);
+		if (rc)
+			goto txreq_fail;
+	}
+
 	for (i = dd->vnic.num_ctxt; i < vport->hfi_info.num_rx_q; i++) {
 		rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]);
 		if (rc)
@@ -526,7 +538,11 @@ static int hfi1_vnic_init(struct hfi_vnic_port *vport)
 
 	dd->vnic.num_vports++;
 	vinfo->vport = vport;
+	hfi1_vnic_sdma_init(vinfo);
 alloc_fail:
+	if (!dd->vnic.num_vports)
+		hfi1_vnic_txreq_deinit(dd);
+txreq_fail:
 	mutex_unlock(&hfi1_mutex);
 	return rc;
 }
@@ -544,6 +560,7 @@ static void hfi1_vnic_deinit(struct hfi_vnic_port *vport)
 		}
 		hfi1_deinit_vnic_rsm(dd);
 		dd->vnic.num_ctxt = 0;
+		hfi1_vnic_txreq_deinit(dd);
 	}
 	mutex_unlock(&hfi1_mutex);
 }
diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c
index 66abad0..cb1f9ce 100644
--- a/drivers/infiniband/hw/hfi1/vnic_sdma.c
+++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c
@@ -52,9 +52,269 @@
 #include "sdma.h"
 #include "vnic.h"
 
+#define HFI1_VNIC_SDMA_Q_ACTIVE   BIT(0)
+#define HFI1_VNIC_SDMA_Q_DEFERRED BIT(1)
+
+#define HFI1_VNIC_TXREQ_NAME_LEN   32
+#define HFI1_VNIC_SDMA_DESC_WTRMRK 64
+#define HFI1_VNIC_SDMA_RETRY_COUNT 1
+
+/*
+ * struct vnic_txreq - VNIC transmit descriptor
+ * @txreq: sdma transmit request
+ * @sdma: vnic sdma pointer
+ * @skb: skb to send
+ * @pad: pad buffer
+ * @plen: pad length
+ * @pbc_val: pbc value
+ * @retry_count: tx retry count
+ */
+struct vnic_txreq {
+	struct sdma_txreq       txreq;
+	struct hfi1_vnic_sdma   *sdma;
+
+	struct sk_buff         *skb;
+	unsigned char           pad[HFI1_VNIC_MAX_PAD];
+	u16                     plen;
+	__le64                  pbc_val;
+
+	u32                     retry_count;
+};
+
+static void vnic_sdma_complete(struct sdma_txreq *txreq,
+			       int status)
+{
+	struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq);
+	struct hfi1_vnic_sdma *vnic_sdma = tx->sdma;
+
+	sdma_txclean(vnic_sdma->dd, txreq);
+	dev_kfree_skb_any(tx->skb);
+	kmem_cache_free(vnic_sdma->dd->vnic.txreq_cache, tx);
+}
+
+static noinline int build_vnic_ulp_payload(struct sdma_engine *sde,
+					   struct vnic_txreq *tx)
+{
+	int i, ret = 0;
+
+	ret = sdma_txadd_kvaddr(
+		sde->dd,
+		&tx->txreq,
+		tx->skb->data,
+		skb_headlen(tx->skb));
+	if (ret)
+		goto bail_txadd;
+
+	for (i = 0; i < skb_shinfo(tx->skb)->nr_frags; i++) {
+		struct skb_frag_struct *frag = &skb_shinfo(tx->skb)->frags[i];
+
+		/* combine physically continuous fragments later? */
+		ret = sdma_txadd_page(sde->dd,
+				      &tx->txreq,
+				      skb_frag_page(frag),
+				      frag->page_offset,
+				      skb_frag_size(frag));
+		if (ret)
+			goto bail_txadd;
+	}
+
+	if (tx->plen)
+		ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq,
+					tx->pad + HFI1_VNIC_MAX_PAD - tx->plen,
+					tx->plen);
+
+bail_txadd:
+	return ret;
+}
+
+static int build_vnic_tx_desc(struct sdma_engine *sde,
+			      struct vnic_txreq *tx,
+			      u64 pbc)
+{
+	int ret = 0;
+	u16 hdrbytes = 2 << 2;  /* PBC */
+
+	ret = sdma_txinit_ahg(
+		&tx->txreq,
+		0,
+		hdrbytes + tx->skb->len + tx->plen,
+		0,
+		0,
+		NULL,
+		0,
+		vnic_sdma_complete);
+	if (ret)
+		goto bail_txadd;
+
+	/* add pbc */
+	tx->pbc_val = cpu_to_le64(pbc);
+	ret = sdma_txadd_kvaddr(
+		sde->dd,
+		&tx->txreq,
+		&tx->pbc_val,
+		hdrbytes);
+	if (ret)
+		goto bail_txadd;
+
+	/* add the ulp payload */
+	ret = build_vnic_ulp_payload(sde, tx);
+bail_txadd:
+	return ret;
+}
+
 int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx,
 		       struct hfi1_vnic_vport_info *vinfo,
 		       struct sk_buff *skb, u64 pbc, u8 plen)
 {
+	struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[q_idx];
+	struct sdma_engine *sde = vnic_sdma->sde;
+	struct vnic_txreq *tx;
+	int ret = -ECOMM;
+
+	if (READ_ONCE(vnic_sdma->state) != HFI1_VNIC_SDMA_Q_ACTIVE)
+		goto tx_err;
+
+	if (!sde || !sdma_running(sde))
+		goto tx_err;
+
+	tx = kmem_cache_alloc(dd->vnic.txreq_cache, GFP_ATOMIC);
+	if (!tx) {
+		ret = -ENOMEM;
+		goto tx_err;
+	}
+
+	tx->sdma = vnic_sdma;
+	tx->skb = skb;
+	hfi1_vnic_update_pad(tx->pad, plen);
+	tx->plen = plen;
+	ret = build_vnic_tx_desc(sde, tx, pbc);
+	if (unlikely(ret))
+		goto free_desc;
+	tx->retry_count = 0;
+
+	ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq);
+	/* When -ECOMM, sdma callback will be called with ABORT status */
+	if (ret && unlikely(ret != -ECOMM))
+		goto free_desc;
+
+	return ret;
+
+free_desc:
+	sdma_txclean(dd, &tx->txreq);
+	kmem_cache_free(dd->vnic.txreq_cache, tx);
+tx_err:
+	if (ret != -EBUSY)
+		dev_kfree_skb_any(skb);
+	return ret;
+}
+
+/*
+ * hfi1_vnic_sdma_sleep - vnic sdma sleep function
+ *
+ * This function gets called from sdma_send_txreq() when there are not enough
+ * sdma descriptors available to send the packet. It adds Tx queue's wait
+ * structure to sdma engine's dmawait list to be woken up when descriptors
+ * become available.
+ */
+static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
+				struct iowait *wait,
+				struct sdma_txreq *txreq,
+				unsigned int seq)
+{
+	struct hfi1_vnic_sdma *vnic_sdma =
+		container_of(wait, struct hfi1_vnic_sdma, wait);
+	struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev;
+	struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq);
+
+	if (sdma_progress(sde, seq, txreq))
+		if (tx->retry_count++ < HFI1_VNIC_SDMA_RETRY_COUNT)
+			return -EAGAIN;
+
+	vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
+	write_seqlock(&dev->iowait_lock);
+	if (list_empty(&vnic_sdma->wait.list))
+		list_add_tail(&vnic_sdma->wait.list, &sde->dmawait);
+	write_sequnlock(&dev->iowait_lock);
+	return -EBUSY;
+}
+
+/*
+ * hfi1_vnic_sdma_wakeup - vnic sdma wakeup function
+ *
+ * This function gets called when SDMA descriptors becomes available and Tx
+ * queue's wait structure was previously added to sdma engine's dmawait list.
+ * It notifies the upper driver about Tx queue wakeup.
+ */
+static void hfi1_vnic_sdma_wakeup(struct iowait *wait, int reason)
+{
+	struct hfi1_vnic_sdma *vnic_sdma =
+		container_of(wait, struct hfi1_vnic_sdma, wait);
+	struct hfi1_vnic_vport_info *vinfo = vnic_sdma->vinfo;
+	u8 evt = HFI_VNIC_EVT_TX0 + vnic_sdma->q_idx;
+	struct hfi1_vnic_notifier *notifier;
+
+	vnic_sdma->state = HFI1_VNIC_SDMA_Q_ACTIVE;
+	notifier = rcu_dereference(vinfo->notifier);
+	if (notifier && notifier->cb && test_bit(evt, vinfo->event_flags))
+		notifier->cb(vinfo->vport, evt);
+};
+
+inline bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo,
+				       u8 q_idx)
+{
+	struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[q_idx];
+
+	return (READ_ONCE(vnic_sdma->state) == HFI1_VNIC_SDMA_Q_ACTIVE);
+}
+
+void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo)
+{
+	int i;
+
+	for (i = 0; i < vinfo->vport->hfi_info.num_tx_q; i++) {
+		struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i];
+
+		iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep,
+			    hfi1_vnic_sdma_wakeup, NULL);
+		vnic_sdma->sde = &vinfo->dd->per_sdma[i];
+		vnic_sdma->dd = vinfo->dd;
+		vnic_sdma->vinfo = vinfo;
+		vnic_sdma->q_idx = i;
+		vnic_sdma->state = HFI1_VNIC_SDMA_Q_ACTIVE;
+
+		/* Add a free descriptor watermark for wakeups */
+		if (vnic_sdma->sde->descq_cnt >= HFI1_VNIC_SDMA_DESC_WTRMRK) {
+			INIT_LIST_HEAD(&vnic_sdma->stx.list);
+			vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK;
+			list_add_tail(&vnic_sdma->stx.list,
+				      &vnic_sdma->wait.tx_head);
+		}
+	}
+}
+
+static void hfi1_vnic_txreq_kmem_cache_ctor(void *obj)
+{
+	struct vnic_txreq *tx = (struct vnic_txreq *)obj;
+
+	memset(tx, 0, sizeof(*tx));
+}
+
+int hfi1_vnic_txreq_init(struct hfi1_devdata *dd)
+{
+	char buf[HFI1_VNIC_TXREQ_NAME_LEN];
+
+	snprintf(buf, sizeof(buf), "hfi1_%u_vnic_txreq_cache", dd->unit);
+	dd->vnic.txreq_cache = kmem_cache_create(buf,
+					  sizeof(struct vnic_txreq),
+					  0, SLAB_HWCACHE_ALIGN,
+					  hfi1_vnic_txreq_kmem_cache_ctor);
+	if (!dd->vnic.txreq_cache)
+		return -ENOMEM;
 	return 0;
 }
+
+void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd)
+{
+	kmem_cache_destroy(dd->vnic.txreq_cache);
+	dd->vnic.txreq_cache = NULL;
+}
-- 
1.8.3.1

^ permalink raw reply related

* Re: [kernel-hardening] Re: [PATCH v2 1/4] siphash: add cryptographically secure hashtable function
From: Daniel Micay @ 2016-12-15  8:15 UTC (permalink / raw)
  To: kernel-hardening, Jason A. Donenfeld
  Cc: hannes, netdev, linux-kernel, linux-crypto, jeanphilippe.aumasson,
	djb, torvalds, ebiggers3
In-Reply-To: <20161215075746.GA14699@gondor.apana.org.au>

[-- Attachment #1: Type: text/plain, Size: 768 bytes --]

On Thu, 2016-12-15 at 15:57 +0800, Herbert Xu wrote:
> Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > 
> > Siphash needs a random secret key, yes. The point is that the hash
> > function remains secure so long as the secret key is kept secret.
> > Other functions can't make the same guarantee, and so nervous
> > periodic
> > key rotation is necessary, but in most cases nothing is done, and so
> > things just leak over time.
> 
> Actually those users that use rhashtable now have a much more
> sophisticated defence against these attacks, dyanmic rehashing
> when bucket length exceeds a preset limit.
> 
> Cheers,

Key independent collisions won't be mitigated by picking a new secret.

A simple solution with clear security properties is ideal.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 866 bytes --]

^ permalink raw reply

* Re: wl1251 & mac address & calibration data
From: Kalle Valo @ 2016-12-15  8:18 UTC (permalink / raw)
  To: Pali Rohár
  Cc: Sebastian Reichel, Pavel Machek, Michal Kazior, Ivaylo Dimitrov,
	Aaro Koskinen, Tony Lindgren, linux-wireless, Network Development,
	linux-kernel, Luis R. Rodriguez
In-Reply-To: <201611241935.32796@pali>

(Adding Luis because he has been working on request_firmware() lately)

Pali Rohár <pali.rohar@gmail.com> writes:

>> > So no, there is no argument against... request_firmware() in
>> > fallback mode with userspace helper is by design blocking and
>> > waiting for userspace. But waiting for some change in DTS in
>> > kernel is just nonsense.
>> 
>> I would just mark the wlan device with status = "disabled" and
>> enable it in the overlay together with adding the NVS & MAC info.
>
> So if you think that this solution make sense, we can wait what net 
> wireless maintainers say about it...
>
> For me it looks like that solution can be:
>
> extending request_firmware() to use only userspace helper

I haven't followed the discussion very closely but this is my preference
what drivers should do:

1) First the driver should do try to get the calibration data and mac
   address from the device tree.

2) If they are not in DT the driver should retrieve the calibration data
   with request_firmware(). BUT with an option for user space to
   implement that with a helper script so that the data can be created
   dynamically, which I believe openwrt does with ath10k calibration
   data right now.

> and load mac address also via request_firmware() either by appending it 
> into NVS data or via separate call

I'm not really fan of the idea providing permanent mac address through
request_firmware(). For example, how to handle multiple devices on the
same host, would there be a need for some kind of bus ids encoded to the
filename? And what about devices with multiple mac addresses?

I wish there would be a better way than request_firmware() to provide
the permanent mac addresses from user space (if device tree is not
available), I just don't know what that could be :) But if we would
start to use request_firmware() for this at least there should be a
wider concensus about that and it should be properly documented, just
like the device tree bindings.

-- 
Kalle Valo

^ permalink raw reply

* Re: Designing a safe RX-zero-copy Memory Model for Networking
From: Jesper Dangaard Brouer @ 2016-12-15  8:28 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: John Fastabend, David Miller, Christoph Lameter, rppt, Netdev,
	linux-mm, willemdebruijn.kernel, Björn Töpel,
	magnus.karlsson, Mel Gorman, Tom Herbert, Brenden Blanco,
	Tariq Toukan, Saeed Mahameed, Brandeburg, Jesse, METH,
	Vlad Yasevich, brouer
In-Reply-To: <CAKgT0UfckuW-qPOr3WAgwKJFGu0Ot0g2Ha3uRpyU3rpdZeFVpA@mail.gmail.com>

On Wed, 14 Dec 2016 14:45:00 -0800
Alexander Duyck <alexander.duyck@gmail.com> wrote:

> On Wed, Dec 14, 2016 at 1:29 PM, Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> > On Wed, 14 Dec 2016 08:45:08 -0800
> > Alexander Duyck <alexander.duyck@gmail.com> wrote:
> >  
> >> I agree.  This is a no-go from the performance perspective as well.
> >> At a minimum you would have to be zeroing out the page between uses to
> >> avoid leaking data, and that assumes that the program we are sending
> >> the pages to is slightly well behaved.  If we think zeroing out an
> >> sk_buff is expensive wait until we are trying to do an entire 4K page.  
> >
> > Again, yes the page will be zero'ed out, but only when entering the
> > page_pool. Because they are recycled they are not cleared on every use.
> > Thus, performance does not suffer.  
> 
> So you are talking about recycling, but not clearing the page when it
> is recycled.  That right there is my problem with this.  It is fine if
> you assume the pages are used by the application only, but you are
> talking about using them for both the application and for the regular
> network path.  You can't do that.  If you are recycling you will have
> to clear the page every time you put it back onto the Rx ring,
> otherwise you can leak the recycled memory into user space and end up
> with a user space program being able to snoop data out of the skb.
> 
> > Besides clearing large mem area is not as bad as clearing small.
> > Clearing an entire page does cost something, as mentioned before 143
> > cycles, which is 28 bytes-per-cycle (4096/143).  And clearing 256 bytes
> > cost 36 cycles which is only 7 bytes-per-cycle (256/36).  
> 
> What I am saying is that you are going to be clearing the 4K blocks
> each time they are recycled.  You can't have the pages shared between
> user-space and the network stack unless you have true isolation.  If
> you are allowing network stack pages to be recycled back into the
> user-space application you open up all sorts of leaks where the
> application can snoop into data it shouldn't have access to.

See later, the "Read-only packet page" mode should provide a mode where
the netstack doesn't write into the page, and thus cannot leak kernel
data. (CAP_NET_ADMIN already give it access to other applications data.)


> >> I think we are stuck with having to use a HW filter to split off
> >> application traffic to a specific ring, and then having to share the
> >> memory between the application and the kernel on that ring only.  Any
> >> other approach just opens us up to all sorts of security concerns
> >> since it would be possible for the application to try to read and
> >> possibly write any data it wants into the buffers.  
> >
> > This is why I wrote a document[1], trying to outline how this is possible,
> > going through all the combinations, and asking the community to find
> > faults in my idea.  Inlining it again, as nobody really replied on the
> > content of the doc.
> >
> > -
> > Best regards,
> >   Jesper Dangaard Brouer
> >   MSc.CS, Principal Kernel Engineer at Red Hat
> >   LinkedIn: http://www.linkedin.com/in/brouer
> >
> > [1] https://prototype-kernel.readthedocs.io/en/latest/vm/page_pool/design/memory_model_nic.html
> >
> > ===========================
> > Memory Model for Networking
> > ===========================
> >
> > This design describes how the page_pool change the memory model for
> > networking in the NIC (Network Interface Card) drivers.
> >
> > .. Note:: The catch for driver developers is that, once an application
> >           request zero-copy RX, then the driver must use a specific
> >           SKB allocation mode and might have to reconfigure the
> >           RX-ring.
> >
> >
> > Design target
> > =============
> >
> > Allow the NIC to function as a normal Linux NIC and be shared in a
> > safe manor, between the kernel network stack and an accelerated
> > userspace application using RX zero-copy delivery.
> >
> > Target is to provide the basis for building RX zero-copy solutions in
> > a memory safe manor.  An efficient communication channel for userspace
> > delivery is out of scope for this document, but OOM considerations are
> > discussed below (`Userspace delivery and OOM`_).
> >
> > Background
> > ==========
> >
> > The SKB or ``struct sk_buff`` is the fundamental meta-data structure
> > for network packets in the Linux Kernel network stack.  It is a fairly
> > complex object and can be constructed in several ways.
> >
> > From a memory perspective there are two ways depending on
> > RX-buffer/page state:
> >
> > 1) Writable packet page
> > 2) Read-only packet page
> >
> > To take full potential of the page_pool, the drivers must actually
> > support handling both options depending on the configuration state of
> > the page_pool.
> >
> > Writable packet page
> > --------------------
> >
> > When the RX packet page is writable, the SKB setup is fairly straight
> > forward.  The SKB->data (and skb->head) can point directly to the page
> > data, adjusting the offset according to drivers headroom (for adding
> > headers) and setting the length according to the DMA descriptor info.
> >
> > The page/data need to be writable, because the network stack need to
> > adjust headers (like TimeToLive and checksum) or even add or remove
> > headers for encapsulation purposes.
> >
> > A subtle catch, which also requires a writable page, is that the SKB
> > also have an accompanying "shared info" data-structure ``struct
> > skb_shared_info``.  This "skb_shared_info" is written into the
> > skb->data memory area at the end (skb->end) of the (header) data.  The
> > skb_shared_info contains semi-sensitive information, like kernel
> > memory pointers to other pages (which might be pointers to more packet
> > data).  This would be bad from a zero-copy point of view to leak this
> > kind of information.  
> 
> This should be the default once we get things moved over to using the
> DMA_ATTR_SKIP_CPU_SYNC DMA attribute.  It will be a little while more
> before it gets fully into Linus's tree.  It looks like the swiotlb
> bits have been accepted, just waiting on the ability to map a page w/
> attributes and the remainder of the patches that are floating around
> in mmotm and linux-next.

I'm very happy that you are working on this.
 
> BTW, any ETA on when we might expect to start seeing code related to
> the page_pool?  It is much easier to review code versus these kind of
> blueprints.

I've implemented a prove-of-concept of page_pool, but only the first
stage, which is the ability to replace driver specific page-caches.  It
works, but is not upstream ready, as e.g. it assumes it can get a page
flag and cleanup-on-driver-unload code is missing.  Mel Gorman have
reviewed it, but with the changes he requested I lost quite some
performance, I'm still trying to figure out a way to regain that
performance lost.  The zero-copy part is not implemented.  


> > Read-only packet page
> > ---------------------
> >
> > When the RX packet page is read-only, the construction of the SKB is
> > significantly more complicated and even involves one more memory
> > allocation.
> >
> > 1) Allocate a new separate writable memory area, and point skb->data
> >    here.  This is needed due to (above described) skb_shared_info.
> >
> > 2) Memcpy packet headers into this (skb->data) area.
> >
> > 3) Clear part of skb_shared_info struct in writable-area.
> >
> > 4) Setup pointer to packet-data in the page (in skb_shared_info->frags)
> >    and adjust the page_offset to be past the headers just copied.
> >
> > It is useful (later) that the network stack have this notion that part
> > of the packet and a page can be read-only.  This implies that the
> > kernel will not "pollute" this memory with any sensitive information.
> > This is good from a zero-copy point of view, but bad from a
> > performance perspective.  
> 
> This will hopefully become a legacy approach.

Hopefully, but this mode will have to be supported forever, and is the
current default.
 

> > NIC RX Zero-Copy
> > ================
> >
> > Doing NIC RX zero-copy involves mapping RX pages into userspace.  This
> > involves costly mapping and unmapping operations in the address space
> > of the userspace process.  Plus for doing this safely, the page memory
> > need to be cleared before using it, to avoid leaking kernel
> > information to userspace, also a costly operation.  The page_pool base
> > "class" of optimization is moving these kind of operations out of the
> > fastpath, by recycling and lifetime control.
> >
> > Once a NIC RX-queue's page_pool have been configured for zero-copy
> > into userspace, then can packets still be allowed to travel the normal
> > stack?
> >
> > Yes, this should be possible, because the driver can use the
> > SKB-read-only mode, which avoids polluting the page data with
> > kernel-side sensitive data.  This implies, when a driver RX-queue
> > switch page_pool to RX-zero-copy mode it MUST also switch to
> > SKB-read-only mode (for normal stack delivery for this RXq).  
> 
> This is the part that is wrong.  Once userspace has access to the
> pages in an Rx ring that ring cannot be used for regular kernel-side
> networking.  If it is, then sensitive kernel data may be leaked
> because the application has full access to any page on the ring so it
> could read the data at any time regardless of where the data is meant
> to be delivered.

Are you sure. Can you give me an example of kernel code that writes
into the page when it is attached as a read-only page to the SKB? 

That would violate how we/drivers use the DMA API today (calling DMA
unmap when packets are in-flight).

 
> > XDP can be used for controlling which pages that gets RX zero-copied
> > to userspace.  The page is still writable for the XDP program, but
> > read-only for normal stack delivery.  
> 
> Making the page read-only doesn't get you anything.  You still have a
> conflict since user-space can read any packet directly out of the
> page.

Giving the application CAP_NAT_ADMIN already gave it "tcpdump" read access
to all other applications packet content from that NIC.


> > Kernel safety
> > -------------
> >
> > For the paranoid, how do we protect the kernel from a malicious
> > userspace program.  Sure there will be a communication interface
> > between kernel and userspace, that synchronize ownership of pages.
> > But a userspace program can violate this interface, given pages are
> > kept VMA mapped, the program can in principle access all the memory
> > pages in the given page_pool.  This opens up for a malicious (or
> > defect) program modifying memory pages concurrently with the kernel
> > and DMA engine using them.
> >
> > An easy way to get around userspace modifying page data contents is
> > simply to map pages read-only into userspace.
> >
> > .. Note:: The first implementation target is read-only zero-copy RX
> >           page to userspace and require driver to use SKB-read-only
> >           mode.  
> 
> This allows for Rx but what do we do about Tx?  

True, I've not covered Tx.  But I believe Tx is easier from a sharing
PoV, as we don't have the early demux sharing problem, because an
application/socket will be the starting point, and simply have
associated a page_pool for TX, solving the VMA mapping overhead.
Using the skb-read-only-page mode, this would in principle allow normal
socket zero-copy TX and packet steering.

For performance reasons, when you already know what NIC you want to TX
on, you could extend this to allocate a separate queue for TX.  Which
makes it look a lot like RDMA.


> It sounds like Christoph's RDMA approach might be the way to go.

I'm getting more and more fond of Christoph's RDMA approach.  I do
think we will end-up with something close to that approach.  I just
wanted to get review on my idea first.

IMHO the major blocker for the RDMA approach is not HW filters
themselves, but a common API that applications can call to register
what goes into the HW queues in the driver.  I suspect it will be a
long project agreeing between vendors.  And agreeing on semantics.


> > Advanced: Allowing userspace write access?
> > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >
> > What if userspace need write access? Flipping the page permissions per
> > transfer will likely kill performance (as this likely affects the
> > TLB-cache).
> >
> > I will argue that giving userspace write access is still possible,
> > without risking a kernel crash.  This is related to the SKB-read-only
> > mode that copies the packet headers (in to another memory area,
> > inaccessible to userspace).  The attack angle is to modify packet
> > headers after they passed some kernel network stack validation step
> > (as once headers are copied they are out of "reach").
> >
> > Situation classes where memory page can be modified concurrently:
> >
> > 1) When DMA engine owns the page.  Not a problem, as DMA engine will
> >    simply overwrite data.
> >
> > 2) Just after DMA engine finish writing.  Not a problem, the packet
> >    will go through netstack validation and be rejected.
> >
> > 3) While XDP reads data. This can lead to XDP/eBPF program goes into a
> >    wrong code branch, but the eBPF virtual machine should not be able
> >    to crash the kernel. The worst outcome is a wrong or invalid XDP
> >    return code.
> >
> > 4) Before SKB with read-only page is constructed. Not a problem, the
> >    packet will go through netstack validation and be rejected.
> >
> > 5) After SKB with read-only page has been constructed.  Remember the
> >    packet headers were copied into a separate memory area, and the
> >    page data is pointed to with an offset passed the copied headers.
> >    Thus, userspace cannot modify the headers used for netstack
> >    validation.  It can only modify packet data contents, which is less
> >    critical as it cannot crash the kernel, and eventually this will be
> >    caught by packet checksum validation.
> >
> > 6) After netstack delivered packet to another userspace process. Not a
> >    problem, as it cannot crash the kernel.  It might corrupt
> >    packet-data being read by another userspace process, which one
> >    argument for requiring elevated privileges to get write access
> >    (like NET_CAP_ADMIN).  
> 
> If userspace has access to a ring we shouldn't be using SKBs on it
> really anyway.  We should probably expect XDP to be handling all the
> packaging so items 4-6 can probably be dropped.
> 
> >
> > Userspace delivery and OOM
> > --------------------------
> >
> > These RX pages are likely mapped to userspace via mmap(), so-far so
> > good.  It is key to performance to get an efficient way of signaling
> > between kernel and userspace, e.g what page are ready for consumption,
> > and when userspace are done with the page.
> >
> > It is outside the scope of page_pool to provide such a queuing
> > structure, but the page_pool can offer some means of protecting the
> > system resource usage.  It is a classical problem that resources
> > (e.g. the page) must be returned in a timely manor, else the system,
> > in this case, will run out of memory.  Any system/design with
> > unbounded memory allocation can lead to Out-Of-Memory (OOM)
> > situations.
> >
> > Communication between kernel and userspace is likely going to be some
> > kind of queue.  Given transferring packets individually will have too
> > much scheduling overhead.  A queue can implicitly function as a
> > bulking interface, and offers a natural way to split the workload
> > across CPU cores.
> >
> > This essentially boils down-to a two queue system, with the RX-ring
> > queue and the userspace delivery queue.
> >
> > Two bad situations exists for the userspace queue:
> >
> > 1) Userspace is not consuming objects fast-enough. This should simply
> >    result in packets getting dropped when enqueueing to a full
> >    userspace queue (as queue *must* implement some limit). Open
> >    question is; should this be reported or communicated to userspace.
> >
> > 2) Userspace is consuming objects fast, but not returning them in a
> >    timely manor.  This is a bad situation, because it threatens the
> >    system stability as it can lead to OOM.
> >
> > The page_pool should somehow protect the system in case 2.  The
> > page_pool can detect the situation as it is able to track the number
> > of outstanding pages, due to the recycle feedback loop.  Thus, the
> > page_pool can have some configurable limit of allowed outstanding
> > pages, which can protect the system against OOM.
> >
> > Note, the `Fbufs paper`_ propose to solve case 2 by allowing these
> > pages to be "pageable", i.e. swap-able, but that is not an option for
> > the page_pool as these pages are DMA mapped.
> >
> > .. _`Fbufs paper`:
> >    http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.52.9688
> >
> > Effect of blocking allocation
> > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >
> > The effect of page_pool, in case 2, that denies more allocations
> > essentially result-in the RX-ring queue cannot be refilled and HW
> > starts dropping packets due to "out-of-buffers".  For NICs with
> > several HW RX-queues, this can be limited to a subset of queues (and
> > admin can control which RX queue with HW filters).
> >
> > The question is if the page_pool can do something smarter in this
> > case, to signal the consumers of these pages, before the maximum limit
> > is hit (of allowed outstanding packets).  The MM-subsystem already
> > have a concept of emergency PFMEMALLOC reserves and associate
> > page-flags (e.g. page_is_pfmemalloc).  And the network stack already
> > handle and react to this.  Could the same PFMEMALLOC system be used
> > for marking pages when limit is close?
> >
> > This requires further analysis. One can imagine; this could be used at
> > RX by XDP to mitigate the situation by dropping less-important frames.
> > Given XDP choose which pages are being send to userspace it might have
> > appropriate knowledge of what it relevant to drop(?).
> >
> > .. Note:: An alternative idea is using a data-structure that blocks
> >           userspace from getting new pages before returning some.
> >           (out of scope for the page_pool)
> >  



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 3/3] Bluetooth: btusb: Configure Marvel to use one of the pins for oob wakeup
From: Gregory CLEMENT @ 2016-12-15  8:29 UTC (permalink / raw)
  To: Rajat Jain
  Cc: Rob Herring, Mark Rutland, Marcel Holtmann, Gustavo Padovan,
	Johan Hedberg, Amitkumar Karwar, Wei-Ning Huang, Xinming Hu,
	netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA, Brian Norris,
	rajatxjain-Re5JQEeQqe8AvxtiuMwx3w
In-Reply-To: <1481742779-15105-3-git-send-email-rajatja-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Hi Rajat,
 
 On mer., déc. 14 2016, Rajat Jain <rajatja-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:

In your title unless you speak about the comic books you should do a
s/Marvel/Marvell/ :)

Gregory

> The Marvell devices may have many gpio pins, and hence for wakeup
> on these out-of-band pins, the chip needs to be told which pin is
> to be used for wakeup, using an hci command.
>
> Thus, we read the pin number etc from the device tree node and send
> a command to the chip.
>
> Signed-off-by: Rajat Jain <rajatja-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> ---
> Note that while I would have liked to name the compatible string as more
> like "marvell, usb8997-bt", the devicetrees/bindings/usb/usb-device.txt
> requires the compatible property to be of the form "usbVID,PID".
>
>  .../{marvell-bt-sd8xxx.txt => marvell-bt-8xxx.txt} | 25 ++++++++-
>  drivers/bluetooth/btusb.c                          | 59 ++++++++++++++++++++++
>  2 files changed, 82 insertions(+), 2 deletions(-)
>  rename Documentation/devicetree/bindings/net/{marvell-bt-sd8xxx.txt => marvell-bt-8xxx.txt} (76%)
>
> diff --git a/Documentation/devicetree/bindings/net/marvell-bt-sd8xxx.txt b/Documentation/devicetree/bindings/net/marvell-bt-8xxx.txt
> similarity index 76%
> rename from Documentation/devicetree/bindings/net/marvell-bt-sd8xxx.txt
> rename to Documentation/devicetree/bindings/net/marvell-bt-8xxx.txt
> index 6a9a63c..471bef8 100644
> --- a/Documentation/devicetree/bindings/net/marvell-bt-sd8xxx.txt
> +++ b/Documentation/devicetree/bindings/net/marvell-bt-8xxx.txt
> @@ -1,4 +1,4 @@
> -Marvell 8897/8997 (sd8897/sd8997) bluetooth SDIO devices
> +Marvell 8897/8997 (sd8897/sd8997) bluetooth devices (SDIO or USB based)
>  ------
>  
>  Required properties:
> @@ -6,11 +6,13 @@ Required properties:
>    - compatible : should be one of the following:
>  	* "marvell,sd8897-bt"
>  	* "marvell,sd8997-bt"
> +	* "usb1286,204e"
>  
>  Optional properties:
>  
>    - marvell,cal-data: Calibration data downloaded to the device during
>  		      initialization. This is an array of 28 values(u8).
> +		      This is only applicable to SDIO devices.
>  
>    - marvell,wakeup-pin: It represents wakeup pin number of the bluetooth chip.
>  		        firmware will use the pin to wakeup host system (u16).
> @@ -29,7 +31,9 @@ Example:
>  IRQ pin 119 is used as system wakeup source interrupt.
>  wakeup pin 13 and gap 100ms are configured so that firmware can wakeup host
>  using this device side pin and wakeup latency.
> -calibration data is also available in below example.
> +
> +Example for SDIO device follows (calibration data is also available in
> +below example).
>  
>  &mmc3 {
>  	status = "okay";
> @@ -54,3 +58,20 @@ calibration data is also available in below example.
>  		marvell,wakeup-gap-ms = /bits/ 16 <0x64>;
>  	};
>  };
> +
> +Example for USB device:
> +
> +&usb_host1_ohci {
> +    status = "okay";
> +    #address-cells = <1>;
> +    #size-cells = <0>;
> +
> +    mvl_bt1: bt@1 {
> +	compatible = "usb1286,204e";
> +	reg = <1>;
> +	interrupt-parent = <&gpio0>;
> +	interrupts = <119 IRQ_TYPE_LEVEL_LOW>;
> +	marvell,wakeup-pin = /bits/ 16 <0x0d>;
> +	marvell,wakeup-gap-ms = /bits/ 16 <0x64>;
> +    };
> +};
> diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
> index 32a6f22..99d7f6d 100644
> --- a/drivers/bluetooth/btusb.c
> +++ b/drivers/bluetooth/btusb.c
> @@ -2343,6 +2343,58 @@ static int btusb_shutdown_intel(struct hci_dev *hdev)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_PM
> +static const struct of_device_id mvl_oob_wake_match_table[] = {
> +	{ .compatible = "usb1286,204e" },
> +	{ }
> +};
> +MODULE_DEVICE_TABLE(of, mvl_oob_wake_match_table);
> +
> +/* Configure an out-of-band gpio as wake-up pin, if specified in device tree */
> +static int marvell_config_oob_wake(struct hci_dev *hdev)
> +{
> +	struct sk_buff *skb;
> +	struct btusb_data *data = hci_get_drvdata(hdev);
> +	struct device *dev = &data->udev->dev;
> +	u16 pin, gap, opcode;
> +	int ret;
> +	u8 cmd[5];
> +
> +	if (!of_match_device(mvl_oob_wake_match_table, dev))
> +		return 0;
> +
> +	if (of_property_read_u16(dev->of_node, "marvell,wakeup-pin", &pin) ||
> +	    of_property_read_u16(dev->of_node, "marvell,wakeup-gap-ms", &gap))
> +		return -EINVAL;
> +
> +	/* Vendor specific command to configure a GPIO as wake-up pin */
> +	opcode = hci_opcode_pack(0x3F, 0x59);
> +	cmd[0] = opcode & 0xFF;
> +	cmd[1] = opcode >> 8;
> +	cmd[2] = 2; /* length of parameters that follow */
> +	cmd[3] = pin;
> +	cmd[4] = gap; /* time in ms, for which wakeup pin should be asserted */
> +
> +	skb = bt_skb_alloc(sizeof(cmd), GFP_KERNEL);
> +	if (!skb) {
> +		bt_dev_err(hdev, "%s: No memory\n", __func__);
> +		return -ENOMEM;
> +	}
> +
> +	memcpy(skb_put(skb, sizeof(cmd)), cmd, sizeof(cmd));
> +	hci_skb_pkt_type(skb) = HCI_COMMAND_PKT;
> +
> +	ret = btusb_send_frame(hdev, skb);
> +	if (ret) {
> +		bt_dev_err(hdev, "%s: configuration failed\n", __func__);
> +		kfree_skb(skb);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +#endif
> +
>  static int btusb_set_bdaddr_marvell(struct hci_dev *hdev,
>  				    const bdaddr_t *bdaddr)
>  {
> @@ -2917,6 +2969,13 @@ static int btusb_probe(struct usb_interface *intf,
>  	err = btusb_config_oob_wake(hdev);
>  	if (err)
>  		goto out_free_dev;
> +
> +	/* Marvel devices may need a specific chip configuration */
> +	if (id->driver_info & BTUSB_MARVELL && data->oob_wake_irq) {
> +		err = marvell_config_oob_wake(hdev);
> +		if (err)
> +			goto out_free_dev;
> +	}
>  #endif
>  	if (id->driver_info & BTUSB_CW6622)
>  		set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks);
> -- 
> 2.8.0.rc3.226.g39d4020
>

-- 
Gregory Clement, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH v2 1/4] siphash: add cryptographically secure hashtable function
From: Hannes Frederic Sowa @ 2016-12-15  8:31 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: David Laight, Netdev, kernel-hardening, Jean-Philippe Aumasson,
	LKML, Linux Crypto Mailing List, Daniel J . Bernstein,
	Linus Torvalds, Eric Biggers
In-Reply-To: <CAHmME9r8-JMqkOdz4PLgF13qYGk8XAmEjGep40k3GVYfUS_sPw@mail.gmail.com>

On 15.12.2016 00:29, Jason A. Donenfeld wrote:
> Hi Hannes,
> 
> On Wed, Dec 14, 2016 at 11:03 PM, Hannes Frederic Sowa
> <hannes@stressinduktion.org> wrote:
>> I fear that the alignment requirement will be a source of bugs on 32 bit
>> machines, where you cannot even simply take a well aligned struct on a
>> stack and put it into the normal siphash(aligned) function without
>> adding alignment annotations everywhere. Even blocks returned from
>> kmalloc on 32 bit are not aligned to 64 bit.
> 
> That's what the "__aligned(SIPHASH24_ALIGNMENT)" attribute is for. The
> aligned siphash function will be for structs explicitly made for
> siphash consumption. For everything else there's siphash_unaligned.

So in case you have a pointer from somewhere on 32 bit you can
essentially only guarantee it has natural alignment or max. native
alignment (based on the arch). gcc only fulfills your request for
alignment when you allocate on the stack (minus gcc bugs).

Let's say you get a pointer from somewhere, maybe embedded in a struct,
which came from kmalloc. kmalloc doesn't care about aligned attribute,
it will align according to architecture description. That said, if you
want to hash that, you would need manually align the memory returned
from kmalloc or make sure the the data is more than naturally aligned on
that architecture.

>> Can we do this a runtime check and just have one function (siphash)
>> dealing with that?
> 
> Seems like the runtime branching on the aligned function would be bad
> for performance, when we likely know at compile time if it's going to
> be aligned or not. I suppose we could add that check just to the
> unaligned version, and rename it to "maybe_unaligned"? Is this what
> you have in mind?

I argue that you mostly don't know at compile time if it is correctly
aligned if the alignment requirements are larger than the natural ones.

Also, we don't even have that for memcpy, even we use it probably much
more than hashing, so I think this is overkill.

Bye,
Hannes

^ permalink raw reply

* Re: [Query] Delayed vxlan socket creation?
From: Du, Fan @ 2016-12-15  8:41 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev@vger.kernel.org, mrjana@gmail.com
In-Reply-To: <20161214102931.307efe94@griffin>



在 2016年12月14日 17:29, Jiri Benc 写道:
> On Wed, 14 Dec 2016 07:49:24 +0000, Du, Fan wrote:
>> I'm interested to one Docker issue[1] which looks like related to kernel vxlan socket creation
>> as described in the thread. From my limited knowledge here, socket creation is synchronous ,
>> and after the *socket* syscall, the sock handle will be valid and ready to linkup.
>>
>> Somehow I'm not sure the detailed scenario here, and which/how possible commit fix?
> baf606d9c9b1^..56ef9c909b40
>
>   Jiri

Thanks a lot Jiri!

^ permalink raw reply

* Re: [RFC v2 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) interface
From: Vishwanathapura, Niranjana @ 2016-12-15  8:53 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	ira.weiny-ral2JQCrhuEAvxtiuMwx3w
In-Reply-To: <1481788782-89964-3-git-send-email-niranjana.vishwanathapura-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

On Wed, Dec 14, 2016 at 11:59:34PM -0800, Vishwanathapura, Niranjana wrote:
>+
>+static inline bool is_hfi_ibdev(struct ib_device *ibdev)
>+{
>+	return !memcmp(ibdev->name, "hfi", 3);
>+}

I am thinking of adding a device capability flag to indicate HFI VNIC capabilty 
instead of relying on the device name as above to identify a hfi ib deice.
Any comments? Probably it can be addressed by a separate patch later.

Niranjana

>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [Query] Delayed vxlan socket creation?
From: Du, Fan @ 2016-12-15  8:43 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev@vger.kernel.org, mrjana@gmail.com
In-Reply-To: <CAM_iQpXujwj_GcN5KOT4vMbPj-H+B+=MdPjpRWZZrc+piooBYg@mail.gmail.com>



在 2016年12月15日 01:24, Cong Wang 写道:
> On Tue, Dec 13, 2016 at 11:49 PM, Du, Fan <fan.du@intel.com> wrote:
>> Hi
>>
>> I'm interested to one Docker issue[1] which looks like related to kernel vxlan socket creation
>> as described in the thread. From my limited knowledge here, socket creation is synchronous ,
>> and after the *socket* syscall, the sock handle will be valid and ready to linkup.
> You need to read the code. vxlan tunnel is a UDP tunnel, it needs a kernel
> socket (and a port) to setup UDP communication, unlike GRE tunnel etc.
I check the fix is merged in 4.0, my code base is pretty new,
so somehow I failed to see the work queue stuff in drver/net/vxlan.c
>> Somehow I'm not sure the detailed scenario here, and which/how possible commit fix?
>> Thanks!
>>
>> Quoted analysis:
>> --------------------------------------------------------------------------
>> (Found in kernel 3.13)
>> The issue happens because in older kernels when a vxlan interface is created,
>> the socket creation is queued up in a worker thread which actually creates
>> the socket. But this needs to happen before we bring up the link on the vxlan interface.
>> If for some chance, the worker thread hasn't completed the creation of the socket
>> before we did link up then when we do link up the kernel checks if the socket was
>> created and if not it will return ENOTCONN. This was a bug in the kernel which got fixed
>> in later kernels. That is why retrying with a timer fixes the issue.
>
> This was introduced by commit 1c51a9159ddefa5119724a4c7da3fd3ef44b68d5
> and later fixed by commit 56ef9c909b40483d2c8cb63fcbf83865f162d5ec.
信聪哥，得永生。
Thanks for the offending commit id！

^ permalink raw reply

* Re: [RFC v2 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) interface
From: Christoph Hellwig @ 2016-12-15  8:56 UTC (permalink / raw)
  To: Vishwanathapura, Niranjana
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	ira.weiny-ral2JQCrhuEAvxtiuMwx3w
In-Reply-To: <20161215085349.GA90068-wPcXA7LoDC+1XWohqUldA0EOCMrvLtNR@public.gmane.org>

On Thu, Dec 15, 2016 at 12:53:49AM -0800, Vishwanathapura, Niranjana wrote:
> On Wed, Dec 14, 2016 at 11:59:34PM -0800, Vishwanathapura, Niranjana wrote:
> > +
> > +static inline bool is_hfi_ibdev(struct ib_device *ibdev)
> > +{
> > +	return !memcmp(ibdev->name, "hfi", 3);
> > +}
> 
> I am thinking of adding a device capability flag to indicate HFI VNIC
> capabilty instead of relying on the device name as above to identify a hfi
> ib deice.

Absolutely.

> Any comments? Probably it can be addressed by a separate patch later.

no, comparing device names is always wrong, please do it ASAP.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] net: sfc: use new api ethtool_{get|set}_link_ksettings
From: Bert Kenward @ 2016-12-15  8:57 UTC (permalink / raw)
  To: Philippe Reynes, linux-net-drivers, ecree; +Cc: netdev, linux-kernel
In-Reply-To: <1481757173-16000-1-git-send-email-tremyfr@gmail.com>

n 14/12/16 23:12, Philippe Reynes wrote:
> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> Signed-off-by: Philippe Reynes <tremyfr@gmail.com>

Thanks Philippe. We'll get some testing done on this.

Bert.

^ permalink raw reply

* [PATCH] net: ipv4: tcp_offload: check segs for NULL
From: shakya.das @ 2016-12-15  8:47 UTC (permalink / raw)
  To: davem; +Cc: netdev, shakya89.das, vidushi.koul

From: Shakya Sundar Das <shakya.das@samsung.com>

This patch will check segs for being NULL in tcp_gso_segment()
before calling skb_shinfo(segs) from skb_is_gso(segs), otherwise
kernel can run into a NULL-pointer dereference.

Signed-off-by: Shakya Sundar Das <shakya.das@samsung.com>
---
 net/ipv4/tcp_offload.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index bc68da3..93feefd 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -96,7 +96,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 	skb->ooo_okay = 0;
 
 	segs = skb_segment(skb, features);
-	if (IS_ERR(segs))
+	if (IS_ERR_OR_NULL(segs))
 		goto out;
 
 	/* Only first segment might have ooo_okay set */
-- 
1.7.9.5

^ permalink raw reply related

* Re: sanity checking iov_iter patches
From: Jesper Dangaard Brouer @ 2016-12-15  9:00 UTC (permalink / raw)
  To: Al Viro; +Cc: brouer, netdev, David Miller
In-Reply-To: <20161215062305.GR1555@ZenIV.linux.org.uk>

On Thu, 15 Dec 2016 06:23:05 +0000
Al Viro <viro@ZenIV.linux.org.uk> wrote:

> 	Some of the vfs.git#work.iov_iter stuff touches net/*; basically,
> there are several missing primitives (copy_from_iter_full(), etc.) for
> "try to copy, tell whether it has copied the full amount requested and
> advance the iterator only in case of success".  Most of the callers were
> actually doing just that (see e.g. skb_add_data() and friends) and while
> nothing in the current kernel cares whether we advance ->msg_iter on
> failure, it's much more consistent semantics.
> 
> 	If anybody has objections to that stuff (in linux-next, or in
> git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git#work.iov_iter),
> or thinks that some of that should go via net-next.git, yell and I'll
> drop the bits in question.  If not, to Linus it all goes...

Just some links to make it quicker for people see the three patches:
 http://git.kernel.org/cgit/linux/kernel/git/viro/vfs.git/log/?h=work.iov_iter

Patches:
 http://git.kernel.org/cgit/linux/kernel/git/viro/vfs.git/commit/?h=work.iov_iter&id=cbbd26b8b1a
 http://git.kernel.org/cgit/linux/kernel/git/viro/vfs.git/commit/?h=work.iov_iter&id=15e6cb46c9b
 http://git.kernel.org/cgit/linux/kernel/git/viro/vfs.git/commit/?h=work.iov_iter&id=0b62fca2623

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH 5/8] linux: drop __bitwise__ everywhere
From: Stefan Schmidt @ 2016-12-15  9:04 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Kukjin Kim, Krzysztof Kozlowski, Javier Martinez Canillas,
	Russell King, Alasdair Kergon, Mike Snitzer,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, Shaohua Li, Johannes Berg,
	Emmanuel Grumbach, Luca Coelho, Intel Linux Wireless, Kalle Valo,
	Greg Kroah-Hartman, Jiri Slaby, Lee Duncan, Chris Leech,
	James E.J. Bottomley, Martin K. Petersen, Nicholas A. Bellinger,
	Jason Wang, Alexander Aring, David S. Miller
In-Reply-To: <1481778865-27667-6-git-send-email-mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Hello.

On 15/12/16 06:15, Michael S. Tsirkin wrote:
> __bitwise__ used to mean "yes, please enable sparse checks
> unconditionally", but now that we dropped __CHECK_ENDIAN__
> __bitwise is exactly the same.
> There aren't many users, replace it by __bitwise everywhere.
>
> Signed-off-by: Michael S. Tsirkin <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> ---
>  arch/arm/plat-samsung/include/plat/gpio-cfg.h    | 2 +-
>  drivers/md/dm-cache-block-types.h                | 6 +++---
>  drivers/net/ethernet/sun/sunhme.h                | 2 +-
>  drivers/net/wireless/intel/iwlwifi/iwl-fw-file.h | 4 ++--
>  include/linux/mmzone.h                           | 2 +-
>  include/linux/serial_core.h                      | 4 ++--
>  include/linux/types.h                            | 4 ++--
>  include/scsi/iscsi_proto.h                       | 2 +-
>  include/target/target_core_base.h                | 2 +-
>  include/uapi/linux/virtio_types.h                | 6 +++---
>  net/ieee802154/6lowpan/6lowpan_i.h               | 2 +-
>  net/mac80211/ieee80211_i.h                       | 4 ++--
>  12 files changed, 20 insertions(+), 20 deletions(-)
>
> diff --git a/arch/arm/plat-samsung/include/plat/gpio-cfg.h b/arch/arm/plat-samsung/include/plat/gpio-cfg.h
> index 21391fa..e55d1f5 100644
> --- a/arch/arm/plat-samsung/include/plat/gpio-cfg.h
> +++ b/arch/arm/plat-samsung/include/plat/gpio-cfg.h
> @@ -26,7 +26,7 @@
>
>  #include <linux/types.h>
>
> -typedef unsigned int __bitwise__ samsung_gpio_pull_t;
> +typedef unsigned int __bitwise samsung_gpio_pull_t;
>
>  /* forward declaration if gpio-core.h hasn't been included */
>  struct samsung_gpio_chip;
> diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
> index bed4ad4..389c9e8 100644
> --- a/drivers/md/dm-cache-block-types.h
> +++ b/drivers/md/dm-cache-block-types.h
> @@ -17,9 +17,9 @@
>   * discard bitset.
>   */
>
> -typedef dm_block_t __bitwise__ dm_oblock_t;
> -typedef uint32_t __bitwise__ dm_cblock_t;
> -typedef dm_block_t __bitwise__ dm_dblock_t;
> +typedef dm_block_t __bitwise dm_oblock_t;
> +typedef uint32_t __bitwise dm_cblock_t;
> +typedef dm_block_t __bitwise dm_dblock_t;
>
>  static inline dm_oblock_t to_oblock(dm_block_t b)
>  {
> diff --git a/drivers/net/ethernet/sun/sunhme.h b/drivers/net/ethernet/sun/sunhme.h
> index f430765..4a8d5b1 100644
> --- a/drivers/net/ethernet/sun/sunhme.h
> +++ b/drivers/net/ethernet/sun/sunhme.h
> @@ -302,7 +302,7 @@
>   * Always write the address first before setting the ownership
>   * bits to avoid races with the hardware scanning the ring.
>   */
> -typedef u32 __bitwise__ hme32;
> +typedef u32 __bitwise hme32;
>
>  struct happy_meal_rxd {
>  	hme32 rx_flags;
> diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-fw-file.h b/drivers/net/wireless/intel/iwlwifi/iwl-fw-file.h
> index 1ad0ec1..84813b5 100644
> --- a/drivers/net/wireless/intel/iwlwifi/iwl-fw-file.h
> +++ b/drivers/net/wireless/intel/iwlwifi/iwl-fw-file.h
> @@ -228,7 +228,7 @@ enum iwl_ucode_tlv_flag {
>  	IWL_UCODE_TLV_FLAGS_BCAST_FILTERING	= BIT(29),
>  };
>
> -typedef unsigned int __bitwise__ iwl_ucode_tlv_api_t;
> +typedef unsigned int __bitwise iwl_ucode_tlv_api_t;
>
>  /**
>   * enum iwl_ucode_tlv_api - ucode api
> @@ -258,7 +258,7 @@ enum iwl_ucode_tlv_api {
>  #endif
>  };
>
> -typedef unsigned int __bitwise__ iwl_ucode_tlv_capa_t;
> +typedef unsigned int __bitwise iwl_ucode_tlv_capa_t;
>
>  /**
>   * enum iwl_ucode_tlv_capa - ucode capabilities
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 0f088f3..36d9896 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -246,7 +246,7 @@ struct lruvec {
>  #define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
>
>  /* LRU Isolation modes. */
> -typedef unsigned __bitwise__ isolate_mode_t;
> +typedef unsigned __bitwise isolate_mode_t;
>
>  enum zone_watermarks {
>  	WMARK_MIN,
> diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
> index 5d49488..5def8e8 100644
> --- a/include/linux/serial_core.h
> +++ b/include/linux/serial_core.h
> @@ -111,8 +111,8 @@ struct uart_icount {
>  	__u32	buf_overrun;
>  };
>
> -typedef unsigned int __bitwise__ upf_t;
> -typedef unsigned int __bitwise__ upstat_t;
> +typedef unsigned int __bitwise upf_t;
> +typedef unsigned int __bitwise upstat_t;
>
>  struct uart_port {
>  	spinlock_t		lock;			/* port lock */
> diff --git a/include/linux/types.h b/include/linux/types.h
> index baf7183..d501ad3 100644
> --- a/include/linux/types.h
> +++ b/include/linux/types.h
> @@ -154,8 +154,8 @@ typedef u64 dma_addr_t;
>  typedef u32 dma_addr_t;
>  #endif
>
> -typedef unsigned __bitwise__ gfp_t;
> -typedef unsigned __bitwise__ fmode_t;
> +typedef unsigned __bitwise gfp_t;
> +typedef unsigned __bitwise fmode_t;
>
>  #ifdef CONFIG_PHYS_ADDR_T_64BIT
>  typedef u64 phys_addr_t;
> diff --git a/include/scsi/iscsi_proto.h b/include/scsi/iscsi_proto.h
> index c1260d8..df156f1 100644
> --- a/include/scsi/iscsi_proto.h
> +++ b/include/scsi/iscsi_proto.h
> @@ -74,7 +74,7 @@ static inline int iscsi_sna_gte(u32 n1, u32 n2)
>  #define zero_data(p) {p[0]=0;p[1]=0;p[2]=0;}
>
>  /* initiator tags; opaque for target */
> -typedef uint32_t __bitwise__ itt_t;
> +typedef uint32_t __bitwise itt_t;
>  /* below makes sense only for initiator that created this tag */
>  #define build_itt(itt, age) ((__force itt_t)\
>  	((itt) | ((age) << ISCSI_AGE_SHIFT)))
> diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
> index c211900..0055828 100644
> --- a/include/target/target_core_base.h
> +++ b/include/target/target_core_base.h
> @@ -149,7 +149,7 @@ enum se_cmd_flags_table {
>   * Used by transport_send_check_condition_and_sense()
>   * to signal which ASC/ASCQ sense payload should be built.
>   */
> -typedef unsigned __bitwise__ sense_reason_t;
> +typedef unsigned __bitwise sense_reason_t;
>
>  enum tcm_sense_reason_table {
>  #define R(x)	(__force sense_reason_t )(x)
> diff --git a/include/uapi/linux/virtio_types.h b/include/uapi/linux/virtio_types.h
> index e845e8c..55c3b73 100644
> --- a/include/uapi/linux/virtio_types.h
> +++ b/include/uapi/linux/virtio_types.h
> @@ -39,8 +39,8 @@
>   * - __le{16,32,64} for standard-compliant virtio devices
>   */
>
> -typedef __u16 __bitwise__ __virtio16;
> -typedef __u32 __bitwise__ __virtio32;
> -typedef __u64 __bitwise__ __virtio64;
> +typedef __u16 __bitwise __virtio16;
> +typedef __u32 __bitwise __virtio32;
> +typedef __u64 __bitwise __virtio64;
>
>  #endif /* _UAPI_LINUX_VIRTIO_TYPES_H */
> diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
> index 5ac7789..ac7c96b 100644
> --- a/net/ieee802154/6lowpan/6lowpan_i.h
> +++ b/net/ieee802154/6lowpan/6lowpan_i.h
> @@ -7,7 +7,7 @@
>  #include <net/inet_frag.h>
>  #include <net/6lowpan.h>
>
> -typedef unsigned __bitwise__ lowpan_rx_result;
> +typedef unsigned __bitwise lowpan_rx_result;
>  #define RX_CONTINUE		((__force lowpan_rx_result) 0u)
>  #define RX_DROP_UNUSABLE	((__force lowpan_rx_result) 1u)
>  #define RX_DROP			((__force lowpan_rx_result) 2u)
> diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
> index d37a577..b2069fb 100644
> --- a/net/mac80211/ieee80211_i.h
> +++ b/net/mac80211/ieee80211_i.h
> @@ -159,7 +159,7 @@ enum ieee80211_bss_valid_data_flags {
>  	IEEE80211_BSS_VALID_ERP			= BIT(3)
>  };
>
> -typedef unsigned __bitwise__ ieee80211_tx_result;
> +typedef unsigned __bitwise ieee80211_tx_result;
>  #define TX_CONTINUE	((__force ieee80211_tx_result) 0u)
>  #define TX_DROP		((__force ieee80211_tx_result) 1u)
>  #define TX_QUEUED	((__force ieee80211_tx_result) 2u)
> @@ -180,7 +180,7 @@ struct ieee80211_tx_data {
>  };
>
>
> -typedef unsigned __bitwise__ ieee80211_rx_result;
> +typedef unsigned __bitwise ieee80211_rx_result;
>  #define RX_CONTINUE		((__force ieee80211_rx_result) 0u)
>  #define RX_DROP_UNUSABLE	((__force ieee80211_rx_result) 1u)
>  #define RX_DROP_MONITOR		((__force ieee80211_rx_result) 2u)
>

For net/ieee802154/6lowpan/6lowpan_i.h

Acked-by: Stefan Schmidt <stefan-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>

regards
Stefan Schmidt

-- 
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: [RFC v2 00/10] HFI Virtual Network Interface Controller (VNIC)
From: Leon Romanovsky @ 2016-12-15  9:12 UTC (permalink / raw)
  To: Vishwanathapura, Niranjana
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	ira.weiny-ral2JQCrhuEAvxtiuMwx3w
In-Reply-To: <1481788782-89964-1-git-send-email-niranjana.vishwanathapura-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 856 bytes --]

On Wed, Dec 14, 2016 at 11:59:32PM -0800, Vishwanathapura, Niranjana wrote:
> Thanks Jason for the valuable feedback.
> Here is the revised HFI VNIC patch series.
>
> ChangeLog:
> =========
> v1 => v2:
> a) Removed hfi_vnic bus, instead make hfi_vnic driver an 'ib client',
>    as per feedback from Jason Gunthorpe.
> b) Interface changes, data structure changes and variable name changes
>    associated with (a).
> c) Add hfi_ibdev abstraction to provide VNIC control operations to
>    hfi_vnic client.
> d) Minor fixes
> e) Moved hfi_vnic driver from .../sw/intel/vnic/hfi_vnic to
>    .../sw/intel/hfi_vnic.

To put it into proportion, Jason asked you to do different thing.
http://marc.info/?l=linux-rdma&m=147977108302151&w=2
http://marc.info/?l=linux-rdma&m=148000415401842&w=2

And Christoph,
http://marc.info/?l=linux-rdma&m=147985587425861&w=2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net] vxlan: fix unused variable warning
From: Jiri Benc @ 2016-12-15  9:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: davem, netdev
In-Reply-To: <20161214124355.7da92ff3@xeon-e3>

On Wed, 14 Dec 2016 12:43:55 -0800, Stephen Hemminger wrote:
> Fixes commit 4528520d315ac1 ("vxlan: add ipv6 proxy support")

Wrong hash, it was commit f564f45c4518. And that commit actually did
use saddr, the actual commit that is being fixed is 4b29dba9c085
("vxlan: fix nonfunctional neigh_reduce()"). Also, please use the
standard Fixes: line when resubmitting this.

The patch itself looks good.

Thanks,

 Jiri

^ permalink raw reply

* Re: [PATCH v2 2/2] net: ethernet: stmmac: remove private tx queue lock
From: Pavel Machek @ 2016-12-15  9:45 UTC (permalink / raw)
  To: Lino Sanfilippo
  Cc: bh74.an, ks.giri, vipul.pandya, peppe.cavallaro, alexandre.torgue,
	romieu, davem, linux-kernel, netdev
In-Reply-To: <1481241343-18062-3-git-send-email-LinoSanfilippo@gmx.de>

[-- Attachment #1: Type: text/plain, Size: 2273 bytes --]

Hi!

> The driver uses a private lock for synchronization of the xmit function and
> the xmit completion handler, but since the NETIF_F_LLTX flag is not set,
> the xmit function is also called with the xmit_lock held.
> 
> On the other hand the completion handler uses the reverse locking order by
> first taking the private lock and (in case that the tx queue had been
> stopped) then the xmit_lock.
> 
> Improve the locking by removing the private lock and using only the
> xmit_lock for synchronization instead.

Do you have stmmac hardware to test on?

I believe something is very wrong with the locking there. In
particular... scheduling the stmmac_tx_timer() function to run often
should not do anything bad if locking is correct... but it breaks the
driver rather quickly. [Example patch below, needs applying to two
places in net-next.]

(Other possibility is that hardware races with the driver.)

Giuseppe, is there documentation available for the chip? Driver says

  Documentation available at:
          http://www.stlinux.com

but that page does not work for me...

404 Not Found

Code: NoSuchBucket
Message: The specified bucket does not exist
BucketName: www.stlinux.com
RequestId: 1C8A20CB99AE7F75
HostId:
ljPnqbEpyD8exct5MUgcDXSW8n+I67Yw0aejNhLuBQ0pqN0UCfiRBa3ztlOMngiXoSN+COX+VSw=

Best regards,
									Pavel

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ffbcd03..8040370 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1973,8 +1973,9 @@ static void stmmac_xmit_common(struct sk_buff *skb, struct net_device *dev, int
 	 */
 	priv->tx_count_frames += nfrags + 1;
 	if (likely(priv->tx_coal_frames > priv->tx_count_frames)) {
-		mod_timer(&priv->txtimer,
-			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
+		if (priv->tx_count_frames == nfrags + 1)
+			mod_timer(&priv->txtimer,
+				  STMMAC_COAL_TIMER(priv->tx_coal_timer));
 	} else {
 		priv->tx_count_frames = 0;
 		priv->hw->desc->set_tx_ic(desc);


-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply related

* [PATCH] bpf: cgroup: annotate pointers in struct cgroup_bpf with __rcu
From: Daniel Mack @ 2016-12-15  9:53 UTC (permalink / raw)
  To: ast; +Cc: daniel, netdev, davem, Daniel Mack

The member 'effective' in 'struct cgroup_bpf' is protected by RCU.
Annotate it accordingly to squelch a sparse warning.

Signed-off-by: Daniel Mack <daniel@zonque.org>
---
 include/linux/bpf-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 7b6e5d1..92bc89a 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -20,7 +20,7 @@ struct cgroup_bpf {
 	 * when this cgroup is accessed.
 	 */
 	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
-	struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
-- 
2.9.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox