Netdev List
 help / color / mirror / Atom feed
* [patch net-next v4 16/21] rocker: implement rocker ofdpa flow table manipulation
From: Jiri Pirko @ 2014-11-27 10:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse, pshelar,
	azhou, ben, stephen, jeffrey.t.kirsher, vyasevic, xiyou.wangcong,
	john.r.fastabend, edumazet, jhs, sfeldma, f.fainelli, roopa,
	linville, jasowang, ebiederm, nicolas.dichtel, ryazanov.s.a,
	buytenh, aviadr, nbd, alexei.starovoitov, Neil.Jerram, ronye,
	simon.horman, alexander.h.duyck, john.ronciak, mleitner, shrijeet,
	gospo, bcrl, hemal
In-Reply-To: <1417084826-9875-1-git-send-email-jiri@resnulli.us>

From: Scott Feldman <sfeldma@gmail.com>

The rocker driver maintains 4 hash tables: flows, groups, FDB, and VLANs.

Flow and group tables track the entries installed to OF-DPA tables,
per the OF-DPA spec.  See OF-DPA spec for full description of fields
in each flow and group table.  New table entries are pushed to the
device with ADD cmd.  Updated entries are pushed to the device with
MOD cmd.  For flow table entries, a crc32 key is made from fields of
the particular field.  For group table entries, the group_id is used
as the key.

The FDB table tracks fdb entries learned by the device or manually
pushed to the bridge by the user.  A crc32 key is made from the
port/mac/vlan tuple for the fdb entry.

The VLAN table tracks the ifindex-to-internal-vlan mapping for
untagged pkts.  On ingress, an untagged pkt is inserted with an
internal VLAN ID based on the input port's current internal VLAN ID.
The input port's internal VLAN will either be referenced by the port's
ifindex, if not bridged, or the containing bridge's ifindex, if
bridged.  Since the ifindex space isn't within a fixed range, uses a
hash table (with ifindex as key) to track internal VLAN ID for a given
ifindex.  The internal VLAN ID range is fixed and currently uses the
upper 255 VLAN IDs, starting at 0xf00.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
v1->v2->v3->v4:
-no change
---
 drivers/net/ethernet/rocker/rocker.c | 1469 +++++++++++++++++++++++++++++++++-
 1 file changed, 1467 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index a53011c..6345f60 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/spinlock.h>
+#include <linux/hashtable.h>
 #include <linux/crc32.h>
 #include <linux/sort.h>
 #include <linux/random.h>
@@ -27,6 +28,7 @@
 #include <linux/ethtool.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
+#include <linux/bitops.h>
 #include <net/switchdev.h>
 #include <net/rtnetlink.h>
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
@@ -41,6 +43,123 @@ static const struct pci_device_id rocker_pci_id_table[] = {
 	{0, }
 };
 
+struct rocker_flow_tbl_key {
+	u32 priority;
+	enum rocker_of_dpa_table_id tbl_id;
+	union {
+		struct {
+			u32 in_lport;
+			u32 in_lport_mask;
+			enum rocker_of_dpa_table_id goto_tbl;
+		} ig_port;
+		struct {
+			u32 in_lport;
+			__be16 vlan_id;
+			__be16 vlan_id_mask;
+			enum rocker_of_dpa_table_id goto_tbl;
+			bool untagged;
+			__be16 new_vlan_id;
+		} vlan;
+		struct {
+			u32 in_lport;
+			u32 in_lport_mask;
+			__be16 eth_type;
+			u8 eth_dst[ETH_ALEN];
+			u8 eth_dst_mask[ETH_ALEN];
+			__be16 vlan_id;
+			__be16 vlan_id_mask;
+			enum rocker_of_dpa_table_id goto_tbl;
+			bool copy_to_cpu;
+		} term_mac;
+		struct {
+			__be16 eth_type;
+			__be32 dst4;
+			__be32 dst4_mask;
+			enum rocker_of_dpa_table_id goto_tbl;
+			u32 group_id;
+		} ucast_routing;
+		struct {
+			u8 eth_dst[ETH_ALEN];
+			u8 eth_dst_mask[ETH_ALEN];
+			int has_eth_dst;
+			int has_eth_dst_mask;
+			__be16 vlan_id;
+			u32 tunnel_id;
+			enum rocker_of_dpa_table_id goto_tbl;
+			u32 group_id;
+			bool copy_to_cpu;
+		} bridge;
+		struct {
+			u32 in_lport;
+			u32 in_lport_mask;
+			u8 eth_src[ETH_ALEN];
+			u8 eth_src_mask[ETH_ALEN];
+			u8 eth_dst[ETH_ALEN];
+			u8 eth_dst_mask[ETH_ALEN];
+			__be16 eth_type;
+			__be16 vlan_id;
+			__be16 vlan_id_mask;
+			u8 ip_proto;
+			u8 ip_proto_mask;
+			u8 ip_tos;
+			u8 ip_tos_mask;
+			u32 group_id;
+		} acl;
+	};
+};
+
+struct rocker_flow_tbl_entry {
+	struct hlist_node entry;
+	u32 ref_count;
+	u64 cookie;
+	struct rocker_flow_tbl_key key;
+	u32 key_crc32; /* key */
+};
+
+struct rocker_group_tbl_entry {
+	struct hlist_node entry;
+	u32 cmd;
+	u32 group_id; /* key */
+	u16 group_count;
+	u32 *group_ids;
+	union {
+		struct {
+			u8 pop_vlan;
+		} l2_interface;
+		struct {
+			u8 eth_src[ETH_ALEN];
+			u8 eth_dst[ETH_ALEN];
+			__be16 vlan_id;
+			u32 group_id;
+		} l2_rewrite;
+		struct {
+			u8 eth_src[ETH_ALEN];
+			u8 eth_dst[ETH_ALEN];
+			__be16 vlan_id;
+			bool ttl_check;
+			u32 group_id;
+		} l3_unicast;
+	};
+};
+
+struct rocker_fdb_tbl_entry {
+	struct hlist_node entry;
+	u32 key_crc32; /* key */
+	bool learned;
+	struct rocker_fdb_tbl_key {
+		u32 lport;
+		u8 addr[ETH_ALEN];
+		__be16 vlan_id;
+	} key;
+};
+
+struct rocker_internal_vlan_tbl_entry {
+	struct hlist_node entry;
+	int ifindex; /* key */
+	u32 ref_count;
+	__be16 vlan_id;
+};
+
 struct rocker_desc_info {
 	char *data; /* mapped */
 	size_t data_size;
@@ -61,11 +180,28 @@ struct rocker_dma_ring_info {
 
 struct rocker;
 
+enum {
+	ROCKER_CTRL_LINK_LOCAL_MCAST,
+	ROCKER_CTRL_LOCAL_ARP,
+	ROCKER_CTRL_IPV4_MCAST,
+	ROCKER_CTRL_IPV6_MCAST,
+	ROCKER_CTRL_DFLT_BRIDGING,
+	ROCKER_CTRL_MAX,
+};
+
+#define ROCKER_INTERNAL_VLAN_ID_BASE	0x0f00
+#define ROCKER_N_INTERNAL_VLANS		255
+#define ROCKER_VLAN_BITMAP_LEN		BITS_TO_LONGS(VLAN_N_VID)
+#define ROCKER_INTERNAL_VLAN_BITMAP_LEN	BITS_TO_LONGS(ROCKER_N_INTERNAL_VLANS)
+
 struct rocker_port {
 	struct net_device *dev;
 	struct rocker *rocker;
 	unsigned int port_number;
 	u32 lport;
+	__be16 internal_vlan_id;
+	bool ctrls[ROCKER_CTRL_MAX];
+	unsigned long vlan_bitmap[ROCKER_VLAN_BITMAP_LEN];
 	struct napi_struct napi_tx;
 	struct napi_struct napi_rx;
 	struct rocker_dma_ring_info tx_ring;
@@ -84,8 +220,76 @@ struct rocker {
 	spinlock_t cmd_ring_lock;
 	struct rocker_dma_ring_info cmd_ring;
 	struct rocker_dma_ring_info event_ring;
+	DECLARE_HASHTABLE(flow_tbl, 16);
+	spinlock_t flow_tbl_lock;
+	u64 flow_tbl_next_cookie;
+	DECLARE_HASHTABLE(group_tbl, 16);
+	spinlock_t group_tbl_lock;
+	DECLARE_HASHTABLE(fdb_tbl, 16);
+	spinlock_t fdb_tbl_lock;
+	unsigned long internal_vlan_bitmap[ROCKER_INTERNAL_VLAN_BITMAP_LEN];
+	DECLARE_HASHTABLE(internal_vlan_tbl, 8);
+	spinlock_t internal_vlan_tbl_lock;
+};
+
+static const u8 zero_mac[ETH_ALEN]   = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+static const u8 ff_mac[ETH_ALEN]     = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+static const u8 ll_mac[ETH_ALEN]     = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
+static const u8 ll_mask[ETH_ALEN]    = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xf0 };
+static const u8 mcast_mac[ETH_ALEN]  = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 };
+static const u8 ipv4_mcast[ETH_ALEN] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };
+static const u8 ipv4_mask[ETH_ALEN]  = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 };
+static const u8 ipv6_mcast[ETH_ALEN] = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };
+static const u8 ipv6_mask[ETH_ALEN]  = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 };
+
+/* Rocker priority levels for flow table entries.  Higher
+ * priority match takes precedence over lower priority match.
+ */
+
+enum {
+	ROCKER_PRIORITY_UNKNOWN = 0,
+	ROCKER_PRIORITY_IG_PORT = 1,
+	ROCKER_PRIORITY_VLAN = 1,
+	ROCKER_PRIORITY_TERM_MAC_UCAST = 0,
+	ROCKER_PRIORITY_TERM_MAC_MCAST = 1,
+	ROCKER_PRIORITY_UNICAST_ROUTING = 1,
+	ROCKER_PRIORITY_BRIDGING_VLAN_DFLT_EXACT = 1,
+	ROCKER_PRIORITY_BRIDGING_VLAN_DFLT_WILD = 2,
+	ROCKER_PRIORITY_BRIDGING_VLAN = 3,
+	ROCKER_PRIORITY_BRIDGING_TENANT_DFLT_EXACT = 1,
+	ROCKER_PRIORITY_BRIDGING_TENANT_DFLT_WILD = 2,
+	ROCKER_PRIORITY_BRIDGING_TENANT = 3,
+	ROCKER_PRIORITY_ACL_CTRL = 3,
+	ROCKER_PRIORITY_ACL_NORMAL = 2,
+	ROCKER_PRIORITY_ACL_DFLT = 1,
 };
 
+static bool rocker_vlan_id_is_internal(__be16 vlan_id)
+{
+	u16 start = ROCKER_INTERNAL_VLAN_ID_BASE;
+	u16 end = 0xffe;
+	u16 _vlan_id = ntohs(vlan_id);
+
+	return (_vlan_id >= start && _vlan_id <= end);
+}
+
+static __be16 rocker_port_vid_to_vlan(struct rocker_port *rocker_port,
+				      u16 vid, bool *pop_vlan)
+{
+	__be16 vlan_id;
+
+	if (pop_vlan)
+		*pop_vlan = false;
+	vlan_id = htons(vid);
+	if (!vlan_id) {
+		vlan_id = rocker_port->internal_vlan_id;
+		if (pop_vlan)
+			*pop_vlan = true;
+	}
+
+	return vlan_id;
+}
+
 struct rocker_wait {
 	wait_queue_head_t wait;
 	bool done;
@@ -1094,6 +1298,10 @@ static int rocker_event_link_change(struct rocker *rocker,
 	return 0;
 }
 
+#define ROCKER_OP_FLAG_REMOVE		BIT(0)
+#define ROCKER_OP_FLAG_NOWAIT		BIT(1)
+#define ROCKER_OP_FLAG_LEARNED		BIT(2)
+
 static int rocker_event_process(struct rocker *rocker,
 				struct rocker_desc_info *desc_info)
 {
@@ -1399,6 +1607,1240 @@ static int rocker_cmd_set_port_settings_macaddr(struct rocker_port *rocker_port,
 			       macaddr, NULL, NULL, false);
 }
 
+static int rocker_cmd_flow_tbl_add_ig_port(struct rocker_desc_info *desc_info,
+					   struct rocker_flow_tbl_entry *entry)
+{
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_LPORT,
+			       entry->key.ig_port.in_lport))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_LPORT_MASK,
+			       entry->key.ig_port.in_lport_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
+			       entry->key.ig_port.goto_tbl))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int rocker_cmd_flow_tbl_add_vlan(struct rocker_desc_info *desc_info,
+					struct rocker_flow_tbl_entry *entry)
+{
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_LPORT,
+			       entry->key.vlan.in_lport))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
+			       entry->key.vlan.vlan_id))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID_MASK,
+			       entry->key.vlan.vlan_id_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
+			       entry->key.vlan.goto_tbl))
+		return -EMSGSIZE;
+	if (entry->key.vlan.untagged &&
+	    rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_NEW_VLAN_ID,
+			       entry->key.vlan.new_vlan_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int rocker_cmd_flow_tbl_add_term_mac(struct rocker_desc_info *desc_info,
+					    struct rocker_flow_tbl_entry *entry)
+{
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_LPORT,
+			       entry->key.term_mac.in_lport))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_LPORT_MASK,
+			       entry->key.term_mac.in_lport_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_ETHERTYPE,
+			       entry->key.term_mac.eth_type))
+		return -EMSGSIZE;
+	if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
+			   ETH_ALEN, entry->key.term_mac.eth_dst))
+		return -EMSGSIZE;
+	if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC_MASK,
+			   ETH_ALEN, entry->key.term_mac.eth_dst_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
+			       entry->key.term_mac.vlan_id))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID_MASK,
+			       entry->key.term_mac.vlan_id_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
+			       entry->key.term_mac.goto_tbl))
+		return -EMSGSIZE;
+	if (entry->key.term_mac.copy_to_cpu &&
+	    rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_COPY_CPU_ACTION,
+			      entry->key.term_mac.copy_to_cpu))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+rocker_cmd_flow_tbl_add_ucast_routing(struct rocker_desc_info *desc_info,
+				      struct rocker_flow_tbl_entry *entry)
+{
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_ETHERTYPE,
+			       entry->key.ucast_routing.eth_type))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_DST_IP,
+			       entry->key.ucast_routing.dst4))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_DST_IP_MASK,
+			       entry->key.ucast_routing.dst4_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
+			       entry->key.ucast_routing.goto_tbl))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
+			       entry->key.ucast_routing.group_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int rocker_cmd_flow_tbl_add_bridge(struct rocker_desc_info *desc_info,
+					  struct rocker_flow_tbl_entry *entry)
+{
+	if (entry->key.bridge.has_eth_dst &&
+	    rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
+			   ETH_ALEN, entry->key.bridge.eth_dst))
+		return -EMSGSIZE;
+	if (entry->key.bridge.has_eth_dst_mask &&
+	    rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC_MASK,
+			   ETH_ALEN, entry->key.bridge.eth_dst_mask))
+		return -EMSGSIZE;
+	if (entry->key.bridge.vlan_id &&
+	    rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
+			       entry->key.bridge.vlan_id))
+		return -EMSGSIZE;
+	if (entry->key.bridge.tunnel_id &&
+	    rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_TUNNEL_ID,
+			       entry->key.bridge.tunnel_id))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GOTO_TABLE_ID,
+			       entry->key.bridge.goto_tbl))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
+			       entry->key.bridge.group_id))
+		return -EMSGSIZE;
+	if (entry->key.bridge.copy_to_cpu &&
+	    rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_COPY_CPU_ACTION,
+			      entry->key.bridge.copy_to_cpu))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int rocker_cmd_flow_tbl_add_acl(struct rocker_desc_info *desc_info,
+				       struct rocker_flow_tbl_entry *entry)
+{
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_LPORT,
+			       entry->key.acl.in_lport))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_IN_LPORT_MASK,
+			       entry->key.acl.in_lport_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_SRC_MAC,
+			   ETH_ALEN, entry->key.acl.eth_src))
+		return -EMSGSIZE;
+	if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_SRC_MAC_MASK,
+			   ETH_ALEN, entry->key.acl.eth_src_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
+			   ETH_ALEN, entry->key.acl.eth_dst))
+		return -EMSGSIZE;
+	if (rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC_MASK,
+			   ETH_ALEN, entry->key.acl.eth_dst_mask))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_ETHERTYPE,
+			       entry->key.acl.eth_type))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
+			       entry->key.acl.vlan_id))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID_MASK,
+			       entry->key.acl.vlan_id_mask))
+		return -EMSGSIZE;
+
+	switch (ntohs(entry->key.acl.eth_type)) {
+	case ETH_P_IP:
+	case ETH_P_IPV6:
+		if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_IP_PROTO,
+				      entry->key.acl.ip_proto))
+			return -EMSGSIZE;
+		if (rocker_tlv_put_u8(desc_info,
+				      ROCKER_TLV_OF_DPA_IP_PROTO_MASK,
+				      entry->key.acl.ip_proto_mask))
+			return -EMSGSIZE;
+		if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_IP_DSCP,
+				      entry->key.acl.ip_tos & 0x3f))
+			return -EMSGSIZE;
+		if (rocker_tlv_put_u8(desc_info,
+				      ROCKER_TLV_OF_DPA_IP_DSCP_MASK,
+				      entry->key.acl.ip_tos_mask & 0x3f))
+			return -EMSGSIZE;
+		if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_IP_ECN,
+				      (entry->key.acl.ip_tos & 0xc0) >> 6))
+			return -EMSGSIZE;
+		if (rocker_tlv_put_u8(desc_info,
+				      ROCKER_TLV_OF_DPA_IP_ECN_MASK,
+				      (entry->key.acl.ip_tos_mask & 0xc0) >> 6))
+			return -EMSGSIZE;
+		break;
+	}
+
+	if (entry->key.acl.group_id != ROCKER_GROUP_NONE &&
+	    rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
+			       entry->key.acl.group_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int rocker_cmd_flow_tbl_add(struct rocker *rocker,
+				   struct rocker_port *rocker_port,
+				   struct rocker_desc_info *desc_info,
+				   void *priv)
+{
+	struct rocker_flow_tbl_entry *entry = priv;
+	struct rocker_tlv *cmd_info;
+	int err = 0;
+
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE,
+			       ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_ADD))
+		return -EMSGSIZE;
+	cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
+	if (!cmd_info)
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_TABLE_ID,
+			       entry->key.tbl_id))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_PRIORITY,
+			       entry->key.priority))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_HARDTIME, 0))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u64(desc_info, ROCKER_TLV_OF_DPA_COOKIE,
+			       entry->cookie))
+		return -EMSGSIZE;
+
+	switch (entry->key.tbl_id) {
+	case ROCKER_OF_DPA_TABLE_ID_INGRESS_PORT:
+		err = rocker_cmd_flow_tbl_add_ig_port(desc_info, entry);
+		break;
+	case ROCKER_OF_DPA_TABLE_ID_VLAN:
+		err = rocker_cmd_flow_tbl_add_vlan(desc_info, entry);
+		break;
+	case ROCKER_OF_DPA_TABLE_ID_TERMINATION_MAC:
+		err = rocker_cmd_flow_tbl_add_term_mac(desc_info, entry);
+		break;
+	case ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING:
+		err = rocker_cmd_flow_tbl_add_ucast_routing(desc_info, entry);
+		break;
+	case ROCKER_OF_DPA_TABLE_ID_BRIDGING:
+		err = rocker_cmd_flow_tbl_add_bridge(desc_info, entry);
+		break;
+	case ROCKER_OF_DPA_TABLE_ID_ACL_POLICY:
+		err = rocker_cmd_flow_tbl_add_acl(desc_info, entry);
+		break;
+	default:
+		err = -ENOTSUPP;
+		break;
+	}
+
+	if (err)
+		return err;
+
+	rocker_tlv_nest_end(desc_info, cmd_info);
+
+	return 0;
+}
+
+static int rocker_cmd_flow_tbl_del(struct rocker *rocker,
+				   struct rocker_port *rocker_port,
+				   struct rocker_desc_info *desc_info,
+				   void *priv)
+{
+	const struct rocker_flow_tbl_entry *entry = priv;
+	struct rocker_tlv *cmd_info;
+
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE,
+			       ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_DEL))
+		return -EMSGSIZE;
+	cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
+	if (!cmd_info)
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u64(desc_info, ROCKER_TLV_OF_DPA_COOKIE,
+			       entry->cookie))
+		return -EMSGSIZE;
+	rocker_tlv_nest_end(desc_info, cmd_info);
+
+	return 0;
+}
+
+static int
+rocker_cmd_group_tbl_add_l2_interface(struct rocker_desc_info *desc_info,
+				      struct rocker_group_tbl_entry *entry)
+{
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_OUT_LPORT,
+			       ROCKER_GROUP_PORT_GET(entry->group_id)))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_POP_VLAN,
+			      entry->l2_interface.pop_vlan))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+rocker_cmd_group_tbl_add_l2_rewrite(struct rocker_desc_info *desc_info,
+				    struct rocker_group_tbl_entry *entry)
+{
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID_LOWER,
+			       entry->l2_rewrite.group_id))
+		return -EMSGSIZE;
+	if (!is_zero_ether_addr(entry->l2_rewrite.eth_src) &&
+	    rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_SRC_MAC,
+			   ETH_ALEN, entry->l2_rewrite.eth_src))
+		return -EMSGSIZE;
+	if (!is_zero_ether_addr(entry->l2_rewrite.eth_dst) &&
+	    rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
+			   ETH_ALEN, entry->l2_rewrite.eth_dst))
+		return -EMSGSIZE;
+	if (entry->l2_rewrite.vlan_id &&
+	    rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
+			       entry->l2_rewrite.vlan_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+rocker_cmd_group_tbl_add_group_ids(struct rocker_desc_info *desc_info,
+				   struct rocker_group_tbl_entry *entry)
+{
+	int i;
+	struct rocker_tlv *group_ids;
+
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_GROUP_COUNT,
+			       entry->group_count))
+		return -EMSGSIZE;
+
+	group_ids = rocker_tlv_nest_start(desc_info,
+					  ROCKER_TLV_OF_DPA_GROUP_IDS);
+	if (!group_ids)
+		return -EMSGSIZE;
+
+	for (i = 0; i < entry->group_count; i++)
+		/* Note TLV array is 1-based */
+		if (rocker_tlv_put_u32(desc_info, i + 1, entry->group_ids[i]))
+			return -EMSGSIZE;
+
+	rocker_tlv_nest_end(desc_info, group_ids);
+
+	return 0;
+}
+
+static int
+rocker_cmd_group_tbl_add_l3_unicast(struct rocker_desc_info *desc_info,
+				    struct rocker_group_tbl_entry *entry)
+{
+	if (!is_zero_ether_addr(entry->l3_unicast.eth_src) &&
+	    rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_SRC_MAC,
+			   ETH_ALEN, entry->l3_unicast.eth_src))
+		return -EMSGSIZE;
+	if (!is_zero_ether_addr(entry->l3_unicast.eth_dst) &&
+	    rocker_tlv_put(desc_info, ROCKER_TLV_OF_DPA_DST_MAC,
+			   ETH_ALEN, entry->l3_unicast.eth_dst))
+		return -EMSGSIZE;
+	if (entry->l3_unicast.vlan_id &&
+	    rocker_tlv_put_u16(desc_info, ROCKER_TLV_OF_DPA_VLAN_ID,
+			       entry->l3_unicast.vlan_id))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_OF_DPA_TTL_CHECK,
+			      entry->l3_unicast.ttl_check))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID_LOWER,
+			       entry->l3_unicast.group_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int rocker_cmd_group_tbl_add(struct rocker *rocker,
+				    struct rocker_port *rocker_port,
+				    struct rocker_desc_info *desc_info,
+				    void *priv)
+{
+	struct rocker_group_tbl_entry *entry = priv;
+	struct rocker_tlv *cmd_info;
+	int err = 0;
+
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, entry->cmd))
+		return -EMSGSIZE;
+	cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
+	if (!cmd_info)
+		return -EMSGSIZE;
+
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
+			       entry->group_id))
+		return -EMSGSIZE;
+
+	switch (ROCKER_GROUP_TYPE_GET(entry->group_id)) {
+	case ROCKER_OF_DPA_GROUP_TYPE_L2_INTERFACE:
+		err = rocker_cmd_group_tbl_add_l2_interface(desc_info, entry);
+		break;
+	case ROCKER_OF_DPA_GROUP_TYPE_L2_REWRITE:
+		err = rocker_cmd_group_tbl_add_l2_rewrite(desc_info, entry);
+		break;
+	case ROCKER_OF_DPA_GROUP_TYPE_L2_FLOOD:
+	case ROCKER_OF_DPA_GROUP_TYPE_L2_MCAST:
+		err = rocker_cmd_group_tbl_add_group_ids(desc_info, entry);
+		break;
+	case ROCKER_OF_DPA_GROUP_TYPE_L3_UCAST:
+		err = rocker_cmd_group_tbl_add_l3_unicast(desc_info, entry);
+		break;
+	default:
+		err = -ENOTSUPP;
+		break;
+	}
+
+	if (err)
+		return err;
+
+	rocker_tlv_nest_end(desc_info, cmd_info);
+
+	return 0;
+}
+
+static int rocker_cmd_group_tbl_del(struct rocker *rocker,
+				    struct rocker_port *rocker_port,
+				    struct rocker_desc_info *desc_info,
+				    void *priv)
+{
+	const struct rocker_group_tbl_entry *entry = priv;
+	struct rocker_tlv *cmd_info;
+
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, entry->cmd))
+		return -EMSGSIZE;
+	cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
+	if (!cmd_info)
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_OF_DPA_GROUP_ID,
+			       entry->group_id))
+		return -EMSGSIZE;
+	rocker_tlv_nest_end(desc_info, cmd_info);
+
+	return 0;
+}
+
+/*****************************************
+ * Flow, group, FDB, internal VLAN tables
+ *****************************************/
+
+static int rocker_init_tbls(struct rocker *rocker)
+{
+	hash_init(rocker->flow_tbl);
+	spin_lock_init(&rocker->flow_tbl_lock);
+
+	hash_init(rocker->group_tbl);
+	spin_lock_init(&rocker->group_tbl_lock);
+
+	hash_init(rocker->fdb_tbl);
+	spin_lock_init(&rocker->fdb_tbl_lock);
+
+	hash_init(rocker->internal_vlan_tbl);
+	spin_lock_init(&rocker->internal_vlan_tbl_lock);
+
+	return 0;
+}
+
+static void rocker_free_tbls(struct rocker *rocker)
+{
+	unsigned long flags;
+	struct rocker_flow_tbl_entry *flow_entry;
+	struct rocker_group_tbl_entry *group_entry;
+	struct rocker_fdb_tbl_entry *fdb_entry;
+	struct rocker_internal_vlan_tbl_entry *internal_vlan_entry;
+	struct hlist_node *tmp;
+	int bkt;
+
+	spin_lock_irqsave(&rocker->flow_tbl_lock, flags);
+	hash_for_each_safe(rocker->flow_tbl, bkt, tmp, flow_entry, entry)
+		hash_del(&flow_entry->entry);
+	spin_unlock_irqrestore(&rocker->flow_tbl_lock, flags);
+
+	spin_lock_irqsave(&rocker->group_tbl_lock, flags);
+	hash_for_each_safe(rocker->group_tbl, bkt, tmp, group_entry, entry)
+		hash_del(&group_entry->entry);
+	spin_unlock_irqrestore(&rocker->group_tbl_lock, flags);
+
+	spin_lock_irqsave(&rocker->fdb_tbl_lock, flags);
+	hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, fdb_entry, entry)
+		hash_del(&fdb_entry->entry);
+	spin_unlock_irqrestore(&rocker->fdb_tbl_lock, flags);
+
+	spin_lock_irqsave(&rocker->internal_vlan_tbl_lock, flags);
+	hash_for_each_safe(rocker->internal_vlan_tbl, bkt,
+			   tmp, internal_vlan_entry, entry)
+		hash_del(&internal_vlan_entry->entry);
+	spin_unlock_irqrestore(&rocker->internal_vlan_tbl_lock, flags);
+}
+
+static struct rocker_flow_tbl_entry *
+rocker_flow_tbl_find(struct rocker *rocker, struct rocker_flow_tbl_entry *match)
+{
+	struct rocker_flow_tbl_entry *found;
+
+	hash_for_each_possible(rocker->flow_tbl, found,
+			       entry, match->key_crc32) {
+		if (memcmp(&found->key, &match->key, sizeof(found->key)) == 0)
+			return found;
+	}
+
+	return NULL;
+}
+
+static int rocker_flow_tbl_add(struct rocker_port *rocker_port,
+			       struct rocker_flow_tbl_entry *match,
+			       bool nowait)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_flow_tbl_entry *found;
+	unsigned long flags;
+	bool add_to_hw = false;
+	int err = 0;
+
+	match->key_crc32 = crc32(~0, &match->key, sizeof(match->key));
+
+	spin_lock_irqsave(&rocker->flow_tbl_lock, flags);
+
+	found = rocker_flow_tbl_find(rocker, match);
+
+	if (found) {
+		kfree(match);
+	} else {
+		found = match;
+		found->cookie = rocker->flow_tbl_next_cookie++;
+		hash_add(rocker->flow_tbl, &found->entry, found->key_crc32);
+		add_to_hw = true;
+	}
+
+	found->ref_count++;
+
+	spin_unlock_irqrestore(&rocker->flow_tbl_lock, flags);
+
+	if (add_to_hw) {
+		err = rocker_cmd_exec(rocker, rocker_port,
+				      rocker_cmd_flow_tbl_add,
+				      found, NULL, NULL, nowait);
+		if (err) {
+			spin_lock_irqsave(&rocker->flow_tbl_lock, flags);
+			hash_del(&found->entry);
+			spin_unlock_irqrestore(&rocker->flow_tbl_lock, flags);
+			kfree(found);
+		}
+	}
+
+	return err;
+}
+
+static int rocker_flow_tbl_del(struct rocker_port *rocker_port,
+			       struct rocker_flow_tbl_entry *match,
+			       bool nowait)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_flow_tbl_entry *found;
+	unsigned long flags;
+	bool del_from_hw = false;
+	int err = 0;
+
+	match->key_crc32 = crc32(~0, &match->key, sizeof(match->key));
+
+	spin_lock_irqsave(&rocker->flow_tbl_lock, flags);
+
+	found = rocker_flow_tbl_find(rocker, match);
+
+	if (found) {
+		found->ref_count--;
+		if (found->ref_count == 0) {
+			hash_del(&found->entry);
+			del_from_hw = true;
+		}
+	}
+
+	spin_unlock_irqrestore(&rocker->flow_tbl_lock, flags);
+
+	kfree(match);
+
+	if (del_from_hw) {
+		err = rocker_cmd_exec(rocker, rocker_port,
+				      rocker_cmd_flow_tbl_del,
+				      found, NULL, NULL, nowait);
+		kfree(found);
+	}
+
+	return err;
+}
+
+static gfp_t rocker_op_flags_gfp(int flags)
+{
+	return flags & ROCKER_OP_FLAG_NOWAIT ? GFP_ATOMIC : GFP_KERNEL;
+}
+
+static int rocker_flow_tbl_do(struct rocker_port *rocker_port,
+			      int flags, struct rocker_flow_tbl_entry *entry)
+{
+	bool nowait = flags & ROCKER_OP_FLAG_NOWAIT;
+
+	if (flags & ROCKER_OP_FLAG_REMOVE)
+		return rocker_flow_tbl_del(rocker_port, entry, nowait);
+	else
+		return rocker_flow_tbl_add(rocker_port, entry, nowait);
+}
+
+static int rocker_flow_tbl_ig_port(struct rocker_port *rocker_port,
+				   int flags, u32 in_lport, u32 in_lport_mask,
+				   enum rocker_of_dpa_table_id goto_tbl)
+{
+	struct rocker_flow_tbl_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	entry->key.priority = ROCKER_PRIORITY_IG_PORT;
+	entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_INGRESS_PORT;
+	entry->key.ig_port.in_lport = in_lport;
+	entry->key.ig_port.in_lport_mask = in_lport_mask;
+	entry->key.ig_port.goto_tbl = goto_tbl;
+
+	return rocker_flow_tbl_do(rocker_port, flags, entry);
+}
+
+static int rocker_flow_tbl_vlan(struct rocker_port *rocker_port,
+				int flags, u32 in_lport,
+				__be16 vlan_id, __be16 vlan_id_mask,
+				enum rocker_of_dpa_table_id goto_tbl,
+				bool untagged, __be16 new_vlan_id)
+{
+	struct rocker_flow_tbl_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	entry->key.priority = ROCKER_PRIORITY_VLAN;
+	entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_VLAN;
+	entry->key.vlan.in_lport = in_lport;
+	entry->key.vlan.vlan_id = vlan_id;
+	entry->key.vlan.vlan_id_mask = vlan_id_mask;
+	entry->key.vlan.goto_tbl = goto_tbl;
+
+	entry->key.vlan.untagged = untagged;
+	entry->key.vlan.new_vlan_id = new_vlan_id;
+
+	return rocker_flow_tbl_do(rocker_port, flags, entry);
+}
+
+static int rocker_flow_tbl_term_mac(struct rocker_port *rocker_port,
+				    u32 in_lport, u32 in_lport_mask,
+				    __be16 eth_type, const u8 *eth_dst,
+				    const u8 *eth_dst_mask, __be16 vlan_id,
+				    __be16 vlan_id_mask, bool copy_to_cpu,
+				    int flags)
+{
+	struct rocker_flow_tbl_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	if (is_multicast_ether_addr(eth_dst)) {
+		entry->key.priority = ROCKER_PRIORITY_TERM_MAC_MCAST;
+		entry->key.term_mac.goto_tbl =
+			 ROCKER_OF_DPA_TABLE_ID_MULTICAST_ROUTING;
+	} else {
+		entry->key.priority = ROCKER_PRIORITY_TERM_MAC_UCAST;
+		entry->key.term_mac.goto_tbl =
+			 ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING;
+	}
+
+	entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_TERMINATION_MAC;
+	entry->key.term_mac.in_lport = in_lport;
+	entry->key.term_mac.in_lport_mask = in_lport_mask;
+	entry->key.term_mac.eth_type = eth_type;
+	ether_addr_copy(entry->key.term_mac.eth_dst, eth_dst);
+	ether_addr_copy(entry->key.term_mac.eth_dst_mask, eth_dst_mask);
+	entry->key.term_mac.vlan_id = vlan_id;
+	entry->key.term_mac.vlan_id_mask = vlan_id_mask;
+	entry->key.term_mac.copy_to_cpu = copy_to_cpu;
+
+	return rocker_flow_tbl_do(rocker_port, flags, entry);
+}
+
+static int rocker_flow_tbl_bridge(struct rocker_port *rocker_port,
+				  int flags,
+				  const u8 *eth_dst, const u8 *eth_dst_mask,
+				  __be16 vlan_id, u32 tunnel_id,
+				  enum rocker_of_dpa_table_id goto_tbl,
+				  u32 group_id, bool copy_to_cpu)
+{
+	struct rocker_flow_tbl_entry *entry;
+	u32 priority;
+	bool vlan_bridging = !!vlan_id;
+	bool dflt = !eth_dst || (eth_dst && eth_dst_mask);
+	bool wild = false;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_BRIDGING;
+
+	if (eth_dst) {
+		entry->key.bridge.has_eth_dst = 1;
+		ether_addr_copy(entry->key.bridge.eth_dst, eth_dst);
+	}
+	if (eth_dst_mask) {
+		entry->key.bridge.has_eth_dst_mask = 1;
+		ether_addr_copy(entry->key.bridge.eth_dst_mask, eth_dst_mask);
+		if (memcmp(eth_dst_mask, ff_mac, ETH_ALEN))
+			wild = true;
+	}
+
+	priority = ROCKER_PRIORITY_UNKNOWN;
+	if (vlan_bridging & dflt & wild)
+		priority = ROCKER_PRIORITY_BRIDGING_VLAN_DFLT_WILD;
+	else if (vlan_bridging & dflt & !wild)
+		priority = ROCKER_PRIORITY_BRIDGING_VLAN_DFLT_EXACT;
+	else if (vlan_bridging & !dflt)
+		priority = ROCKER_PRIORITY_BRIDGING_VLAN;
+	else if (!vlan_bridging & dflt & wild)
+		priority = ROCKER_PRIORITY_BRIDGING_TENANT_DFLT_WILD;
+	else if (!vlan_bridging & dflt & !wild)
+		priority = ROCKER_PRIORITY_BRIDGING_TENANT_DFLT_EXACT;
+	else if (!vlan_bridging & !dflt)
+		priority = ROCKER_PRIORITY_BRIDGING_TENANT;
+
+	entry->key.priority = priority;
+	entry->key.bridge.vlan_id = vlan_id;
+	entry->key.bridge.tunnel_id = tunnel_id;
+	entry->key.bridge.goto_tbl = goto_tbl;
+	entry->key.bridge.group_id = group_id;
+	entry->key.bridge.copy_to_cpu = copy_to_cpu;
+
+	return rocker_flow_tbl_do(rocker_port, flags, entry);
+}
+
+static int rocker_flow_tbl_acl(struct rocker_port *rocker_port,
+			       int flags, u32 in_lport,
+			       u32 in_lport_mask,
+			       const u8 *eth_src, const u8 *eth_src_mask,
+			       const u8 *eth_dst, const u8 *eth_dst_mask,
+			       __be16 eth_type,
+			       __be16 vlan_id, __be16 vlan_id_mask,
+			       u8 ip_proto, u8 ip_proto_mask,
+			       u8 ip_tos, u8 ip_tos_mask,
+			       u32 group_id)
+{
+	u32 priority;
+	struct rocker_flow_tbl_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	priority = ROCKER_PRIORITY_ACL_NORMAL;
+	if (eth_dst && eth_dst_mask) {
+		if (memcmp(eth_dst_mask, mcast_mac, ETH_ALEN) == 0)
+			priority = ROCKER_PRIORITY_ACL_DFLT;
+		else if (is_link_local_ether_addr(eth_dst))
+			priority = ROCKER_PRIORITY_ACL_CTRL;
+	}
+
+	entry->key.priority = priority;
+	entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
+	entry->key.acl.in_lport = in_lport;
+	entry->key.acl.in_lport_mask = in_lport_mask;
+
+	if (eth_src)
+		ether_addr_copy(entry->key.acl.eth_src, eth_src);
+	if (eth_src_mask)
+		ether_addr_copy(entry->key.acl.eth_src_mask, eth_src_mask);
+	if (eth_dst)
+		ether_addr_copy(entry->key.acl.eth_dst, eth_dst);
+	if (eth_dst_mask)
+		ether_addr_copy(entry->key.acl.eth_dst_mask, eth_dst_mask);
+
+	entry->key.acl.eth_type = eth_type;
+	entry->key.acl.vlan_id = vlan_id;
+	entry->key.acl.vlan_id_mask = vlan_id_mask;
+	entry->key.acl.ip_proto = ip_proto;
+	entry->key.acl.ip_proto_mask = ip_proto_mask;
+	entry->key.acl.ip_tos = ip_tos;
+	entry->key.acl.ip_tos_mask = ip_tos_mask;
+	entry->key.acl.group_id = group_id;
+
+	return rocker_flow_tbl_do(rocker_port, flags, entry);
+}
+
+static struct rocker_group_tbl_entry *
+rocker_group_tbl_find(struct rocker *rocker,
+		      struct rocker_group_tbl_entry *match)
+{
+	struct rocker_group_tbl_entry *found;
+
+	hash_for_each_possible(rocker->group_tbl, found,
+			       entry, match->group_id) {
+		if (found->group_id == match->group_id)
+			return found;
+	}
+
+	return NULL;
+}
+
+static void rocker_group_tbl_entry_free(struct rocker_group_tbl_entry *entry)
+{
+	switch (ROCKER_GROUP_TYPE_GET(entry->group_id)) {
+	case ROCKER_OF_DPA_GROUP_TYPE_L2_FLOOD:
+	case ROCKER_OF_DPA_GROUP_TYPE_L2_MCAST:
+		kfree(entry->group_ids);
+		break;
+	default:
+		break;
+	}
+	kfree(entry);
+}
+
+static int rocker_group_tbl_add(struct rocker_port *rocker_port,
+				struct rocker_group_tbl_entry *match,
+				bool nowait)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_group_tbl_entry *found;
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&rocker->group_tbl_lock, flags);
+
+	found = rocker_group_tbl_find(rocker, match);
+
+	if (found) {
+		hash_del(&found->entry);
+		rocker_group_tbl_entry_free(found);
+		found = match;
+		found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_GROUP_MOD;
+	} else {
+		found = match;
+		found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_GROUP_ADD;
+	}
+
+	hash_add(rocker->group_tbl, &found->entry, found->group_id);
+
+	spin_unlock_irqrestore(&rocker->group_tbl_lock, flags);
+
+	if (found->cmd)
+		err = rocker_cmd_exec(rocker, rocker_port,
+				      rocker_cmd_group_tbl_add,
+				      found, NULL, NULL, nowait);
+
+	return err;
+}
+
+static int rocker_group_tbl_del(struct rocker_port *rocker_port,
+				struct rocker_group_tbl_entry *match,
+				bool nowait)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_group_tbl_entry *found;
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&rocker->group_tbl_lock, flags);
+
+	found = rocker_group_tbl_find(rocker, match);
+
+	if (found) {
+		hash_del(&found->entry);
+		found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_GROUP_DEL;
+	}
+
+	spin_unlock_irqrestore(&rocker->group_tbl_lock, flags);
+
+	rocker_group_tbl_entry_free(match);
+
+	if (found) {
+		err = rocker_cmd_exec(rocker, rocker_port,
+				      rocker_cmd_group_tbl_del,
+				      found, NULL, NULL, nowait);
+		rocker_group_tbl_entry_free(found);
+	}
+
+	return err;
+}
+
+static int rocker_group_tbl_do(struct rocker_port *rocker_port,
+			       int flags, struct rocker_group_tbl_entry *entry)
+{
+	bool nowait = flags & ROCKER_OP_FLAG_NOWAIT;
+
+	if (flags & ROCKER_OP_FLAG_REMOVE)
+		return rocker_group_tbl_del(rocker_port, entry, nowait);
+	else
+		return rocker_group_tbl_add(rocker_port, entry, nowait);
+}
+
+static int rocker_group_l2_interface(struct rocker_port *rocker_port,
+				     int flags, __be16 vlan_id,
+				     u32 out_lport, int pop_vlan)
+{
+	struct rocker_group_tbl_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	entry->group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_lport);
+	entry->l2_interface.pop_vlan = pop_vlan;
+
+	return rocker_group_tbl_do(rocker_port, flags, entry);
+}
+
+static int rocker_group_l2_fan_out(struct rocker_port *rocker_port,
+				   int flags, u8 group_count,
+				   u32 *group_ids, u32 group_id)
+{
+	struct rocker_group_tbl_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	entry->group_id = group_id;
+	entry->group_count = group_count;
+
+	entry->group_ids = kcalloc(group_count, sizeof(u32),
+				   rocker_op_flags_gfp(flags));
+	if (!entry->group_ids) {
+		kfree(entry);
+		return -ENOMEM;
+	}
+	memcpy(entry->group_ids, group_ids, group_count * sizeof(u32));
+
+	return rocker_group_tbl_do(rocker_port, flags, entry);
+}
+
+static int rocker_group_l2_flood(struct rocker_port *rocker_port,
+				 int flags, __be16 vlan_id,
+				 u8 group_count, u32 *group_ids,
+				 u32 group_id)
+{
+	return rocker_group_l2_fan_out(rocker_port, flags,
+				       group_count, group_ids,
+				       group_id);
+}
+
+static struct rocker_ctrl {
+	const u8 *eth_dst;
+	const u8 *eth_dst_mask;
+	u16 eth_type;
+	bool acl;
+	bool bridge;
+	bool term;
+	bool copy_to_cpu;
+} rocker_ctrls[] = {
+	[ROCKER_CTRL_LINK_LOCAL_MCAST] = {
+		/* pass link local multicast pkts up to CPU for filtering */
+		.eth_dst = ll_mac,
+		.eth_dst_mask = ll_mask,
+		.acl = true,
+	},
+	[ROCKER_CTRL_LOCAL_ARP] = {
+		/* pass local ARP pkts up to CPU */
+		.eth_dst = zero_mac,
+		.eth_dst_mask = zero_mac,
+		.eth_type = htons(ETH_P_ARP),
+		.acl = true,
+	},
+	[ROCKER_CTRL_IPV4_MCAST] = {
+		/* pass IPv4 mcast pkts up to CPU, RFC 1112 */
+		.eth_dst = ipv4_mcast,
+		.eth_dst_mask = ipv4_mask,
+		.eth_type = htons(ETH_P_IP),
+		.term  = true,
+		.copy_to_cpu = true,
+	},
+	[ROCKER_CTRL_IPV6_MCAST] = {
+		/* pass IPv6 mcast pkts up to CPU, RFC 2464 */
+		.eth_dst = ipv6_mcast,
+		.eth_dst_mask = ipv6_mask,
+		.eth_type = htons(ETH_P_IPV6),
+		.term  = true,
+		.copy_to_cpu = true,
+	},
+	[ROCKER_CTRL_DFLT_BRIDGING] = {
+		/* flood any pkts on vlan */
+		.bridge = true,
+		.copy_to_cpu = true,
+	},
+};
+
+static int rocker_port_ctrl_vlan_acl(struct rocker_port *rocker_port,
+				     int flags, struct rocker_ctrl *ctrl,
+				     __be16 vlan_id)
+{
+	u32 in_lport = rocker_port->lport;
+	u32 in_lport_mask = 0xffffffff;
+	u32 out_lport = 0;
+	u8 *eth_src = NULL;
+	u8 *eth_src_mask = NULL;
+	__be16 vlan_id_mask = htons(0xffff);
+	u8 ip_proto = 0;
+	u8 ip_proto_mask = 0;
+	u8 ip_tos = 0;
+	u8 ip_tos_mask = 0;
+	u32 group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_lport);
+	int err;
+
+	err = rocker_flow_tbl_acl(rocker_port, flags,
+				  in_lport, in_lport_mask,
+				  eth_src, eth_src_mask,
+				  ctrl->eth_dst, ctrl->eth_dst_mask,
+				  ctrl->eth_type,
+				  vlan_id, vlan_id_mask,
+				  ip_proto, ip_proto_mask,
+				  ip_tos, ip_tos_mask,
+				  group_id);
+
+	if (err)
+		netdev_err(rocker_port->dev, "Error (%d) ctrl ACL\n", err);
+
+	return err;
+}
+
+static int rocker_port_ctrl_vlan_term(struct rocker_port *rocker_port,
+				      int flags, struct rocker_ctrl *ctrl,
+				      __be16 vlan_id)
+{
+	u32 in_lport_mask = 0xffffffff;
+	__be16 vlan_id_mask = htons(0xffff);
+	int err;
+
+	if (ntohs(vlan_id) == 0)
+		vlan_id = rocker_port->internal_vlan_id;
+
+	err = rocker_flow_tbl_term_mac(rocker_port,
+				       rocker_port->lport, in_lport_mask,
+				       ctrl->eth_type, ctrl->eth_dst,
+				       ctrl->eth_dst_mask, vlan_id,
+				       vlan_id_mask, ctrl->copy_to_cpu,
+				       flags);
+
+	if (err)
+		netdev_err(rocker_port->dev, "Error (%d) ctrl term\n", err);
+
+	return err;
+}
+
+static int rocker_port_ctrl_vlan(struct rocker_port *rocker_port, int flags,
+				 struct rocker_ctrl *ctrl, __be16 vlan_id)
+{
+	if (ctrl->acl)
+		return rocker_port_ctrl_vlan_acl(rocker_port, flags,
+						 ctrl, vlan_id);
+
+	if (ctrl->term)
+		return rocker_port_ctrl_vlan_term(rocker_port, flags,
+						  ctrl, vlan_id);
+
+	return -EOPNOTSUPP;
+}
+
+static int rocker_port_ctrl_vlan_add(struct rocker_port *rocker_port,
+				     int flags, __be16 vlan_id)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < ROCKER_CTRL_MAX; i++) {
+		if (rocker_port->ctrls[i]) {
+			err = rocker_port_ctrl_vlan(rocker_port, flags,
+						    &rocker_ctrls[i], vlan_id);
+			if (err)
+				return err;
+		}
+	}
+
+	return err;
+}
+
+static int rocker_port_ctrl(struct rocker_port *rocker_port, int flags,
+			    struct rocker_ctrl *ctrl)
+{
+	u16 vid;
+	int err = 0;
+
+	for (vid = 1; vid < VLAN_N_VID; vid++) {
+		if (!test_bit(vid, rocker_port->vlan_bitmap))
+			continue;
+		err = rocker_port_ctrl_vlan(rocker_port, flags,
+					    ctrl, htons(vid));
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int rocker_port_ig_tbl(struct rocker_port *rocker_port, int flags)
+{
+	enum rocker_of_dpa_table_id goto_tbl;
+	u32 in_lport;
+	u32 in_lport_mask;
+	int err;
+
+	/* Normal Ethernet Frames.  Matches pkts from any local physical
+	 * ports.  Goto VLAN tbl.
+	 */
+
+	in_lport = 0;
+	in_lport_mask = 0xffff0000;
+	goto_tbl = ROCKER_OF_DPA_TABLE_ID_VLAN;
+
+	err = rocker_flow_tbl_ig_port(rocker_port, flags,
+				      in_lport, in_lport_mask,
+				      goto_tbl);
+	if (err)
+		netdev_err(rocker_port->dev,
+			   "Error (%d) ingress port table entry\n", err);
+
+	return err;
+}
+
+static int rocker_port_router_mac(struct rocker_port *rocker_port,
+				  int flags, __be16 vlan_id)
+{
+	u32 in_lport_mask = 0xffffffff;
+	__be16 eth_type;
+	const u8 *dst_mac_mask = ff_mac;
+	__be16 vlan_id_mask = htons(0xffff);
+	bool copy_to_cpu = false;
+	int err;
+
+	if (ntohs(vlan_id) == 0)
+		vlan_id = rocker_port->internal_vlan_id;
+
+	eth_type = htons(ETH_P_IP);
+	err = rocker_flow_tbl_term_mac(rocker_port,
+				       rocker_port->lport, in_lport_mask,
+				       eth_type, rocker_port->dev->dev_addr,
+				       dst_mac_mask, vlan_id, vlan_id_mask,
+				       copy_to_cpu, flags);
+	if (err)
+		return err;
+
+	eth_type = htons(ETH_P_IPV6);
+	err = rocker_flow_tbl_term_mac(rocker_port,
+				       rocker_port->lport, in_lport_mask,
+				       eth_type, rocker_port->dev->dev_addr,
+				       dst_mac_mask, vlan_id, vlan_id_mask,
+				       copy_to_cpu, flags);
+
+	return err;
+}
+
+static struct rocker_internal_vlan_tbl_entry *
+rocker_internal_vlan_tbl_find(struct rocker *rocker, int ifindex)
+{
+	struct rocker_internal_vlan_tbl_entry *found;
+
+	hash_for_each_possible(rocker->internal_vlan_tbl, found,
+			       entry, ifindex) {
+		if (found->ifindex == ifindex)
+			return found;
+	}
+
+	return NULL;
+}
+
+static __be16 rocker_port_internal_vlan_id_get(struct rocker_port *rocker_port,
+					       int ifindex)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_internal_vlan_tbl_entry *entry;
+	struct rocker_internal_vlan_tbl_entry *found;
+	unsigned long lock_flags;
+	int i;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return 0;
+
+	entry->ifindex = ifindex;
+
+	spin_lock_irqsave(&rocker->internal_vlan_tbl_lock, lock_flags);
+
+	found = rocker_internal_vlan_tbl_find(rocker, ifindex);
+	if (found) {
+		kfree(entry);
+		goto found;
+	}
+
+	found = entry;
+	hash_add(rocker->internal_vlan_tbl, &found->entry, found->ifindex);
+
+	for (i = 0; i < ROCKER_N_INTERNAL_VLANS; i++) {
+		if (test_and_set_bit(i, rocker->internal_vlan_bitmap))
+			continue;
+		found->vlan_id = htons(ROCKER_INTERNAL_VLAN_ID_BASE + i);
+		goto found;
+	}
+
+	netdev_err(rocker_port->dev, "Out of internal VLAN IDs\n");
+
+found:
+	found->ref_count++;
+	spin_unlock_irqrestore(&rocker->internal_vlan_tbl_lock, lock_flags);
+
+	return found->vlan_id;
+}
+
+static void rocker_port_internal_vlan_id_put(struct rocker_port *rocker_port,
+					     int ifindex)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_internal_vlan_tbl_entry *found;
+	unsigned long lock_flags;
+	unsigned long bit;
+
+	spin_lock_irqsave(&rocker->internal_vlan_tbl_lock, lock_flags);
+
+	found = rocker_internal_vlan_tbl_find(rocker, ifindex);
+	if (!found) {
+		netdev_err(rocker_port->dev,
+			   "ifindex (%d) not found in internal VLAN tbl\n",
+			   ifindex);
+		goto not_found;
+	}
+
+	if (--found->ref_count <= 0) {
+		bit = ntohs(found->vlan_id) - ROCKER_INTERNAL_VLAN_ID_BASE;
+		clear_bit(bit, rocker->internal_vlan_bitmap);
+		hash_del(&found->entry);
+		kfree(found);
+	}
+
+not_found:
+	spin_unlock_irqrestore(&rocker->internal_vlan_tbl_lock, lock_flags);
+}
+
 /*****************
  * Net device ops
  *****************/
@@ -1768,10 +3210,14 @@ static void rocker_carrier_init(struct rocker_port *rocker_port)
 
 static void rocker_remove_ports(struct rocker *rocker)
 {
+	struct rocker_port *rocker_port;
 	int i;
 
-	for (i = 0; i < rocker->port_count; i++)
-		unregister_netdev(rocker->ports[i]->dev);
+	for (i = 0; i < rocker->port_count; i++) {
+		rocker_port = rocker->ports[i];
+		rocker_port_ig_tbl(rocker_port, ROCKER_OP_FLAG_REMOVE);
+		unregister_netdev(rocker_port->dev);
+	}
 	kfree(rocker->ports);
 }
 
@@ -1823,8 +3269,18 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 	}
 	rocker->ports[port_number] = rocker_port;
 
+	rocker_port->internal_vlan_id =
+		rocker_port_internal_vlan_id_get(rocker_port, dev->ifindex);
+	err = rocker_port_ig_tbl(rocker_port, 0);
+	if (err) {
+		dev_err(&pdev->dev, "install ig port table failed\n");
+		goto err_port_ig_tbl;
+	}
+
 	return 0;
 
+err_port_ig_tbl:
+	unregister_netdev(dev);
 err_register_netdev:
 	free_netdev(dev);
 	return err;
@@ -1981,6 +3437,12 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
 
+	err = rocker_init_tbls(rocker);
+	if (err) {
+		dev_err(&pdev->dev, "cannot init rocker tables\n");
+		goto err_init_tbls;
+	}
+
 	err = rocker_probe_ports(rocker);
 	if (err) {
 		dev_err(&pdev->dev, "failed to probe ports\n");
@@ -1992,6 +3454,8 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 
 err_probe_ports:
+	rocker_free_tbls(rocker);
+err_init_tbls:
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
 err_request_event_irq:
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
@@ -2017,6 +3481,7 @@ static void rocker_remove(struct pci_dev *pdev)
 {
 	struct rocker *rocker = pci_get_drvdata(pdev);
 
+	rocker_free_tbls(rocker);
 	rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
 	rocker_remove_ports(rocker);
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
-- 
1.9.3

^ permalink raw reply related

* [patch net-next v4 17/21] rocker: implement L2 bridge offloading
From: Jiri Pirko @ 2014-11-27 10:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse, pshelar,
	azhou, ben, stephen, jeffrey.t.kirsher, vyasevic, xiyou.wangcong,
	john.r.fastabend, edumazet, jhs, sfeldma, f.fainelli, roopa,
	linville, jasowang, ebiederm, nicolas.dichtel, ryazanov.s.a,
	buytenh, aviadr, nbd, alexei.starovoitov, Neil.Jerram, ronye,
	simon.horman, alexander.h.duyck, john.ronciak, mleitner, shrijeet,
	gospo, bcrl, hemal
In-Reply-To: <1417084826-9875-1-git-send-email-jiri@resnulli.us>

From: Scott Feldman <sfeldma@gmail.com>

Add L2 bridge offloading support to rocker driver.  Here, the Linux bridge
driver is used to collect swdev ports into a tagged (or untagged) VLAN
bridge.  The switchdev will offload from the bridge driver the following L2
bridging functions:

 - Learning of neighbor MAC addresses on VLAN X  Learned mac/vlan is
installed in bridge FDB.  (And removed when device unlearns mac/vlan).
Learning must be turned off on each bridge port to disable the feature in
the bridge driver.

- Flooding of multicast/broadcast and unknown unicast pkts to (STP)
active ports in bridge.  The bridge driver is unaware of the flooding happening
at the device level.  Flooding must be turned off on each bridge port to
disable the feature on the bridge driver.

- STP port state is pushed down to driver/device.  The bridge still processes
STP BDPUs and maintains port STP state (for all VLANs in bridge), but
the driver/device must be notified of port STP state change to program
the device.

Multiple (VLAN) bridges are supported.  The device (implemented per
the OF-DPA spec) must use a portion of the VLAN namespace for
internal VLANs.  Right now, the upper 255 VLANs (0xf00 to 0xffe) are
used as internal VLAN IDs for untagged traffic and are not available
as port VLANs.

The driver uses the following interfaces:

1. To track VLAN add/del on ports in bridge:

.ndo_vlan_rx_add_vid
.ndo_vlan_rx_kill_vid

2. To track port add/del membership in bridge:

NETDEV_CHANGEUPPER netdevice notifier

3. To catch static FDB entries installed on bridge/vlan by user using netlink:

.ndo_fdb_add
.ndo_fdb_del

4. To be notified on port STP state change:

.ndo_switch_port_stp_update

5. To notify bridge driver on learned/forgotten mac/vlans on bridge port:

br_fdb_external_learn_add
br_fdb_external_learn_del

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
v3->v4:
-no change
v2->v3:
-use renamed br_fdb learn functions
-use ndo_fdb_add/del ops instead on new ones
-pass rocker_port struct to rocker_port_stp_update instead of net_device
-allow device to refresh existing learned FDB entries to keep aging
 timers active
v1->v2:
-no change
---
 drivers/net/ethernet/rocker/rocker.c | 670 ++++++++++++++++++++++++++++++++++-
 1 file changed, 669 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 6345f60..6c15aa1 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -28,6 +28,7 @@
 #include <linux/ethtool.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
 #include <linux/bitops.h>
 #include <net/switchdev.h>
 #include <net/rtnetlink.h>
@@ -196,10 +197,12 @@ enum {
 
 struct rocker_port {
 	struct net_device *dev;
+	struct net_device *bridge_dev;
 	struct rocker *rocker;
 	unsigned int port_number;
 	u32 lport;
 	__be16 internal_vlan_id;
+	int stp_state;
 	bool ctrls[ROCKER_CTRL_MAX];
 	unsigned long vlan_bitmap[ROCKER_VLAN_BITMAP_LEN];
 	struct napi_struct napi_tx;
@@ -290,6 +293,20 @@ static __be16 rocker_port_vid_to_vlan(struct rocker_port *rocker_port,
 	return vlan_id;
 }
 
+static u16 rocker_port_vlan_to_vid(struct rocker_port *rocker_port,
+				   __be16 vlan_id)
+{
+	if (rocker_vlan_id_is_internal(vlan_id))
+		return 0;
+
+	return ntohs(vlan_id);
+}
+
+static bool rocker_port_is_bridged(struct rocker_port *rocker_port)
+{
+	return !!rocker_port->bridge_dev;
+}
+
 struct rocker_wait {
 	wait_queue_head_t wait;
 	bool done;
@@ -1301,6 +1318,43 @@ static int rocker_event_link_change(struct rocker *rocker,
 #define ROCKER_OP_FLAG_REMOVE		BIT(0)
 #define ROCKER_OP_FLAG_NOWAIT		BIT(1)
 #define ROCKER_OP_FLAG_LEARNED		BIT(2)
+#define ROCKER_OP_FLAG_REFRESH		BIT(3)
+
+static int rocker_port_fdb(struct rocker_port *rocker_port,
+			   const unsigned char *addr,
+			   __be16 vlan_id, int flags);
+
+static int rocker_event_mac_vlan_seen(struct rocker *rocker,
+				      const struct rocker_tlv *info)
+{
+	struct rocker_tlv *attrs[ROCKER_TLV_EVENT_MAC_VLAN_MAX + 1];
+	unsigned int port_number;
+	struct rocker_port *rocker_port;
+	unsigned char *addr;
+	int flags = ROCKER_OP_FLAG_NOWAIT | ROCKER_OP_FLAG_LEARNED;
+	__be16 vlan_id;
+
+	rocker_tlv_parse_nested(attrs, ROCKER_TLV_EVENT_MAC_VLAN_MAX, info);
+	if (!attrs[ROCKER_TLV_EVENT_MAC_VLAN_LPORT] ||
+	    !attrs[ROCKER_TLV_EVENT_MAC_VLAN_MAC] ||
+	    !attrs[ROCKER_TLV_EVENT_MAC_VLAN_VLAN_ID])
+		return -EIO;
+	port_number =
+		rocker_tlv_get_u32(attrs[ROCKER_TLV_EVENT_MAC_VLAN_LPORT]) - 1;
+	addr = rocker_tlv_data(attrs[ROCKER_TLV_EVENT_MAC_VLAN_MAC]);
+	vlan_id = rocker_tlv_get_u16(attrs[ROCKER_TLV_EVENT_MAC_VLAN_VLAN_ID]);
+
+	if (port_number >= rocker->port_count)
+		return -EINVAL;
+
+	rocker_port = rocker->ports[port_number];
+
+	if (rocker_port->stp_state != BR_STATE_LEARNING &&
+	    rocker_port->stp_state != BR_STATE_FORWARDING)
+		return 0;
+
+	return rocker_port_fdb(rocker_port, addr, vlan_id, flags);
+}
 
 static int rocker_event_process(struct rocker *rocker,
 				struct rocker_desc_info *desc_info)
@@ -1320,6 +1374,8 @@ static int rocker_event_process(struct rocker *rocker,
 	switch (type) {
 	case ROCKER_TLV_EVENT_TYPE_LINK_CHANGED:
 		return rocker_event_link_change(rocker, info);
+	case ROCKER_TLV_EVENT_TYPE_MAC_VLAN_SEEN:
+		return rocker_event_mac_vlan_seen(rocker, info);
 	}
 
 	return -EOPNOTSUPP;
@@ -2547,6 +2603,104 @@ static int rocker_group_l2_flood(struct rocker_port *rocker_port,
 				       group_id);
 }
 
+static int rocker_port_vlan_flood_group(struct rocker_port *rocker_port,
+					int flags, __be16 vlan_id)
+{
+	struct rocker_port *p;
+	struct rocker *rocker = rocker_port->rocker;
+	u32 group_id = ROCKER_GROUP_L2_FLOOD(vlan_id, 0);
+	u32 group_ids[rocker->port_count];
+	u8 group_count = 0;
+	int err;
+	int i;
+
+	/* Adjust the flood group for this VLAN.  The flood group
+	 * references an L2 interface group for each port in this
+	 * VLAN.
+	 */
+
+	for (i = 0; i < rocker->port_count; i++) {
+		p = rocker->ports[i];
+		if (!rocker_port_is_bridged(p))
+			continue;
+		if (test_bit(ntohs(vlan_id), p->vlan_bitmap)) {
+			group_ids[group_count++] =
+				ROCKER_GROUP_L2_INTERFACE(vlan_id,
+							  p->lport);
+		}
+	}
+
+	/* If there are no bridged ports in this VLAN, we're done */
+	if (group_count == 0)
+		return 0;
+
+	err = rocker_group_l2_flood(rocker_port, flags, vlan_id,
+				    group_count, group_ids,
+				    group_id);
+	if (err)
+		netdev_err(rocker_port->dev,
+			   "Error (%d) port VLAN l2 flood group\n", err);
+
+	return err;
+}
+
+static int rocker_port_vlan_l2_groups(struct rocker_port *rocker_port,
+				      int flags, __be16 vlan_id,
+				      bool pop_vlan)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_port *p;
+	bool adding = !(flags & ROCKER_OP_FLAG_REMOVE);
+	u32 out_lport;
+	int ref = 0;
+	int err;
+	int i;
+
+	/* An L2 interface group for this port in this VLAN, but
+	 * only when port STP state is LEARNING|FORWARDING.
+	 */
+
+	if (rocker_port->stp_state == BR_STATE_LEARNING ||
+	    rocker_port->stp_state == BR_STATE_FORWARDING) {
+		out_lport = rocker_port->lport;
+		err = rocker_group_l2_interface(rocker_port, flags,
+						vlan_id, out_lport,
+						pop_vlan);
+		if (err) {
+			netdev_err(rocker_port->dev,
+				   "Error (%d) port VLAN l2 group for lport %d\n",
+				   err, out_lport);
+			return err;
+		}
+	}
+
+	/* An L2 interface group for this VLAN to CPU port.
+	 * Add when first port joins this VLAN and destroy when
+	 * last port leaves this VLAN.
+	 */
+
+	for (i = 0; i < rocker->port_count; i++) {
+		p = rocker->ports[i];
+		if (test_bit(ntohs(vlan_id), p->vlan_bitmap))
+			ref++;
+	}
+
+	if ((!adding || ref != 1) && (adding || ref != 0))
+		return 0;
+
+	out_lport = 0;
+	err = rocker_group_l2_interface(rocker_port, flags,
+					vlan_id, out_lport,
+					pop_vlan);
+	if (err) {
+		netdev_err(rocker_port->dev,
+			   "Error (%d) port VLAN l2 group for CPU port\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
 static struct rocker_ctrl {
 	const u8 *eth_dst;
 	const u8 *eth_dst_mask;
@@ -2625,6 +2779,30 @@ static int rocker_port_ctrl_vlan_acl(struct rocker_port *rocker_port,
 	return err;
 }
 
+static int rocker_port_ctrl_vlan_bridge(struct rocker_port *rocker_port,
+					int flags, struct rocker_ctrl *ctrl,
+					__be16 vlan_id)
+{
+	enum rocker_of_dpa_table_id goto_tbl =
+		ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
+	u32 group_id = ROCKER_GROUP_L2_FLOOD(vlan_id, 0);
+	u32 tunnel_id = 0;
+	int err;
+
+	if (!rocker_port_is_bridged(rocker_port))
+		return 0;
+
+	err = rocker_flow_tbl_bridge(rocker_port, flags,
+				     ctrl->eth_dst, ctrl->eth_dst_mask,
+				     vlan_id, tunnel_id,
+				     goto_tbl, group_id, ctrl->copy_to_cpu);
+
+	if (err)
+		netdev_err(rocker_port->dev, "Error (%d) ctrl FLOOD\n", err);
+
+	return err;
+}
+
 static int rocker_port_ctrl_vlan_term(struct rocker_port *rocker_port,
 				      int flags, struct rocker_ctrl *ctrl,
 				      __be16 vlan_id)
@@ -2655,6 +2833,9 @@ static int rocker_port_ctrl_vlan(struct rocker_port *rocker_port, int flags,
 	if (ctrl->acl)
 		return rocker_port_ctrl_vlan_acl(rocker_port, flags,
 						 ctrl, vlan_id);
+	if (ctrl->bridge)
+		return rocker_port_ctrl_vlan_bridge(rocker_port, flags,
+						    ctrl, vlan_id);
 
 	if (ctrl->term)
 		return rocker_port_ctrl_vlan_term(rocker_port, flags,
@@ -2699,6 +2880,64 @@ static int rocker_port_ctrl(struct rocker_port *rocker_port, int flags,
 	return err;
 }
 
+static int rocker_port_vlan(struct rocker_port *rocker_port, int flags,
+			    u16 vid)
+{
+	enum rocker_of_dpa_table_id goto_tbl =
+		ROCKER_OF_DPA_TABLE_ID_TERMINATION_MAC;
+	u32 in_lport = rocker_port->lport;
+	__be16 vlan_id = htons(vid);
+	__be16 vlan_id_mask = htons(0xffff);
+	__be16 internal_vlan_id;
+	bool untagged;
+	bool adding = !(flags & ROCKER_OP_FLAG_REMOVE);
+	int err;
+
+	internal_vlan_id = rocker_port_vid_to_vlan(rocker_port, vid, &untagged);
+
+	if (adding && test_and_set_bit(ntohs(internal_vlan_id),
+				       rocker_port->vlan_bitmap))
+			return 0; /* already added */
+	else if (!adding && !test_and_clear_bit(ntohs(internal_vlan_id),
+						rocker_port->vlan_bitmap))
+			return 0; /* already removed */
+
+	if (adding) {
+		err = rocker_port_ctrl_vlan_add(rocker_port, flags,
+						internal_vlan_id);
+		if (err) {
+			netdev_err(rocker_port->dev,
+				   "Error (%d) port ctrl vlan add\n", err);
+			return err;
+		}
+	}
+
+	err = rocker_port_vlan_l2_groups(rocker_port, flags,
+					 internal_vlan_id, untagged);
+	if (err) {
+		netdev_err(rocker_port->dev,
+			   "Error (%d) port VLAN l2 groups\n", err);
+		return err;
+	}
+
+	err = rocker_port_vlan_flood_group(rocker_port, flags,
+					   internal_vlan_id);
+	if (err) {
+		netdev_err(rocker_port->dev,
+			   "Error (%d) port VLAN l2 flood group\n", err);
+		return err;
+	}
+
+	err = rocker_flow_tbl_vlan(rocker_port, flags,
+				   in_lport, vlan_id, vlan_id_mask,
+				   goto_tbl, untagged, internal_vlan_id);
+	if (err)
+		netdev_err(rocker_port->dev,
+			   "Error (%d) port VLAN table\n", err);
+
+	return err;
+}
+
 static int rocker_port_ig_tbl(struct rocker_port *rocker_port, int flags)
 {
 	enum rocker_of_dpa_table_id goto_tbl;
@@ -2724,6 +2963,163 @@ static int rocker_port_ig_tbl(struct rocker_port *rocker_port, int flags)
 	return err;
 }
 
+struct rocker_fdb_learn_work {
+	struct work_struct work;
+	struct net_device *dev;
+	int flags;
+	u8 addr[ETH_ALEN];
+	u16 vid;
+};
+
+static void rocker_port_fdb_learn_work(struct work_struct *work)
+{
+	struct rocker_fdb_learn_work *lw =
+		container_of(work, struct rocker_fdb_learn_work, work);
+	bool removing = (lw->flags & ROCKER_OP_FLAG_REMOVE);
+	bool learned = (lw->flags & ROCKER_OP_FLAG_LEARNED);
+
+	if (learned & removing)
+		br_fdb_external_learn_del(lw->dev, lw->addr, lw->vid);
+	else if (learned & !removing)
+		br_fdb_external_learn_add(lw->dev, lw->addr, lw->vid);
+
+	kfree(work);
+}
+
+static int rocker_port_fdb_learn(struct rocker_port *rocker_port,
+				 int flags, const u8 *addr, __be16 vlan_id)
+{
+	struct rocker_fdb_learn_work *lw;
+	enum rocker_of_dpa_table_id goto_tbl =
+		ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
+	u32 out_lport = rocker_port->lport;
+	u32 tunnel_id = 0;
+	u32 group_id = ROCKER_GROUP_NONE;
+	bool copy_to_cpu = false;
+	int err;
+
+	if (rocker_port_is_bridged(rocker_port))
+		group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_lport);
+
+	if (!(flags & ROCKER_OP_FLAG_REFRESH)) {
+		err = rocker_flow_tbl_bridge(rocker_port, flags, addr, NULL,
+					     vlan_id, tunnel_id, goto_tbl,
+					     group_id, copy_to_cpu);
+		if (err)
+			return err;
+	}
+
+	if (!rocker_port_is_bridged(rocker_port))
+		return 0;
+
+	lw = kmalloc(sizeof(*lw), rocker_op_flags_gfp(flags));
+	if (!lw)
+		return -ENOMEM;
+
+	INIT_WORK(&lw->work, rocker_port_fdb_learn_work);
+
+	lw->dev = rocker_port->dev;
+	lw->flags = flags;
+	ether_addr_copy(lw->addr, addr);
+	lw->vid = rocker_port_vlan_to_vid(rocker_port, vlan_id);
+
+	schedule_work(&lw->work);
+
+	return 0;
+}
+
+static struct rocker_fdb_tbl_entry *
+rocker_fdb_tbl_find(struct rocker *rocker, struct rocker_fdb_tbl_entry *match)
+{
+	struct rocker_fdb_tbl_entry *found;
+
+	hash_for_each_possible(rocker->fdb_tbl, found, entry, match->key_crc32)
+		if (memcmp(&found->key, &match->key, sizeof(found->key)) == 0)
+			return found;
+
+	return NULL;
+}
+
+static int rocker_port_fdb(struct rocker_port *rocker_port,
+			   const unsigned char *addr,
+			   __be16 vlan_id, int flags)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_fdb_tbl_entry *fdb;
+	struct rocker_fdb_tbl_entry *found;
+	bool removing = (flags & ROCKER_OP_FLAG_REMOVE);
+	unsigned long lock_flags;
+
+	fdb = kzalloc(sizeof(*fdb), rocker_op_flags_gfp(flags));
+	if (!fdb)
+		return -ENOMEM;
+
+	fdb->learned = (flags & ROCKER_OP_FLAG_LEARNED);
+	fdb->key.lport = rocker_port->lport;
+	ether_addr_copy(fdb->key.addr, addr);
+	fdb->key.vlan_id = vlan_id;
+	fdb->key_crc32 = crc32(~0, &fdb->key, sizeof(fdb->key));
+
+	spin_lock_irqsave(&rocker->fdb_tbl_lock, lock_flags);
+
+	found = rocker_fdb_tbl_find(rocker, fdb);
+
+	if (removing && found) {
+		kfree(fdb);
+		hash_del(&found->entry);
+	} else if (!removing && !found) {
+		hash_add(rocker->fdb_tbl, &fdb->entry, fdb->key_crc32);
+	}
+
+	spin_unlock_irqrestore(&rocker->fdb_tbl_lock, lock_flags);
+
+	/* Check if adding and already exists, or removing and can't find */
+	if (!found != !removing) {
+		kfree(fdb);
+		if (!found && removing)
+			return 0;
+		/* Refreshing existing to update aging timers */
+		flags |= ROCKER_OP_FLAG_REFRESH;
+	}
+
+	return rocker_port_fdb_learn(rocker_port, flags, addr, vlan_id);
+}
+
+static int rocker_port_fdb_flush(struct rocker_port *rocker_port)
+{
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_fdb_tbl_entry *found;
+	unsigned long lock_flags;
+	int flags = ROCKER_OP_FLAG_NOWAIT | ROCKER_OP_FLAG_REMOVE;
+	struct hlist_node *tmp;
+	int bkt;
+	int err = 0;
+
+	if (rocker_port->stp_state == BR_STATE_LEARNING ||
+	    rocker_port->stp_state == BR_STATE_FORWARDING)
+		return 0;
+
+	spin_lock_irqsave(&rocker->fdb_tbl_lock, lock_flags);
+
+	hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) {
+		if (found->key.lport != rocker_port->lport)
+			continue;
+		if (!found->learned)
+			continue;
+		err = rocker_port_fdb_learn(rocker_port, flags,
+					    found->key.addr,
+					    found->key.vlan_id);
+		if (err)
+			goto err_out;
+		hash_del(&found->entry);
+	}
+
+err_out:
+	spin_unlock_irqrestore(&rocker->fdb_tbl_lock, lock_flags);
+
+	return err;
+}
+
 static int rocker_port_router_mac(struct rocker_port *rocker_port,
 				  int flags, __be16 vlan_id)
 {
@@ -2756,6 +3152,97 @@ static int rocker_port_router_mac(struct rocker_port *rocker_port,
 	return err;
 }
 
+static int rocker_port_fwding(struct rocker_port *rocker_port)
+{
+	bool pop_vlan;
+	u32 out_lport;
+	__be16 vlan_id;
+	u16 vid;
+	int flags = ROCKER_OP_FLAG_NOWAIT;
+	int err;
+
+	/* Port will be forwarding-enabled if its STP state is LEARNING
+	 * or FORWARDING.  Traffic from CPU can still egress, regardless of
+	 * port STP state.  Use L2 interface group on port VLANs as a way
+	 * to toggle port forwarding: if forwarding is disabled, L2
+	 * interface group will not exist.
+	 */
+
+	if (rocker_port->stp_state != BR_STATE_LEARNING &&
+	    rocker_port->stp_state != BR_STATE_FORWARDING)
+		flags |= ROCKER_OP_FLAG_REMOVE;
+
+	out_lport = rocker_port->lport;
+	for (vid = 1; vid < VLAN_N_VID; vid++) {
+		if (!test_bit(vid, rocker_port->vlan_bitmap))
+			continue;
+		vlan_id = htons(vid);
+		pop_vlan = rocker_vlan_id_is_internal(vlan_id);
+		err = rocker_group_l2_interface(rocker_port, flags,
+						vlan_id, out_lport,
+						pop_vlan);
+		if (err) {
+			netdev_err(rocker_port->dev,
+				   "Error (%d) port VLAN l2 group for lport %d\n",
+				   err, out_lport);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int rocker_port_stp_update(struct rocker_port *rocker_port, u8 state)
+{
+	bool want[ROCKER_CTRL_MAX] = { 0, };
+	int flags;
+	int err;
+	int i;
+
+	if (rocker_port->stp_state == state)
+		return 0;
+
+	rocker_port->stp_state = state;
+
+	switch (state) {
+	case BR_STATE_DISABLED:
+		/* port is completely disabled */
+		break;
+	case BR_STATE_LISTENING:
+	case BR_STATE_BLOCKING:
+		want[ROCKER_CTRL_LINK_LOCAL_MCAST] = true;
+		break;
+	case BR_STATE_LEARNING:
+	case BR_STATE_FORWARDING:
+		want[ROCKER_CTRL_LINK_LOCAL_MCAST] = true;
+		want[ROCKER_CTRL_IPV4_MCAST] = true;
+		want[ROCKER_CTRL_IPV6_MCAST] = true;
+		if (rocker_port_is_bridged(rocker_port))
+			want[ROCKER_CTRL_DFLT_BRIDGING] = true;
+		else
+			want[ROCKER_CTRL_LOCAL_ARP] = true;
+		break;
+	}
+
+	for (i = 0; i < ROCKER_CTRL_MAX; i++) {
+		if (want[i] != rocker_port->ctrls[i]) {
+			flags = ROCKER_OP_FLAG_NOWAIT |
+				(want[i] ? 0 : ROCKER_OP_FLAG_REMOVE);
+			err = rocker_port_ctrl(rocker_port, flags,
+					       &rocker_ctrls[i]);
+			if (err)
+				return err;
+			rocker_port->ctrls[i] = want[i];
+		}
+	}
+
+	err = rocker_port_fdb_flush(rocker_port);
+	if (err)
+		return err;
+
+	return rocker_port_fwding(rocker_port);
+}
+
 static struct rocker_internal_vlan_tbl_entry *
 rocker_internal_vlan_tbl_find(struct rocker *rocker, int ifindex)
 {
@@ -2848,6 +3335,8 @@ not_found:
 static int rocker_port_open(struct net_device *dev)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
+	u8 stp_state = rocker_port_is_bridged(rocker_port) ?
+		BR_STATE_BLOCKING : BR_STATE_FORWARDING;
 	int err;
 
 	err = rocker_port_dma_rings_init(rocker_port);
@@ -2870,12 +3359,18 @@ static int rocker_port_open(struct net_device *dev)
 		goto err_request_rx_irq;
 	}
 
+	err = rocker_port_stp_update(rocker_port, stp_state);
+	if (err)
+		goto err_stp_update;
+
 	napi_enable(&rocker_port->napi_tx);
 	napi_enable(&rocker_port->napi_rx);
 	rocker_port_set_enable(rocker_port, true);
 	netif_start_queue(dev);
 	return 0;
 
+err_stp_update:
+	free_irq(rocker_msix_rx_vector(rocker_port), rocker_port);
 err_request_rx_irq:
 	free_irq(rocker_msix_tx_vector(rocker_port), rocker_port);
 err_request_tx_irq:
@@ -2891,6 +3386,7 @@ static int rocker_port_stop(struct net_device *dev)
 	rocker_port_set_enable(rocker_port, false);
 	napi_disable(&rocker_port->napi_rx);
 	napi_disable(&rocker_port->napi_tx);
+	rocker_port_stp_update(rocker_port, BR_STATE_DISABLED);
 	free_irq(rocker_msix_rx_vector(rocker_port), rocker_port);
 	free_irq(rocker_msix_tx_vector(rocker_port), rocker_port);
 	rocker_port_dma_rings_fini(rocker_port);
@@ -3035,6 +3531,62 @@ static int rocker_port_set_mac_address(struct net_device *dev, void *p)
 	return 0;
 }
 
+static int rocker_port_vlan_rx_add_vid(struct net_device *dev,
+				       __be16 proto, u16 vid)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	int err;
+
+	err = rocker_port_vlan(rocker_port, 0, vid);
+	if (err)
+		return err;
+
+	return rocker_port_router_mac(rocker_port, 0, htons(vid));
+}
+
+static int rocker_port_vlan_rx_kill_vid(struct net_device *dev,
+					__be16 proto, u16 vid)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	int err;
+
+	err = rocker_port_router_mac(rocker_port, ROCKER_OP_FLAG_REMOVE,
+				     htons(vid));
+	if (err)
+		return err;
+
+	return rocker_port_vlan(rocker_port, ROCKER_OP_FLAG_REMOVE, vid);
+}
+
+static int rocker_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			       struct net_device *dev,
+			       const unsigned char *addr, u16 vid,
+			       u16 nlm_flags)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	__be16 vlan_id = rocker_port_vid_to_vlan(rocker_port, vid, NULL);
+	int flags = 0;
+
+	if (!rocker_port_is_bridged(rocker_port))
+		return -EINVAL;
+
+	return rocker_port_fdb(rocker_port, addr, vlan_id, flags);
+}
+
+static int rocker_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			       struct net_device *dev,
+			       const unsigned char *addr, u16 vid)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	__be16 vlan_id = rocker_port_vid_to_vlan(rocker_port, vid, NULL);
+	int flags = ROCKER_OP_FLAG_REMOVE;
+
+	if (!rocker_port_is_bridged(rocker_port))
+		return -EINVAL;
+
+	return rocker_port_fdb(rocker_port, addr, vlan_id, flags);
+}
+
 static int rocker_port_switch_parent_id_get(struct net_device *dev,
 					    struct netdev_phys_item_id *psid)
 {
@@ -3046,12 +3598,24 @@ static int rocker_port_switch_parent_id_get(struct net_device *dev,
 	return 0;
 }
 
+static int rocker_port_switch_port_stp_update(struct net_device *dev, u8 state)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+
+	return rocker_port_stp_update(rocker_port, state);
+}
+
 static const struct net_device_ops rocker_port_netdev_ops = {
 	.ndo_open			= rocker_port_open,
 	.ndo_stop			= rocker_port_stop,
 	.ndo_start_xmit			= rocker_port_xmit,
 	.ndo_set_mac_address		= rocker_port_set_mac_address,
+	.ndo_vlan_rx_add_vid		= rocker_port_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid		= rocker_port_vlan_rx_kill_vid,
+	.ndo_fdb_add			= rocker_port_fdb_add,
+	.ndo_fdb_del			= rocker_port_fdb_del,
 	.ndo_switch_parent_id_get	= rocker_port_switch_parent_id_get,
+	.ndo_switch_port_stp_update	= rocker_port_switch_port_stp_update,
 };
 
 /********************
@@ -3501,17 +4065,121 @@ static struct pci_driver rocker_pci_driver = {
 	.remove		= rocker_remove,
 };
 
+/************************************
+ * Net device notifier event handler
+ ************************************/
+
+static bool rocker_port_dev_check(struct net_device *dev)
+{
+	return dev->netdev_ops == &rocker_port_netdev_ops;
+}
+
+static int rocker_port_bridge_join(struct rocker_port *rocker_port,
+				   struct net_device *bridge)
+{
+	int err;
+
+	rocker_port_internal_vlan_id_put(rocker_port,
+					 rocker_port->dev->ifindex);
+
+	rocker_port->bridge_dev = bridge;
+
+	/* Use bridge internal VLAN ID for untagged pkts */
+	err = rocker_port_vlan(rocker_port, ROCKER_OP_FLAG_REMOVE, 0);
+	if (err)
+		return err;
+	rocker_port->internal_vlan_id =
+		rocker_port_internal_vlan_id_get(rocker_port,
+						 bridge->ifindex);
+	err = rocker_port_vlan(rocker_port, 0, 0);
+
+	return err;
+}
+
+static int rocker_port_bridge_leave(struct rocker_port *rocker_port)
+{
+	int err;
+
+	rocker_port_internal_vlan_id_put(rocker_port,
+					 rocker_port->bridge_dev->ifindex);
+
+	rocker_port->bridge_dev = NULL;
+
+	/* Use port internal VLAN ID for untagged pkts */
+	err = rocker_port_vlan(rocker_port, ROCKER_OP_FLAG_REMOVE, 0);
+	if (err)
+		return err;
+	rocker_port->internal_vlan_id =
+		rocker_port_internal_vlan_id_get(rocker_port,
+						 rocker_port->dev->ifindex);
+	err = rocker_port_vlan(rocker_port, 0, 0);
+
+	return err;
+}
+
+static int rocker_port_master_changed(struct net_device *dev)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct net_device *master = netdev_master_upper_dev_get(dev);
+	int err = 0;
+
+	if (master && master->rtnl_link_ops &&
+	    !strcmp(master->rtnl_link_ops->kind, "bridge"))
+		err = rocker_port_bridge_join(rocker_port, master);
+	else
+		err = rocker_port_bridge_leave(rocker_port);
+
+	return err;
+}
+
+static int rocker_netdevice_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr)
+{
+	struct net_device *dev;
+	int err;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		dev = netdev_notifier_info_to_dev(ptr);
+		if (!rocker_port_dev_check(dev))
+			return NOTIFY_DONE;
+		err = rocker_port_master_changed(dev);
+		if (err)
+			netdev_warn(dev,
+				    "failed to reflect master change (err %d)\n",
+				    err);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block rocker_netdevice_nb __read_mostly = {
+	.notifier_call = rocker_netdevice_event,
+};
+
 /***********************
  * Module init and exit
  ***********************/
 
 static int __init rocker_module_init(void)
 {
-	return pci_register_driver(&rocker_pci_driver);
+	int err;
+
+	register_netdevice_notifier(&rocker_netdevice_nb);
+	err = pci_register_driver(&rocker_pci_driver);
+	if (err)
+		goto err_pci_register_driver;
+	return 0;
+
+err_pci_register_driver:
+	unregister_netdevice_notifier(&rocker_netdevice_nb);
+	return err;
 }
 
 static void __exit rocker_module_exit(void)
 {
+	unregister_netdevice_notifier(&rocker_netdevice_nb);
 	pci_unregister_driver(&rocker_pci_driver);
 }
 
-- 
1.9.3

^ permalink raw reply related

* [patch net-next v4 18/21] rocker: implement ndo_fdb_dump
From: Jiri Pirko @ 2014-11-27 10:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse, pshelar,
	azhou, ben, stephen, jeffrey.t.kirsher, vyasevic, xiyou.wangcong,
	john.r.fastabend, edumazet, jhs, sfeldma, f.fainelli, roopa,
	linville, jasowang, ebiederm, nicolas.dichtel, ryazanov.s.a,
	buytenh, aviadr, nbd, alexei.starovoitov, Neil.Jerram, ronye,
	simon.horman, alexander.h.duyck, john.ronciak, mleitner, shrijeet,
	gospo, bcrl, hemal
In-Reply-To: <1417084826-9875-1-git-send-email-jiri@resnulli.us>

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
v3->v4:
-ndm->ndm_flags set to NTF_SELF as pointed out by Scott
-use err to store retval of rocker_fdb_fill_info and break only in case < 0
new in v3
---
 drivers/net/ethernet/rocker/rocker.c | 73 ++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 6c15aa1..fea49e8 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3587,6 +3587,78 @@ static int rocker_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 	return rocker_port_fdb(rocker_port, addr, vlan_id, flags);
 }
 
+static int rocker_fdb_fill_info(struct sk_buff *skb,
+				struct rocker_port *rocker_port,
+				const unsigned char *addr, u16 vid,
+				u32 portid, u32 seq, int type,
+				unsigned int flags)
+{
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family	 = AF_BRIDGE;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags	 = NTF_SELF;
+	ndm->ndm_type	 = 0;
+	ndm->ndm_ifindex = rocker_port->dev->ifindex;
+	ndm->ndm_state   = NUD_REACHABLE;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+		goto nla_put_failure;
+
+	if (vid && nla_put_u16(skb, NDA_VLAN, vid))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int rocker_port_fdb_dump(struct sk_buff *skb,
+				struct netlink_callback *cb,
+				struct net_device *dev,
+				struct net_device *filter_dev,
+				int idx)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct rocker *rocker = rocker_port->rocker;
+	struct rocker_fdb_tbl_entry *found;
+	struct hlist_node *tmp;
+	int bkt;
+	unsigned long lock_flags;
+	const unsigned char *addr;
+	u16 vid;
+	int err;
+
+	spin_lock_irqsave(&rocker->fdb_tbl_lock, lock_flags);
+	hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) {
+		if (found->key.lport != rocker_port->lport)
+			continue;
+		if (idx < cb->args[0])
+			goto skip;
+		addr = found->key.addr;
+		vid = rocker_port_vlan_to_vid(rocker_port, found->key.vlan_id);
+		err = rocker_fdb_fill_info(skb, rocker_port, addr, vid,
+					   NETLINK_CB(cb->skb).portid,
+					   cb->nlh->nlmsg_seq,
+					   RTM_NEWNEIGH, NLM_F_MULTI);
+		if (err < 0)
+			break;
+skip:
+		++idx;
+	}
+	spin_unlock_irqrestore(&rocker->fdb_tbl_lock, lock_flags);
+	return idx;
+}
+
 static int rocker_port_switch_parent_id_get(struct net_device *dev,
 					    struct netdev_phys_item_id *psid)
 {
@@ -3614,6 +3686,7 @@ static const struct net_device_ops rocker_port_netdev_ops = {
 	.ndo_vlan_rx_kill_vid		= rocker_port_vlan_rx_kill_vid,
 	.ndo_fdb_add			= rocker_port_fdb_add,
 	.ndo_fdb_del			= rocker_port_fdb_del,
+	.ndo_fdb_dump			= rocker_port_fdb_dump,
 	.ndo_switch_parent_id_get	= rocker_port_switch_parent_id_get,
 	.ndo_switch_port_stp_update	= rocker_port_switch_port_stp_update,
 };
-- 
1.9.3

^ permalink raw reply related

* [patch net-next v4 19/21] rocker: add ndo_bridge_setlink/getlink support for learning policy
From: Jiri Pirko @ 2014-11-27 10:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse, pshelar,
	azhou, ben, stephen, jeffrey.t.kirsher, vyasevic, xiyou.wangcong,
	john.r.fastabend, edumazet, jhs, sfeldma, f.fainelli, roopa,
	linville, jasowang, ebiederm, nicolas.dichtel, ryazanov.s.a,
	buytenh, aviadr, nbd, alexei.starovoitov, Neil.Jerram, ronye,
	simon.horman, alexander.h.duyck, john.ronciak, mleitner, shrijeet,
	gospo, bcrl, hemal
In-Reply-To: <1417084826-9875-1-git-send-email-jiri@resnulli.us>

From: Scott Feldman <sfeldma@gmail.com>

Rocker ports will use new "swdev" hwmode for bridge port offload policy.
Current supported policy settings are BR_LEARNING and BR_LEARNING_SYNC.
User can turn on/off device port FDB learning and syncing to bridge.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
v3->v4:
-no change
new in v3
---
 drivers/net/ethernet/rocker/rocker.c | 99 ++++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/rocker/rocker.h |  1 +
 2 files changed, 100 insertions(+)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index fea49e8..61cfdbf 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -203,6 +203,7 @@ struct rocker_port {
 	u32 lport;
 	__be16 internal_vlan_id;
 	int stp_state;
+	u32 brport_flags;
 	bool ctrls[ROCKER_CTRL_MAX];
 	unsigned long vlan_bitmap[ROCKER_VLAN_BITMAP_LEN];
 	struct napi_struct napi_tx;
@@ -1629,6 +1630,30 @@ rocker_cmd_set_port_settings_macaddr_prep(struct rocker *rocker,
 	return 0;
 }
 
+static int
+rocker_cmd_set_port_learning_prep(struct rocker *rocker,
+				  struct rocker_port *rocker_port,
+				  struct rocker_desc_info *desc_info,
+				  void *priv)
+{
+	struct rocker_tlv *cmd_info;
+
+	if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE,
+			       ROCKER_TLV_CMD_TYPE_SET_PORT_SETTINGS))
+		return -EMSGSIZE;
+	cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO);
+	if (!cmd_info)
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u32(desc_info, ROCKER_TLV_CMD_PORT_SETTINGS_LPORT,
+			       rocker_port->lport))
+		return -EMSGSIZE;
+	if (rocker_tlv_put_u8(desc_info, ROCKER_TLV_CMD_PORT_SETTINGS_LEARNING,
+			      !!(rocker_port->brport_flags & BR_LEARNING)))
+		return -EMSGSIZE;
+	rocker_tlv_nest_end(desc_info, cmd_info);
+	return 0;
+}
+
 static int rocker_cmd_get_port_settings_ethtool(struct rocker_port *rocker_port,
 						struct ethtool_cmd *ecmd)
 {
@@ -1663,6 +1688,13 @@ static int rocker_cmd_set_port_settings_macaddr(struct rocker_port *rocker_port,
 			       macaddr, NULL, NULL, false);
 }
 
+static int rocker_port_set_learning(struct rocker_port *rocker_port)
+{
+	return rocker_cmd_exec(rocker_port->rocker, rocker_port,
+			       rocker_cmd_set_port_learning_prep,
+			       NULL, NULL, NULL, false);
+}
+
 static int rocker_cmd_flow_tbl_add_ig_port(struct rocker_desc_info *desc_info,
 					   struct rocker_flow_tbl_entry *entry)
 {
@@ -2995,6 +3027,7 @@ static int rocker_port_fdb_learn(struct rocker_port *rocker_port,
 	u32 out_lport = rocker_port->lport;
 	u32 tunnel_id = 0;
 	u32 group_id = ROCKER_GROUP_NONE;
+	bool syncing = !!(rocker_port->brport_flags & BR_LEARNING_SYNC);
 	bool copy_to_cpu = false;
 	int err;
 
@@ -3009,6 +3042,9 @@ static int rocker_port_fdb_learn(struct rocker_port *rocker_port,
 			return err;
 	}
 
+	if (!syncing)
+		return 0;
+
 	if (!rocker_port_is_bridged(rocker_port))
 		return 0;
 
@@ -3659,6 +3695,64 @@ skip:
 	return idx;
 }
 
+static int rocker_port_bridge_setlink(struct net_device *dev,
+				      struct nlmsghdr *nlh)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct nlattr *protinfo;
+	struct nlattr *afspec;
+	struct nlattr *attr;
+	u16 mode;
+	int err;
+
+	protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
+				   IFLA_PROTINFO);
+	afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+
+	if (afspec) {
+		attr = nla_find_nested(afspec, IFLA_BRIDGE_MODE);
+		if (attr) {
+			mode = nla_get_u16(attr);
+			if (mode != BRIDGE_MODE_SWDEV)
+				return -EINVAL;
+		}
+	}
+
+	if (protinfo) {
+		attr = nla_find_nested(protinfo, IFLA_BRPORT_LEARNING);
+		if (attr) {
+			if (nla_get_u8(attr))
+				rocker_port->brport_flags |= BR_LEARNING;
+			else
+				rocker_port->brport_flags &= ~BR_LEARNING;
+			err = rocker_port_set_learning(rocker_port);
+			if (err)
+				return err;
+		}
+		attr = nla_find_nested(protinfo, IFLA_BRPORT_LEARNING_SYNC);
+		if (attr) {
+			if (nla_get_u8(attr))
+				rocker_port->brport_flags |= BR_LEARNING_SYNC;
+			else
+				rocker_port->brport_flags &= ~BR_LEARNING_SYNC;
+		}
+	}
+
+	return 0;
+}
+
+static int rocker_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+				      struct net_device *dev,
+				      u32 filter_mask)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	u16 mode = BRIDGE_MODE_SWDEV;
+	u32 mask = BR_LEARNING | BR_LEARNING_SYNC;
+
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode,
+				       rocker_port->brport_flags, mask);
+}
+
 static int rocker_port_switch_parent_id_get(struct net_device *dev,
 					    struct netdev_phys_item_id *psid)
 {
@@ -3687,6 +3781,8 @@ static const struct net_device_ops rocker_port_netdev_ops = {
 	.ndo_fdb_add			= rocker_port_fdb_add,
 	.ndo_fdb_del			= rocker_port_fdb_del,
 	.ndo_fdb_dump			= rocker_port_fdb_dump,
+	.ndo_bridge_setlink		= rocker_port_bridge_setlink,
+	.ndo_bridge_getlink		= rocker_port_bridge_getlink,
 	.ndo_switch_parent_id_get	= rocker_port_switch_parent_id_get,
 	.ndo_switch_port_stp_update	= rocker_port_switch_port_stp_update,
 };
@@ -3887,6 +3983,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 	rocker_port->rocker = rocker;
 	rocker_port->port_number = port_number;
 	rocker_port->lport = port_number + 1;
+	rocker_port->brport_flags = BR_LEARNING | BR_LEARNING_SYNC;
 
 	rocker_port_dev_addr_init(rocker, rocker_port);
 	dev->netdev_ops = &rocker_port_netdev_ops;
@@ -3906,6 +4003,8 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 	}
 	rocker->ports[port_number] = rocker_port;
 
+	rocker_port_set_learning(rocker_port);
+
 	rocker_port->internal_vlan_id =
 		rocker_port_internal_vlan_id_get(rocker_port, dev->ifindex);
 	err = rocker_port_ig_tbl(rocker_port, 0);
diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h
index 5251cf8..8d2865b 100644
--- a/drivers/net/ethernet/rocker/rocker.h
+++ b/drivers/net/ethernet/rocker/rocker.h
@@ -139,6 +139,7 @@ enum {
 	ROCKER_TLV_CMD_PORT_SETTINGS_AUTONEG,		/* u8 */
 	ROCKER_TLV_CMD_PORT_SETTINGS_MACADDR,		/* binary */
 	ROCKER_TLV_CMD_PORT_SETTINGS_MODE,		/* u8 */
+	ROCKER_TLV_CMD_PORT_SETTINGS_LEARNING,		/* u8 */
 
 	__ROCKER_TLV_CMD_PORT_SETTINGS_MAX,
 	ROCKER_TLV_CMD_PORT_SETTINGS_MAX =
-- 
1.9.3

^ permalink raw reply related

* [patch net-next v4 20/21] rocker: Add proper validation of Netlink attributes
From: Jiri Pirko @ 2014-11-27 10:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse, pshelar,
	azhou, ben, stephen, jeffrey.t.kirsher, vyasevic, xiyou.wangcong,
	john.r.fastabend, edumazet, jhs, sfeldma, f.fainelli, roopa,
	linville, jasowang, ebiederm, nicolas.dichtel, ryazanov.s.a,
	buytenh, aviadr, nbd, alexei.starovoitov, Neil.Jerram, ronye,
	simon.horman, alexander.h.duyck, john.ronciak, mleitner, shrijeet,
	gospo, bcrl, hemal
In-Reply-To: <1417084826-9875-1-git-send-email-jiri@resnulli.us>

From: Thomas Graf <tgraf@suug.ch>

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
new in v4
---
 drivers/net/ethernet/rocker/rocker.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 61cfdbf..30687bf 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3712,6 +3712,9 @@ static int rocker_port_bridge_setlink(struct net_device *dev,
 	if (afspec) {
 		attr = nla_find_nested(afspec, IFLA_BRIDGE_MODE);
 		if (attr) {
+			if (nla_len(attr) < sizeof(mode))
+				return -EINVAL;
+
 			mode = nla_get_u16(attr);
 			if (mode != BRIDGE_MODE_SWDEV)
 				return -EINVAL;
@@ -3721,6 +3724,9 @@ static int rocker_port_bridge_setlink(struct net_device *dev,
 	if (protinfo) {
 		attr = nla_find_nested(protinfo, IFLA_BRPORT_LEARNING);
 		if (attr) {
+			if (nla_len(attr) < sizeof(u8))
+				return -EINVAL;
+
 			if (nla_get_u8(attr))
 				rocker_port->brport_flags |= BR_LEARNING;
 			else
@@ -3731,6 +3737,9 @@ static int rocker_port_bridge_setlink(struct net_device *dev,
 		}
 		attr = nla_find_nested(protinfo, IFLA_BRPORT_LEARNING_SYNC);
 		if (attr) {
+			if (nla_len(attr) < sizeof(u8))
+				return -EINVAL;
+
 			if (nla_get_u8(attr))
 				rocker_port->brport_flags |= BR_LEARNING_SYNC;
 			else
-- 
1.9.3

^ permalink raw reply related

* [patch net-next v4 21/21] rocker: Use logical operators on booleans
From: Jiri Pirko @ 2014-11-27 10:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, nhorman, andy, tgraf, dborkman, ogerlitz, jesse, pshelar,
	azhou, ben, stephen, jeffrey.t.kirsher, vyasevic, xiyou.wangcong,
	john.r.fastabend, edumazet, jhs, sfeldma, f.fainelli, roopa,
	linville, jasowang, ebiederm, nicolas.dichtel, ryazanov.s.a,
	buytenh, aviadr, nbd, alexei.starovoitov, Neil.Jerram, ronye,
	simon.horman, alexander.h.duyck, john.ronciak, mleitner, shrijeet,
	gospo, bcrl, hemal
In-Reply-To: <1417084826-9875-1-git-send-email-jiri@resnulli.us>

From: Thomas Graf <tgraf@suug.ch>

Silences various sparse warnings

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
new in v4
---
 drivers/net/ethernet/rocker/rocker.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 30687bf..fded127 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -2404,17 +2404,17 @@ static int rocker_flow_tbl_bridge(struct rocker_port *rocker_port,
 	}
 
 	priority = ROCKER_PRIORITY_UNKNOWN;
-	if (vlan_bridging & dflt & wild)
+	if (vlan_bridging && dflt && wild)
 		priority = ROCKER_PRIORITY_BRIDGING_VLAN_DFLT_WILD;
-	else if (vlan_bridging & dflt & !wild)
+	else if (vlan_bridging && dflt && !wild)
 		priority = ROCKER_PRIORITY_BRIDGING_VLAN_DFLT_EXACT;
-	else if (vlan_bridging & !dflt)
+	else if (vlan_bridging && !dflt)
 		priority = ROCKER_PRIORITY_BRIDGING_VLAN;
-	else if (!vlan_bridging & dflt & wild)
+	else if (!vlan_bridging && dflt && wild)
 		priority = ROCKER_PRIORITY_BRIDGING_TENANT_DFLT_WILD;
-	else if (!vlan_bridging & dflt & !wild)
+	else if (!vlan_bridging && dflt && !wild)
 		priority = ROCKER_PRIORITY_BRIDGING_TENANT_DFLT_EXACT;
-	else if (!vlan_bridging & !dflt)
+	else if (!vlan_bridging && !dflt)
 		priority = ROCKER_PRIORITY_BRIDGING_TENANT;
 
 	entry->key.priority = priority;
@@ -3010,9 +3010,9 @@ static void rocker_port_fdb_learn_work(struct work_struct *work)
 	bool removing = (lw->flags & ROCKER_OP_FLAG_REMOVE);
 	bool learned = (lw->flags & ROCKER_OP_FLAG_LEARNED);
 
-	if (learned & removing)
+	if (learned && removing)
 		br_fdb_external_learn_del(lw->dev, lw->addr, lw->vid);
-	else if (learned & !removing)
+	else if (learned && !removing)
 		br_fdb_external_learn_add(lw->dev, lw->addr, lw->vid);
 
 	kfree(work);
-- 
1.9.3

^ permalink raw reply related

* Re: [PATCH rfc] packet: zerocopy packet_snd
From: Michael S. Tsirkin @ 2014-11-27 10:44 UTC (permalink / raw)
  To: Jason Wang
  Cc: Willem de Bruijn, Network Development, David Miller, Eric Dumazet,
	Daniel Borkmann
In-Reply-To: <1417079412.18179.3@smtp.corp.redhat.com>

On Thu, Nov 27, 2014 at 09:18:12AM +0008, Jason Wang wrote:
> 
> 
> On Thu, Nov 27, 2014 at 5:17 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >On Wed, Nov 26, 2014 at 02:59:34PM -0500, Willem de Bruijn wrote:
> >> > The main problem with zero copy ATM is with queueing disciplines
> >> > which might keep the socket around essentially forever.
> >> > The case was described here:
> >> > https://lkml.org/lkml/2014/1/17/105
> >> > and of course this will make it more serious now that
> >> > more applications will be able to do this, so
> >> > chances that an administrator enables this
> >> > are higher.
> >> The denial of service issue raised there, that a single queue can
> >> block an entire virtio-net device, is less problematic in the case of
> >> packet sockets. A socket can run out of sk_wmem_alloc, but a prudent
> >> application can increase the limit or use separate sockets for
> >> separate flows.
> >
> >Socket per flow? Maybe just use TCP then?  increasing the limit
> >sounds like a wrong solution, it hurts security.
> >
> >> > One possible solution is some kind of timer orphaning frags
> >> > for skbs that have been around for too long.
> >>   Perhaps this can be approximated without an explicit timer by calling
> >> skb_copy_ubufs on enqueue whenever qlen exceeds a threshold value?
> >
> >Hard to say. Will have to see that patch to judge how robust this is.
> 
> This could not work, consider if the threshold is greater than vring size
> or vhost_net pending limit, transmission may still be blocked.

Well, application can e.g. just switch to non zero copy after
reaching a specific number of requests.
I think the real problem isn't reaching the queue full
condition, it's the fact a specific buffer might never
get freed. This API isn't half as useful as it could be
if applications had a way to force the memory
to be reclaimed.


And actually, I see a way for applications to reclaim the memory:
application could invoke something like MADV_SOFT_OFFLINE on the memory
submitted for zero copy transmit, to invalidate PTEs, and make next
access fault new pages in.
If dedicated memory is used for packets, you could even use
MADV_DONTNEED - but this doesn't work in many cases, certainly
not for virtualization type workloads.

Playting with PTEs needs to invalidate the TLB so it is not fast,
but it does not need to be: we are talking about ability to close the
socket, which should be rare.

For example, an application/hypervisor can detect a timeout when a
packet is not transmitted within a predefined time period, and trigger
such reclaim.
Making this period shorter than network watchdog timer of the VM
will ensure that watchdog does not trigger within VM.
Alternatively, VM network watchdog could trigger this reclaim
in order to recover packet memory.

With this idea, if application merely reads memory, we incur a lot of
overhead with pagefaults. So maybe a new call to enable COW for a range
of pages would be a good idea.


We'd have to make sure whatever's used for reclaim works for
a wide range of memory types: mmap-ed file, hugetlbfs, anonymous memory.


Thoughts?

-- 
MST

^ permalink raw reply

* Re: [PATCH v2] sh_eth: Fix skb alloc size and alignment adjust rule.
From: Yoshihiro Kaneko @ 2014-11-27 11:26 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Simon Horman, Magnus Damm, Linux-sh list
In-Reply-To: <20141121.150544.2240809721107766779.davem@davemloft.net>

Hello David,

I'm very sorry for the late response.

2014-11-22 5:05 GMT+09:00 David Miller <davem@davemloft.net>:
> From: Yoshihiro Kaneko <ykaneko0929@gmail.com>
> Date: Thu, 20 Nov 2014 19:35:21 +0900
>
>> From: Mitsuhiro Kimura <mitsuhiro.kimura.kc@renesas.com>
>>
>> In the current driver, allocation size of skb does not care the alignment
>> adjust after allocation.
>> And also, in the current implementation, buffer alignment method by
>> sh_eth_set_receive_align function has a bug that this function displace
>> buffer start address forcedly when the alignment is corrected.
>> In the result, tail of the skb will exceed allocated area and kernel panic
>> will be occurred.
>> This patch fix this issue.
>>
>> Signed-off-by: Mitsuhiro Kimura <mitsuhiro.kimura.kc@renesas.com>
>> Signed-off-by: Yoshihiro Kaneko <ykaneko0929@gmail.com>
>> ---
>>
>> The previous version of this patch was a part of the patch series as follows:
>> [PATCH 2/3] sh_eth: Fix skb alloc size and alignment adjust rule.
>>
>> This series is based on net tree.
>>
>> v2 [Yoshihiro Kaneko]
>> * Update as suggested by Sergei Shtylyov
>>   - Fixed the coding style
>>   - Corrected the comment
>>   - Removed {SH2_SH3|SH4}_SKB_RX_ALIGN
>
> Please compile test your changes on 64-bit platforms:
>
> drivers/net/ethernet/renesas/sh_eth.c: In function ‘sh_eth_set_receive_align’:
> drivers/net/ethernet/renesas/sh_eth.c:922:16: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]

I have never seen that warning message because I had done compile test
for ARM platform.
I saw the warning after I do compile test for x86_64.
I'll post the new version of this patch.

Thanks,
Kaneko

^ permalink raw reply

* [PATCH v3] sh_eth: Fix skb alloc size and alignment adjust rule.
From: Yoshihiro Kaneko @ 2014-11-27 11:34 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Simon Horman, Magnus Damm, linux-sh

From: Mitsuhiro Kimura <mitsuhiro.kimura.kc@renesas.com>

In the current driver, allocation size of skb does not care the alignment
adjust after allocation.
And also, in the current implementation, buffer alignment method by
sh_eth_set_receive_align function has a bug that this function displace
buffer start address forcedly when the alignment is corrected.
In the result, tail of the skb will exceed allocated area and kernel panic
will be occurred.
This patch fix this issue.

Signed-off-by: Mitsuhiro Kimura <mitsuhiro.kimura.kc@renesas.com>
Signed-off-by: Yoshihiro Kaneko <ykaneko0929@gmail.com>
---

The first version of this patch was a part of the patch series as follows:   
[PATCH 2/3] sh_eth: Fix skb alloc size and alignment adjust rule.

This patch is based on net tree.

v3 [Yoshihiro Kaneko]
* use uintptr_t instead of u32 in sh_eth_set_receive_align()

v2 [Yoshihiro Kaneko]
* Update as suggested by Sergei Shtylyov
  - Fixed the coding style
  - Corrected the comment
  - Removed {SH2_SH3|SH4}_SKB_RX_ALIGN

 drivers/net/ethernet/renesas/sh_eth.c | 32 +++++++++++++-------------------
 drivers/net/ethernet/renesas/sh_eth.h |  4 ++--
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 60e9c2c..f9e30b8 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -917,21 +917,13 @@ static int sh_eth_reset(struct net_device *ndev)
 	return ret;
 }
 
-#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE)
 static void sh_eth_set_receive_align(struct sk_buff *skb)
 {
-	int reserve;
+	uintptr_t reserve = (uintptr_t)skb->data & (SH_ETH_RX_ALIGN - 1);
 
-	reserve = SH4_SKB_RX_ALIGN - ((u32)skb->data & (SH4_SKB_RX_ALIGN - 1));
 	if (reserve)
-		skb_reserve(skb, reserve);
+		skb_reserve(skb, SH_ETH_RX_ALIGN - reserve);
 }
-#else
-static void sh_eth_set_receive_align(struct sk_buff *skb)
-{
-	skb_reserve(skb, SH2_SH3_SKB_RX_ALIGN);
-}
-#endif
 
 
 /* CPU <-> EDMAC endian convert */
@@ -1119,6 +1111,7 @@ static void sh_eth_ring_format(struct net_device *ndev)
 	struct sh_eth_txdesc *txdesc = NULL;
 	int rx_ringsize = sizeof(*rxdesc) * mdp->num_rx_ring;
 	int tx_ringsize = sizeof(*txdesc) * mdp->num_tx_ring;
+	int skbuff_size = mdp->rx_buf_sz + SH_ETH_RX_ALIGN - 1;
 
 	mdp->cur_rx = 0;
 	mdp->cur_tx = 0;
@@ -1131,21 +1124,21 @@ static void sh_eth_ring_format(struct net_device *ndev)
 	for (i = 0; i < mdp->num_rx_ring; i++) {
 		/* skb */
 		mdp->rx_skbuff[i] = NULL;
-		skb = netdev_alloc_skb(ndev, mdp->rx_buf_sz);
+		skb = netdev_alloc_skb(ndev, skbuff_size);
 		mdp->rx_skbuff[i] = skb;
 		if (skb == NULL)
 			break;
-		dma_map_single(&ndev->dev, skb->data, mdp->rx_buf_sz,
-			       DMA_FROM_DEVICE);
 		sh_eth_set_receive_align(skb);
 
 		/* RX descriptor */
 		rxdesc = &mdp->rx_ring[i];
+		/* The size of the buffer is a multiple of 16 bytes. */
+		rxdesc->buffer_length = ALIGN(mdp->rx_buf_sz, 16);
+		dma_map_single(&ndev->dev, skb->data, rxdesc->buffer_length,
+			       DMA_FROM_DEVICE);
 		rxdesc->addr = virt_to_phys(PTR_ALIGN(skb->data, 4));
 		rxdesc->status = cpu_to_edmac(mdp, RD_RACT | RD_RFP);
 
-		/* The size of the buffer is 16 byte boundary. */
-		rxdesc->buffer_length = ALIGN(mdp->rx_buf_sz, 16);
 		/* Rx descriptor address set */
 		if (i == 0) {
 			sh_eth_write(ndev, mdp->rx_desc_dma, RDLAR);
@@ -1397,6 +1390,7 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota)
 	struct sk_buff *skb;
 	u16 pkt_len = 0;
 	u32 desc_status;
+	int skbuff_size = mdp->rx_buf_sz + SH_ETH_RX_ALIGN - 1;
 
 	rxdesc = &mdp->rx_ring[entry];
 	while (!(rxdesc->status & cpu_to_edmac(mdp, RD_RACT))) {
@@ -1448,7 +1442,7 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota)
 			if (mdp->cd->rpadir)
 				skb_reserve(skb, NET_IP_ALIGN);
 			dma_sync_single_for_cpu(&ndev->dev, rxdesc->addr,
-						mdp->rx_buf_sz,
+						ALIGN(mdp->rx_buf_sz, 16),
 						DMA_FROM_DEVICE);
 			skb_put(skb, pkt_len);
 			skb->protocol = eth_type_trans(skb, ndev);
@@ -1468,13 +1462,13 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota)
 		rxdesc->buffer_length = ALIGN(mdp->rx_buf_sz, 16);
 
 		if (mdp->rx_skbuff[entry] == NULL) {
-			skb = netdev_alloc_skb(ndev, mdp->rx_buf_sz);
+			skb = netdev_alloc_skb(ndev, skbuff_size);
 			mdp->rx_skbuff[entry] = skb;
 			if (skb == NULL)
 				break;	/* Better luck next round. */
-			dma_map_single(&ndev->dev, skb->data, mdp->rx_buf_sz,
-				       DMA_FROM_DEVICE);
 			sh_eth_set_receive_align(skb);
+			dma_map_single(&ndev->dev, skb->data,
+				       rxdesc->buffer_length, DMA_FROM_DEVICE);
 
 			skb_checksum_none_assert(skb);
 			rxdesc->addr = virt_to_phys(PTR_ALIGN(skb->data, 4));
diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index b37c427..9fa9332 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -162,9 +162,9 @@ enum {
 
 /* Driver's parameters */
 #if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE)
-#define SH4_SKB_RX_ALIGN	32
+#define SH_ETH_RX_ALIGN		32
 #else
-#define SH2_SH3_SKB_RX_ALIGN	2
+#define SH_ETH_RX_ALIGN		2
 #endif
 
 /* Register's bits
-- 
1.9.1


^ permalink raw reply related

* Re: [RFC PATCH 0/3] net: Alloc NAPI page frags from their own pool
From: Jesper Dangaard Brouer @ 2014-11-27 12:00 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: netdev, davem, jeffrey.t.kirsher, eric.dumazet, ast, brouer
In-Reply-To: <20141126235900.1617.10008.stgit@ahduyck-vm-fedora20>

On Wed, 26 Nov 2014 16:05:50 -0800
Alexander Duyck <alexander.h.duyck@redhat.com> wrote:

> This patch series implements a means of allocating page fragments without
> the need for the local_irq_save/restore in __netdev_alloc_frag.  By doing
> this I am able to decrease packet processing time by 11ns per packet in my
> test environment.

This is really good work!

I've tested the patchset (detail see below).  Two different packet
sizes 64bytes and 272bytes, due to "copy-break" point in driver.

Notice, these tests are single flow, resulting in single CPU getting
activated on receiver.

If I drop packets very early in iptables "raw" table, I see an
improvement 10.51 ns to 13.22 ns (for 272bytes between 9.64 ns to 11.97
ns).  Which corrospond with Alex'es observations.

A little surprising, when doing full forwarding (IP-routing), I see a
much larger "nanosec" improvement, for 64bytes of between 47.64ns to
58.15ns (for 272bytes between 29.08ns to 30.14ns).  This improvement is
larger than I expected.  One pitfall is with full forwarding, we can
only forwards approx 1Mpps (single CPU), and the accuracy between tests
runs vary more.

Setup
-----
Generator: ixgbe, pktgen (3x CPUs), sending 10G wirespeed
 - Single flow pktgen, resulting in single CPU activation on target
 - pkt@64bytes:  tx:14900856 pps (wirespeed)
 - pkt@272bytes: tx: 4228696 pps (wirespeed)

Ethernet wirespeed:
 * (1/((64+20)*8))*(10*10^9)  = 14880952
 * (1/((272+20)*8))*(10*10^9) =  4280822

Receiver CPU E5-2695 running state-c0@2.8GHz

baseline
--------

Baseline: Full forwarding (no-netfilter):

 * pkt@64bytes: tx:977414 pps
 * pkt@64bytes: tx:974404 pps
 * test-variation@64bytes: 3010pps (1/977414*10^9)-(1/974404*10^9) = -3.16ns

 * pkt@272bytes: tx:911657 pps
 * pkt@272bytes: tx:906229 pps
 * test-variation@272bytes: 5428pps -6.57ns

Baseline: Drop in iptables RAW:

 * pkt@64bytes: rx:2801058 pps
 * pkt@64bytes: rx:2785579 pps
 * test-variation@64bytes: 15479pps -1.98 ns

 * pkt@272bytes: rx:2559718 pps
 * pkt@272bytes: rx:2544577 pps
 * test-variation@64bytes diff: 6230pps 0.746ns

With patch: alex'es napi_alloc_skb
----------------------------------

Full forwarding (no-netfilter) (pkt@64bytes):

 * pkt@64bytes: tx:1025150 pps
 * pkt@64bytes: tx:1032930 pps
 * test-variation@64bytes: -7780pps 7.34ns
 * Patchset improvements@64-fwd:
 - 977414 -> 1025150 = 47736pps -> 47.64ns
 - 974404 -> 1032930 = 58526pps -> 58.15ns

 * pkt@272bytes: tx:937416 pps
 * pkt@272bytes: tx:930761 pps
 * test-variation@272bytes: 6655pps -7.62ns
 * Patchset improvements@272-fwd:
  - 911657 -> 937416 = 25759pps -> 30.14ns
  - 906229 -> 930761 = 24532pps -> 29.08ns

Drop in iptables RAW (pkt@64bytes):

 * pkt@64bytes: rx:2885820 pps
 * pkt@64bytes: rx:2892050 pps
 * test-variation@64bytes diff: 6230pps 0.746ns
 * Patchset improvements@64-drop:
  - 2800896 -> 2885820 =  84924pps -> 10.51 ns
  - 2785579 -> 2892050 = 106471pps -> 13.22 ns

 * pkt@272bytes: rx:2624484 pps
 * pkt@272bytes: rx:2624492 pps
 * test-variation: pkt@272bytes diff: 8pps 0ns
 * Patchset improvements@272-drop:
  - 2624484 -> 2559718 = 64766 pps ->  9.64 ns
  - 2624492 -> 2544577 = 79915 pps -> 11.97 ns


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [patch net-next v3 02/17] net: make vid as a parameter for ndo_fdb_add/ndo_fdb_del
From: Jamal Hadi Salim @ 2014-11-27 12:14 UTC (permalink / raw)
  To: Scott Feldman
  Cc: John Fastabend, Jiri Pirko, Netdev, David S. Miller,
	nhorman@tuxdriver.com, Andy Gospodarek, Thomas Graf,
	dborkman@redhat.com, ogerlitz@mellanox.com, jesse@nicira.com,
	pshelar@nicira.com, azhou@nicira.com, ben@decadent.org.uk,
	stephen@networkplumber.org, Kirsher, Jeffrey T,
	vyasevic@redhat.com, Cong Wang, Eric Dumazet, Florian Fainelli,
	Roopa Prabhu, John Linville
In-Reply-To: <CAE4R7bAM+orLXdEn0qCu2rKj+W5_681RHmvgW4CY5H7jZ0ijfQ@mail.gmail.com>

On 11/27/14 01:50, Scott Feldman wrote:

[..]

>
> It's there: IFLA_BRPORT_LEARNING_SYNC.  From iproute2:
>
> $ bridge -d link show dev swp1
> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
> master br0 state forwarding priority 32 cost 2
>      hairpin off guard off root_block off fastleave off learning off flood off
> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master br0
>      learning on learning_sync on hwmode swdev
>
> Turn it off:
>
> $ bridge link set dev swp1 hwmode swdev learning_sync off
>
> And now:
>
> $ bridge -d link show dev swp1
> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
> master br0 state forwarding priority 32 cost 2
>      hairpin off guard off root_block off fastleave off learning off flood off
> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master br0
>      learning on learning_sync off hwmode swdev
>
>

Yes, this is the nice control portion.
 From reviewing the patches, I didnt see how the core to the driver was
using the  learning_sync. IOW, how do i turn off the drivers sync
from being activated? Maybe you are doing this in the rocker patches
which i didnt review? i think this needs to be core infrastructure i.e
if you are doing this in a timer (as opposed to interrupt driven), then
the core sync timer would kick in and call some driver ops.
In any case, details that can be ironed out later..

cheers,
jamal

^ permalink raw reply

* RE: [PATCH] x86: bpf_jit_comp: simplify trivial boolean return
From: David Laight @ 2014-11-27 12:25 UTC (permalink / raw)
  To: 'Joe Perches', Alexei Starovoitov
  Cc: Quentin Lambert, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, x86@kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <1417032059.16355.4.camel@perches.com>

From: Joe Perches
> On Wed, 2014-11-26 at 10:34 -0800, Alexei Starovoitov wrote:
> > On Wed, Nov 26, 2014 at 10:02 AM, Joe Perches <joe@perches.com> wrote:
> > > On Wed, 2014-11-26 at 09:23 -0800, Alexei Starovoitov wrote:
> > >> On Wed, Nov 26, 2014 at 8:58 AM, Joe Perches <joe@perches.com> wrote:
> > >
> > >> > Is there any value in reordering these tests for frequency
> > >> > or maybe using | instead of || to avoid multiple jumps?
> > >>
> > >> probably not. It's not a critical path.
> > >> compiler may fuse conditions depending on values anyway.
> > >> If it was a critical path, we could have used
> > >> (1 << reg) & mask trick.
> > >> I picked explicit 'return true' else 'return false' here,
> > >> because it felt easier to read. Just a matter of taste.
> > >
> > > There is a size difference though: (allyesconfig)
> > >
> > > $ size arch/x86/net/built-in.o*
> > >    text    data     bss     dec     hex filename
> > >   12999    1012    4336   18347    47ab arch/x86/net/built-in.o.new
> > >   13177    1076    4592   18845    499d arch/x86/net/built-in.o.old
> >
> > interesting. Compiler obviously thinks that 178 byte increase
> > with -O2 is the right trade off. Which I agree with :)
> >
> > If I think dropping 'inline' and using -Os will give bigger savings...
> 
> This was allyesconfig which already uses -Os
> 
> Using -O2, there is no difference using inline
> or not, but the size delta with the bitmask is
> much larger
> 
> $ size arch/x86/net/built-in.o* (allyesconfig, but not -Os)
>    text	   data	    bss	    dec	    hex	filename
>   13410	    820	   3624	  17854	   45be	arch/x86/net/built-in.o.new
>   16130	    884	   4200	  21214	   52de	arch/x86/net/built-in.o.old
>   16130	    884	   4200	  21214	   52de	arch/x86/net/built-in.o.static

That is quite a big % change in the code size.
Why the change in data?

	David

^ permalink raw reply

* Re: [PATCH rfc 1/4] net-timestamp: pull headers for SOCK_STREAM
From: Richard Cochran @ 2014-11-27 12:30 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Willem de Bruijn, David Miller, Network Development
In-Reply-To: <CALCETrW+2Q-R8ekVW=u=3Xb5YvYK=4RnuuWCvAG8pyc9eG6DQg@mail.gmail.com>

On Wed, Nov 26, 2014 at 04:36:39PM -0800, Andy Lutomirski wrote:
> Is there any reason to believe that unconditionally dropping the
> headers would break anything?  I find it a bit hard to believe that
> anyone has actually implemented logic to figure out *what* L2 header
> type should be decoded and decode it.

Documentation/networking/timestamping/timestamping.c

				else if (!memcmp(sync, data + res - sizeof(sync),
							sizeof(sync)))
					printf(" => GOT OUR DATA BACK (HURRAY!)");

The example program looks from the end of the buffer, ignoring the lower headers.

Thanks,
Richard

^ permalink raw reply

* [PATCH v5 14/45] virtio_net: v1.0 endianness
From: Michael S. Tsirkin @ 2014-11-27 12:31 UTC (permalink / raw)
  To: linux-kernel
  Cc: thuth, rusty, netdev, virtualization, dahi, linux-api, pbonzini,
	David Miller
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

Based on patches by Rusty Russell, Cornelia Huck.
Note: more code changes are needed for 1.0 support
(due to different header size).
So we don't advertize support for 1.0 yet.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_net.h | 15 ++++++++-------
 drivers/net/virtio_net.c        | 33 ++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 172a7f0..b5f1677 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -28,6 +28,7 @@
 #include <linux/types.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
+#include <linux/virtio_types.h>
 #include <linux/if_ether.h>
 
 /* The feature bitmap for virtio net */
@@ -84,17 +85,17 @@ struct virtio_net_hdr {
 #define VIRTIO_NET_HDR_GSO_TCPV6	4	// GSO frame, IPv6 TCP
 #define VIRTIO_NET_HDR_GSO_ECN		0x80	// TCP has ECN set
 	__u8 gso_type;
-	__u16 hdr_len;		/* Ethernet + IP + tcp/udp hdrs */
-	__u16 gso_size;		/* Bytes to append to hdr_len per frame */
-	__u16 csum_start;	/* Position to start checksumming from */
-	__u16 csum_offset;	/* Offset after that to place checksum */
+	__virtio16 hdr_len;		/* Ethernet + IP + tcp/udp hdrs */
+	__virtio16 gso_size;		/* Bytes to append to hdr_len per frame */
+	__virtio16 csum_start;	/* Position to start checksumming from */
+	__virtio16 csum_offset;	/* Offset after that to place checksum */
 };
 
 /* This is the version of the header to use when the MRG_RXBUF
  * feature has been negotiated. */
 struct virtio_net_hdr_mrg_rxbuf {
 	struct virtio_net_hdr hdr;
-	__u16 num_buffers;	/* Number of merged rx buffers */
+	__virtio16 num_buffers;	/* Number of merged rx buffers */
 };
 
 /*
@@ -149,7 +150,7 @@ typedef __u8 virtio_net_ctrl_ack;
  * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
  */
 struct virtio_net_ctrl_mac {
-	__u32 entries;
+	__virtio32 entries;
 	__u8 macs[][ETH_ALEN];
 } __attribute__((packed));
 
@@ -193,7 +194,7 @@ struct virtio_net_ctrl_mac {
  * specified.
  */
 struct virtio_net_ctrl_mq {
-	__u16 virtqueue_pairs;
+	__virtio16 virtqueue_pairs;
 };
 
 #define VIRTIO_NET_CTRL_MQ   4
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b0bc8ea..c07e030 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -347,13 +347,14 @@ err:
 }
 
 static struct sk_buff *receive_mergeable(struct net_device *dev,
+					 struct virtnet_info *vi,
 					 struct receive_queue *rq,
 					 unsigned long ctx,
 					 unsigned int len)
 {
 	void *buf = mergeable_ctx_to_buf_address(ctx);
 	struct skb_vnet_hdr *hdr = buf;
-	int num_buf = hdr->mhdr.num_buffers;
+	u16 num_buf = virtio16_to_cpu(rq->vq->vdev, hdr->mhdr.num_buffers);
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
 	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
@@ -369,7 +370,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
 		if (unlikely(!ctx)) {
 			pr_debug("%s: rx error: %d buffers out of %d missing\n",
-				 dev->name, num_buf, hdr->mhdr.num_buffers);
+				 dev->name, num_buf,
+				 virtio16_to_cpu(rq->vq->vdev,
+						 hdr->mhdr.num_buffers));
 			dev->stats.rx_length_errors++;
 			goto err_buf;
 		}
@@ -454,7 +457,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 	}
 
 	if (vi->mergeable_rx_bufs)
-		skb = receive_mergeable(dev, rq, (unsigned long)buf, len);
+		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
 	else if (vi->big_packets)
 		skb = receive_big(dev, rq, buf, len);
 	else
@@ -473,8 +476,8 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
 		pr_debug("Needs csum!\n");
 		if (!skb_partial_csum_set(skb,
-					  hdr->hdr.csum_start,
-					  hdr->hdr.csum_offset))
+			  virtio16_to_cpu(vi->vdev, hdr->hdr.csum_start),
+			  virtio16_to_cpu(vi->vdev, hdr->hdr.csum_offset)))
 			goto frame_err;
 	} else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -514,7 +517,8 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 		if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
 			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 
-		skb_shinfo(skb)->gso_size = hdr->hdr.gso_size;
+		skb_shinfo(skb)->gso_size = virtio16_to_cpu(vi->vdev,
+							    hdr->hdr.gso_size);
 		if (skb_shinfo(skb)->gso_size == 0) {
 			net_warn_ratelimited("%s: zero gso size.\n", dev->name);
 			goto frame_err;
@@ -876,16 +880,19 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		hdr->hdr.csum_start = skb_checksum_start_offset(skb);
-		hdr->hdr.csum_offset = skb->csum_offset;
+		hdr->hdr.csum_start = cpu_to_virtio16(vi->vdev,
+						skb_checksum_start_offset(skb));
+		hdr->hdr.csum_offset = cpu_to_virtio16(vi->vdev,
+							 skb->csum_offset);
 	} else {
 		hdr->hdr.flags = 0;
 		hdr->hdr.csum_offset = hdr->hdr.csum_start = 0;
 	}
 
 	if (skb_is_gso(skb)) {
-		hdr->hdr.hdr_len = skb_headlen(skb);
-		hdr->hdr.gso_size = skb_shinfo(skb)->gso_size;
+		hdr->hdr.hdr_len = cpu_to_virtio16(vi->vdev, skb_headlen(skb));
+		hdr->hdr.gso_size = cpu_to_virtio16(vi->vdev,
+						    skb_shinfo(skb)->gso_size);
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
 			hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
@@ -1112,7 +1119,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
 		return 0;
 
-	s.virtqueue_pairs = queue_pairs;
+	s.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
 	sg_init_one(&sg, &s, sizeof(s));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
@@ -1189,7 +1196,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	sg_init_table(sg, 2);
 
 	/* Store the unicast list and count in the front of the buffer */
-	mac_data->entries = uc_count;
+	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
 	i = 0;
 	netdev_for_each_uc_addr(ha, dev)
 		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
@@ -1200,7 +1207,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	/* multicast list and count fill the end */
 	mac_data = (void *)&mac_data->macs[uc_count][0];
 
-	mac_data->entries = mc_count;
+	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
 	i = 0;
 	netdev_for_each_mc_addr(ha, dev)
 		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
-- 
MST

^ permalink raw reply related

* [PATCH v5 22/45] virtio_net: pass vi around
From: Michael S. Tsirkin @ 2014-11-27 12:31 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, thuth, dahi,
	Rusty Russell, virtualization, netdev
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

Too many places poke at [rs]q->vq->vdev->priv just to get
the vi structure.  Let's just pass the pointer around: seems
cleaner, and might even be faster.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/net/virtio_net.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c07e030..1630c21 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -241,11 +241,11 @@ static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
 }
 
 /* Called from bottom half context */
-static struct sk_buff *page_to_skb(struct receive_queue *rq,
+static struct sk_buff *page_to_skb(struct virtnet_info *vi,
+				   struct receive_queue *rq,
 				   struct page *page, unsigned int offset,
 				   unsigned int len, unsigned int truesize)
 {
-	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	unsigned int copy, hdr_len, hdr_padded_len;
@@ -328,12 +328,13 @@ static struct sk_buff *receive_small(void *buf, unsigned int len)
 }
 
 static struct sk_buff *receive_big(struct net_device *dev,
+				   struct virtnet_info *vi,
 				   struct receive_queue *rq,
 				   void *buf,
 				   unsigned int len)
 {
 	struct page *page = buf;
-	struct sk_buff *skb = page_to_skb(rq, page, 0, len, PAGE_SIZE);
+	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 
 	if (unlikely(!skb))
 		goto err;
@@ -359,7 +360,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	int offset = buf - page_address(page);
 	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
 
-	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
+	struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
+					       truesize);
 	struct sk_buff *curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
@@ -433,9 +435,9 @@ err_buf:
 	return NULL;
 }
 
-static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
+static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
+			void *buf, unsigned int len)
 {
-	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct net_device *dev = vi->dev;
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 	struct sk_buff *skb;
@@ -459,7 +461,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 	if (vi->mergeable_rx_bufs)
 		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
 	else if (vi->big_packets)
-		skb = receive_big(dev, rq, buf, len);
+		skb = receive_big(dev, vi, rq, buf, len);
 	else
 		skb = receive_small(buf, len);
 
@@ -539,9 +541,9 @@ frame_err:
 	dev_kfree_skb(skb);
 }
 
-static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
+static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
+			     gfp_t gfp)
 {
-	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	int err;
@@ -664,9 +666,9 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
  * before we're receiving packets, or from refill_work which is
  * careful to disable receiving (using napi_disable).
  */
-static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
+static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
+			  gfp_t gfp)
 {
-	struct virtnet_info *vi = rq->vq->vdev->priv;
 	int err;
 	bool oom;
 
@@ -677,7 +679,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 		else if (vi->big_packets)
 			err = add_recvbuf_big(rq, gfp);
 		else
-			err = add_recvbuf_small(rq, gfp);
+			err = add_recvbuf_small(vi, rq, gfp);
 
 		oom = err == -ENOMEM;
 		if (err)
@@ -726,7 +728,7 @@ static void refill_work(struct work_struct *work)
 		struct receive_queue *rq = &vi->rq[i];
 
 		napi_disable(&rq->napi);
-		still_empty = !try_fill_recv(rq, GFP_KERNEL);
+		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
 		virtnet_napi_enable(rq);
 
 		/* In theory, this can happen: if we don't get any buffers in
@@ -745,12 +747,12 @@ static int virtnet_receive(struct receive_queue *rq, int budget)
 
 	while (received < budget &&
 	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
-		receive_buf(rq, buf, len);
+		receive_buf(vi, rq, buf, len);
 		received++;
 	}
 
 	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
-		if (!try_fill_recv(rq, GFP_ATOMIC))
+		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
 			schedule_delayed_work(&vi->refill, 0);
 	}
 
@@ -826,7 +828,7 @@ static int virtnet_open(struct net_device *dev)
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		if (i < vi->curr_queue_pairs)
 			/* Make sure we have some buffers: if oom use wq. */
-			if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
 				schedule_delayed_work(&vi->refill, 0);
 		virtnet_napi_enable(&vi->rq[i]);
 	}
@@ -1851,7 +1853,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	/* Last of all, set up some receive buffers. */
 	for (i = 0; i < vi->curr_queue_pairs; i++) {
-		try_fill_recv(&vi->rq[i], GFP_KERNEL);
+		try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
 
 		/* If we didn't even get one input buffer, we're useless. */
 		if (vi->rq[i].vq->num_free ==
@@ -1971,7 +1973,7 @@ static int virtnet_restore(struct virtio_device *vdev)
 
 	if (netif_running(vi->dev)) {
 		for (i = 0; i < vi->curr_queue_pairs; i++)
-			if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
 				schedule_delayed_work(&vi->refill, 0);
 
 		for (i = 0; i < vi->max_queue_pairs; i++)
-- 
MST

^ permalink raw reply related

* [PATCH v5 23/45] virtio_net: get rid of virtio_net_hdr/skb_vnet_hdr
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, thuth, dahi,
	Rusty Russell, virtualization, netdev
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

virtio 1.0 doesn't use virtio_net_hdr anymore, and in fact, it's not
really useful since virtio_net_hdr_mrg_rxbuf includes that as the first
field anyway.

Let's drop it, precalculate header len and store within vi instead.

This way we can also remove struct skb_vnet_hdr.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/net/virtio_net.c | 90 ++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 49 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 1630c21..516f2cb 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -123,6 +123,9 @@ struct virtnet_info {
 	/* Host can handle any s/g split between our header and packet data */
 	bool any_header_sg;
 
+	/* Packet virtio header size */
+	u8 hdr_len;
+
 	/* Active statistics */
 	struct virtnet_stats __percpu *stats;
 
@@ -139,21 +142,14 @@ struct virtnet_info {
 	struct notifier_block nb;
 };
 
-struct skb_vnet_hdr {
-	union {
-		struct virtio_net_hdr hdr;
-		struct virtio_net_hdr_mrg_rxbuf mhdr;
-	};
-};
-
 struct padded_vnet_hdr {
-	struct virtio_net_hdr hdr;
+	struct virtio_net_hdr_mrg_rxbuf hdr;
 	/*
-	 * virtio_net_hdr should be in a separated sg buffer because of a
-	 * QEMU bug, and data sg buffer shares same page with this header sg.
-	 * This padding makes next sg 16 byte aligned after virtio_net_hdr.
+	 * hdr is in a separate sg buffer, and data sg buffer shares same page
+	 * with this header sg. This padding makes next sg 16 byte aligned
+	 * after the header.
 	 */
-	char padding[6];
+	char padding[4];
 };
 
 /* Converting between virtqueue no. and kernel tx/rx queue no.
@@ -179,9 +175,9 @@ static int rxq2vq(int rxq)
 	return rxq * 2;
 }
 
-static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
+static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
 {
-	return (struct skb_vnet_hdr *)skb->cb;
+	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
 }
 
 /*
@@ -247,7 +243,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 				   unsigned int len, unsigned int truesize)
 {
 	struct sk_buff *skb;
-	struct skb_vnet_hdr *hdr;
+	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	unsigned int copy, hdr_len, hdr_padded_len;
 	char *p;
 
@@ -260,13 +256,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 
 	hdr = skb_vnet_hdr(skb);
 
-	if (vi->mergeable_rx_bufs) {
-		hdr_len = sizeof hdr->mhdr;
-		hdr_padded_len = sizeof hdr->mhdr;
-	} else {
-		hdr_len = sizeof hdr->hdr;
+	hdr_len = vi->hdr_len;
+	if (vi->mergeable_rx_bufs)
+		hdr_padded_len = sizeof *hdr;
+	else
 		hdr_padded_len = sizeof(struct padded_vnet_hdr);
-	}
 
 	memcpy(hdr, p, hdr_len);
 
@@ -317,11 +311,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
-static struct sk_buff *receive_small(void *buf, unsigned int len)
+static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
 {
 	struct sk_buff * skb = buf;
 
-	len -= sizeof(struct virtio_net_hdr);
+	len -= vi->hdr_len;
 	skb_trim(skb, len);
 
 	return skb;
@@ -354,8 +348,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 					 unsigned int len)
 {
 	void *buf = mergeable_ctx_to_buf_address(ctx);
-	struct skb_vnet_hdr *hdr = buf;
-	u16 num_buf = virtio16_to_cpu(rq->vq->vdev, hdr->mhdr.num_buffers);
+	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
+	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
 	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
@@ -373,8 +367,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		if (unlikely(!ctx)) {
 			pr_debug("%s: rx error: %d buffers out of %d missing\n",
 				 dev->name, num_buf,
-				 virtio16_to_cpu(rq->vq->vdev,
-						 hdr->mhdr.num_buffers));
+				 virtio16_to_cpu(vi->vdev,
+						 hdr->num_buffers));
 			dev->stats.rx_length_errors++;
 			goto err_buf;
 		}
@@ -441,7 +435,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	struct net_device *dev = vi->dev;
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 	struct sk_buff *skb;
-	struct skb_vnet_hdr *hdr;
+	struct virtio_net_hdr_mrg_rxbuf *hdr;
 
 	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
 		pr_debug("%s: short packet %i\n", dev->name, len);
@@ -463,7 +457,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	else if (vi->big_packets)
 		skb = receive_big(dev, vi, rq, buf, len);
 	else
-		skb = receive_small(buf, len);
+		skb = receive_small(vi, buf, len);
 
 	if (unlikely(!skb))
 		return;
@@ -545,7 +539,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 			     gfp_t gfp)
 {
 	struct sk_buff *skb;
-	struct skb_vnet_hdr *hdr;
+	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	int err;
 
 	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
@@ -556,7 +550,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 
 	hdr = skb_vnet_hdr(skb);
 	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);
-	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
+	sg_set_buf(rq->sg, hdr, vi->hdr_len);
 	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
 
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
@@ -566,7 +560,8 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 	return err;
 }
 
-static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
+static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
+			   gfp_t gfp)
 {
 	struct page *first, *list = NULL;
 	char *p;
@@ -597,8 +592,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 	p = page_address(first);
 
 	/* rq->sg[0], rq->sg[1] share the same page */
-	/* a separated rq->sg[0] for virtio_net_hdr only due to QEMU bug */
-	sg_set_buf(&rq->sg[0], p, sizeof(struct virtio_net_hdr));
+	/* a separated rq->sg[0] for header - required in case !any_header_sg */
+	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
 
 	/* rq->sg[1] for data packet, from offset */
 	offset = sizeof(struct padded_vnet_hdr);
@@ -677,7 +672,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
 		if (vi->mergeable_rx_bufs)
 			err = add_recvbuf_mergeable(rq, gfp);
 		else if (vi->big_packets)
-			err = add_recvbuf_big(rq, gfp);
+			err = add_recvbuf_big(vi, rq, gfp);
 		else
 			err = add_recvbuf_small(vi, rq, gfp);
 
@@ -857,18 +852,14 @@ static void free_old_xmit_skbs(struct send_queue *sq)
 
 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 {
-	struct skb_vnet_hdr *hdr;
+	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
 	struct virtnet_info *vi = sq->vq->vdev->priv;
 	unsigned num_sg;
-	unsigned hdr_len;
+	unsigned hdr_len = vi->hdr_len;
 	bool can_push;
 
 	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
-	if (vi->mergeable_rx_bufs)
-		hdr_len = sizeof hdr->mhdr;
-	else
-		hdr_len = sizeof hdr->hdr;
 
 	can_push = vi->any_header_sg &&
 		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
@@ -876,7 +867,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 	/* Even if we can, don't push here yet as this would skew
 	 * csum_start offset below. */
 	if (can_push)
-		hdr = (struct skb_vnet_hdr *)(skb->data - hdr_len);
+		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
 	else
 		hdr = skb_vnet_hdr(skb);
 
@@ -909,7 +900,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 	}
 
 	if (vi->mergeable_rx_bufs)
-		hdr->mhdr.num_buffers = 0;
+		hdr->num_buffers = 0;
 
 	sg_init_table(sq->sg, MAX_SKB_FRAGS + 2);
 	if (can_push) {
@@ -1814,18 +1805,19 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
+		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	else
+		vi->hdr_len = sizeof(struct virtio_net_hdr);
+
 	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT))
 		vi->any_header_sg = true;
 
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
 		vi->has_cvq = true;
 
-	if (vi->any_header_sg) {
-		if (vi->mergeable_rx_bufs)
-			dev->needed_headroom = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-		else
-			dev->needed_headroom = sizeof(struct virtio_net_hdr);
-	}
+	if (vi->any_header_sg)
+		dev->needed_headroom = vi->hdr_len;
 
 	/* Use single tx/rx queue pair as default */
 	vi->curr_queue_pairs = 1;
-- 
MST

^ permalink raw reply related

* [PATCH v5 24/45] virtio_net: stricter short buffer length checks
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: thuth, rusty, netdev, virtualization, dahi, pbonzini,
	David Miller
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

Our buffer length check is not strict enough for mergeable
buffers: buffer can still be shorter that header + address
by 2 bytes.

Fix that up.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 516f2cb..098f443 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -437,7 +437,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	struct sk_buff *skb;
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 
-	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
+	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
 		if (vi->mergeable_rx_bufs) {
-- 
MST

^ permalink raw reply related

* [PATCH v5 25/45] virtio_net: bigger header when VERSION_1 is set
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, thuth, dahi,
	Rusty Russell, virtualization, netdev
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

With VERSION_1 virtio_net uses same header size
whether mergeable buffers are enabled or not.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/net/virtio_net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 098f443..a0e64cf 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1805,7 +1805,8 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
-	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
+	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
 		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	else
 		vi->hdr_len = sizeof(struct virtio_net_hdr);
-- 
MST

^ permalink raw reply related

* [PATCH v5 26/45] virtio_net: enable v1.0 support
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: thuth, rusty, netdev, virtualization, dahi, pbonzini,
	David Miller
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

Now that we have completed 1.0 support, enable it in our driver.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/net/virtio_net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a0e64cf..c6a72d3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2003,6 +2003,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
 	VIRTIO_NET_F_CTRL_MAC_ADDR,
 	VIRTIO_F_ANY_LAYOUT,
+	VIRTIO_F_VERSION_1,
 };
 
 static struct virtio_driver virtio_net_driver = {
-- 
MST

^ permalink raw reply related

* [PATCH v5 27/45] vhost: make features 64 bit
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: thuth, kvm, rusty, netdev, virtualization, dahi, pbonzini,
	David Miller
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

We need to use bit 32 for virtio 1.0

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3eda654..c624b09 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
 	/* Protected by virtqueue mutex. */
 	struct vhost_memory *memory;
 	void *private_data;
-	unsigned acked_features;
+	u64 acked_features;
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
@@ -174,6 +174,6 @@ enum {
 
 static inline int vhost_has_feature(struct vhost_virtqueue *vq, int bit)
 {
-	return vq->acked_features & (1 << bit);
+	return vq->acked_features & (1ULL << bit);
 }
 #endif
-- 
MST

^ permalink raw reply related

* [PATCH v5 28/45] vhost: add memory access wrappers
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, thuth, dahi,
	kvm, virtualization, netdev
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

Add guest memory access wrappers to handle virtio endianness
conversions.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index c624b09..1f321fd 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -176,4 +176,35 @@ static inline int vhost_has_feature(struct vhost_virtqueue *vq, int bit)
 {
 	return vq->acked_features & (1ULL << bit);
 }
+
+/* Memory accessors */
+static inline u16 vhost16_to_cpu(struct vhost_virtqueue *vq, __virtio16 val)
+{
+	return __virtio16_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), val);
+}
+
+static inline __virtio16 cpu_to_vhost16(struct vhost_virtqueue *vq, u16 val)
+{
+	return __cpu_to_virtio16(vhost_has_feature(vq, VIRTIO_F_VERSION_1), val);
+}
+
+static inline u32 vhost32_to_cpu(struct vhost_virtqueue *vq, __virtio32 val)
+{
+	return __virtio32_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), val);
+}
+
+static inline __virtio32 cpu_to_vhost32(struct vhost_virtqueue *vq, u32 val)
+{
+	return __cpu_to_virtio32(vhost_has_feature(vq, VIRTIO_F_VERSION_1), val);
+}
+
+static inline u64 vhost64_to_cpu(struct vhost_virtqueue *vq, __virtio64 val)
+{
+	return __virtio64_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), val);
+}
+
+static inline __virtio64 cpu_to_vhost64(struct vhost_virtqueue *vq, u64 val)
+{
+	return __cpu_to_virtio64(vhost_has_feature(vq, VIRTIO_F_VERSION_1), val);
+}
 #endif
-- 
MST

^ permalink raw reply related

* [PATCH v5 29/45] vhost/net: force len for TX to host endian
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: thuth, kvm, rusty, netdev, virtualization, dahi, pbonzini,
	David Miller
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

vhost/net keeps a copy of some used ring but (ab)uses length
field for internal house-keeping. This works because
for tx used length is always 0.
Suppress sparse errors: we use native endian-ness internally but never
expose it to guest.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8dae2f7..dce5c58 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -48,15 +48,15 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
  * status internally; used for zerocopy tx only.
  */
 /* Lower device DMA failed */
-#define VHOST_DMA_FAILED_LEN	3
+#define VHOST_DMA_FAILED_LEN	((__force __virtio32)3)
 /* Lower device DMA done */
-#define VHOST_DMA_DONE_LEN	2
+#define VHOST_DMA_DONE_LEN	((__force __virtio32)2)
 /* Lower device DMA in progress */
-#define VHOST_DMA_IN_PROGRESS	1
+#define VHOST_DMA_IN_PROGRESS	((__force __virtio32)1)
 /* Buffer unused */
-#define VHOST_DMA_CLEAR_LEN	0
+#define VHOST_DMA_CLEAR_LEN	((__force __virtio32)0)
 
-#define VHOST_DMA_IS_DONE(len) ((len) >= VHOST_DMA_DONE_LEN)
+#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
 
 enum {
 	VHOST_NET_FEATURES = VHOST_FEATURES |
-- 
MST

^ permalink raw reply related

* [PATCH v5 30/45] vhost: virtio 1.0 endian-ness support
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, thuth, dahi,
	kvm, virtualization, netdev
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.c | 93 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 56 insertions(+), 37 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c90f437..4d379ed 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -33,8 +33,8 @@ enum {
 	VHOST_MEMORY_F_LOG = 0x1,
 };
 
-#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
-#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
+#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
+#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
 
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
 	void __user *used;
-	if (__put_user(vq->used_flags, &vq->used->flags) < 0)
+	if (__put_user(cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags) < 0)
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		/* Make sure the flag is seen before log. */
@@ -1019,7 +1019,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
-	if (__put_user(vq->avail_idx, vhost_avail_event(vq)))
+	if (__put_user(cpu_to_vhost16(vq, vq->avail_idx), vhost_avail_event(vq)))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		void __user *used;
@@ -1038,6 +1038,7 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 
 int vhost_init_used(struct vhost_virtqueue *vq)
 {
+	__virtio16 last_used_idx;
 	int r;
 	if (!vq->private_data)
 		return 0;
@@ -1046,7 +1047,13 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 	if (r)
 		return r;
 	vq->signalled_used_valid = false;
-	return get_user(vq->last_used_idx, &vq->used->idx);
+	if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx))
+		return -EFAULT;
+	r = __get_user(last_used_idx, &vq->used->idx);
+	if (r)
+		return r;
+	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
 
@@ -1087,16 +1094,16 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 /* Each buffer in the virtqueues is actually a chain of descriptors.  This
  * function returns the next descriptor in the chain,
  * or -1U if we're at the end. */
-static unsigned next_desc(struct vring_desc *desc)
+static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
 {
 	unsigned int next;
 
 	/* If this descriptor says it doesn't chain, we're done. */
-	if (!(desc->flags & VRING_DESC_F_NEXT))
+	if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
 		return -1U;
 
 	/* Check they're not leading us off end of descriptors. */
-	next = desc->next;
+	next = vhost16_to_cpu(vq, desc->next);
 	/* Make sure compiler knows to grab that: we don't want it changing! */
 	/* We will use the result as an index in an array, so most
 	 * architectures only need a compiler barrier here. */
@@ -1113,18 +1120,19 @@ static int get_indirect(struct vhost_virtqueue *vq,
 {
 	struct vring_desc desc;
 	unsigned int i = 0, count, found = 0;
+	u32 len = vhost32_to_cpu(vq, indirect->len);
 	int ret;
 
 	/* Sanity check */
-	if (unlikely(indirect->len % sizeof desc)) {
+	if (unlikely(len % sizeof desc)) {
 		vq_err(vq, "Invalid length in indirect descriptor: "
 		       "len 0x%llx not multiple of 0x%zx\n",
-		       (unsigned long long)indirect->len,
+		       (unsigned long long)vhost32_to_cpu(vq, indirect->len),
 		       sizeof desc);
 		return -EINVAL;
 	}
 
-	ret = translate_desc(vq, indirect->addr, indirect->len, vq->indirect,
+	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
 			     UIO_MAXIOV);
 	if (unlikely(ret < 0)) {
 		vq_err(vq, "Translation failure %d in indirect.\n", ret);
@@ -1135,7 +1143,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	 * architectures only need a compiler barrier here. */
 	read_barrier_depends();
 
-	count = indirect->len / sizeof desc;
+	count = len / sizeof desc;
 	/* Buffers are chained via a 16 bit next field, so
 	 * we can have at most 2^16 of these. */
 	if (unlikely(count > USHRT_MAX + 1)) {
@@ -1155,16 +1163,17 @@ static int get_indirect(struct vhost_virtqueue *vq,
 		if (unlikely(memcpy_fromiovec((unsigned char *)&desc,
 					      vq->indirect, sizeof desc))) {
 			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
-			       i, (size_t)indirect->addr + i * sizeof desc);
+			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
 			return -EINVAL;
 		}
-		if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
+		if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
 			vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
-			       i, (size_t)indirect->addr + i * sizeof desc);
+			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
 			return -EINVAL;
 		}
 
-		ret = translate_desc(vq, desc.addr, desc.len, iov + iov_count,
+		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
 				     iov_size - iov_count);
 		if (unlikely(ret < 0)) {
 			vq_err(vq, "Translation failure %d indirect idx %d\n",
@@ -1172,11 +1181,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			return ret;
 		}
 		/* If this is an input descriptor, increment that count. */
-		if (desc.flags & VRING_DESC_F_WRITE) {
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
 			*in_num += ret;
 			if (unlikely(log)) {
-				log[*log_num].addr = desc.addr;
-				log[*log_num].len = desc.len;
+				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
+				log[*log_num].len = vhost32_to_cpu(vq, desc.len);
 				++*log_num;
 			}
 		} else {
@@ -1189,7 +1198,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			}
 			*out_num += ret;
 		}
-	} while ((i = next_desc(&desc)) != -1);
+	} while ((i = next_desc(vq, &desc)) != -1);
 	return 0;
 }
 
@@ -1209,15 +1218,18 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	struct vring_desc desc;
 	unsigned int i, head, found = 0;
 	u16 last_avail_idx;
+	__virtio16 avail_idx;
+	__virtio16 ring_head;
 	int ret;
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
 	last_avail_idx = vq->last_avail_idx;
-	if (unlikely(__get_user(vq->avail_idx, &vq->avail->idx))) {
+	if (unlikely(__get_user(avail_idx, &vq->avail->idx))) {
 		vq_err(vq, "Failed to access avail idx at %p\n",
 		       &vq->avail->idx);
 		return -EFAULT;
 	}
+	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
 	if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
 		vq_err(vq, "Guest moved used index from %u to %u",
@@ -1234,7 +1246,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
 	/* Grab the next descriptor number they're advertising, and increment
 	 * the index we've seen. */
-	if (unlikely(__get_user(head,
+	if (unlikely(__get_user(ring_head,
 				&vq->avail->ring[last_avail_idx % vq->num]))) {
 		vq_err(vq, "Failed to read head: idx %d address %p\n",
 		       last_avail_idx,
@@ -1242,6 +1254,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
+	head = vhost16_to_cpu(vq, ring_head);
+
 	/* If their number is silly, that's an error. */
 	if (unlikely(head >= vq->num)) {
 		vq_err(vq, "Guest says index %u > %u is available",
@@ -1274,7 +1288,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			       i, vq->desc + i);
 			return -EFAULT;
 		}
-		if (desc.flags & VRING_DESC_F_INDIRECT) {
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
 			ret = get_indirect(vq, iov, iov_size,
 					   out_num, in_num,
 					   log, log_num, &desc);
@@ -1286,20 +1300,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			continue;
 		}
 
-		ret = translate_desc(vq, desc.addr, desc.len, iov + iov_count,
+		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
 				     iov_size - iov_count);
 		if (unlikely(ret < 0)) {
 			vq_err(vq, "Translation failure %d descriptor idx %d\n",
 			       ret, i);
 			return ret;
 		}
-		if (desc.flags & VRING_DESC_F_WRITE) {
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
 			/* If this is an input descriptor,
 			 * increment that count. */
 			*in_num += ret;
 			if (unlikely(log)) {
-				log[*log_num].addr = desc.addr;
-				log[*log_num].len = desc.len;
+				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
+				log[*log_num].len = vhost32_to_cpu(vq, desc.len);
 				++*log_num;
 			}
 		} else {
@@ -1312,7 +1327,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			}
 			*out_num += ret;
 		}
-	} while ((i = next_desc(&desc)) != -1);
+	} while ((i = next_desc(vq, &desc)) != -1);
 
 	/* On success, increment avail index. */
 	vq->last_avail_idx++;
@@ -1335,7 +1350,10 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
  * want to notify the guest, using eventfd. */
 int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
 {
-	struct vring_used_elem heads = { head, len };
+	struct vring_used_elem heads = {
+		cpu_to_vhost32(vq, head),
+		cpu_to_vhost32(vq, len)
+	};
 
 	return vhost_add_used_n(vq, &heads, 1);
 }
@@ -1404,7 +1422,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
 
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
-	if (put_user(vq->last_used_idx, &vq->used->idx)) {
+	if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
@@ -1422,7 +1440,8 @@ EXPORT_SYMBOL_GPL(vhost_add_used_n);
 
 static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-	__u16 old, new, event;
+	__u16 old, new;
+	__virtio16 event;
 	bool v;
 	/* Flush out used index updates. This is paired
 	 * with the barrier that the Guest executes when enabling
@@ -1434,12 +1453,12 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 		return true;
 
 	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
-		__u16 flags;
+		__virtio16 flags;
 		if (__get_user(flags, &vq->avail->flags)) {
 			vq_err(vq, "Failed to get flags");
 			return true;
 		}
-		return !(flags & VRING_AVAIL_F_NO_INTERRUPT);
+		return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
 	}
 	old = vq->signalled_used;
 	v = vq->signalled_used_valid;
@@ -1449,11 +1468,11 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	if (unlikely(!v))
 		return true;
 
-	if (get_user(event, vhost_used_event(vq))) {
+	if (__get_user(event, vhost_used_event(vq))) {
 		vq_err(vq, "Failed to get used event idx");
 		return true;
 	}
-	return vring_need_event(event, new, old);
+	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
 }
 
 /* This actually signals the guest, using eventfd. */
@@ -1488,7 +1507,7 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-	u16 avail_idx;
+	__virtio16 avail_idx;
 	int r;
 
 	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
@@ -1519,7 +1538,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 		return false;
 	}
 
-	return avail_idx != vq->avail_idx;
+	return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
 }
 EXPORT_SYMBOL_GPL(vhost_enable_notify);
 
-- 
MST

^ permalink raw reply related

* [PATCH v5 31/45] vhost/net: virtio 1.0 byte swap
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, thuth, dahi,
	kvm, virtualization, netdev
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index dce5c58..cae22f9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -416,7 +416,7 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].id = head;
+			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
 			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
@@ -500,6 +500,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 	int headcount = 0;
 	unsigned d;
 	int r, nlogs = 0;
+	u32 len;
 
 	while (datalen > 0 && headcount < quota) {
 		if (unlikely(seg >= UIO_MAXIOV)) {
@@ -527,13 +528,14 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 			nlogs += *log_num;
 			log += *log_num;
 		}
-		heads[headcount].id = d;
-		heads[headcount].len = iov_length(vq->iov + seg, in);
-		datalen -= heads[headcount].len;
+		heads[headcount].id = cpu_to_vhost32(vq, d);
+		len = iov_length(vq->iov + seg, in);
+		heads[headcount].len = cpu_to_vhost32(vq, len);
+		datalen -= len;
 		++headcount;
 		seg += in;
 	}
-	heads[headcount - 1].len += datalen;
+	heads[headcount - 1].len = cpu_to_vhost32(vq, len - datalen);
 	*iovcount = seg;
 	if (unlikely(log))
 		*log_num = nlogs;
-- 
MST

^ permalink raw reply related

* [PATCH v5 32/45] vhost/net: larger header for virtio 1.0
From: Michael S. Tsirkin @ 2014-11-27 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, thuth, dahi,
	kvm, virtualization, netdev
In-Reply-To: <1417091078-24611-1-git-send-email-mst@redhat.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index cae22f9..1ac58d0 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1027,7 +1027,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 	size_t vhost_hlen, sock_hlen, hdr_len;
 	int i;
 
-	hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
+	hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+			       (1ULL << VIRTIO_F_VERSION_1))) ?
 			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
 			sizeof(struct virtio_net_hdr);
 	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
-- 
MST

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox