Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next PATCH v1 09/11] net: rocker: add cookie to group acls and use flow_id to set cookie
From: John Fastabend @ 2014-12-31 19:49 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

Rocker uses a cookie value to identify flows however the flow API
already has a unique id for each flow. To help the translation
add support to set the cookie value through the internal rocker
flow API and then use the unique id in the cases where it is
available.

This patch extends the internal code paths to support the new
cookie value.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/rocker/rocker.c |   64 ++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 997beb9..4d2d292 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -120,6 +120,7 @@ struct rocker_flow_tbl_entry {
 
 struct rocker_group_tbl_entry {
 	struct hlist_node entry;
+	u64 cookie;
 	u32 cmd;
 	u32 group_id; /* key */
 	u16 group_count;
@@ -2216,7 +2217,8 @@ static int rocker_flow_tbl_add(struct rocker_port *rocker_port,
 		kfree(match);
 	} else {
 		found = match;
-		found->cookie = rocker->flow_tbl_next_cookie++;
+		if (!found->cookie)
+			found->cookie = rocker->flow_tbl_next_cookie++;
 		hash_add(rocker->flow_tbl, &found->entry, found->key_crc32);
 		add_to_hw = true;
 	}
@@ -2294,7 +2296,7 @@ static int rocker_flow_tbl_do(struct rocker_port *rocker_port,
 		return rocker_flow_tbl_add(rocker_port, entry, nowait);
 }
 
-static int rocker_flow_tbl_ig_port(struct rocker_port *rocker_port,
+static int rocker_flow_tbl_ig_port(struct rocker_port *rocker_port, u64 flow_id,
 				   int flags, u32 in_lport, u32 in_lport_mask,
 				   enum rocker_of_dpa_table_id goto_tbl)
 {
@@ -2310,11 +2312,14 @@ static int rocker_flow_tbl_ig_port(struct rocker_port *rocker_port,
 	entry->key.ig_port.in_lport_mask = in_lport_mask;
 	entry->key.ig_port.goto_tbl = goto_tbl;
 
+	if (flow_id)
+		entry->cookie = flow_id;
+
 	return rocker_flow_tbl_do(rocker_port, flags, entry);
 }
 
 static int rocker_flow_tbl_vlan(struct rocker_port *rocker_port,
-				int flags, u32 in_lport,
+				int flags, u64 flow_id, u32 in_lport,
 				__be16 vlan_id, __be16 vlan_id_mask,
 				enum rocker_of_dpa_table_id goto_tbl,
 				bool untagged, __be16 new_vlan_id)
@@ -2335,10 +2340,14 @@ static int rocker_flow_tbl_vlan(struct rocker_port *rocker_port,
 	entry->key.vlan.untagged = untagged;
 	entry->key.vlan.new_vlan_id = new_vlan_id;
 
+	if (flow_id)
+		entry->cookie = flow_id;
+
 	return rocker_flow_tbl_do(rocker_port, flags, entry);
 }
 
 static int rocker_flow_tbl_term_mac(struct rocker_port *rocker_port,
+				    u64 flow_id,
 				    u32 in_lport, u32 in_lport_mask,
 				    __be16 eth_type, const u8 *eth_dst,
 				    const u8 *eth_dst_mask, __be16 vlan_id,
@@ -2371,11 +2380,14 @@ static int rocker_flow_tbl_term_mac(struct rocker_port *rocker_port,
 	entry->key.term_mac.vlan_id_mask = vlan_id_mask;
 	entry->key.term_mac.copy_to_cpu = copy_to_cpu;
 
+	if (flow_id)
+		entry->cookie = flow_id;
+
 	return rocker_flow_tbl_do(rocker_port, flags, entry);
 }
 
 static int rocker_flow_tbl_bridge(struct rocker_port *rocker_port,
-				  int flags,
+				  int flags, u64 flow_id,
 				  const u8 *eth_dst, const u8 *eth_dst_mask,
 				  __be16 vlan_id, u32 tunnel_id,
 				  enum rocker_of_dpa_table_id goto_tbl,
@@ -2425,11 +2437,14 @@ static int rocker_flow_tbl_bridge(struct rocker_port *rocker_port,
 	entry->key.bridge.group_id = group_id;
 	entry->key.bridge.copy_to_cpu = copy_to_cpu;
 
+	if (flow_id)
+		entry->cookie = flow_id;
+
 	return rocker_flow_tbl_do(rocker_port, flags, entry);
 }
 
 static int rocker_flow_tbl_acl(struct rocker_port *rocker_port,
-			       int flags, u32 in_lport,
+			       int flags, u64 flow_id, u32 in_lport,
 			       u32 in_lport_mask,
 			       const u8 *eth_src, const u8 *eth_src_mask,
 			       const u8 *eth_dst, const u8 *eth_dst_mask,
@@ -2477,6 +2492,9 @@ static int rocker_flow_tbl_acl(struct rocker_port *rocker_port,
 	entry->key.acl.ip_tos_mask = ip_tos_mask;
 	entry->key.acl.group_id = group_id;
 
+	if (flow_id)
+		entry->cookie = flow_id;
+
 	return rocker_flow_tbl_do(rocker_port, flags, entry);
 }
 
@@ -2587,7 +2605,7 @@ static int rocker_group_tbl_do(struct rocker_port *rocker_port,
 }
 
 static int rocker_group_l2_interface(struct rocker_port *rocker_port,
-				     int flags, __be16 vlan_id,
+				     int flags, int flow_id, __be16 vlan_id,
 				     u32 out_lport, int pop_vlan)
 {
 	struct rocker_group_tbl_entry *entry;
@@ -2598,6 +2616,7 @@ static int rocker_group_l2_interface(struct rocker_port *rocker_port,
 
 	entry->group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_lport);
 	entry->l2_interface.pop_vlan = pop_vlan;
+	entry->cookie = flow_id;
 
 	return rocker_group_tbl_do(rocker_port, flags, entry);
 }
@@ -2696,7 +2715,7 @@ static int rocker_port_vlan_l2_groups(struct rocker_port *rocker_port,
 	if (rocker_port->stp_state == BR_STATE_LEARNING ||
 	    rocker_port->stp_state == BR_STATE_FORWARDING) {
 		out_lport = rocker_port->lport;
-		err = rocker_group_l2_interface(rocker_port, flags,
+		err = rocker_group_l2_interface(rocker_port, flags, 0,
 						vlan_id, out_lport,
 						pop_vlan);
 		if (err) {
@@ -2722,7 +2741,7 @@ static int rocker_port_vlan_l2_groups(struct rocker_port *rocker_port,
 		return 0;
 
 	out_lport = 0;
-	err = rocker_group_l2_interface(rocker_port, flags,
+	err = rocker_group_l2_interface(rocker_port, flags, 0,
 					vlan_id, out_lport,
 					pop_vlan);
 	if (err) {
@@ -2796,7 +2815,7 @@ static int rocker_port_ctrl_vlan_acl(struct rocker_port *rocker_port,
 	u32 group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_lport);
 	int err;
 
-	err = rocker_flow_tbl_acl(rocker_port, flags,
+	err = rocker_flow_tbl_acl(rocker_port, flags, 0,
 				  in_lport, in_lport_mask,
 				  eth_src, eth_src_mask,
 				  ctrl->eth_dst, ctrl->eth_dst_mask,
@@ -2825,7 +2844,7 @@ static int rocker_port_ctrl_vlan_bridge(struct rocker_port *rocker_port,
 	if (!rocker_port_is_bridged(rocker_port))
 		return 0;
 
-	err = rocker_flow_tbl_bridge(rocker_port, flags,
+	err = rocker_flow_tbl_bridge(rocker_port, flags, 0,
 				     ctrl->eth_dst, ctrl->eth_dst_mask,
 				     vlan_id, tunnel_id,
 				     goto_tbl, group_id, ctrl->copy_to_cpu);
@@ -2847,7 +2866,7 @@ static int rocker_port_ctrl_vlan_term(struct rocker_port *rocker_port,
 	if (ntohs(vlan_id) == 0)
 		vlan_id = rocker_port->internal_vlan_id;
 
-	err = rocker_flow_tbl_term_mac(rocker_port,
+	err = rocker_flow_tbl_term_mac(rocker_port, 0,
 				       rocker_port->lport, in_lport_mask,
 				       ctrl->eth_type, ctrl->eth_dst,
 				       ctrl->eth_dst_mask, vlan_id,
@@ -2961,7 +2980,7 @@ static int rocker_port_vlan(struct rocker_port *rocker_port, int flags,
 		return err;
 	}
 
-	err = rocker_flow_tbl_vlan(rocker_port, flags,
+	err = rocker_flow_tbl_vlan(rocker_port, flags, 0,
 				   in_lport, vlan_id, vlan_id_mask,
 				   goto_tbl, untagged, internal_vlan_id);
 	if (err)
@@ -2986,7 +3005,7 @@ static int rocker_port_ig_tbl(struct rocker_port *rocker_port, int flags)
 	in_lport_mask = 0xffff0000;
 	goto_tbl = ROCKER_OF_DPA_TABLE_ID_VLAN;
 
-	err = rocker_flow_tbl_ig_port(rocker_port, flags,
+	err = rocker_flow_tbl_ig_port(rocker_port, flags, 0,
 				      in_lport, in_lport_mask,
 				      goto_tbl);
 	if (err)
@@ -3036,7 +3055,7 @@ static int rocker_port_fdb_learn(struct rocker_port *rocker_port,
 		group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, out_lport);
 
 	if (!(flags & ROCKER_OP_FLAG_REFRESH)) {
-		err = rocker_flow_tbl_bridge(rocker_port, flags, addr, NULL,
+		err = rocker_flow_tbl_bridge(rocker_port, flags, 0, addr, NULL,
 					     vlan_id, tunnel_id, goto_tbl,
 					     group_id, copy_to_cpu);
 		if (err)
@@ -3171,7 +3190,7 @@ static int rocker_port_router_mac(struct rocker_port *rocker_port,
 		vlan_id = rocker_port->internal_vlan_id;
 
 	eth_type = htons(ETH_P_IP);
-	err = rocker_flow_tbl_term_mac(rocker_port,
+	err = rocker_flow_tbl_term_mac(rocker_port, 0,
 				       rocker_port->lport, in_lport_mask,
 				       eth_type, rocker_port->dev->dev_addr,
 				       dst_mac_mask, vlan_id, vlan_id_mask,
@@ -3180,7 +3199,7 @@ static int rocker_port_router_mac(struct rocker_port *rocker_port,
 		return err;
 
 	eth_type = htons(ETH_P_IPV6);
-	err = rocker_flow_tbl_term_mac(rocker_port,
+	err = rocker_flow_tbl_term_mac(rocker_port, 0,
 				       rocker_port->lport, in_lport_mask,
 				       eth_type, rocker_port->dev->dev_addr,
 				       dst_mac_mask, vlan_id, vlan_id_mask,
@@ -3215,7 +3234,7 @@ static int rocker_port_fwding(struct rocker_port *rocker_port)
 			continue;
 		vlan_id = htons(vid);
 		pop_vlan = rocker_vlan_id_is_internal(vlan_id);
-		err = rocker_group_l2_interface(rocker_port, flags,
+		err = rocker_group_l2_interface(rocker_port, flags, 0,
 						vlan_id, out_lport,
 						pop_vlan);
 		if (err) {
@@ -3919,7 +3938,7 @@ static int rocker_flow_set_ig_port(struct net_device *dev,
 	in_lport_mask = flow->matches[0].mask_u32;
 	goto_tbl = rocker_goto_value(flow->actions[0].args[0].value_u16);
 
-	err = rocker_flow_tbl_ig_port(rocker_port, flags,
+	err = rocker_flow_tbl_ig_port(rocker_port, flags, 0,
 				      in_lport, in_lport_mask,
 				      goto_tbl);
 	return err;
@@ -3981,7 +4000,7 @@ static int rocker_flow_set_vlan(struct net_device *dev,
 	if (!have_in_lport)
 		return -EINVAL;
 
-	err = rocker_flow_tbl_vlan(rocker_port, flags, in_lport,
+	err = rocker_flow_tbl_vlan(rocker_port, flags, 0, in_lport,
 				   vlan_id, vlan_id_mask, goto_tbl,
 				   untagged, new_vlan_id);
 	return err;
@@ -4063,7 +4082,8 @@ static int rocker_flow_set_term_mac(struct net_device *dev,
 		}
 	}
 
-	err = rocker_flow_tbl_term_mac(rocker_port, in_lport, in_lport_mask,
+	err = rocker_flow_tbl_term_mac(rocker_port, 0,
+				       in_lport, in_lport_mask,
 				       ethtype, eth_dst, eth_dst_mask,
 				       vlan_id, vlan_id_mask,
 				       copy_to_cpu, flags);
@@ -4162,7 +4182,7 @@ static int rocker_flow_set_bridge(struct net_device *dev,
 	}
 
 	/* Ignoring eth_dst_mask it seems to cause a EINVAL return code */
-	err = rocker_flow_tbl_bridge(rocker_port, flags,
+	err = rocker_flow_tbl_bridge(rocker_port, flags, 0,
 				     eth_dst, eth_dst_mask,
 				     vlan_id, tunnel_id,
 				     goto_tbl, group_id, copy_to_cpu);
@@ -4269,7 +4289,7 @@ static int rocker_flow_set_acl(struct net_device *dev,
 		}
 	}
 
-	err = rocker_flow_tbl_acl(rocker_port, flags,
+	err = rocker_flow_tbl_acl(rocker_port, flags, 0,
 				  in_lport, in_lport_mask,
 				  eth_src, eth_src_mask,
 				  eth_dst, eth_dst_mask, ethtype,

^ permalink raw reply related

* [net-next PATCH v1 08/11] net: rocker: add get flow API operation
From: John Fastabend @ 2014-12-31 19:48 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

Add operations to get flows. I wouldn't mind cleaning this code
up a bit but my first attempt to do this used macros which shortered
the code up but when I was done I decided it just made the code
unreadable and unmaintainable.

I might think about it a bit more but this implementation albeit
a bit long and repeatative is easier to understand IMO.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/rocker/rocker.c |  819 ++++++++++++++++++++++++++++++++++
 1 file changed, 819 insertions(+)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 8ce9933..997beb9 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3884,6 +3884,12 @@ static u32 rocker_goto_value(u32 id)
 		return ROCKER_OF_DPA_TABLE_ID_BRIDGING;
 	case ROCKER_FLOW_TABLE_ID_ACL_POLICY:
 		return ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
+	case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L3_UNICAST:
+		return ROCKER_OF_DPA_GROUP_TYPE_L3_UCAST;
+	case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2_REWRITE:
+		return ROCKER_OF_DPA_GROUP_TYPE_L2_REWRITE;
+	case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2:
+		return ROCKER_OF_DPA_GROUP_TYPE_L2_INTERFACE;
 	default:
 		return 0;
 	}
@@ -4492,6 +4498,818 @@ static int rocker_del_flows(struct net_device *dev,
 {
 	return -EOPNOTSUPP;
 }
+
+static int rocker_ig_port_to_flow(struct rocker_flow_tbl_key *key,
+				  struct net_flow_flow *flow)
+{
+	flow->matches = kcalloc(2, sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	flow->matches[0].instance = HEADER_INSTANCE_IN_LPORT;
+	flow->matches[0].header = HEADER_METADATA;
+	flow->matches[0].field = HEADER_METADATA_IN_LPORT;
+	flow->matches[0].mask_type = NET_FLOW_MASK_TYPE_LPM;
+	flow->matches[0].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U32;
+	flow->matches[0].value_u32 = key->ig_port.in_lport;
+	flow->matches[0].mask_u32 = key->ig_port.in_lport_mask;
+	memset(&flow->matches[1], 0, sizeof(flow->matches[1]));
+	return 0;
+}
+
+static int rocker_vlan_to_flow(struct rocker_flow_tbl_key *key,
+			       struct net_flow_flow *flow)
+{
+	int cnt = 0;
+
+	if (key->vlan.in_lport)
+		cnt++;
+	if (key->vlan.vlan_id)
+		cnt++;
+
+	flow->matches = kcalloc((cnt + 1),
+				sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	cnt = 0;
+	if (key->vlan.in_lport) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_IN_LPORT;
+		flow->matches[cnt].header = HEADER_METADATA;
+		flow->matches[cnt].field = HEADER_METADATA_IN_LPORT;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U32;
+		flow->matches[cnt].value_u32 = key->vlan.in_lport;
+		cnt++;
+	}
+
+	if (key->vlan.vlan_id) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_VLAN_OUTER;
+		flow->matches[cnt].header = HEADER_VLAN;
+		flow->matches[cnt].field = HEADER_VLAN_VID;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16;
+		flow->matches[cnt].value_u16 = ntohs(key->vlan.vlan_id);
+		flow->matches[cnt].mask_u16 = ntohs(key->vlan.vlan_id_mask);
+		cnt++;
+	}
+	memset(&flow->matches[cnt], 0, sizeof(flow->matches[cnt]));
+
+	flow->actions = kcalloc(2,
+				sizeof(struct net_flow_action),
+				GFP_KERNEL);
+	if (!flow->actions) {
+		kfree(flow->matches);
+		return -ENOMEM;
+	}
+
+	flow->actions[0].args = kcalloc(2, sizeof(struct net_flow_action_arg),
+					GFP_KERNEL);
+	if (!flow->actions[0].args) {
+		kfree(flow->matches);
+		kfree(flow->actions);
+		return -ENOMEM;
+	}
+
+	flow->actions[0].uid = ACTION_SET_VLAN_ID;
+	flow->actions[0].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U16;
+	flow->actions[0].args[0].value_u16 = ntohs(key->vlan.new_vlan_id);
+
+	memset(&flow->actions[1], 0, sizeof(flow->actions[1]));
+	memset(&flow->actions[0].args[1], 0,
+	       sizeof(struct net_flow_action_arg));
+
+	return 0;
+}
+
+static int rocker_term_to_flow(struct rocker_flow_tbl_key *key,
+			       struct net_flow_flow *flow)
+{
+	int cnt = 0;
+
+	if (key->term_mac.in_lport)
+		cnt++;
+	if (key->term_mac.eth_type)
+		cnt++;
+	if (key->term_mac.eth_dst)
+		cnt++;
+	if (key->term_mac.vlan_id)
+		cnt++;
+
+	flow->matches = kcalloc((cnt + 1), sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	cnt = 0;
+	if (key->term_mac.in_lport) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_IN_LPORT;
+		flow->matches[cnt].header = HEADER_METADATA;
+		flow->matches[cnt].field = HEADER_METADATA_IN_LPORT;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U32;
+		flow->matches[cnt].value_u32 = key->term_mac.in_lport;
+		flow->matches[cnt].mask_u32 = key->term_mac.in_lport;
+		cnt++;
+	}
+
+	if (key->term_mac.eth_type) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_ETHERNET;
+		flow->matches[cnt].header = HEADER_ETHERNET;
+		flow->matches[cnt].field = HEADER_ETHERNET_ETHERTYPE;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16;
+		flow->matches[cnt].value_u16 = ntohs(key->term_mac.eth_type);
+		cnt++;
+	}
+
+	if (key->term_mac.eth_dst) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_ETHERNET;
+		flow->matches[cnt].header = HEADER_ETHERNET;
+		flow->matches[cnt].field = HEADER_ETHERNET_DST_MAC;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U64;
+		memcpy(&flow->matches[cnt].value_u64,
+		       key->term_mac.eth_dst, ETH_ALEN);
+		memcpy(&flow->matches[cnt].mask_u64,
+		       key->term_mac.eth_dst_mask, ETH_ALEN);
+		cnt++;
+	}
+
+	if (key->term_mac.vlan_id) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_VLAN_OUTER;
+		flow->matches[cnt].header = HEADER_VLAN;
+		flow->matches[cnt].field = HEADER_VLAN_VID;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16;
+		flow->matches[cnt].value_u16 = ntohs(key->term_mac.vlan_id);
+		flow->matches[cnt].mask_u16 = ntohs(key->term_mac.vlan_id_mask);
+		cnt++;
+	}
+
+	memset(&flow->matches[cnt], 0, sizeof(flow->matches[cnt]));
+
+	flow->actions = kmalloc(2 * sizeof(struct net_flow_action), GFP_KERNEL);
+	if (!flow->actions) {
+		kfree(flow->matches);
+		return -ENOMEM;
+	}
+
+	flow->actions[0].args = NULL;
+	flow->actions[0].uid = ACTION_COPY_TO_CPU;
+	memset(&flow->actions[1], 0, sizeof(flow->actions[1]));
+
+	return 0;
+}
+
+static int rocker_ucast_to_flow(struct rocker_flow_tbl_key *key,
+				struct net_flow_flow *flow)
+{
+	int cnt = 0;
+
+	if (key->ucast_routing.eth_type)
+		cnt++;
+	if (key->ucast_routing.dst4)
+		cnt++;
+
+	flow->matches = kcalloc((cnt + 1), sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	cnt = 0;
+
+	if (key->ucast_routing.eth_type) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_ETHERNET;
+		flow->matches[cnt].header = HEADER_ETHERNET;
+		flow->matches[cnt].field = HEADER_ETHERNET_ETHERTYPE;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16;
+		flow->matches[cnt].value_u16 =
+				ntohs(key->ucast_routing.eth_type);
+		cnt++;
+	}
+
+	if (key->ucast_routing.dst4) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_IPV4;
+		flow->matches[cnt].header = HEADER_IPV4;
+		flow->matches[cnt].field = HEADER_IPV4_DST_IP;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U32;
+		flow->matches[cnt].value_u32 = key->ucast_routing.dst4;
+		flow->matches[cnt].mask_u32 = key->ucast_routing.dst4_mask;
+		cnt++;
+	}
+
+	memset(&flow->matches[cnt], 0, sizeof(flow->matches[cnt]));
+
+	flow->actions = kmalloc(2 * sizeof(struct net_flow_action), GFP_KERNEL);
+	if (!flow->actions) {
+		kfree(flow->matches);
+		return -ENOMEM;
+	}
+
+	flow->actions[0].args = kcalloc(2, sizeof(struct net_flow_action_arg),
+					GFP_KERNEL);
+	if (!flow->actions[0].args) {
+		kfree(flow->matches);
+		kfree(flow->actions);
+		return -ENOMEM;
+	}
+
+	flow->actions[0].uid = ACTION_SET_L3_UNICAST_GROUP_ID;
+	flow->actions[0].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U32;
+	flow->actions[0].args[0].value_u32 = key->ucast_routing.group_id;
+
+	memset(&flow->actions[1], 0, sizeof(flow->actions[1]));
+	memset(&flow->actions[0].args[1], 0,
+	       sizeof(struct net_flow_action_arg));
+
+	return 0;
+}
+
+static int rocker_bridge_to_flow(struct rocker_flow_tbl_key *key,
+				 struct net_flow_flow *flow)
+{
+	int cnt = 0;
+
+	if (key->bridge.eth_dst)
+		cnt++;
+	if (key->bridge.vlan_id)
+		cnt++;
+
+	flow->matches = kcalloc((cnt + 1), sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	cnt = 0;
+
+	if (key->bridge.eth_dst) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_ETHERNET;
+		flow->matches[cnt].header = HEADER_ETHERNET;
+		flow->matches[cnt].field = HEADER_ETHERNET_DST_MAC;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U64;
+		memcpy(&flow->matches[cnt].value_u64,
+		       key->bridge.eth_dst, ETH_ALEN);
+		memcpy(&flow->matches[cnt].mask_u64,
+		       key->bridge.eth_dst_mask, ETH_ALEN);
+		cnt++;
+	}
+
+	if (key->bridge.vlan_id) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_VLAN_OUTER;
+		flow->matches[cnt].header = HEADER_VLAN;
+		flow->matches[cnt].field = HEADER_VLAN_VID;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16;
+		flow->matches[cnt].value_u16 = ntohs(key->bridge.vlan_id);
+		cnt++;
+	}
+
+	memset(&flow->matches[cnt], 0, sizeof(flow->matches[cnt]));
+
+	cnt = 0;
+	if (key->bridge.group_id)
+		cnt++;
+	if (key->bridge.copy_to_cpu)
+		cnt++;
+
+	flow->actions = kcalloc((cnt + 1), sizeof(struct net_flow_action),
+				GFP_KERNEL);
+	if (!flow->actions) {
+		kfree(flow->matches);
+		return -ENOMEM;
+	}
+
+	cnt = 0;
+	if (key->bridge.group_id) {
+		flow->actions[cnt].args =
+				kcalloc(2,
+					sizeof(struct net_flow_action_arg),
+					GFP_KERNEL);
+		if (!flow->actions[cnt].args) {
+			kfree(flow->matches);
+			kfree(flow->actions);
+			return -ENOMEM;
+		}
+
+		flow->actions[cnt].uid = ACTION_SET_L3_UNICAST_GROUP_ID;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U32;
+		flow->actions[cnt].args[0].value_u32 = key->bridge.group_id;
+		cnt++;
+	}
+
+	if (key->bridge.copy_to_cpu) {
+		flow->actions[cnt].uid = ACTION_COPY_TO_CPU;
+		flow->actions[cnt].args = NULL;
+		cnt++;
+	}
+
+	memset(&flow->actions[cnt], 0, sizeof(flow->actions[1]));
+	return 0;
+}
+
+static int rocker_acl_to_flow(struct rocker_flow_tbl_key *key,
+			      struct net_flow_flow *flow)
+{
+	int cnt = 0;
+
+	if (key->acl.in_lport)
+		cnt++;
+	if (key->acl.eth_src)
+		cnt++;
+	if (key->acl.eth_dst)
+		cnt++;
+	if (key->acl.eth_type)
+		cnt++;
+	if (key->acl.vlan_id)
+		cnt++;
+	if (key->acl.ip_proto)
+		cnt++;
+	if (key->acl.ip_tos)
+		cnt++;
+
+	flow->matches = kcalloc((cnt + 1), sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	cnt = 0;
+
+	if (key->acl.in_lport) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_IN_LPORT;
+		flow->matches[cnt].header = HEADER_METADATA;
+		flow->matches[cnt].field = HEADER_METADATA_IN_LPORT;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U32;
+		flow->matches[cnt].value_u32 = key->acl.in_lport;
+		flow->matches[cnt].mask_u32 = key->acl.in_lport_mask;
+		cnt++;
+	}
+
+	if (key->acl.eth_src) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_ETHERNET;
+		flow->matches[cnt].header = HEADER_ETHERNET;
+		flow->matches[cnt].field = HEADER_ETHERNET_SRC_MAC;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U64;
+		flow->matches[cnt].value_u64 = *key->acl.eth_src;
+		flow->matches[cnt].mask_u64 = *key->acl.eth_src_mask;
+		cnt++;
+	}
+
+	if (key->acl.eth_dst) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_ETHERNET;
+		flow->matches[cnt].header = HEADER_ETHERNET;
+		flow->matches[cnt].field = HEADER_ETHERNET_DST_MAC;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U64;
+		memcpy(&flow->matches[cnt].value_u64,
+		       key->acl.eth_dst, ETH_ALEN);
+		memcpy(&flow->matches[cnt].mask_u64,
+		       key->acl.eth_dst_mask, ETH_ALEN);
+		cnt++;
+	}
+
+	if (key->acl.eth_type) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_ETHERNET;
+		flow->matches[cnt].header = HEADER_ETHERNET;
+		flow->matches[cnt].field = HEADER_ETHERNET_ETHERTYPE;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16;
+		flow->matches[cnt].value_u16 = ntohs(key->acl.eth_type);
+		cnt++;
+	}
+
+	if (key->acl.vlan_id) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_VLAN_OUTER;
+		flow->matches[cnt].header = HEADER_VLAN;
+		flow->matches[cnt].field = HEADER_VLAN_VID;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16;
+		flow->matches[cnt].value_u16 = ntohs(key->acl.vlan_id);
+		cnt++;
+	}
+
+	if (key->acl.ip_proto) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_IPV4;
+		flow->matches[cnt].header = HEADER_IPV4;
+		flow->matches[cnt].field = HEADER_IPV4_PROTOCOL;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U8;
+		flow->matches[cnt].value_u8 = key->acl.ip_proto;
+		flow->matches[cnt].mask_u8 = key->acl.ip_proto_mask;
+		cnt++;
+	}
+
+	if (key->acl.ip_tos) {
+		flow->matches[cnt].instance = HEADER_INSTANCE_IPV4;
+		flow->matches[cnt].header = HEADER_IPV4;
+		flow->matches[cnt].field = HEADER_IPV4_DSCP;
+		flow->matches[cnt].mask_type = NET_FLOW_MASK_TYPE_LPM;
+		flow->matches[cnt].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U8;
+		flow->matches[cnt].value_u8 = key->acl.ip_tos;
+		flow->matches[cnt].mask_u8 = key->acl.ip_tos_mask;
+		cnt++;
+	}
+
+	memset(&flow->matches[cnt], 0, sizeof(flow->matches[cnt]));
+
+	flow->actions = kcalloc(2,
+				sizeof(struct net_flow_action),
+				GFP_KERNEL);
+	if (!flow->actions) {
+		kfree(flow->matches);
+		return -ENOMEM;
+	}
+
+	flow->actions[0].args = kcalloc(2,
+					sizeof(struct net_flow_action_arg),
+					GFP_KERNEL);
+	if (!flow->actions[0].args) {
+		kfree(flow->matches);
+		kfree(flow->actions);
+		return -ENOMEM;
+	}
+
+	flow->actions[0].uid = ACTION_SET_L3_UNICAST_GROUP_ID;
+	flow->actions[0].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U32;
+	flow->actions[0].args[0].value_u32 = key->acl.group_id;
+
+	memset(&flow->actions[0].args[1], 0,
+	       sizeof(struct net_flow_action_arg));
+	memset(&flow->actions[1], 0, sizeof(flow->actions[1]));
+	return 0;
+}
+
+static int rocker_l3_unicast_to_flow(struct rocker_group_tbl_entry *entry,
+				     struct net_flow_flow *flow)
+{
+	int cnt = 0;
+
+	flow->matches = kcalloc(2, sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	flow->matches[0].instance = HEADER_INSTANCE_L3_UNICAST_GROUP_ID;
+	flow->matches[0].header = HEADER_METADATA;
+	flow->matches[0].field = HEADER_METADATA_L3_UNICAST_GROUP_ID;
+	flow->matches[0].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+	flow->matches[0].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U32;
+	flow->matches[0].value_u32 = ~ROCKER_GROUP_TYPE_MASK & entry->group_id;
+
+	memset(&flow->matches[1], 0, sizeof(flow->matches[cnt]));
+
+	if (entry->l3_unicast.eth_src)
+		cnt++;
+	if (entry->l3_unicast.eth_dst)
+		cnt++;
+	if (entry->l3_unicast.vlan_id)
+		cnt++;
+	if (entry->l3_unicast.ttl_check)
+		cnt++;
+	if (entry->l3_unicast.group_id)
+		cnt++;
+
+	flow->actions = kcalloc(cnt, sizeof(struct net_flow_action),
+				GFP_KERNEL);
+	if (!flow->actions) {
+		kfree(flow->matches);
+		return -ENOMEM;
+	}
+
+	cnt = 0;
+
+	if (entry->l3_unicast.eth_src) {
+		flow->actions[cnt].args =
+				kcalloc(2,
+					sizeof(struct net_flow_action_arg),
+					GFP_KERNEL);
+
+		if (!flow->actions[cnt].args)
+			goto unwind_args;
+
+		flow->actions[cnt].uid = ACTION_SET_ETH_SRC;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U64;
+		ether_addr_copy(flow->actions[cnt].args[0].value_u64,
+				entry->l3_unicast.eth_src);
+		memset(&flow->actions[0].args[1], 0,
+		       sizeof(struct net_flow_action_arg));
+		cnt++;
+	}
+
+	if (entry->l3_unicast.eth_dst) {
+		flow->actions[cnt].args =
+			kcalloc(2,
+				sizeof(struct net_flow_action_arg),
+				GFP_KERNEL);
+
+		if (!flow->actions[cnt].args)
+			goto unwind_args;
+
+		flow->actions[cnt].uid = ACTION_SET_ETH_DST;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U64;
+		ether_addr_copy(&flow->actions[cnt].args[0].value_u64,
+				entry->l3_unicast.eth_dst);
+		memset(&flow->actions[0].args[1], 0,
+		       sizeof(struct net_flow_action_arg));
+		cnt++;
+	}
+
+	if (entry->l3_unicast.vlan_id) {
+		flow->actions[cnt].args =
+				kcalloc(2,
+					sizeof(struct net_flow_action_arg),
+					GFP_KERNEL);
+
+		if (!flow->actions[cnt].args)
+			goto unwind_args;
+
+		flow->actions[cnt].uid = ACTION_SET_VLAN_ID;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U16;
+		flow->actions[cnt].args[0].value_u16 =
+					ntohs(entry->l3_unicast.vlan_id);
+		memset(&flow->actions[0].args[1], 0,
+		       sizeof(struct net_flow_action_arg));
+		cnt++;
+	}
+
+	if (entry->l3_unicast.ttl_check) {
+		flow->actions[cnt].uid = ACTION_CHECK_TTL_DROP;
+		flow->actions[cnt].args = NULL;
+		cnt++;
+	}
+
+	if (entry->l3_unicast.group_id) {
+		flow->actions[cnt].args =
+				kcalloc(2,
+					sizeof(struct net_flow_action_arg),
+					GFP_KERNEL);
+
+		if (!flow->actions[cnt].args)
+			goto unwind_args;
+
+		flow->actions[cnt].uid = ACTION_SET_L2_GROUP_ID;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U32;
+		flow->actions[cnt].args[0].value_u32 =
+						entry->l3_unicast.group_id;
+		memset(&flow->actions[0].args[1], 0,
+		       sizeof(struct net_flow_action_arg));
+		cnt++;
+	}
+
+	memset(&flow->actions[cnt], 0, sizeof(flow->actions[cnt]));
+	return 0;
+unwind_args:
+	kfree(flow->matches);
+	for (cnt--; cnt >= 0; cnt--)
+		kfree(flow->actions[cnt].args);
+	kfree(flow->actions);
+	return -ENOMEM;
+}
+
+static int rocker_l2_rewrite_to_flow(struct rocker_group_tbl_entry *entry,
+				     struct net_flow_flow *flow)
+{
+	int cnt = 0;
+
+	flow->matches = kcalloc(2, sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	flow->matches[0].instance = HEADER_INSTANCE_L2_REWRITE_GROUP_ID;
+	flow->matches[0].header = HEADER_METADATA;
+	flow->matches[0].field = HEADER_METADATA_L2_REWRITE_GROUP_ID;
+	flow->matches[0].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+	flow->matches[0].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U32;
+	flow->matches[0].value_u32 = ~ROCKER_GROUP_TYPE_MASK & entry->group_id;
+
+	memset(&flow->matches[1], 0, sizeof(flow->matches[cnt]));
+
+	if (entry->l2_rewrite.eth_src)
+		cnt++;
+	if (entry->l2_rewrite.eth_dst)
+		cnt++;
+	if (entry->l2_rewrite.vlan_id)
+		cnt++;
+	if (entry->l2_rewrite.group_id)
+		cnt++;
+
+	flow->actions = kcalloc(cnt, sizeof(struct net_flow_action),
+				GFP_KERNEL);
+	if (!flow->actions) {
+		kfree(flow->matches);
+		return -ENOMEM;
+	}
+
+	cnt = 0;
+
+	if (entry->l2_rewrite.eth_src) {
+		flow->actions[cnt].args =
+			kmalloc(2 * sizeof(struct net_flow_action_arg),
+				GFP_KERNEL);
+
+		if (!flow->actions[cnt].args)
+			goto unwind_args;
+
+		flow->actions[cnt].uid = ACTION_SET_ETH_SRC;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U64;
+		ether_addr_copy(flow->actions[cnt].args[0].value_u64,
+				entry->l2_rewrite.eth_src);
+		memset(&flow->actions[0].args[1], 0,
+		       sizeof(struct net_flow_action_arg));
+		cnt++;
+	}
+
+	if (entry->l2_rewrite.eth_dst) {
+		flow->actions[cnt].args =
+			kmalloc(2 * sizeof(struct net_flow_action_arg),
+				GFP_KERNEL);
+
+		if (!flow->actions[cnt].args)
+			goto unwind_args;
+
+		flow->actions[cnt].uid = ACTION_SET_ETH_DST;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U64;
+		ether_addr_copy(&flow->actions[cnt].args[0].value_u64,
+				entry->l2_rewrite.eth_dst);
+		memset(&flow->actions[0].args[1], 0,
+		       sizeof(struct net_flow_action_arg));
+		cnt++;
+	}
+
+	if (entry->l2_rewrite.vlan_id) {
+		flow->actions[cnt].args =
+			kmalloc(2 * sizeof(struct net_flow_action_arg),
+				GFP_KERNEL);
+
+		if (!flow->actions[cnt].args)
+			goto unwind_args;
+
+		flow->actions[cnt].uid = ACTION_SET_VLAN_ID;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U16;
+		flow->actions[cnt].args[0].value_u16 =
+					ntohs(entry->l2_rewrite.vlan_id);
+		memset(&flow->actions[0].args[1], 0,
+		       sizeof(struct net_flow_action_arg));
+		cnt++;
+	}
+
+	if (entry->l2_rewrite.group_id) {
+		flow->actions[cnt].args =
+			kmalloc(2 * sizeof(struct net_flow_action_arg),
+				GFP_KERNEL);
+
+		if (!flow->actions[cnt].args)
+			goto unwind_args;
+
+		flow->actions[cnt].uid = ACTION_SET_L2_GROUP_ID;
+		flow->actions[cnt].args[0].type = NET_FLOW_ACTION_ARG_TYPE_U32;
+		flow->actions[cnt].args[0].value_u32 =
+			entry->l2_rewrite.group_id;
+		memset(&flow->actions[0].args[1], 0,
+		       sizeof(struct net_flow_action_arg));
+		cnt++;
+	}
+
+	memset(&flow->actions[cnt], 0, sizeof(flow->actions[cnt]));
+	return 0;
+unwind_args:
+	kfree(flow->matches);
+	for (cnt--; cnt >= 0; cnt--)
+		kfree(flow->actions[cnt].args);
+	kfree(flow->actions);
+	return -ENOMEM;
+}
+
+static int rocker_l2_interface_to_flow(struct rocker_group_tbl_entry *entry,
+				       struct net_flow_flow *flow)
+{
+	flow->matches = kmalloc(2 * sizeof(struct net_flow_field_ref),
+				GFP_KERNEL);
+	if (!flow->matches)
+		return -ENOMEM;
+
+	flow->matches[0].instance = HEADER_INSTANCE_L2_GROUP_ID;
+	flow->matches[0].header = HEADER_METADATA;
+	flow->matches[0].field = HEADER_METADATA_L2_GROUP_ID;
+	flow->matches[0].mask_type = NET_FLOW_MASK_TYPE_EXACT;
+	flow->matches[0].type = NET_FLOW_FIELD_REF_ATTR_TYPE_U32;
+	flow->matches[0].value_u32 = ~ROCKER_GROUP_TYPE_MASK & entry->group_id;
+
+	memset(&flow->matches[1], 0, sizeof(flow->matches[1]));
+
+	if (!entry->l2_interface.pop_vlan) {
+		flow->actions = NULL;
+		return 0;
+	}
+
+	flow->actions = kmalloc(2 * sizeof(struct net_flow_action), GFP_KERNEL);
+	if (!flow->actions) {
+		kfree(flow->matches);
+		return -ENOMEM;
+	}
+
+	if (entry->l2_interface.pop_vlan) {
+		flow->actions[0].uid = ACTION_POP_VLAN;
+		flow->actions[0].args = NULL;
+	}
+
+	memset(&flow->actions[1], 0, sizeof(flow->actions[1]));
+	return 0;
+}
+
+static int rocker_get_flows(struct sk_buff *skb, struct net_device *dev,
+			    int table, int min, int max)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct net_flow_flow flow;
+	struct rocker_flow_tbl_entry *entry;
+	struct rocker_group_tbl_entry *group;
+	struct hlist_node *tmp;
+	unsigned long flags;
+	int bkt, err;
+
+	spin_lock_irqsave(&rocker_port->rocker->flow_tbl_lock, flags);
+	hash_for_each_safe(rocker_port->rocker->flow_tbl,
+			   bkt, tmp, entry, entry) {
+		struct rocker_flow_tbl_key *key = &entry->key;
+
+		if (rocker_goto_value(table) != key->tbl_id)
+			continue;
+
+		flow.table_id = table;
+		flow.uid = entry->cookie;
+		flow.priority = key->priority;
+
+		switch (table) {
+		case ROCKER_FLOW_TABLE_ID_INGRESS_PORT:
+			err = rocker_ig_port_to_flow(key, &flow);
+			if (err)
+				return err;
+			break;
+		case ROCKER_FLOW_TABLE_ID_VLAN:
+			err = rocker_vlan_to_flow(key, &flow);
+			if (err)
+				return err;
+			break;
+		case ROCKER_FLOW_TABLE_ID_TERMINATION_MAC:
+			err = rocker_term_to_flow(key, &flow);
+			break;
+		case ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING:
+			err = rocker_ucast_to_flow(key, &flow);
+			break;
+		case ROCKER_FLOW_TABLE_ID_BRIDGING:
+			err = rocker_bridge_to_flow(key, &flow);
+			break;
+		case ROCKER_FLOW_TABLE_ID_ACL_POLICY:
+			err = rocker_acl_to_flow(key, &flow);
+			break;
+		default:
+			continue;
+		}
+
+		net_flow_put_flow(skb, &flow);
+	}
+	spin_unlock_irqrestore(&rocker_port->rocker->flow_tbl_lock, flags);
+
+	spin_lock_irqsave(&rocker_port->rocker->group_tbl_lock, flags);
+	hash_for_each_safe(rocker_port->rocker->group_tbl,
+			   bkt, tmp, group, entry) {
+		if (rocker_goto_value(table) !=
+			ROCKER_GROUP_TYPE_GET(group->group_id))
+			continue;
+
+		flow.table_id = table;
+		flow.uid = group->group_id;
+		flow.priority = 1;
+
+		switch (table) {
+		case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L3_UNICAST:
+			err = rocker_l3_unicast_to_flow(group, &flow);
+			break;
+		case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2_REWRITE:
+			err = rocker_l2_rewrite_to_flow(group, &flow);
+			break;
+		case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2:
+			err = rocker_l2_interface_to_flow(group, &flow);
+			break;
+		default:
+			continue;
+		}
+
+		net_flow_put_flow(skb, &flow);
+	}
+	spin_unlock_irqrestore(&rocker_port->rocker->group_tbl_lock, flags);
+
+	return 0;
+}
 #endif
 
 static const struct net_device_ops rocker_port_netdev_ops = {
@@ -4517,6 +5335,7 @@ static const struct net_device_ops rocker_port_netdev_ops = {
 
 	.ndo_flow_set_flows		= rocker_set_flows,
 	.ndo_flow_del_flows		= rocker_del_flows,
+	.ndo_flow_get_flows		= rocker_get_flows,
 #endif
 };
 

^ permalink raw reply related

* [net-next PATCH v1 07/11] net: rocker: add multicast path to bridging
From: John Fastabend @ 2014-12-31 19:48 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

Add path in table graph to send packets to the bridge table.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/rocker/rocker_pipeline.h |   10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/rocker/rocker_pipeline.h b/drivers/net/ethernet/rocker/rocker_pipeline.h
index 7e689c0..0835bcc 100644
--- a/drivers/net/ethernet/rocker/rocker_pipeline.h
+++ b/drivers/net/ethernet/rocker/rocker_pipeline.h
@@ -708,7 +708,15 @@ struct net_flow_tbl_node table_node_vlan = {
 	.uid = ROCKER_FLOW_TABLE_ID_VLAN,
 	.jump = table_node_vlan_next};
 
-struct net_flow_jump_table table_node_term_mac_next[2] = {
+struct net_flow_jump_table table_node_term_mac_next[3] = {
+	{ .field = {.instance = HEADER_INSTANCE_ETHERNET,
+		    .header = HEADER_ETHERNET,
+		    .field = HEADER_ETHERNET_DST_MAC,
+		    .mask_type = NET_FLOW_MASK_TYPE_LPM,
+		    .type = NET_FLOW_FIELD_REF_ATTR_TYPE_U64,
+		    .value_u64 = (__u64)0x1,
+		    .mask_u64 = (__u64)0x1,
+	}, .node = ROCKER_FLOW_TABLE_ID_BRIDGING},
 	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING},
 	{ .field = {0}, .node = 0},
 };

^ permalink raw reply related

* [net-next PATCH v1 06/11] net: rocker: add group_id slices and drop explicit goto
From: John Fastabend @ 2014-12-31 19:48 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

This adds the group tables for l3_unicast, l2_rewrite and l2. In
addition to adding the tables we extend the metadata fields to
support three different group id lookups. One for each table and
drop the more generic one previously being used.

Finally we can also drop the goto action as it is not used anymore.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/rocker/rocker.c          |  192 +++++++++++++++++++-
 drivers/net/ethernet/rocker/rocker_pipeline.h |  235 ++++++++++++++++++-------
 2 files changed, 355 insertions(+), 72 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index c40c58d..8ce9933 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3964,9 +3964,6 @@ static int rocker_flow_set_vlan(struct net_device *dev,
 		struct net_flow_action_arg *arg = &flow->actions[i].args[0];
 
 		switch (flow->actions[i].uid) {
-		case ACTION_SET_GOTO_TABLE:
-			goto_tbl = rocker_goto_value(arg->value_u16);
-			break;
 		case ACTION_SET_VLAN_ID:
 			new_vlan_id = htons(arg->value_u16);
 			if (new_vlan_id)
@@ -4147,14 +4144,11 @@ static int rocker_flow_set_bridge(struct net_device *dev,
 		struct net_flow_action_arg *arg = &flow->actions[i].args[0];
 
 		switch (flow->actions[i].uid) {
-		case ACTION_SET_GOTO_TABLE:
-			goto_tbl = rocker_goto_value(arg->value_u16);
-			break;
 		case ACTION_COPY_TO_CPU:
 			copy_to_cpu = true;
 			break;
-		case ACTION_SET_GROUP_ID:
-			group_id = arg->value_u32;
+		case ACTION_SET_L3_UNICAST_GROUP_ID:
+			group_id = ROCKER_GROUP_L3_UNICAST(arg->value_u32);
 			break;
 		default:
 			return -EINVAL;
@@ -4258,9 +4252,11 @@ static int rocker_flow_set_acl(struct net_device *dev,
 	group_id = ROCKER_GROUP_NONE;
 
 	for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+		struct net_flow_action_arg *arg = &flow->actions[i].args[0];
+
 		switch (flow->actions[i].uid) {
-		case ACTION_SET_GROUP_ID:
-			group_id = flow->actions[i].args[0].value_u32;
+		case ACTION_SET_L3_UNICAST_GROUP_ID:
+			group_id = ROCKER_GROUP_L3_UNICAST(arg->value_u32);
 			break;
 		default:
 			return -EINVAL;
@@ -4278,6 +4274,173 @@ static int rocker_flow_set_acl(struct net_device *dev,
 	return err;
 }
 
+static int rocker_flow_set_group_slice_l3_unicast(struct net_device *dev,
+						  struct net_flow_flow *flow)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct rocker_group_tbl_entry *entry;
+	int i, flags = 0, err = 0;
+
+	err = is_valid_net_flow(&group_slice_l3_unicast_table, flow);
+	if (err)
+		return err;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	for (i = 0; flow->matches && flow->matches[i].instance; i++) {
+		struct net_flow_field_ref *r = &flow->matches[i];
+
+		switch (r->instance) {
+		case HEADER_INSTANCE_L3_UNICAST_GROUP_ID:
+			entry->group_id = ROCKER_GROUP_L3_UNICAST(r->value_u32);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+		struct net_flow_action_arg *arg = &flow->actions[i].args[0];
+
+		switch (flow->actions[i].uid) {
+		case ACTION_SET_ETH_SRC:
+			ether_addr_copy(entry->l3_unicast.eth_src,
+					(u8 *)&arg->value_u64);
+			break;
+		case ACTION_SET_ETH_DST:
+			ether_addr_copy(entry->l3_unicast.eth_dst,
+					(u8 *)&arg->value_u64);
+			break;
+		case ACTION_SET_VLAN_ID:
+			entry->l3_unicast.vlan_id = htons(arg->value_u16);
+			break;
+		case ACTION_CHECK_TTL_DROP:
+			entry->l3_unicast.ttl_check = true;
+			break;
+		case ACTION_SET_L2_REWRITE_GROUP_ID:
+			entry->l3_unicast.group_id =
+				ROCKER_GROUP_L2_REWRITE(arg->value_u32);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return rocker_group_tbl_do(rocker_port, flags, entry);
+}
+
+static int rocker_flow_set_group_slice_l2_rewrite(struct net_device *dev,
+						  struct net_flow_flow *flow)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct rocker_group_tbl_entry *entry;
+	int i, flags = 0, err = 0;
+
+	err = is_valid_net_flow(&group_slice_l2_rewrite_table, flow);
+	if (err)
+		return err;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	for (i = 0; flow->matches && flow->matches[i].instance; i++) {
+		struct net_flow_field_ref *r = &flow->matches[i];
+
+		switch (r->instance) {
+		case HEADER_INSTANCE_L2_REWRITE_GROUP_ID:
+			entry->group_id = ROCKER_GROUP_L2_REWRITE(r->value_u32);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+		struct net_flow_action_arg *arg = &flow->actions[i].args[0];
+
+		switch (flow->actions[i].uid) {
+		case ACTION_SET_ETH_SRC:
+			ether_addr_copy(entry->l2_rewrite.eth_src,
+					(u8 *)&arg->value_u64);
+			break;
+		case ACTION_SET_ETH_DST:
+			ether_addr_copy(entry->l2_rewrite.eth_dst,
+					(u8 *)&arg->value_u64);
+			break;
+		case ACTION_SET_VLAN_ID:
+			entry->l2_rewrite.vlan_id = htons(arg->value_u16);
+			break;
+		case ACTION_SET_L2_GROUP_ID:
+			entry->l2_rewrite.group_id =
+				ROCKER_GROUP_L2_INTERFACE(arg->value_u32,
+							  rocker_port->lport);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return rocker_group_tbl_do(rocker_port, flags, entry);
+}
+
+static int rocker_flow_set_group_slice_l2(struct net_device *dev,
+					  struct net_flow_flow *flow)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	struct rocker_group_tbl_entry *entry;
+	int i, flags = 0, err = 0;
+	u32 lport;
+
+	err = is_valid_net_flow(&group_slice_l2_table, flow);
+	if (err)
+		return err;
+
+	entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags));
+	if (!entry)
+		return -ENOMEM;
+
+	lport = rocker_port->lport;
+
+	/* Use the dev lport if we don't have a specified lport instance
+	 * from the user. We need to walk the list once before to extract
+	 * any lport attribute.
+	 */
+	for (i = 0; flow->matches && flow->matches[i].instance; i++) {
+		switch (flow->matches[i].instance) {
+		case HEADER_METADATA_IN_LPORT:
+			lport = flow->matches[i].value_u32;
+		}
+	}
+
+	for (i = 0; flow->matches && flow->matches[i].instance; i++) {
+		struct net_flow_field_ref *r = &flow->matches[i];
+
+		switch (r->instance) {
+		case HEADER_INSTANCE_L2_GROUP_ID:
+			entry->group_id =
+				ROCKER_GROUP_L2_INTERFACE(r->value_u32, lport);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+		switch (flow->actions[i].uid) {
+		case ACTION_POP_VLAN:
+			entry->l2_interface.pop_vlan = true;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return rocker_group_tbl_do(rocker_port, flags, entry);
+}
+
 static int rocker_set_flows(struct net_device *dev,
 			    struct net_flow_flow *flow)
 {
@@ -4308,6 +4471,15 @@ static int rocker_set_flows(struct net_device *dev,
 	case ROCKER_FLOW_TABLE_ID_ACL_POLICY:
 		err = rocker_flow_set_acl(dev, flow);
 		break;
+	case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L3_UNICAST:
+		err = rocker_flow_set_group_slice_l3_unicast(dev, flow);
+		break;
+	case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2_REWRITE:
+		err = rocker_flow_set_group_slice_l2_rewrite(dev, flow);
+		break;
+	case ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2:
+		err = rocker_flow_set_group_slice_l2(dev, flow);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/net/ethernet/rocker/rocker_pipeline.h b/drivers/net/ethernet/rocker/rocker_pipeline.h
index 701e139..7e689c0 100644
--- a/drivers/net/ethernet/rocker/rocker_pipeline.h
+++ b/drivers/net/ethernet/rocker/rocker_pipeline.h
@@ -111,16 +111,21 @@ struct net_flow_header ipv4 = {
 
 #define HEADER_METADATA_IN_LPORT 1
 #define HEADER_METADATA_GOTO_TBL 2
-#define HEADER_METADATA_GROUP_ID 3
-struct net_flow_field metadata_fields[3] = {
+#define HEADER_METADATA_L3_UNICAST_GROUP_ID	3
+#define HEADER_METADATA_L2_REWRITE_GROUP_ID	4
+#define HEADER_METADATA_L2_GROUP_ID		5
+struct net_flow_field metadata_fields[5] = {
 	{ .name = "in_lport",
 	  .uid = HEADER_METADATA_IN_LPORT,
 	  .bitwidth = 32,},
-	{ .name = "goto_tbl",
-	  .uid = HEADER_METADATA_GOTO_TBL,
-	  .bitwidth = 16,},
-	{ .name = "group_id",
-	  .uid = HEADER_METADATA_GROUP_ID,
+	{ .name = "l3_unicast_group_id",
+	  .uid = HEADER_METADATA_L3_UNICAST_GROUP_ID,
+	  .bitwidth = 32,},
+	{ .name = "l2_rewrite_group_id",
+	  .uid = HEADER_METADATA_L2_REWRITE_GROUP_ID,
+	  .bitwidth = 32,},
+	{ .name = "l2_group_id",
+	  .uid = HEADER_METADATA_L2_GROUP_ID,
 	  .bitwidth = 32,},
 };
 
@@ -128,7 +133,7 @@ struct net_flow_field metadata_fields[3] = {
 struct net_flow_header metadata_t = {
 	.name = "metadata_t",
 	.uid = HEADER_METADATA,
-	.field_sz = 3,
+	.field_sz = 5,
 	.fields = metadata_fields,
 };
 
@@ -157,25 +162,6 @@ struct net_flow_action null_action = {
 	.name = "", .uid = 0, .args = NULL,
 };
 
-struct net_flow_action_arg set_goto_table_args[2] = {
-	{
-		.name = "table",
-		.type = NET_FLOW_ACTION_ARG_TYPE_U16,
-		.value_u16 = 0,
-	},
-	{
-		.name = "",
-		.type = NET_FLOW_ACTION_ARG_TYPE_NULL,
-	},
-};
-
-#define ACTION_SET_GOTO_TABLE 1
-struct net_flow_action set_goto_table = {
-	.name = "set_goto_table",
-	.uid = ACTION_SET_GOTO_TABLE,
-	.args = set_goto_table_args,
-};
-
 struct net_flow_action_arg set_vlan_id_args[2] = {
 	{
 		.name = "vlan_id",
@@ -188,7 +174,7 @@ struct net_flow_action_arg set_vlan_id_args[2] = {
 	},
 };
 
-#define ACTION_SET_VLAN_ID 2
+#define ACTION_SET_VLAN_ID 1
 struct net_flow_action set_vlan_id = {
 	.name = "set_vlan_id",
 	.uid = ACTION_SET_VLAN_ID,
@@ -196,7 +182,7 @@ struct net_flow_action set_vlan_id = {
 };
 
 /* TBD: what is the untagged bool about in vlan table */
-#define ACTION_COPY_TO_CPU 3
+#define ACTION_COPY_TO_CPU 2
 struct net_flow_action copy_to_cpu = {
 	.name = "copy_to_cpu",
 	.uid = ACTION_COPY_TO_CPU,
@@ -215,14 +201,28 @@ struct net_flow_action_arg set_group_id_args[2] = {
 	},
 };
 
-#define ACTION_SET_GROUP_ID 4
-struct net_flow_action set_group_id = {
-	.name = "set_group_id",
-	.uid = ACTION_SET_GROUP_ID,
+#define ACTION_SET_L3_UNICAST_GROUP_ID 3
+struct net_flow_action set_l3_unicast_group_id = {
+	.name = "set_l3_unicast_group_id",
+	.uid = ACTION_SET_L3_UNICAST_GROUP_ID,
 	.args = set_group_id_args,
 };
 
-#define ACTION_POP_VLAN 5
+#define ACTION_SET_L2_REWRITE_GROUP_ID 4
+struct net_flow_action set_l2_rewrite_group_id = {
+	.name = "set_l2_rewrite_group_id",
+	.uid = ACTION_SET_L2_REWRITE_GROUP_ID,
+	.args = set_group_id_args,
+};
+
+#define ACTION_SET_L2_GROUP_ID 5
+struct net_flow_action set_l2_group_id = {
+	.name = "set_l2_group_id",
+	.uid = ACTION_SET_L2_GROUP_ID,
+	.args = set_group_id_args,
+};
+
+#define ACTION_POP_VLAN 6
 struct net_flow_action pop_vlan = {
 	.name = "pop_vlan",
 	.uid = ACTION_POP_VLAN,
@@ -241,7 +241,7 @@ struct net_flow_action_arg set_eth_src_args[2] = {
 	},
 };
 
-#define ACTION_SET_ETH_SRC 6
+#define ACTION_SET_ETH_SRC 7
 struct net_flow_action set_eth_src = {
 	.name = "set_eth_src",
 	.uid = ACTION_SET_ETH_SRC,
@@ -260,7 +260,7 @@ struct net_flow_action_arg set_eth_dst_args[2] = {
 	},
 };
 
-#define ACTION_SET_ETH_DST 7
+#define ACTION_SET_ETH_DST 8
 struct net_flow_action set_eth_dst = {
 	.name = "set_eth_dst",
 	.uid = ACTION_SET_ETH_DST,
@@ -279,21 +279,30 @@ struct net_flow_action_arg set_out_port_args[2] = {
 	},
 };
 
-#define ACTION_SET_OUT_PORT 8
+#define ACTION_SET_OUT_PORT 9
 struct net_flow_action set_out_port = {
 	.name = "set_out_port",
 	.uid = ACTION_SET_OUT_PORT,
 	.args = set_out_port_args,
 };
 
-struct net_flow_action *rocker_action_list[8] = {
-	&set_goto_table,
+#define ACTION_CHECK_TTL_DROP 10
+struct net_flow_action check_ttl_drop = {
+	.name = "check_ttl_drop",
+	.uid = ACTION_CHECK_TTL_DROP,
+	.args = null_args,
+};
+
+struct net_flow_action *rocker_action_list[10] = {
 	&set_vlan_id,
 	&copy_to_cpu,
-	&set_group_id,
+	&set_l3_unicast_group_id,
+	&set_l2_rewrite_group_id,
+	&set_l2_group_id,
 	&pop_vlan,
 	&set_eth_src,
 	&set_eth_dst,
+	&check_ttl_drop,
 	&null_action,
 };
 
@@ -302,8 +311,9 @@ struct net_flow_action *rocker_action_list[8] = {
 #define HEADER_INSTANCE_VLAN_OUTER 2
 #define HEADER_INSTANCE_IPV4 3
 #define HEADER_INSTANCE_IN_LPORT 4
-#define HEADER_INSTANCE_GOTO_TABLE 5
-#define HEADER_INSTANCE_GROUP_ID 6
+#define HEADER_INSTANCE_L3_UNICAST_GROUP_ID 5
+#define HEADER_INSTANCE_L2_REWRITE_GROUP_ID 6
+#define HEADER_INSTANCE_L2_GROUP_ID 7
 
 struct net_flow_jump_table parse_ethernet[3] = {
 	{
@@ -390,29 +400,37 @@ struct net_flow_hdr_node in_lport_header_node = {
 	.jump = terminal_headers,
 };
 
-struct net_flow_hdr_node goto_table_header_node = {
-	.name = "goto_table",
-	.uid = HEADER_INSTANCE_GOTO_TABLE,
+struct net_flow_hdr_node l2_group_id_header_node = {
+	.name = "l2_group_id",
+	.uid = HEADER_INSTANCE_L2_GROUP_ID,
+	.hdrs = metadata_headers,
+	.jump = terminal_headers,
+};
+
+struct net_flow_hdr_node l2_rewrite_group_id_header_node = {
+	.name = "l2_rewrite_group_id",
+	.uid = HEADER_INSTANCE_L2_REWRITE_GROUP_ID,
 	.hdrs = metadata_headers,
 	.jump = terminal_headers,
 };
 
-struct net_flow_hdr_node group_id_header_node = {
-	.name = "group_id",
-	.uid = HEADER_INSTANCE_GROUP_ID,
+struct net_flow_hdr_node l3_unicast_group_id_header_node = {
+	.name = "l3_uniscast_group_id",
+	.uid = HEADER_INSTANCE_L3_UNICAST_GROUP_ID,
 	.hdrs = metadata_headers,
 	.jump = terminal_headers,
 };
 
 struct net_flow_hdr_node null_header = {.name = "", .uid = 0,};
 
-struct net_flow_hdr_node *rocker_header_nodes[7] = {
+struct net_flow_hdr_node *rocker_header_nodes[] = {
 	&ethernet_header_node,
 	&vlan_header_node,
 	&ipv4_header_node,
 	&in_lport_header_node,
-	&goto_table_header_node,
-	&group_id_header_node,
+	&l3_unicast_group_id_header_node,
+	&l2_rewrite_group_id_header_node,
+	&l2_group_id_header_node,
 	&null_header,
 };
 
@@ -513,14 +531,46 @@ struct net_flow_field_ref matches_acl[8] = {
 	{ .instance = 0, .field = 0},
 };
 
-int actions_ig_port[2] = {ACTION_SET_GOTO_TABLE, 0};
-int actions_vlan[3] = {ACTION_SET_GOTO_TABLE, ACTION_SET_VLAN_ID, 0};
-int actions_term_mac[3] = {ACTION_SET_GOTO_TABLE, ACTION_COPY_TO_CPU, 0};
-int actions_ucast_routing[3] = {ACTION_SET_GOTO_TABLE, ACTION_SET_GROUP_ID, 0};
-int actions_bridge[4] = {ACTION_SET_GOTO_TABLE,
-			 ACTION_SET_GROUP_ID,
-			 ACTION_COPY_TO_CPU, 0};
-int actions_acl[2] = {ACTION_SET_GROUP_ID, 0};
+struct net_flow_field_ref matches_l3_unicast_group_slice[2] = {
+	{ .instance = HEADER_INSTANCE_L3_UNICAST_GROUP_ID,
+	  .header = HEADER_METADATA,
+	  .field = HEADER_METADATA_L3_UNICAST_GROUP_ID,
+	  .mask_type = NET_FLOW_MASK_TYPE_EXACT},
+	{ .instance = 0, .field = 0},
+};
+
+struct net_flow_field_ref matches_l2_rewrite_group_slice[2] = {
+	{ .instance = HEADER_INSTANCE_L2_REWRITE_GROUP_ID,
+	  .header = HEADER_METADATA,
+	  .field = HEADER_METADATA_L2_REWRITE_GROUP_ID,
+	  .mask_type = NET_FLOW_MASK_TYPE_EXACT},
+	{ .instance = 0, .field = 0},
+};
+
+struct net_flow_field_ref matches_l2_group_slice[2] = {
+	{ .instance = HEADER_INSTANCE_L2_GROUP_ID,
+	  .header = HEADER_METADATA,
+	  .field = HEADER_METADATA_L2_GROUP_ID,
+	  .mask_type = NET_FLOW_MASK_TYPE_EXACT},
+	{ .instance = 0, .field = 0},
+};
+
+int actions_ig_port[] = {0};
+int actions_vlan[] = {ACTION_SET_VLAN_ID, 0};
+int actions_term_mac[] = {ACTION_COPY_TO_CPU, 0};
+int actions_ucast_routing[] = {ACTION_SET_L3_UNICAST_GROUP_ID, 0};
+int actions_bridge[] = {ACTION_SET_L2_GROUP_ID, ACTION_COPY_TO_CPU, 0};
+int actions_acl[] = {ACTION_SET_L3_UNICAST_GROUP_ID, 0};
+int actions_group_slice_l3_unicast[] = {ACTION_SET_ETH_SRC,
+					ACTION_SET_ETH_DST,
+					ACTION_SET_VLAN_ID,
+					ACTION_SET_L2_REWRITE_GROUP_ID,
+					ACTION_CHECK_TTL_DROP, 0};
+int actions_group_slice_l2_rewrite[] = {ACTION_SET_ETH_SRC,
+					ACTION_SET_ETH_DST,
+					ACTION_SET_VLAN_ID,
+					ACTION_SET_L2_GROUP_ID, 0};
+int actions_group_slice_l2[] = {ACTION_POP_VLAN, 0};
 
 enum rocker_flow_table_id_space {
 	ROCKER_FLOW_TABLE_ID_INGRESS_PORT = 1,
@@ -530,6 +580,9 @@ enum rocker_flow_table_id_space {
 	ROCKER_FLOW_TABLE_ID_MULTICAST_ROUTING,
 	ROCKER_FLOW_TABLE_ID_BRIDGING,
 	ROCKER_FLOW_TABLE_ID_ACL_POLICY,
+	ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L3_UNICAST,
+	ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2_REWRITE,
+	ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2,
 	ROCKER_FLOW_TABLE_NULL = 0,
 };
 
@@ -587,6 +640,33 @@ struct net_flow_table acl_table = {
 	.actions = actions_acl,
 };
 
+struct net_flow_table group_slice_l3_unicast_table = {
+	.name = "group_slice_l3_unicast",
+	.uid = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L3_UNICAST,
+	.source = 1,
+	.size = -1,
+	.matches = matches_l3_unicast_group_slice,
+	.actions = actions_group_slice_l3_unicast,
+};
+
+struct net_flow_table group_slice_l2_rewrite_table = {
+	.name = "group_slice_l2_rewrite",
+	.uid = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2_REWRITE,
+	.source = 1,
+	.size = -1,
+	.matches = matches_l2_rewrite_group_slice,
+	.actions = actions_group_slice_l2_rewrite,
+};
+
+struct net_flow_table group_slice_l2_table = {
+	.name = "group_slice_l2",
+	.uid = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2,
+	.source = 1,
+	.size = -1,
+	.matches = matches_l2_group_slice,
+	.actions = actions_group_slice_l2,
+};
+
 struct net_flow_table null_table = {
 	.name = "",
 	.uid = ROCKER_FLOW_TABLE_NULL,
@@ -596,13 +676,16 @@ struct net_flow_table null_table = {
 	.actions = NULL,
 };
 
-struct net_flow_table *rocker_table_list[7] = {
+struct net_flow_table *rocker_table_list[10] = {
 	&ingress_port_table,
 	&vlan_table,
 	&term_mac_table,
 	&ucast_routing_table,
 	&bridge_table,
 	&acl_table,
+	&group_slice_l3_unicast_table,
+	&group_slice_l2_rewrite_table,
+	&group_slice_l2_table,
 	&null_table,
 };
 
@@ -652,7 +735,8 @@ struct net_flow_tbl_node table_node_ucast_routing = {
 	.uid = ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING,
 	.jump = table_node_ucast_routing_next};
 
-struct net_flow_jump_table table_node_acl_next[1] = {
+struct net_flow_jump_table table_node_acl_next[2] = {
+	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L3_UNICAST},
 	{ .field = {0}, .node = 0},
 };
 
@@ -660,15 +744,42 @@ struct net_flow_tbl_node table_node_acl = {
 	.uid = ROCKER_FLOW_TABLE_ID_ACL_POLICY,
 	.jump = table_node_acl_next};
 
+struct net_flow_jump_table table_node_group_l3_unicast_next[1] = {
+	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2_REWRITE},
+};
+
+struct net_flow_tbl_node table_node_group_l3_unicast = {
+	.uid = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L3_UNICAST,
+	.jump = table_node_group_l3_unicast_next};
+
+struct net_flow_jump_table table_node_group_l2_rewrite_next[1] = {
+	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2},
+};
+
+struct net_flow_tbl_node table_node_group_l2_rewrite = {
+	.uid = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2_REWRITE,
+	.jump = table_node_group_l2_rewrite_next};
+
+struct net_flow_jump_table table_node_group_l2_next[1] = {
+	{ .field = {0}, .node = 0},
+};
+
+struct net_flow_tbl_node table_node_group_l2 = {
+	.uid = ROCKER_FLOW_TABLE_ID_GROUP_SLICE_L2,
+	.jump = table_node_group_l2_next};
+
 struct net_flow_tbl_node table_node_nil = {.uid = 0, .jump = NULL};
 
-struct net_flow_tbl_node *rocker_table_nodes[7] = {
+struct net_flow_tbl_node *rocker_table_nodes[10] = {
 	&table_node_ingress_port,
 	&table_node_vlan,
 	&table_node_term_mac,
 	&table_node_ucast_routing,
 	&table_node_bridge,
 	&table_node_acl,
+	&table_node_group_l3_unicast,
+	&table_node_group_l2_rewrite,
+	&table_node_group_l2,
 	&table_node_nil,
 };
 #endif /*_MY_PIPELINE_H*/

^ permalink raw reply related

* [net-next PATCH v1 05/11] net: rocker: add set flow rules
From: John Fastabend @ 2014-12-31 19:47 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

Implement set flow operations for existing rocker tables.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/rocker/rocker.c          |  517 +++++++++++++++++++++++++
 drivers/net/ethernet/rocker/rocker_pipeline.h |    3 
 2 files changed, 519 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 4c6787a..c40c58d 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3806,6 +3806,520 @@ static struct net_flow_hdr_node **rocker_get_hgraph(struct net_device *d)
 {
 	return rocker_header_nodes;
 }
+
+static int is_valid_net_flow_action_arg(struct net_flow_action *a, int id)
+{
+	struct net_flow_action_arg *args = a->args;
+	int i;
+
+	for (i = 0; args[i].type != NET_FLOW_ACTION_ARG_TYPE_NULL; i++) {
+		if (a->args[i].type == NET_FLOW_ACTION_ARG_TYPE_NULL ||
+		    args[i].type != a->args[i].type)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int is_valid_net_flow_action(struct net_flow_action *a, int *actions)
+{
+	int i;
+
+	for (i = 0; actions[i]; i++) {
+		if (actions[i] == a->uid)
+			return is_valid_net_flow_action_arg(a, a->uid);
+	}
+	return -EINVAL;
+}
+
+static int is_valid_net_flow_match(struct net_flow_field_ref *f,
+				   struct net_flow_field_ref *fields)
+{
+	int i;
+
+	for (i = 0; fields[i].header; i++) {
+		if (f->header == fields[i].header &&
+		    f->field == fields[i].field)
+			return 0;
+	}
+
+	return -EINVAL;
+}
+
+int is_valid_net_flow(struct net_flow_table *table, struct net_flow_flow *flow)
+{
+	struct net_flow_field_ref *fields = table->matches;
+	int *actions = table->actions;
+	int i, err;
+
+	for (i = 0; flow->actions[i].uid; i++) {
+		err = is_valid_net_flow_action(&flow->actions[i], actions);
+		if (err)
+			return -EINVAL;
+	}
+
+	for (i = 0; flow->matches[i].header; i++) {
+		err = is_valid_net_flow_match(&flow->matches[i], fields);
+		if (err)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static u32 rocker_goto_value(u32 id)
+{
+	switch (id) {
+	case ROCKER_FLOW_TABLE_ID_INGRESS_PORT:
+		return ROCKER_OF_DPA_TABLE_ID_INGRESS_PORT;
+	case ROCKER_FLOW_TABLE_ID_VLAN:
+		return ROCKER_OF_DPA_TABLE_ID_VLAN;
+	case ROCKER_FLOW_TABLE_ID_TERMINATION_MAC:
+		return ROCKER_OF_DPA_TABLE_ID_TERMINATION_MAC;
+	case ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING:
+		return ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING;
+	case ROCKER_FLOW_TABLE_ID_MULTICAST_ROUTING:
+		return ROCKER_OF_DPA_TABLE_ID_MULTICAST_ROUTING;
+	case ROCKER_FLOW_TABLE_ID_BRIDGING:
+		return ROCKER_OF_DPA_TABLE_ID_BRIDGING;
+	case ROCKER_FLOW_TABLE_ID_ACL_POLICY:
+		return ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
+	default:
+		return 0;
+	}
+}
+
+static int rocker_flow_set_ig_port(struct net_device *dev,
+				   struct net_flow_flow *flow)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	enum rocker_of_dpa_table_id goto_tbl;
+	u32 in_lport_mask = 0xffff0000;
+	u32 in_lport = 0;
+	int err, flags = 0;
+
+	err = is_valid_net_flow(&ingress_port_table, flow);
+	if (err)
+		return err;
+
+	/* ingress port table only supports one field/mask/action this
+	 * simplifies the key construction and we can assume the values
+	 * are the correct types/mask/action by valid check above. The
+	 * user could pass multiple match/actions in a message with the
+	 * same field multiple times currently the valid test does not
+	 * catch this and we just use the first specified.
+	 */
+	in_lport = flow->matches[0].value_u32;
+	in_lport_mask = flow->matches[0].mask_u32;
+	goto_tbl = rocker_goto_value(flow->actions[0].args[0].value_u16);
+
+	err = rocker_flow_tbl_ig_port(rocker_port, flags,
+				      in_lport, in_lport_mask,
+				      goto_tbl);
+	return err;
+}
+
+static int rocker_flow_set_vlan(struct net_device *dev,
+				struct net_flow_flow *flow)
+{
+	enum rocker_of_dpa_table_id goto_tbl;
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	int i, err = 0, flags = 0;
+	u32 in_lport;
+	__be16 vlan_id, vlan_id_mask, new_vlan_id;
+	bool untagged, have_in_lport = false;
+
+	err = is_valid_net_flow(&vlan_table, flow);
+	if (err)
+		return err;
+
+	goto_tbl = ROCKER_OF_DPA_TABLE_ID_TERMINATION_MAC;
+
+	/* If user does not specify vid match default to any */
+	vlan_id = 1;
+	vlan_id_mask = 0;
+
+	for (i = 0; flow->matches && flow->matches[i].instance; i++) {
+		switch (flow->matches[i].instance) {
+		case HEADER_INSTANCE_IN_LPORT:
+			in_lport = flow->matches[i].value_u32;
+			have_in_lport = true;
+			break;
+		case HEADER_INSTANCE_VLAN_OUTER:
+			if (flow->matches[i].field != HEADER_VLAN_VID)
+				break;
+
+			vlan_id = htons(flow->matches[i].value_u16);
+			vlan_id_mask = htons(flow->matches[i].mask_u16);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* If user does not specify a new vlan id use default vlan id */
+	new_vlan_id = rocker_port_vid_to_vlan(rocker_port, vlan_id, &untagged);
+
+	for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+		struct net_flow_action_arg *arg = &flow->actions[i].args[0];
+
+		switch (flow->actions[i].uid) {
+		case ACTION_SET_GOTO_TABLE:
+			goto_tbl = rocker_goto_value(arg->value_u16);
+			break;
+		case ACTION_SET_VLAN_ID:
+			new_vlan_id = htons(arg->value_u16);
+			if (new_vlan_id)
+				untagged = false;
+			break;
+		}
+	}
+
+	if (!have_in_lport)
+		return -EINVAL;
+
+	err = rocker_flow_tbl_vlan(rocker_port, flags, in_lport,
+				   vlan_id, vlan_id_mask, goto_tbl,
+				   untagged, new_vlan_id);
+	return err;
+}
+
+static int rocker_flow_set_term_mac(struct net_device *dev,
+				    struct net_flow_flow *flow)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	__be16 vlan_id, vlan_id_mask, ethtype = 0;
+	const u8 *eth_dst, *eth_dst_mask;
+	u32 in_lport, in_lport_mask;
+	int i, err = 0, flags = 0;
+	bool copy_to_cpu;
+
+	eth_dst = NULL;
+	eth_dst_mask = NULL;
+
+	err = is_valid_net_flow(&term_mac_table, flow);
+	if (err)
+		return err;
+
+	/* If user does not specify vid match default to any */
+	vlan_id = rocker_port->internal_vlan_id;
+	vlan_id_mask = 0;
+
+	/* If user does not specify in_lport match default to any */
+	in_lport = rocker_port->lport;
+	in_lport_mask = 0;
+
+	/* If user does not specify a mac address match any */
+	eth_dst = rocker_port->dev->dev_addr;
+	eth_dst_mask = zero_mac;
+
+	for (i = 0; flow->matches && flow->matches[i].instance; i++) {
+		switch (flow->matches[i].instance) {
+		case HEADER_INSTANCE_IN_LPORT:
+			in_lport = flow->matches[i].value_u32;
+			in_lport_mask = flow->matches[i].mask_u32;
+			break;
+		case HEADER_INSTANCE_VLAN_OUTER:
+			if (flow->matches[i].field != HEADER_VLAN_VID)
+				break;
+
+			vlan_id = htons(flow->matches[i].value_u16);
+			vlan_id_mask = htons(flow->matches[i].mask_u16);
+			break;
+		case HEADER_INSTANCE_ETHERNET:
+			switch (flow->matches[i].field) {
+			case HEADER_ETHERNET_DST_MAC:
+				eth_dst = (u8 *)&flow->matches[i].value_u64;
+				eth_dst_mask = (u8 *)&flow->matches[i].mask_u64;
+				break;
+			case HEADER_ETHERNET_ETHERTYPE:
+				ethtype = htons(flow->matches[i].value_u16);
+				break;
+			default:
+				return -EINVAL;
+			}
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	if (!ethtype)
+		return -EINVAL;
+
+	/* By default do not copy to cpu */
+	copy_to_cpu = false;
+
+	for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+		switch (flow->actions[i].uid) {
+		case ACTION_COPY_TO_CPU:
+			copy_to_cpu = true;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	err = rocker_flow_tbl_term_mac(rocker_port, in_lport, in_lport_mask,
+				       ethtype, eth_dst, eth_dst_mask,
+				       vlan_id, vlan_id_mask,
+				       copy_to_cpu, flags);
+	return err;
+}
+
+static int rocker_flow_set_ucast_routing(struct net_device *dev,
+					 struct net_flow_flow *flow)
+{
+	return -EOPNOTSUPP;
+}
+
+static int rocker_flow_set_mcast_routing(struct net_device *dev,
+					 struct net_flow_flow *flow)
+{
+	return -EOPNOTSUPP;
+}
+
+static int rocker_flow_set_bridge(struct net_device *dev,
+				  struct net_flow_flow *flow)
+{
+	enum rocker_of_dpa_table_id goto_tbl;
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	u32 in_lport, in_lport_mask, group_id, tunnel_id;
+	__be16 vlan_id, vlan_id_mask;
+	const u8 *eth_dst, *eth_dst_mask;
+	int i, err = 0, flags = 0;
+	bool copy_to_cpu;
+
+	err = is_valid_net_flow(&bridge_table, flow);
+	if (err)
+		return err;
+
+	goto_tbl = ROCKER_OF_DPA_TABLE_ID_ACL_POLICY;
+
+	/* If user does not specify vid match default to any */
+	vlan_id = rocker_port->internal_vlan_id;
+	vlan_id_mask = 0;
+
+	/* If user does not specify in_lport match default to any */
+	in_lport = rocker_port->lport;
+	in_lport_mask = 0;
+
+	/* If user does not specify a mac address match any */
+	eth_dst = rocker_port->dev->dev_addr;
+	eth_dst_mask = NULL;
+
+	/* Do not support for tunnel_id yet. */
+	tunnel_id = 0;
+
+	for (i = 0; flow->matches && flow->matches[i].instance; i++) {
+		switch (flow->matches[i].instance) {
+		case HEADER_INSTANCE_IN_LPORT:
+			in_lport = flow->matches[i].value_u32;
+			in_lport_mask = flow->matches[i].mask_u32;
+			break;
+		case HEADER_INSTANCE_VLAN_OUTER:
+			if (flow->matches[i].field != HEADER_VLAN_VID)
+				break;
+
+			vlan_id = htons(flow->matches[i].value_u16);
+			vlan_id_mask = htons(flow->matches[i].mask_u16);
+			break;
+		case HEADER_INSTANCE_ETHERNET:
+			switch (flow->matches[i].field) {
+			case HEADER_ETHERNET_DST_MAC:
+				eth_dst = (u8 *)&flow->matches[i].value_u64;
+				eth_dst_mask = (u8 *)&flow->matches[i].mask_u64;
+				break;
+			default:
+				return -EINVAL;
+			}
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* By default do not copy to cpu and skip group assignment */
+	copy_to_cpu = false;
+	group_id = ROCKER_GROUP_NONE;
+
+	for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+		struct net_flow_action_arg *arg = &flow->actions[i].args[0];
+
+		switch (flow->actions[i].uid) {
+		case ACTION_SET_GOTO_TABLE:
+			goto_tbl = rocker_goto_value(arg->value_u16);
+			break;
+		case ACTION_COPY_TO_CPU:
+			copy_to_cpu = true;
+			break;
+		case ACTION_SET_GROUP_ID:
+			group_id = arg->value_u32;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* Ignoring eth_dst_mask it seems to cause a EINVAL return code */
+	err = rocker_flow_tbl_bridge(rocker_port, flags,
+				     eth_dst, eth_dst_mask,
+				     vlan_id, tunnel_id,
+				     goto_tbl, group_id, copy_to_cpu);
+	return err;
+}
+
+static int rocker_flow_set_acl(struct net_device *dev,
+			       struct net_flow_flow *flow)
+{
+	struct rocker_port *rocker_port = netdev_priv(dev);
+	u32 in_lport, in_lport_mask, group_id, tunnel_id;
+	__be16 vlan_id, vlan_id_mask, ethtype = 0;
+	const u8 *eth_dst, *eth_src, *eth_dst_mask, *eth_src_mask;
+	u8 protocol, protocol_mask, dscp, dscp_mask;
+	int i, err = 0, flags = 0;
+
+	err = is_valid_net_flow(&bridge_table, flow);
+	if (err)
+		return err;
+
+	/* If user does not specify vid match default to any */
+	vlan_id = rocker_port->internal_vlan_id;
+	vlan_id_mask = 0;
+
+	/* If user does not specify in_lport match default to any */
+	in_lport = rocker_port->lport;
+	in_lport_mask = 0;
+
+	/* If user does not specify a mac address match any */
+	eth_dst = rocker_port->dev->dev_addr;
+	eth_src = zero_mac;
+	eth_dst_mask = NULL;
+	eth_src_mask = NULL;
+
+	/* If user does not set protocol/dscp mask them out */
+	protocol = 0;
+	dscp = 0;
+	protocol_mask = 0;
+	dscp_mask = 0;
+
+	/* Do not support for tunnel_id yet. */
+	tunnel_id = 0;
+
+	for (i = 0; flow->matches && flow->matches[i].instance; i++) {
+		switch (flow->matches[i].instance) {
+		case HEADER_INSTANCE_IN_LPORT:
+			in_lport = flow->matches[i].value_u32;
+			in_lport_mask = flow->matches[i].mask_u32;
+			break;
+		case HEADER_INSTANCE_VLAN_OUTER:
+			if (flow->matches[i].field != HEADER_VLAN_VID)
+				break;
+
+			vlan_id = htons(flow->matches[i].value_u16);
+			vlan_id_mask = htons(flow->matches[i].mask_u16);
+			break;
+		case HEADER_INSTANCE_ETHERNET:
+			switch (flow->matches[i].field) {
+			case HEADER_ETHERNET_SRC_MAC:
+				eth_src = (u8 *)&flow->matches[i].value_u64;
+				eth_src_mask = (u8 *)&flow->matches[i].mask_u64;
+				break;
+			case HEADER_ETHERNET_DST_MAC:
+				eth_dst = (u8 *)&flow->matches[i].value_u64;
+				eth_dst_mask = (u8 *)&flow->matches[i].mask_u64;
+				break;
+			case HEADER_ETHERNET_ETHERTYPE:
+				ethtype = htons(flow->matches[i].value_u16);
+				break;
+			default:
+				return -EINVAL;
+			}
+			break;
+		case HEADER_INSTANCE_IPV4:
+			switch (flow->matches[i].field) {
+			case HEADER_IPV4_PROTOCOL:
+				protocol = flow->matches[i].value_u8;
+				protocol_mask = flow->matches[i].mask_u8;
+				break;
+			case HEADER_IPV4_DSCP:
+				dscp = flow->matches[i].value_u8;
+				dscp_mask = flow->matches[i].mask_u8;
+				break;
+			default:
+				return -EINVAL;
+			}
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* By default do not copy to cpu and skip group assignment */
+	group_id = ROCKER_GROUP_NONE;
+
+	for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+		switch (flow->actions[i].uid) {
+		case ACTION_SET_GROUP_ID:
+			group_id = flow->actions[i].args[0].value_u32;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	err = rocker_flow_tbl_acl(rocker_port, flags,
+				  in_lport, in_lport_mask,
+				  eth_src, eth_src_mask,
+				  eth_dst, eth_dst_mask, ethtype,
+				  vlan_id, vlan_id_mask,
+				  protocol, protocol_mask,
+				  dscp, dscp_mask,
+				  group_id);
+	return err;
+}
+
+static int rocker_set_flows(struct net_device *dev,
+			    struct net_flow_flow *flow)
+{
+	int err = -EINVAL;
+
+	if (!flow->matches || !flow->actions)
+		return -EINVAL;
+
+	switch (flow->table_id) {
+	case ROCKER_FLOW_TABLE_ID_INGRESS_PORT:
+		err = rocker_flow_set_ig_port(dev, flow);
+		break;
+	case ROCKER_FLOW_TABLE_ID_VLAN:
+		err = rocker_flow_set_vlan(dev, flow);
+		break;
+	case ROCKER_FLOW_TABLE_ID_TERMINATION_MAC:
+		err = rocker_flow_set_term_mac(dev, flow);
+		break;
+	case ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING:
+		err = rocker_flow_set_ucast_routing(dev, flow);
+		break;
+	case ROCKER_FLOW_TABLE_ID_MULTICAST_ROUTING:
+		err = rocker_flow_set_mcast_routing(dev, flow);
+		break;
+	case ROCKER_FLOW_TABLE_ID_BRIDGING:
+		err = rocker_flow_set_bridge(dev, flow);
+		break;
+	case ROCKER_FLOW_TABLE_ID_ACL_POLICY:
+		err = rocker_flow_set_acl(dev, flow);
+		break;
+	default:
+		break;
+	}
+
+	return err;
+}
+
+static int rocker_del_flows(struct net_device *dev,
+			    struct net_flow_flow *flow)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 static const struct net_device_ops rocker_port_netdev_ops = {
@@ -3828,6 +4342,9 @@ static const struct net_device_ops rocker_port_netdev_ops = {
 	.ndo_flow_get_actions		= rocker_get_actions,
 	.ndo_flow_get_tbl_graph		= rocker_get_tgraph,
 	.ndo_flow_get_hdr_graph		= rocker_get_hgraph,
+
+	.ndo_flow_set_flows		= rocker_set_flows,
+	.ndo_flow_del_flows		= rocker_del_flows,
 #endif
 };
 
diff --git a/drivers/net/ethernet/rocker/rocker_pipeline.h b/drivers/net/ethernet/rocker/rocker_pipeline.h
index 9544339..701e139 100644
--- a/drivers/net/ethernet/rocker/rocker_pipeline.h
+++ b/drivers/net/ethernet/rocker/rocker_pipeline.h
@@ -527,6 +527,7 @@ enum rocker_flow_table_id_space {
 	ROCKER_FLOW_TABLE_ID_VLAN,
 	ROCKER_FLOW_TABLE_ID_TERMINATION_MAC,
 	ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING,
+	ROCKER_FLOW_TABLE_ID_MULTICAST_ROUTING,
 	ROCKER_FLOW_TABLE_ID_BRIDGING,
 	ROCKER_FLOW_TABLE_ID_ACL_POLICY,
 	ROCKER_FLOW_TABLE_NULL = 0,
@@ -588,7 +589,7 @@ struct net_flow_table acl_table = {
 
 struct net_flow_table null_table = {
 	.name = "",
-	.uid = 0,
+	.uid = ROCKER_FLOW_TABLE_NULL,
 	.source = 0,
 	.size = 0,
 	.matches = NULL,

^ permalink raw reply related

* [net-next PATCH v1 04/11] rocker: add pipeline model for rocker switch
From: John Fastabend @ 2014-12-31 19:47 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

This adds rocker support for the net_flow_get_* operations. With this
we can interrogate rocker.

Here we see that for static configurations enabling the get operations
is simply a matter of defining a pipeline model and returning the
structures for the core infrastructure to encapsulate into netlink
messages.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/ethernet/rocker/rocker.c          |   35 +
 drivers/net/ethernet/rocker/rocker_pipeline.h |  673 +++++++++++++++++++++++++
 2 files changed, 708 insertions(+)
 create mode 100644 drivers/net/ethernet/rocker/rocker_pipeline.h

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index fded127..4c6787a 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -36,6 +36,7 @@
 #include <generated/utsrelease.h>
 
 #include "rocker.h"
+#include "rocker_pipeline.h"
 
 static const char rocker_driver_name[] = "rocker";
 
@@ -3780,6 +3781,33 @@ static int rocker_port_switch_port_stp_update(struct net_device *dev, u8 state)
 	return rocker_port_stp_update(rocker_port, state);
 }
 
+#ifdef CONFIG_NET_FLOW_TABLES
+static struct net_flow_table **rocker_get_tables(struct net_device *d)
+{
+	return rocker_table_list;
+}
+
+static struct net_flow_header **rocker_get_headers(struct net_device *d)
+{
+	return rocker_header_list;
+}
+
+static struct net_flow_action **rocker_get_actions(struct net_device *d)
+{
+	return rocker_action_list;
+}
+
+static struct net_flow_tbl_node **rocker_get_tgraph(struct net_device *d)
+{
+	return rocker_table_nodes;
+}
+
+static struct net_flow_hdr_node **rocker_get_hgraph(struct net_device *d)
+{
+	return rocker_header_nodes;
+}
+#endif
+
 static const struct net_device_ops rocker_port_netdev_ops = {
 	.ndo_open			= rocker_port_open,
 	.ndo_stop			= rocker_port_stop,
@@ -3794,6 +3822,13 @@ static const struct net_device_ops rocker_port_netdev_ops = {
 	.ndo_bridge_getlink		= rocker_port_bridge_getlink,
 	.ndo_switch_parent_id_get	= rocker_port_switch_parent_id_get,
 	.ndo_switch_port_stp_update	= rocker_port_switch_port_stp_update,
+#ifdef CONFIG_NET_FLOW_TABLES
+	.ndo_flow_get_tables		= rocker_get_tables,
+	.ndo_flow_get_headers		= rocker_get_headers,
+	.ndo_flow_get_actions		= rocker_get_actions,
+	.ndo_flow_get_tbl_graph		= rocker_get_tgraph,
+	.ndo_flow_get_hdr_graph		= rocker_get_hgraph,
+#endif
 };
 
 /********************
diff --git a/drivers/net/ethernet/rocker/rocker_pipeline.h b/drivers/net/ethernet/rocker/rocker_pipeline.h
new file mode 100644
index 0000000..9544339
--- /dev/null
+++ b/drivers/net/ethernet/rocker/rocker_pipeline.h
@@ -0,0 +1,673 @@
+#ifndef _MY_PIPELINE_H_
+#define _MY_PIPELINE_H_
+
+#include <linux/if_flow.h>
+
+/* header definition */
+#define HEADER_ETHERNET_SRC_MAC 1
+#define HEADER_ETHERNET_DST_MAC 2
+#define HEADER_ETHERNET_ETHERTYPE 3
+struct net_flow_field ethernet_fields[3] = {
+	{ .name = "src_mac", .uid = HEADER_ETHERNET_SRC_MAC, .bitwidth = 48},
+	{ .name = "dst_mac", .uid = HEADER_ETHERNET_DST_MAC, .bitwidth = 48},
+	{ .name = "ethertype",
+	  .uid = HEADER_ETHERNET_ETHERTYPE,
+	  .bitwidth = 16},
+};
+
+#define HEADER_ETHERNET 1
+struct net_flow_header ethernet = {
+	.name = "ethernet",
+	.uid = HEADER_ETHERNET,
+	.field_sz = 3,
+	.fields = ethernet_fields,
+};
+
+#define HEADER_VLAN_PCP 1
+#define HEADER_VLAN_CFI 2
+#define HEADER_VLAN_VID 3
+#define HEADER_VLAN_ETHERTYPE 4
+struct net_flow_field vlan_fields[4] = {
+	{ .name = "pcp", .uid = HEADER_VLAN_PCP, .bitwidth = 3,},
+	{ .name = "cfi", .uid = HEADER_VLAN_CFI, .bitwidth = 1,},
+	{ .name = "vid", .uid = HEADER_VLAN_VID, .bitwidth = 12,},
+	{ .name = "ethertype", .uid = HEADER_VLAN_ETHERTYPE, .bitwidth = 16,},
+};
+
+#define HEADER_VLAN 2
+struct net_flow_header vlan = {
+	.name = "vlan",
+	.uid = HEADER_VLAN,
+	.field_sz = 4,
+	.fields = vlan_fields,
+};
+
+#define HEADER_IPV4_VERSION 1
+#define HEADER_IPV4_IHL 2
+#define HEADER_IPV4_DSCP 3
+#define HEADER_IPV4_ECN 4
+#define HEADER_IPV4_LENGTH 5
+#define HEADER_IPV4_IDENTIFICATION 6
+#define HEADER_IPV4_FLAGS 7
+#define HEADER_IPV4_FRAGMENT_OFFSET 8
+#define HEADER_IPV4_TTL 9
+#define HEADER_IPV4_PROTOCOL 10
+#define HEADER_IPV4_CSUM 11
+#define HEADER_IPV4_SRC_IP 12
+#define HEADER_IPV4_DST_IP 13
+#define HEADER_IPV4_OPTIONS 14
+struct net_flow_field ipv4_fields[14] = {
+	{ .name = "version",
+	  .uid = HEADER_IPV4_VERSION,
+	  .bitwidth = 4,},
+	{ .name = "ihl",
+	  .uid = HEADER_IPV4_IHL,
+	  .bitwidth = 4,},
+	{ .name = "dscp",
+	  .uid = HEADER_IPV4_DSCP,
+	  .bitwidth = 6,},
+	{ .name = "ecn",
+	  .uid = HEADER_IPV4_ECN,
+	  .bitwidth = 2,},
+	{ .name = "length",
+	  .uid = HEADER_IPV4_LENGTH,
+	  .bitwidth = 8,},
+	{ .name = "identification",
+	  .uid = HEADER_IPV4_IDENTIFICATION,
+	  .bitwidth = 8,},
+	{ .name = "flags",
+	  .uid = HEADER_IPV4_FLAGS,
+	  .bitwidth = 3,},
+	{ .name = "fragment_offset",
+	  .uid = HEADER_IPV4_FRAGMENT_OFFSET,
+	  .bitwidth = 13,},
+	{ .name = "ttl",
+	  .uid = HEADER_IPV4_TTL,
+	  .bitwidth = 1,},
+	{ .name = "protocol",
+	  .uid = HEADER_IPV4_PROTOCOL,
+	  .bitwidth = 8,},
+	{ .name = "csum",
+	  .uid = HEADER_IPV4_CSUM,
+	  .bitwidth = 8,},
+	{ .name = "src_ip",
+	  .uid = HEADER_IPV4_SRC_IP,
+	  .bitwidth = 32,},
+	{ .name = "dst_ip",
+	  .uid = HEADER_IPV4_DST_IP,
+	  .bitwidth = 32,},
+	{ .name = "options",
+	  .uid = HEADER_IPV4_OPTIONS,
+	  .bitwidth = -1,},
+};
+
+#define HEADER_IPV4 3
+struct net_flow_header ipv4 = {
+	.name = "ipv4",
+	.uid = HEADER_IPV4,
+	.field_sz = 14,
+	.fields = ipv4_fields,
+};
+
+#define HEADER_METADATA_IN_LPORT 1
+#define HEADER_METADATA_GOTO_TBL 2
+#define HEADER_METADATA_GROUP_ID 3
+struct net_flow_field metadata_fields[3] = {
+	{ .name = "in_lport",
+	  .uid = HEADER_METADATA_IN_LPORT,
+	  .bitwidth = 32,},
+	{ .name = "goto_tbl",
+	  .uid = HEADER_METADATA_GOTO_TBL,
+	  .bitwidth = 16,},
+	{ .name = "group_id",
+	  .uid = HEADER_METADATA_GROUP_ID,
+	  .bitwidth = 32,},
+};
+
+#define HEADER_METADATA 4
+struct net_flow_header metadata_t = {
+	.name = "metadata_t",
+	.uid = HEADER_METADATA,
+	.field_sz = 3,
+	.fields = metadata_fields,
+};
+
+struct net_flow_header null_hdr = {.name = "",
+				   .uid = 0,
+				   .field_sz = 0,
+				   .fields = NULL};
+
+struct net_flow_header *rocker_header_list[8] = {
+	&ethernet,
+	&vlan,
+	&ipv4,
+	&metadata_t,
+	&null_hdr,
+};
+
+/* action definitions */
+struct net_flow_action_arg null_args[1] = {
+	{
+		.name = "",
+		.type = NET_FLOW_ACTION_ARG_TYPE_NULL,
+	},
+};
+
+struct net_flow_action null_action = {
+	.name = "", .uid = 0, .args = NULL,
+};
+
+struct net_flow_action_arg set_goto_table_args[2] = {
+	{
+		.name = "table",
+		.type = NET_FLOW_ACTION_ARG_TYPE_U16,
+		.value_u16 = 0,
+	},
+	{
+		.name = "",
+		.type = NET_FLOW_ACTION_ARG_TYPE_NULL,
+	},
+};
+
+#define ACTION_SET_GOTO_TABLE 1
+struct net_flow_action set_goto_table = {
+	.name = "set_goto_table",
+	.uid = ACTION_SET_GOTO_TABLE,
+	.args = set_goto_table_args,
+};
+
+struct net_flow_action_arg set_vlan_id_args[2] = {
+	{
+		.name = "vlan_id",
+		.type = NET_FLOW_ACTION_ARG_TYPE_U16,
+		.value_u16 = 0,
+	},
+	{
+		.name = "",
+		.type = NET_FLOW_ACTION_ARG_TYPE_NULL,
+	},
+};
+
+#define ACTION_SET_VLAN_ID 2
+struct net_flow_action set_vlan_id = {
+	.name = "set_vlan_id",
+	.uid = ACTION_SET_VLAN_ID,
+	.args = set_vlan_id_args,
+};
+
+/* TBD: what is the untagged bool about in vlan table */
+#define ACTION_COPY_TO_CPU 3
+struct net_flow_action copy_to_cpu = {
+	.name = "copy_to_cpu",
+	.uid = ACTION_COPY_TO_CPU,
+	.args = null_args,
+};
+
+struct net_flow_action_arg set_group_id_args[2] = {
+	{
+		.name = "group_id",
+		.type = NET_FLOW_ACTION_ARG_TYPE_U32,
+		.value_u32 = 0,
+	},
+	{
+		.name = "",
+		.type = NET_FLOW_ACTION_ARG_TYPE_NULL,
+	},
+};
+
+#define ACTION_SET_GROUP_ID 4
+struct net_flow_action set_group_id = {
+	.name = "set_group_id",
+	.uid = ACTION_SET_GROUP_ID,
+	.args = set_group_id_args,
+};
+
+#define ACTION_POP_VLAN 5
+struct net_flow_action pop_vlan = {
+	.name = "pop_vlan",
+	.uid = ACTION_POP_VLAN,
+	.args = null_args,
+};
+
+struct net_flow_action_arg set_eth_src_args[2] = {
+	{
+		.name = "eth_src",
+		.type = NET_FLOW_ACTION_ARG_TYPE_U64,
+		.value_u64 = 0,
+	},
+	{
+		.name = "",
+		.type = NET_FLOW_ACTION_ARG_TYPE_NULL,
+	},
+};
+
+#define ACTION_SET_ETH_SRC 6
+struct net_flow_action set_eth_src = {
+	.name = "set_eth_src",
+	.uid = ACTION_SET_ETH_SRC,
+	.args = set_eth_src_args,
+};
+
+struct net_flow_action_arg set_eth_dst_args[2] = {
+	{
+		.name = "eth_dst",
+		.type = NET_FLOW_ACTION_ARG_TYPE_U64,
+		.value_u64 = 0,
+	},
+	{
+		.name = "",
+		.type = NET_FLOW_ACTION_ARG_TYPE_NULL,
+	},
+};
+
+#define ACTION_SET_ETH_DST 7
+struct net_flow_action set_eth_dst = {
+	.name = "set_eth_dst",
+	.uid = ACTION_SET_ETH_DST,
+	.args = set_eth_dst_args,
+};
+
+struct net_flow_action_arg set_out_port_args[2] = {
+	{
+		.name = "set_out_port",
+		.type = NET_FLOW_ACTION_ARG_TYPE_U32,
+		.value_u32 = 0,
+	},
+	{
+		.name = "",
+		.type = NET_FLOW_ACTION_ARG_TYPE_NULL,
+	},
+};
+
+#define ACTION_SET_OUT_PORT 8
+struct net_flow_action set_out_port = {
+	.name = "set_out_port",
+	.uid = ACTION_SET_OUT_PORT,
+	.args = set_out_port_args,
+};
+
+struct net_flow_action *rocker_action_list[8] = {
+	&set_goto_table,
+	&set_vlan_id,
+	&copy_to_cpu,
+	&set_group_id,
+	&pop_vlan,
+	&set_eth_src,
+	&set_eth_dst,
+	&null_action,
+};
+
+/* headers graph */
+#define HEADER_INSTANCE_ETHERNET 1
+#define HEADER_INSTANCE_VLAN_OUTER 2
+#define HEADER_INSTANCE_IPV4 3
+#define HEADER_INSTANCE_IN_LPORT 4
+#define HEADER_INSTANCE_GOTO_TABLE 5
+#define HEADER_INSTANCE_GROUP_ID 6
+
+struct net_flow_jump_table parse_ethernet[3] = {
+	{
+		.field = {
+		   .header = HEADER_ETHERNET,
+		   .field = HEADER_ETHERNET_ETHERTYPE,
+		   .type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16,
+		   .value_u16 = 0x0800,
+		},
+		.node = HEADER_INSTANCE_IPV4,
+	},
+	{
+		.field = {
+		   .header = HEADER_ETHERNET,
+		   .field = HEADER_ETHERNET_ETHERTYPE,
+		   .type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16,
+		   .value_u16 = 0x8100,
+		},
+		.node = HEADER_INSTANCE_VLAN_OUTER,
+	},
+	{
+		.field = {0},
+		.node = 0,
+	},
+};
+
+int ethernet_headers[2] = {HEADER_ETHERNET, 0};
+
+struct net_flow_hdr_node ethernet_header_node = {
+	.name = "ethernet",
+	.uid = HEADER_INSTANCE_ETHERNET,
+	.hdrs = ethernet_headers,
+	.jump = parse_ethernet,
+};
+
+struct net_flow_jump_table parse_vlan[2] = {
+	{
+		.field = {
+		   .header = HEADER_VLAN,
+		   .field = HEADER_VLAN_ETHERTYPE,
+		   .type = NET_FLOW_FIELD_REF_ATTR_TYPE_U16,
+		   .value_u16 = 0x0800,
+		},
+		.node = HEADER_INSTANCE_IPV4,
+	},
+	{
+		.field = {0},
+		.node = 0,
+	},
+};
+
+int vlan_headers[2] = {HEADER_VLAN, 0};
+struct net_flow_hdr_node vlan_header_node = {
+	.name = "vlan",
+	.uid = HEADER_INSTANCE_VLAN_OUTER,
+	.hdrs = vlan_headers,
+	.jump = parse_vlan,
+};
+
+struct net_flow_jump_table terminal_headers[2] = {
+	{
+		.field = {0},
+		.node = NET_FLOW_JUMP_TABLE_DONE,
+	},
+	{
+		.field = {0},
+		.node = 0,
+	},
+};
+
+int ipv4_headers[2] = {HEADER_IPV4, 0};
+struct net_flow_hdr_node ipv4_header_node = {
+	.name = "ipv4",
+	.uid = HEADER_INSTANCE_IPV4,
+	.hdrs = ipv4_headers,
+	.jump = terminal_headers,
+};
+
+int metadata_headers[2] = {HEADER_METADATA, 0};
+struct net_flow_hdr_node in_lport_header_node = {
+	.name = "in_lport",
+	.uid = HEADER_INSTANCE_IN_LPORT,
+	.hdrs = metadata_headers,
+	.jump = terminal_headers,
+};
+
+struct net_flow_hdr_node goto_table_header_node = {
+	.name = "goto_table",
+	.uid = HEADER_INSTANCE_GOTO_TABLE,
+	.hdrs = metadata_headers,
+	.jump = terminal_headers,
+};
+
+struct net_flow_hdr_node group_id_header_node = {
+	.name = "group_id",
+	.uid = HEADER_INSTANCE_GROUP_ID,
+	.hdrs = metadata_headers,
+	.jump = terminal_headers,
+};
+
+struct net_flow_hdr_node null_header = {.name = "", .uid = 0,};
+
+struct net_flow_hdr_node *rocker_header_nodes[7] = {
+	&ethernet_header_node,
+	&vlan_header_node,
+	&ipv4_header_node,
+	&in_lport_header_node,
+	&goto_table_header_node,
+	&group_id_header_node,
+	&null_header,
+};
+
+/* table definition */
+struct net_flow_field_ref matches_ig_port[2] = {
+	{ .instance = HEADER_INSTANCE_IN_LPORT,
+	  .header = HEADER_METADATA,
+	  .field = HEADER_METADATA_IN_LPORT,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = 0, .field = 0},
+};
+
+struct net_flow_field_ref matches_vlan[3] = {
+	{ .instance = HEADER_INSTANCE_IN_LPORT,
+	  .header = HEADER_METADATA,
+	  .field = HEADER_METADATA_IN_LPORT,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_VLAN_OUTER,
+	  .header = HEADER_VLAN,
+	  .field = HEADER_VLAN_VID,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = 0, .field = 0},
+};
+
+struct net_flow_field_ref matches_term_mac[5] = {
+	{ .instance = HEADER_INSTANCE_IN_LPORT,
+	  .header = HEADER_METADATA,
+	  .field = HEADER_METADATA_IN_LPORT,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_ETHERNET,
+	  .header = HEADER_ETHERNET,
+	  .field = HEADER_ETHERNET_ETHERTYPE,
+	  .mask_type = NET_FLOW_MASK_TYPE_EXACT},
+	{ .instance = HEADER_INSTANCE_ETHERNET,
+	  .header = HEADER_ETHERNET,
+	  .field = HEADER_ETHERNET_DST_MAC,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_VLAN_OUTER,
+	  .header = HEADER_VLAN,
+	  .field = HEADER_VLAN_VID,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = 0, .field = 0},
+};
+
+struct net_flow_field_ref matches_ucast_routing[3] = {
+	{ .instance = HEADER_INSTANCE_ETHERNET,
+	  .header = HEADER_ETHERNET,
+	  .field = HEADER_ETHERNET_ETHERTYPE,
+	  .mask_type = NET_FLOW_MASK_TYPE_EXACT},
+	{ .instance = HEADER_INSTANCE_IPV4,
+	  .header = HEADER_IPV4,
+	  .field = HEADER_IPV4_DST_IP,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = 0, .field = 0},
+};
+
+struct net_flow_field_ref matches_bridge[3] = {
+	{ .instance = HEADER_INSTANCE_ETHERNET,
+	  .header = HEADER_ETHERNET,
+	  .field = HEADER_ETHERNET_DST_MAC,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_VLAN_OUTER,
+	  .header = HEADER_VLAN,
+	  .field = HEADER_VLAN_VID,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = 0, .field = 0},
+};
+
+struct net_flow_field_ref matches_acl[8] = {
+	{ .instance = HEADER_INSTANCE_IN_LPORT,
+	  .header = HEADER_METADATA,
+	  .field = HEADER_METADATA_IN_LPORT,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_ETHERNET,
+	  .header = HEADER_ETHERNET,
+	  .field = HEADER_ETHERNET_SRC_MAC,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_ETHERNET,
+	  .header = HEADER_ETHERNET,
+	  .field = HEADER_ETHERNET_DST_MAC,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_ETHERNET,
+	  .header = HEADER_ETHERNET,
+	  .field = HEADER_ETHERNET_ETHERTYPE,
+	  .mask_type = NET_FLOW_MASK_TYPE_EXACT},
+	{ .instance = HEADER_INSTANCE_VLAN_OUTER,
+	  .header = HEADER_VLAN,
+	  .field = HEADER_VLAN_VID,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_IPV4,
+	  .header = HEADER_IPV4,
+	  .field = HEADER_IPV4_PROTOCOL,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = HEADER_INSTANCE_IPV4,
+	  .header = HEADER_IPV4,
+	  .field = HEADER_IPV4_DSCP,
+	  .mask_type = NET_FLOW_MASK_TYPE_LPM},
+	{ .instance = 0, .field = 0},
+};
+
+int actions_ig_port[2] = {ACTION_SET_GOTO_TABLE, 0};
+int actions_vlan[3] = {ACTION_SET_GOTO_TABLE, ACTION_SET_VLAN_ID, 0};
+int actions_term_mac[3] = {ACTION_SET_GOTO_TABLE, ACTION_COPY_TO_CPU, 0};
+int actions_ucast_routing[3] = {ACTION_SET_GOTO_TABLE, ACTION_SET_GROUP_ID, 0};
+int actions_bridge[4] = {ACTION_SET_GOTO_TABLE,
+			 ACTION_SET_GROUP_ID,
+			 ACTION_COPY_TO_CPU, 0};
+int actions_acl[2] = {ACTION_SET_GROUP_ID, 0};
+
+enum rocker_flow_table_id_space {
+	ROCKER_FLOW_TABLE_ID_INGRESS_PORT = 1,
+	ROCKER_FLOW_TABLE_ID_VLAN,
+	ROCKER_FLOW_TABLE_ID_TERMINATION_MAC,
+	ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING,
+	ROCKER_FLOW_TABLE_ID_BRIDGING,
+	ROCKER_FLOW_TABLE_ID_ACL_POLICY,
+	ROCKER_FLOW_TABLE_NULL = 0,
+};
+
+struct net_flow_table ingress_port_table = {
+	.name = "ingress_port",
+	.uid = ROCKER_FLOW_TABLE_ID_INGRESS_PORT,
+	.source = 1,
+	.size = -1,
+	.matches = matches_ig_port,
+	.actions = actions_ig_port,
+};
+
+struct net_flow_table vlan_table = {
+	.name = "vlan",
+	.uid = ROCKER_FLOW_TABLE_ID_VLAN,
+	.source = 1,
+	.size = -1,
+	.matches = matches_vlan,
+	.actions = actions_vlan,
+};
+
+struct net_flow_table term_mac_table = {
+	.name = "term_mac",
+	.uid = ROCKER_FLOW_TABLE_ID_TERMINATION_MAC,
+	.source = 1,
+	.size = -1,
+	.matches = matches_term_mac,
+	.actions = actions_term_mac,
+};
+
+struct net_flow_table ucast_routing_table = {
+	.name = "ucast_routing",
+	.uid = ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING,
+	.source = 1,
+	.size = -1,
+	.matches = matches_ucast_routing,
+	.actions = actions_ucast_routing,
+};
+
+struct net_flow_table bridge_table = {
+	.name = "bridge",
+	.uid = ROCKER_FLOW_TABLE_ID_BRIDGING,
+	.source = 1,
+	.size = -1,
+	.matches = matches_bridge,
+	.actions = actions_bridge,
+};
+
+struct net_flow_table acl_table = {
+	.name = "acl",
+	.uid = ROCKER_FLOW_TABLE_ID_ACL_POLICY,
+	.source = 1,
+	.size = -1,
+	.matches = matches_acl,
+	.actions = actions_acl,
+};
+
+struct net_flow_table null_table = {
+	.name = "",
+	.uid = 0,
+	.source = 0,
+	.size = 0,
+	.matches = NULL,
+	.actions = NULL,
+};
+
+struct net_flow_table *rocker_table_list[7] = {
+	&ingress_port_table,
+	&vlan_table,
+	&term_mac_table,
+	&ucast_routing_table,
+	&bridge_table,
+	&acl_table,
+	&null_table,
+};
+
+/* Define the table graph layout */
+struct net_flow_jump_table table_node_ig_port_next[2] = {
+	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_VLAN},
+	{ .field = {0}, .node = 0},
+};
+
+struct net_flow_tbl_node table_node_ingress_port = {
+	.uid = ROCKER_FLOW_TABLE_ID_INGRESS_PORT,
+	.jump = table_node_ig_port_next};
+
+struct net_flow_jump_table table_node_vlan_next[2] = {
+	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_TERMINATION_MAC},
+	{ .field = {0}, .node = 0},
+};
+
+struct net_flow_tbl_node table_node_vlan = {
+	.uid = ROCKER_FLOW_TABLE_ID_VLAN,
+	.jump = table_node_vlan_next};
+
+struct net_flow_jump_table table_node_term_mac_next[2] = {
+	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING},
+	{ .field = {0}, .node = 0},
+};
+
+struct net_flow_tbl_node table_node_term_mac = {
+	.uid = ROCKER_FLOW_TABLE_ID_TERMINATION_MAC,
+	.jump = table_node_term_mac_next};
+
+struct net_flow_jump_table table_node_bridge_next[2] = {
+	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_ACL_POLICY},
+	{ .field = {0}, .node = 0},
+};
+
+struct net_flow_tbl_node table_node_bridge = {
+	.uid = ROCKER_FLOW_TABLE_ID_BRIDGING,
+	.jump = table_node_bridge_next};
+
+struct net_flow_jump_table table_node_ucast_routing_next[2] = {
+	{ .field = {0}, .node = ROCKER_FLOW_TABLE_ID_ACL_POLICY},
+	{ .field = {0}, .node = 0},
+};
+
+struct net_flow_tbl_node table_node_ucast_routing = {
+	.uid = ROCKER_FLOW_TABLE_ID_UNICAST_ROUTING,
+	.jump = table_node_ucast_routing_next};
+
+struct net_flow_jump_table table_node_acl_next[1] = {
+	{ .field = {0}, .node = 0},
+};
+
+struct net_flow_tbl_node table_node_acl = {
+	.uid = ROCKER_FLOW_TABLE_ID_ACL_POLICY,
+	.jump = table_node_acl_next};
+
+struct net_flow_tbl_node table_node_nil = {.uid = 0, .jump = NULL};
+
+struct net_flow_tbl_node *rocker_table_nodes[7] = {
+	&table_node_ingress_port,
+	&table_node_vlan,
+	&table_node_term_mac,
+	&table_node_ucast_routing,
+	&table_node_bridge,
+	&table_node_acl,
+	&table_node_nil,
+};
+#endif /*_MY_PIPELINE_H*/

^ permalink raw reply related

* [net-next PATCH v1 03/11] net: flow_table: add apply action argument to tables
From: John Fastabend @ 2014-12-31 19:46 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

Actions may not always be applied after exiting a table. For example
some pipelines may accumulate actions and then apply them at the end
of a pipeline.

To model this we use a table type called APPLY. Tables who share an
apply identifier have their actions applied in one step.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/linux/if_flow.h      |    1 +
 include/uapi/linux/if_flow.h |    1 +
 net/core/flow_table.c        |    1 +
 3 files changed, 3 insertions(+)

diff --git a/include/linux/if_flow.h b/include/linux/if_flow.h
index 20fa752..a042a3d 100644
--- a/include/linux/if_flow.h
+++ b/include/linux/if_flow.h
@@ -67,6 +67,7 @@ struct net_flow_table {
 	char name[NET_FLOW_NAMSIZ];
 	int uid;
 	int source;
+	int apply_action;
 	int size;
 	struct net_flow_field_ref *matches;
 	int *actions;
diff --git a/include/uapi/linux/if_flow.h b/include/uapi/linux/if_flow.h
index 125cdc6..3c1a860 100644
--- a/include/uapi/linux/if_flow.h
+++ b/include/uapi/linux/if_flow.h
@@ -265,6 +265,7 @@ enum {
 	NET_FLOW_TABLE_ATTR_NAME,
 	NET_FLOW_TABLE_ATTR_UID,
 	NET_FLOW_TABLE_ATTR_SOURCE,
+	NET_FLOW_TABLE_ATTR_APPLY,
 	NET_FLOW_TABLE_ATTR_SIZE,
 	NET_FLOW_TABLE_ATTR_MATCHES,
 	NET_FLOW_TABLE_ATTR_ACTIONS,
diff --git a/net/core/flow_table.c b/net/core/flow_table.c
index f4cf293..97cdf92 100644
--- a/net/core/flow_table.c
+++ b/net/core/flow_table.c
@@ -223,6 +223,7 @@ static int net_flow_put_table(struct net_device *dev,
 	if (nla_put_string(skb, NET_FLOW_TABLE_ATTR_NAME, t->name) ||
 	    nla_put_u32(skb, NET_FLOW_TABLE_ATTR_UID, t->uid) ||
 	    nla_put_u32(skb, NET_FLOW_TABLE_ATTR_SOURCE, t->source) ||
+	    nla_put_u32(skb, NET_FLOW_TABLE_ATTR_APPLY, t->apply_action) ||
 	    nla_put_u32(skb, NET_FLOW_TABLE_ATTR_SIZE, t->size))
 		return -EMSGSIZE;
 

^ permalink raw reply related

* [net-next PATCH v1 02/11] net: flow_table: add flow, delete flow
From: John Fastabend @ 2014-12-31 19:46 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

Now that the device capabilities are exposed we can add support to
add and delete flows from the tables.

The two operations are

table_set_flows :

  The set flow operations is used to program a set of flows into a
  hardware device table. The message is consumed via netlink encoded
  message which is then decoded into a null terminated  array of
  flow entry structures. A flow entry structure is defined as

     struct net_flow_flow {
			  int table_id;
			  int uid;
			  int priority;
			  struct net_flow_field_ref *matches;
			  struct net_flow_action *actions;
     }

  The table id is the _uid_ returned from 'get_tables' operatoins.
  Matches is a set of match criteria for packets with a logical AND
  operation done on the set so packets match the entire criteria.
  Actions provide a set of actions to perform when the flow rule is
  hit. Both matches and actions are null terminated arrays.

  The flows are configured in hardware using an ndo op. We do not
  provide a commit operation at the moment and expect hardware
  commits the flows one at a time. Future work may require a commit
  operation to tell the hardware we are done loading flow rules. On
  some hardware this will help bulk updates.

  Its possible for hardware to return an error from a flow set
  operation. This can occur for many reasons both transient and
  resource constraints. We have different error handling strategies
  built in and listed here,

    *_ERROR_ABORT      abort on first error with errmsg

    *_ERROR_CONTINUE   continue programming flows no errmsg

    *_ERROR_ABORT_LOG  abort on first error and return flow that
 		       failed to user space in reply msg

    *_ERROR_CONT_LOG   continue programming flows and return a list
		       of flows that failed to user space in a reply
		       msg.

  notably missing is a rollback error strategy. I don't have a
  use for this in software yet but the strategy can be added with
  *_ERROR_ROLLBACK for example.

table_del_flows

  The delete flow operation uses the same structures and error
  handling strategies as the table_set_flows operations. Although on
  delete messges ommit the matches/actions arrays because they are
  not needed to lookup the flow.

Also thanks to Simon Horman for fixes and other help.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/linux/if_flow.h      |   21 ++
 include/linux/netdevice.h    |    8 +
 include/uapi/linux/if_flow.h |   49 ++++
 net/core/flow_table.c        |  501 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 579 insertions(+)

diff --git a/include/linux/if_flow.h b/include/linux/if_flow.h
index 1b6c1ea..20fa752 100644
--- a/include/linux/if_flow.h
+++ b/include/linux/if_flow.h
@@ -90,4 +90,25 @@ struct net_flow_tbl_node {
 	__u32 flags;
 	struct net_flow_jump_table *jump;
 };
+
+/**
+ * @struct net_flow_flow
+ * @brief describes the match/action entry
+ *
+ * @uid unique identifier for flow
+ * @priority priority to execute flow match/action in table
+ * @match null terminated set of match uids match criteria
+ * @actoin null terminated set of action uids to apply to match
+ *
+ * Flows must match all entries in match set.
+ */
+struct net_flow_flow {
+	int table_id;
+	int uid;
+	int priority;
+	struct net_flow_field_ref *matches;
+	struct net_flow_action *actions;
+};
+
+int net_flow_put_flow(struct sk_buff *skb, struct net_flow_flow *flow);
 #endif
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3c3c856..be8d4e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1197,6 +1197,14 @@ struct net_device_ops {
 	struct net_flow_header	**(*ndo_flow_get_headers)(struct net_device *dev);
 	struct net_flow_hdr_node **(*ndo_flow_get_hdr_graph)(struct net_device *dev);
 	struct net_flow_tbl_node **(*ndo_flow_get_tbl_graph)(struct net_device *dev);
+	int		        (*ndo_flow_get_flows)(struct sk_buff *skb,
+						      struct net_device *dev,
+						      int table,
+						      int min, int max);
+	int		        (*ndo_flow_set_flows)(struct net_device *dev,
+						      struct net_flow_flow *f);
+	int		        (*ndo_flow_del_flows)(struct net_device *dev,
+						      struct net_flow_flow *f);
 #endif
 };
 
diff --git a/include/uapi/linux/if_flow.h b/include/uapi/linux/if_flow.h
index 2acdb38..125cdc6 100644
--- a/include/uapi/linux/if_flow.h
+++ b/include/uapi/linux/if_flow.h
@@ -329,6 +329,48 @@ enum {
 #define NET_FLOW_TABLE_GRAPH_MAX (__NET_FLOW_TABLE_GRAPH_MAX - 1)
 
 enum {
+	NET_FLOW_NET_FLOW_UNSPEC,
+	NET_FLOW_FLOW,
+	__NET_FLOW_NET_FLOW_MAX,
+};
+#define NET_FLOW_NET_FLOW_MAX (__NET_FLOW_NET_FLOW_MAX - 1)
+
+enum {
+	NET_FLOW_TABLE_FLOWS_UNSPEC,
+	NET_FLOW_TABLE_FLOWS_TABLE,
+	NET_FLOW_TABLE_FLOWS_MINPRIO,
+	NET_FLOW_TABLE_FLOWS_MAXPRIO,
+	NET_FLOW_TABLE_FLOWS_FLOWS,
+	__NET_FLOW_TABLE_FLOWS_MAX,
+};
+#define NET_FLOW_TABLE_FLOWS_MAX (__NET_FLOW_TABLE_FLOWS_MAX - 1)
+
+enum {
+	/* Abort with normal errmsg */
+	NET_FLOW_FLOWS_ERROR_ABORT,
+	/* Ignore errors and continue without logging */
+	NET_FLOW_FLOWS_ERROR_CONTINUE,
+	/* Abort and reply with invalid flow fields */
+	NET_FLOW_FLOWS_ERROR_ABORT_LOG,
+	/* Continue and reply with list of invalid flows */
+	NET_FLOW_FLOWS_ERROR_CONT_LOG,
+	__NET_FLOWS_FLOWS_ERROR_MAX,
+};
+#define NET_FLOWS_FLOWS_ERROR_MAX (__NET_FLOWS_FLOWS_ERROR_MAX - 1)
+
+enum {
+	NET_FLOW_ATTR_UNSPEC,
+	NET_FLOW_ATTR_ERROR,
+	NET_FLOW_ATTR_TABLE,
+	NET_FLOW_ATTR_UID,
+	NET_FLOW_ATTR_PRIORITY,
+	NET_FLOW_ATTR_MATCHES,
+	NET_FLOW_ATTR_ACTIONS,
+	__NET_FLOW_ATTR_MAX,
+};
+#define NET_FLOW_ATTR_MAX (__NET_FLOW_ATTR_MAX - 1)
+
+enum {
 	NET_FLOW_IDENTIFIER_IFINDEX, /* net_device ifindex */
 };
 
@@ -343,6 +385,9 @@ enum {
 	NET_FLOW_HEADER_GRAPH,
 	NET_FLOW_TABLE_GRAPH,
 
+	NET_FLOW_FLOWS,
+	NET_FLOW_FLOWS_ERROR,
+
 	__NET_FLOW_MAX,
 	NET_FLOW_MAX = (__NET_FLOW_MAX - 1),
 };
@@ -354,6 +399,10 @@ enum {
 	NET_FLOW_TABLE_CMD_GET_HDR_GRAPH,
 	NET_FLOW_TABLE_CMD_GET_TABLE_GRAPH,
 
+	NET_FLOW_TABLE_CMD_GET_FLOWS,
+	NET_FLOW_TABLE_CMD_SET_FLOWS,
+	NET_FLOW_TABLE_CMD_DEL_FLOWS,
+
 	__NET_FLOW_CMD_MAX,
 	NET_FLOW_CMD_MAX = (__NET_FLOW_CMD_MAX - 1),
 };
diff --git a/net/core/flow_table.c b/net/core/flow_table.c
index ec3f06d..f4cf293 100644
--- a/net/core/flow_table.c
+++ b/net/core/flow_table.c
@@ -774,6 +774,489 @@ static int net_flow_cmd_get_table_graph(struct sk_buff *skb,
 	return genlmsg_reply(msg, info);
 }
 
+static struct sk_buff *net_flow_build_flows_msg(struct net_device *dev,
+						u32 portid, int seq, u8 cmd,
+						int min, int max, int table)
+{
+	struct genlmsghdr *hdr;
+	struct nlattr *flows;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOBUFS);
+
+	hdr = genlmsg_put(skb, portid, seq, &net_flow_nl_family, 0, cmd);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_u32(skb,
+			NET_FLOW_IDENTIFIER_TYPE,
+			NET_FLOW_IDENTIFIER_IFINDEX) ||
+	    nla_put_u32(skb, NET_FLOW_IDENTIFIER, dev->ifindex)) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	flows = nla_nest_start(skb, NET_FLOW_FLOWS);
+	if (!flows) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	err = dev->netdev_ops->ndo_flow_get_flows(skb, dev, table, min, max);
+	if (err < 0)
+		goto out_cancel;
+
+	nla_nest_end(skb, flows);
+
+	err = genlmsg_end(skb, hdr);
+	if (err < 0)
+		goto out;
+
+	return skb;
+out_cancel:
+	nla_nest_cancel(skb, flows);
+out:
+	nlmsg_free(skb);
+	return ERR_PTR(err);
+}
+
+static const
+struct nla_policy net_flow_table_flows_policy[NET_FLOW_TABLE_FLOWS_MAX + 1] = {
+	[NET_FLOW_TABLE_FLOWS_TABLE]   = { .type = NLA_U32,},
+	[NET_FLOW_TABLE_FLOWS_MINPRIO] = { .type = NLA_U32,},
+	[NET_FLOW_TABLE_FLOWS_MAXPRIO] = { .type = NLA_U32,},
+	[NET_FLOW_TABLE_FLOWS_FLOWS]   = { .type = NLA_NESTED,},
+};
+
+static int net_flow_table_cmd_get_flows(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct nlattr *tb[NET_FLOW_TABLE_FLOWS_MAX+1];
+	int table, min = -1, max = -1;
+	struct net_device *dev;
+	struct sk_buff *msg;
+	int err = -EINVAL;
+
+	dev = net_flow_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->netdev_ops->ndo_flow_get_flows) {
+		dev_put(dev);
+		return -EOPNOTSUPP;
+	}
+
+	if (!info->attrs[NET_FLOW_IDENTIFIER_TYPE] ||
+	    !info->attrs[NET_FLOW_IDENTIFIER] ||
+	    !info->attrs[NET_FLOW_FLOWS])
+		goto out;
+
+	err = nla_parse_nested(tb, NET_FLOW_TABLE_FLOWS_MAX,
+			       info->attrs[NET_FLOW_FLOWS],
+			       net_flow_table_flows_policy);
+	if (err)
+		goto out;
+
+	if (!tb[NET_FLOW_TABLE_FLOWS_TABLE])
+		goto out;
+
+	table = nla_get_u32(tb[NET_FLOW_TABLE_FLOWS_TABLE]);
+
+	if (tb[NET_FLOW_TABLE_FLOWS_MINPRIO])
+		min = nla_get_u32(tb[NET_FLOW_TABLE_FLOWS_MINPRIO]);
+	if (tb[NET_FLOW_TABLE_FLOWS_MAXPRIO])
+		max = nla_get_u32(tb[NET_FLOW_TABLE_FLOWS_MAXPRIO]);
+
+	msg = net_flow_build_flows_msg(dev,
+				       info->snd_portid,
+				       info->snd_seq,
+				       NET_FLOW_TABLE_CMD_GET_FLOWS,
+				       min, max, table);
+	dev_put(dev);
+
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	return genlmsg_reply(msg, info);
+out:
+	dev_put(dev);
+	return err;
+}
+
+static struct sk_buff *net_flow_start_errmsg(struct net_device *dev,
+					     struct genlmsghdr **hdr,
+					     u32 portid, int seq, u8 cmd)
+{
+	struct genlmsghdr *h;
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-EMSGSIZE);
+
+	h = genlmsg_put(skb, portid, seq, &net_flow_nl_family, 0, cmd);
+	if (!h)
+		return ERR_PTR(-EMSGSIZE);
+
+	if (nla_put_u32(skb,
+			NET_FLOW_IDENTIFIER_TYPE,
+			NET_FLOW_IDENTIFIER_IFINDEX) ||
+	    nla_put_u32(skb, NET_FLOW_IDENTIFIER, dev->ifindex))
+		return ERR_PTR(-EMSGSIZE);
+
+	*hdr = h;
+	return skb;
+}
+
+static struct sk_buff *net_flow_end_flow_errmsg(struct sk_buff *skb,
+						struct genlmsghdr *hdr)
+{
+	int err;
+
+	err = genlmsg_end(skb, hdr);
+	if (err < 0) {
+		nlmsg_free(skb);
+		return ERR_PTR(err);
+	}
+
+	return skb;
+}
+
+static int net_flow_put_flow_action(struct sk_buff *skb,
+				    struct net_flow_action *a)
+{
+	struct nlattr *action, *sigs;
+	int i, err = 0;
+
+	action = nla_nest_start(skb, NET_FLOW_ACTION);
+	if (!action)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, NET_FLOW_ACTION_ATTR_UID, a->uid))
+		return -EMSGSIZE;
+
+	if (!a->args)
+		goto done;
+
+	for (i = 0; a->args[i].type; i++) {
+		sigs = nla_nest_start(skb, NET_FLOW_ACTION_ATTR_SIGNATURE);
+		if (!sigs) {
+			nla_nest_cancel(skb, action);
+			return -EMSGSIZE;
+		}
+
+		err = net_flow_put_act_types(skb, a[i].args);
+		if (err) {
+			nla_nest_cancel(skb, action);
+			nla_nest_cancel(skb, sigs);
+			return err;
+		}
+		nla_nest_end(skb, sigs);
+	}
+
+done:
+	nla_nest_end(skb, action);
+	return 0;
+}
+
+int net_flow_put_flow(struct sk_buff *skb, struct net_flow_flow *flow)
+{
+	struct nlattr *flows, *matches;
+	struct nlattr *actions = NULL; /* must be null to unwind */
+	int err, j, i = 0;
+
+	flows = nla_nest_start(skb, NET_FLOW_FLOW);
+	if (!flows)
+		goto put_failure;
+
+	if (nla_put_u32(skb, NET_FLOW_ATTR_TABLE, flow->table_id) ||
+	    nla_put_u32(skb, NET_FLOW_ATTR_UID, flow->uid) ||
+	    nla_put_u32(skb, NET_FLOW_ATTR_PRIORITY, flow->priority))
+		goto flows_put_failure;
+
+	if (flow->matches) {
+		matches = nla_nest_start(skb, NET_FLOW_ATTR_MATCHES);
+		if (!matches)
+			goto flows_put_failure;
+
+		for (j = 0; flow->matches && flow->matches[j].header; j++) {
+			struct net_flow_field_ref *f = &flow->matches[j];
+
+			if (!f->header)
+				continue;
+
+			nla_put(skb, NET_FLOW_FIELD_REF, sizeof(*f), f);
+		}
+		nla_nest_end(skb, matches);
+	}
+
+	if (flow->actions) {
+		actions = nla_nest_start(skb, NET_FLOW_ATTR_ACTIONS);
+		if (!actions)
+			goto flows_put_failure;
+
+		for (i = 0; flow->actions && flow->actions[i].uid; i++) {
+			err = net_flow_put_flow_action(skb, &flow->actions[i]);
+			if (err) {
+				nla_nest_cancel(skb, actions);
+				goto flows_put_failure;
+			}
+		}
+		nla_nest_end(skb, actions);
+	}
+
+	nla_nest_end(skb, flows);
+	return 0;
+
+flows_put_failure:
+	nla_nest_cancel(skb, flows);
+put_failure:
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL(net_flow_put_flow);
+
+static int net_flow_get_field(struct net_flow_field_ref *field,
+			      struct nlattr *nla)
+{
+	if (nla_type(nla) != NET_FLOW_FIELD_REF)
+		return -EINVAL;
+
+	if (nla_len(nla) < sizeof(*field))
+		return -EINVAL;
+
+	*field = *(struct net_flow_field_ref *)nla_data(nla);
+	return 0;
+}
+
+static int net_flow_get_action(struct net_flow_action *a, struct nlattr *attr)
+{
+	struct nlattr *act[NET_FLOW_ACTION_ATTR_MAX+1];
+	struct nlattr *args;
+	int rem;
+	int err, count = 0;
+
+	if (nla_type(attr) != NET_FLOW_ACTION) {
+		pr_warn("%s: expected NET_FLOW_ACTION\n", __func__);
+		return 0;
+	}
+
+	err = nla_parse_nested(act, NET_FLOW_ACTION_ATTR_MAX,
+			       attr, net_flow_action_policy);
+	if (err < 0)
+		return err;
+
+	if (!act[NET_FLOW_ACTION_ATTR_UID] ||
+	    !act[NET_FLOW_ACTION_ATTR_SIGNATURE])
+		return -EINVAL;
+
+	a->uid = nla_get_u32(act[NET_FLOW_ACTION_ATTR_UID]);
+
+	nla_for_each_nested(args, act[NET_FLOW_ACTION_ATTR_SIGNATURE], rem)
+		count++; /* unoptimized max possible */
+
+	a->args = kcalloc(count + 1,
+			  sizeof(struct net_flow_action_arg),
+			  GFP_KERNEL);
+	count = 0;
+
+	nla_for_each_nested(args, act[NET_FLOW_ACTION_ATTR_SIGNATURE], rem) {
+		if (nla_type(args) != NET_FLOW_ACTION_ARG)
+			continue;
+
+		if (nla_len(args) < sizeof(struct net_flow_action_arg)) {
+			kfree(a->args);
+			return -EINVAL;
+		}
+
+		a->args[count] = *(struct net_flow_action_arg *)nla_data(args);
+	}
+	return 0;
+}
+
+static const
+struct nla_policy net_flow_flow_policy[NET_FLOW_ATTR_MAX + 1] = {
+	[NET_FLOW_ATTR_TABLE]		= { .type = NLA_U32 },
+	[NET_FLOW_ATTR_UID]		= { .type = NLA_U32 },
+	[NET_FLOW_ATTR_PRIORITY]	= { .type = NLA_U32 },
+	[NET_FLOW_ATTR_MATCHES]		= { .type = NLA_NESTED },
+	[NET_FLOW_ATTR_ACTIONS]		= { .type = NLA_NESTED },
+};
+
+static int net_flow_get_flow(struct net_flow_flow *flow, struct nlattr *attr)
+{
+	struct nlattr *f[NET_FLOW_ATTR_MAX+1];
+	struct nlattr *attr2;
+	int rem, err;
+	int count = 0;
+
+	err = nla_parse_nested(f, NET_FLOW_ATTR_MAX,
+			       attr, net_flow_flow_policy);
+	if (err < 0)
+		return -EINVAL;
+
+	if (!f[NET_FLOW_ATTR_TABLE] || !f[NET_FLOW_ATTR_UID] ||
+	    !f[NET_FLOW_ATTR_PRIORITY])
+		return -EINVAL;
+
+	flow->table_id = nla_get_u32(f[NET_FLOW_ATTR_TABLE]);
+	flow->uid = nla_get_u32(f[NET_FLOW_ATTR_UID]);
+	flow->priority = nla_get_u32(f[NET_FLOW_ATTR_PRIORITY]);
+
+	flow->matches = NULL;
+	flow->actions = NULL;
+
+	if (f[NET_FLOW_ATTR_MATCHES]) {
+		nla_for_each_nested(attr2, f[NET_FLOW_ATTR_MATCHES], rem)
+			count++;
+
+		/* Null terminated list of matches */
+		flow->matches = kcalloc(count + 1,
+					sizeof(struct net_flow_field_ref),
+					GFP_KERNEL);
+		if (!flow->matches)
+			return -ENOMEM;
+
+		count = 0;
+		nla_for_each_nested(attr2, f[NET_FLOW_ATTR_MATCHES], rem) {
+			err = net_flow_get_field(&flow->matches[count], attr2);
+			if (err) {
+				kfree(flow->matches);
+				return err;
+			}
+			count++;
+		}
+	}
+
+	if (f[NET_FLOW_ATTR_ACTIONS]) {
+		count = 0;
+		nla_for_each_nested(attr2, f[NET_FLOW_ATTR_ACTIONS], rem)
+			count++;
+
+		/* Null terminated list of actions */
+		flow->actions = kcalloc(count + 1,
+					sizeof(struct net_flow_action),
+					GFP_KERNEL);
+		if (!flow->actions) {
+			kfree(flow->matches);
+			return -ENOMEM;
+		}
+
+		count = 0;
+		nla_for_each_nested(attr2, f[NET_FLOW_ATTR_ACTIONS], rem) {
+			err = net_flow_get_action(&flow->actions[count], attr2);
+			if (err) {
+				kfree(flow->matches);
+				kfree(flow->actions);
+				return err;
+			}
+			count++;
+		}
+	}
+
+	return 0;
+}
+
+static int net_flow_table_cmd_flows(struct sk_buff *recv_skb,
+				    struct genl_info *info)
+{
+	int rem, err_handle = NET_FLOW_FLOWS_ERROR_ABORT;
+	struct sk_buff *skb = NULL;
+	struct net_flow_flow this;
+	struct genlmsghdr *hdr;
+	struct net_device *dev;
+	struct nlattr *flow, *flows;
+	int cmd = info->genlhdr->cmd;
+	int err = -EOPNOTSUPP;
+
+	dev = net_flow_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->netdev_ops->ndo_flow_set_flows ||
+	    !dev->netdev_ops->ndo_flow_del_flows)
+		goto out;
+
+	if (!info->attrs[NET_FLOW_IDENTIFIER_TYPE] ||
+	    !info->attrs[NET_FLOW_IDENTIFIER] ||
+	    !info->attrs[NET_FLOW_FLOWS]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (info->attrs[NET_FLOW_FLOWS_ERROR])
+		err_handle = nla_get_u32(info->attrs[NET_FLOW_FLOWS_ERROR]);
+
+	nla_for_each_nested(flow, info->attrs[NET_FLOW_FLOWS], rem) {
+		if (nla_type(flow) != NET_FLOW_FLOW)
+			continue;
+
+		err = net_flow_get_flow(&this, flow);
+		if (err)
+			goto out;
+
+		switch (cmd) {
+		case NET_FLOW_TABLE_CMD_SET_FLOWS:
+			err = dev->netdev_ops->ndo_flow_set_flows(dev, &this);
+			break;
+		case NET_FLOW_TABLE_CMD_DEL_FLOWS:
+			err = dev->netdev_ops->ndo_flow_del_flows(dev, &this);
+			break;
+		default:
+			err = -EOPNOTSUPP;
+			break;
+		}
+
+		if (err && err_handle != NET_FLOW_FLOWS_ERROR_CONTINUE) {
+			if (!skb) {
+				skb = net_flow_start_errmsg(dev, &hdr,
+							    info->snd_portid,
+							    info->snd_seq,
+							    cmd);
+				if (IS_ERR(skb)) {
+					err = PTR_ERR(skb);
+					goto out_plus_free;
+				}
+
+				flows = nla_nest_start(skb, NET_FLOW_FLOWS);
+				if (!flows) {
+					err = -EMSGSIZE;
+					goto out_plus_free;
+				}
+			}
+
+			net_flow_put_flow(skb, &this);
+		}
+
+		/* Cleanup flow */
+		kfree(this.matches);
+		kfree(this.actions);
+
+		if (err && err_handle == NET_FLOW_FLOWS_ERROR_ABORT)
+			goto out;
+	}
+
+	dev_put(dev);
+
+	if (skb) {
+		nla_nest_end(skb, flows);
+		net_flow_end_flow_errmsg(skb, hdr);
+		return genlmsg_reply(skb, info);
+	}
+	return 0;
+
+out_plus_free:
+	kfree(this.matches);
+	kfree(this.actions);
+out:
+	if (skb)
+		nlmsg_free(skb);
+	dev_put(dev);
+	return -EINVAL;
+}
+
 static const struct nla_policy net_flow_cmd_policy[NET_FLOW_MAX + 1] = {
 	[NET_FLOW_IDENTIFIER_TYPE] = {.type = NLA_U32, },
 	[NET_FLOW_IDENTIFIER]	   = {.type = NLA_U32, },
@@ -815,6 +1298,24 @@ static const struct genl_ops net_flow_table_nl_ops[] = {
 		.policy = net_flow_cmd_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NET_FLOW_TABLE_CMD_GET_FLOWS,
+		.doit = net_flow_table_cmd_get_flows,
+		.policy = net_flow_cmd_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NET_FLOW_TABLE_CMD_SET_FLOWS,
+		.doit = net_flow_table_cmd_flows,
+		.policy = net_flow_cmd_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NET_FLOW_TABLE_CMD_DEL_FLOWS,
+		.doit = net_flow_table_cmd_flows,
+		.policy = net_flow_cmd_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static int __init net_flow_nl_module_init(void)

^ permalink raw reply related

* [net-next PATCH v1 01/11] net: flow_table: create interface for hw match/action tables
From: John Fastabend @ 2014-12-31 19:45 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy
In-Reply-To: <20141231194057.31070.5244.stgit@nitbit.x32>

Currently, we do not have an interface to query hardware and learn
the capabilities of the device. This makes it very difficult to use
hardware flow tables.

At the moment the only interface we have to work with hardware flow
tables is ethtool. This has many deficiencies, first its ioctl based
making it difficult to use in systems that need to monitor interfaces
because there is no support for multicast, notifiers, etc.

The next big gap is it doesn't support querying devices for
capabilities. The only way to learn hardware entries is by doing a
"try and see" operation. An error perhaps indicating the device can
not support your request but could be possibly for other reasons.
Maybe a table is full for example. The existing flow interface only
supports a single ingress table which is sufficient for some of the
existing NIC host interfaces but limiting for more advanced NIC
interfaces and switch devices.

Also it is not extensible without recompiling both drivers and core
interfaces. It may be possible to reprogram a device with additional
header types, new protocols, whatever and it would be great if the
flow table infrastructure can handle this.

So this patch scraps the ethtool flow classifier interface and
creates a new flow table interface. It is expected that device that
support the existing ethtool interface today can support both
interfaces without too much difficulty. I did a proof point on the
ixgbe driver. Only choosing ixgbe because I have a 82599 10Gbps
device in my development system. A more thorough implementation
was done for the rocker switch showing how to use the interface.

In this patch we create interfaces to get the headers a device
supports, the actions it supports, a header graph showing the
relationship between headers the device supports, the tables
supported by the device and how they are connected.

This patch _only_ provides the get routines in an attempt to
make the patch sequence manageable.

get_headers :

   report a set of headers/fields the device supports. These
   are specified as length/offsets so we can support standard
   protocols or vendor specific headers. This is more flexible
   then bitmasks of pre-defined packet types. In 'tc' for example
   I may use u32 to match on proprietary or vendor specific fields.
   A bitmask approach does not allow for this, but defining the
   fields as a set of offsets and lengths allows for this.

   A device that supports Openflow version 1.x for example could
   provide the set of field/offsets that are equivelent to the
   specification.

   One property of this type of interface is I don't have to
   rebuild my kernel/driver header interfaces, etc to support the
   latest and greatest trendy protocol foo.

   For some types of metadata the device understands we also
   use header fields to represent these. One example of this is
   we may have an ingress_port metadata field to report the
   port a packet was received on. At the moment we expect the
   metadata fields to be defined outside the interface. We can
   standardize on common ones such "ingress_port" across devices.

   Some examples of outside definitions specifying metadata
   might be OVS, internal definitions like skb->mark, or some
   FoRCES definitions.

get_header_graph :

   Simply providing a header/field offset I support is not sufficient
   to learn how many nested 802.1Q tags I can support and other
   similar cases where the ordering of headers matters.

   So we use this operation to query the device for a header
   graph showing how the headers need to be related.
   With this operation and the 'get_headers' operation you can
   interrogate the driver with questions like "do you support
   Q'in'Q?", "how many VLAN tags can I nest before the parser
   breaks?", "Do you support MPLS?", "How about Foo Header in
   a VXLAN tunnel?".

get_actions :

   Report a list of actions supported by the device along with the
   arguments they take. So "drop_packet" action takes no arguments
   and "set_field" action takes two arguments a field and value.

   This suffers again from being slightly opaque. Meaning if a device
   reports back action "foo_bar" with three arguments how do I as a
   consumer of this "know" what that action is? The easy thing to do
   is punt on it and say it should be described outside the driver
   somewhere. OVS for example defines a set of actions. If my FoRCeS
   quick read is correct they define actions using text in the
   messaging interface. A follow up patch series could use a
   description language to describe actions. Possibly using something
   from eBPF or nftables for example. This patch will not try to
   solve the isuse now and expect actions are defined outside the API
   or are well known.

get_tables :

   Hardware may support one or more tables. Each table supports a set
   of matches and a set of actions. The match fields supported are
   defined above by the 'get_headers' operations. Similarly the actions
   supported are defined by the 'get_actions' operation.

   This allows the hardware to report several tables all with distinct
   capabilities. Tables also have table attributes used to describe
   features of the table. Because netlink messages are TLV based we
   can easily add new table attribues as needed.

   Currently a table has two attributes size and source. The size
   indicates how many "slots" are in the table for flow entries. One
   caveat here is a rule in the flow table may consume multiple slots
   in the table. We deal with this in a subsequent patch.

   The source field is used to indicate table boundaries where actions
   are applied. A table with the same source value will not "see"
   actions from tables with the same source. An example where this is
   relavent would be to have an action to re-write the destiniation
   IP address of a packet. If you have a match rule in a table with
   the same source that matches on the new IP address it will not be
   hit. However if it is in a table with a different source value
   _and_ in another table that gets applied the rule will be hit. See
   the next operatoin for querying table ordering.

   Some basic hardware may only support a single table which simplifies
   some things. But even the simple 10/40Gbps NICs support multiple
   tables and different tables depending on ingress/egress.

get_table_graph :

   When a device supports multiple tables we need to identify how the
   tables are connected when each table is executed.

   To do this we provide a table graph which gives the pipeline of the
   device. The graph gives nodes representing each table and the edges
   indicate the criteria to progress to the next flow table. There are
   examples of this type of thing in both FoRCES and OVS. OVS
   prescribes a set of tables reachable with goto actions and FoRCES a
   slightly more flexible arrangement. In software tc's u32 classifier
   allows "linking" hash tables together. The OVS dataplane with the
   support of 'goto' action is completely connected. Without the
   'goto' action the tables are progressed linearly.

   By querying the graph from hardware we can "learn" what table flows
   are supported and map them into software.

   We also provide a bit to indicate if the node is a root node of the
   ingress pipeline or egress pipeline. This is used on devices that
   have different pipelines for ingres and egress. This appears to be
   fairly common for devices. The realtek chip presented at LPC in
   Dusseldorf for example appeared to have a separate ingress/egress
   pipeline.

With these five operations software can learn what types of fields
the hardware flow table supports and how they are arranged. Subsequent
patches will address programming the flow tables.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/linux/if_flow.h      |   93 +++++
 include/linux/netdevice.h    |   12 +
 include/uapi/linux/if_flow.h |  363 ++++++++++++++++++
 net/Kconfig                  |    7 
 net/core/Makefile            |    1 
 net/core/flow_table.c        |  837 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1313 insertions(+)
 create mode 100644 include/linux/if_flow.h
 create mode 100644 include/uapi/linux/if_flow.h
 create mode 100644 net/core/flow_table.c

diff --git a/include/linux/if_flow.h b/include/linux/if_flow.h
new file mode 100644
index 0000000..1b6c1ea
--- /dev/null
+++ b/include/linux/if_flow.h
@@ -0,0 +1,93 @@
+/*
+ * include/linux/net/if_flow.h - Flow table interface for Switch devices
+ * Copyright (c) 2014 John Fastabend <john.r.fastabend@intel.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Author: John Fastabend <john.r.fastabend@intel.com>
+ */
+
+#ifndef _IF_FLOW_H
+#define _IF_FLOW_H
+
+#include <uapi/linux/if_flow.h>
+
+/**
+ * @struct net_flow_header
+ * @brief defines a match (header/field) an endpoint can use
+ *
+ * @uid unique identifier for header
+ * @field_sz number of fields are in the set
+ * @fields the set of fields in the net_flow_header
+ */
+struct net_flow_header {
+	char name[NET_FLOW_NAMSIZ];
+	int uid;
+	int field_sz;
+	struct net_flow_field *fields;
+};
+
+/**
+ * @struct net_flow_action
+ * @brief a description of a endpoint defined action
+ *
+ * @name printable name
+ * @uid unique action identifier
+ * @types NET_FLOW_ACTION_TYPE_NULL terminated list of action types
+ */
+struct net_flow_action {
+	char name[NET_FLOW_NAMSIZ];
+	int uid;
+	struct net_flow_action_arg *args;
+};
+
+/**
+ * @struct net_flow_table
+ * @brief define flow table with supported match/actions
+ *
+ * @uid unique identifier for table
+ * @source uid of parent table
+ * @size max number of entries for table or -1 for unbounded
+ * @matches null terminated set of supported match types given by match uid
+ * @actions null terminated set of supported action types given by action uid
+ * @flows set of flows
+ */
+struct net_flow_table {
+	char name[NET_FLOW_NAMSIZ];
+	int uid;
+	int source;
+	int size;
+	struct net_flow_field_ref *matches;
+	int *actions;
+};
+
+/* net_flow_hdr_node: node in a header graph of header fields.
+ *
+ * @uid : unique id of the graph node
+ * @flwo_header_ref : identify the hdrs that can handled by this node
+ * @net_flow_jump_table : give a case jump statement
+ */
+struct net_flow_hdr_node {
+	char name[NET_FLOW_NAMSIZ];
+	int uid;
+	int *hdrs;
+	struct net_flow_jump_table *jump;
+};
+
+struct net_flow_tbl_node {
+	int uid;
+	__u32 flags;
+	struct net_flow_jump_table *jump;
+};
+#endif
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 29c92ee..3c3c856 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,6 +52,11 @@
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
 
+#ifdef CONFIG_NET_FLOW_TABLES
+#include <linux/if_flow.h>
+#include <uapi/linux/if_flow.h>
+#endif
+
 struct netpoll_info;
 struct device;
 struct phy_device;
@@ -1186,6 +1191,13 @@ struct net_device_ops {
 	int			(*ndo_switch_port_stp_update)(struct net_device *dev,
 							      u8 state);
 #endif
+#ifdef CONFIG_NET_FLOW_TABLES
+	struct net_flow_action  **(*ndo_flow_get_actions)(struct net_device *dev);
+	struct net_flow_table	**(*ndo_flow_get_tables)(struct net_device *dev);
+	struct net_flow_header	**(*ndo_flow_get_headers)(struct net_device *dev);
+	struct net_flow_hdr_node **(*ndo_flow_get_hdr_graph)(struct net_device *dev);
+	struct net_flow_tbl_node **(*ndo_flow_get_tbl_graph)(struct net_device *dev);
+#endif
 };
 
 /**
diff --git a/include/uapi/linux/if_flow.h b/include/uapi/linux/if_flow.h
new file mode 100644
index 0000000..2acdb38
--- /dev/null
+++ b/include/uapi/linux/if_flow.h
@@ -0,0 +1,363 @@
+/*
+ * include/uapi/linux/if_flow.h - Flow table interface for Switch devices
+ * Copyright (c) 2014 John Fastabend <john.r.fastabend@intel.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Author: John Fastabend <john.r.fastabend@intel.com>
+ */
+
+/* Netlink description:
+ *
+ * Table definition used to describe running tables. The following
+ * describes the netlink message returned from a flow API messages.
+ *
+ * Flow table definitions used to define tables.
+ *
+ * [NET_FLOW_TABLE_IDENTIFIER_TYPE]
+ * [NET_FLOW_TABLE_IDENTIFIER]
+ * [NET_FLOW_TABLE_TABLES]
+ *     [NET_FLOW_TABLE]
+ *       [NET_FLOW_TABLE_ATTR_NAME]
+ *       [NET_FLOW_TABLE_ATTR_UID]
+ *       [NET_FLOW_TABLE_ATTR_SOURCE]
+ *       [NET_FLOW_TABLE_ATTR_SIZE]
+ *	 [NET_FLOW_TABLE_ATTR_MATCHES]
+ *	   [NET_FLOW_FIELD_REF]
+ *	   [NET_FLOW_FIELD_REF]
+ *	     [...]
+ *	   [...]
+ *	 [NET_FLOW_TABLE_ATTR_ACTIONS]
+ *	   [NET_FLOW_ACTION]
+ *	     [NET_FLOW_ACTION_ATTR_NAME]
+ *	     [NET_FLOW_ACTION_ATTR_UID]
+ *	     [NET_FLOW_ACTION_ATTR_SIGNATURE]
+ *		 [NET_FLOW_ACTION_ARG]
+ *	         [NET_FLOW_ACTION_ARG]
+ *	         [...]
+ *	   [NET_FLOW_ACTION]
+ *	     [...]
+ *	   [...]
+ *     [NET_FLOW_TABLE]
+ *       [...]
+ *
+ * Header definitions used to define headers with user friendly
+ * names.
+ *
+ * [NET_FLOW_TABLE_HEADERS]
+ *   [NET_FLOW_HEADER]
+ *	[NET_FLOW_HEADER_ATTR_NAME]
+ *	[NET_FLOW_HEADER_ATTR_UID]
+ *	[NET_FLOW_HEADER_ATTR_FIELDS]
+ *	  [NET_FLOW_HEADER_ATTR_FIELD]
+ *	    [NET_FLOW_FIELD_ATTR_NAME]
+ *	    [NET_FLOW_FIELD_ATTR_UID]
+ *	    [NET_FLOW_FIELD_ATTR_BITWIDTH]
+ *	  [NET_FLOW_HEADER_ATTR_FIELD]
+ *	    [...]
+ *	  [...]
+ *   [NET_FLOW_HEADER]
+ *      [...]
+ *   [...]
+ *
+ * Action definitions supported by tables
+ *
+ * [NET_FLOW_TABLE_ACTIONS]
+ *   [NET_FLOW_TABLE_ATTR_ACTIONS]
+ *	[NET_FLOW_ACTION]
+ *	  [NET_FLOW_ACTION_ATTR_NAME]
+ *	  [NET_FLOW_ACTION_ATTR_UID]
+ *	  [NET_FLOW_ACTION_ATTR_SIGNATURE]
+ *		 [NET_FLOW_ACTION_ARG]
+ *	         [NET_FLOW_ACTION_ARG]
+ *               [...]
+ *	[NET_FLOW_ACTION]
+ *	     [...]
+ *
+ * Parser definition used to unambiguously define match headers.
+ *
+ * [NET_FLOW_TABLE_PARSE_GRAPH]
+ *
+ * Primitive Type descriptions
+ *
+ * Get Table Graph <Request> only requires msg preamble.
+ *
+ * Get Table Graph <Reply> description
+ *
+ * [NET_FLOW_TABLE_TABLE_GRAPH]
+ *   [TABLE_GRAPH_NODE]
+ *	[TABLE_GRAPH_NODE_UID]
+ *	[TABLE_GRAPH_NODE_JUMP]
+ *	  [NET_FLOW_JUMP_TABLE_ENTRY]
+ *	  [NET_FLOW_JUMP_TABLE_ENTRY]
+ *	    [...]
+ *   [TABLE_GRAPH_NODE]
+ *	[..]
+ */
+
+#ifndef _UAPI_LINUX_IF_FLOW
+#define _UAPI_LINUX_IF_FLOW
+
+#include <linux/types.h>
+#include <linux/netlink.h>
+#include <linux/if.h>
+
+#define NET_FLOW_NAMSIZ 80
+
+/**
+ * @struct net_flow_fields
+ * @brief defines a field in a header
+ */
+struct net_flow_field {
+	char name[NET_FLOW_NAMSIZ];
+	int uid;
+	int bitwidth;
+};
+
+enum {
+	NET_FLOW_FIELD_UNSPEC,
+	NET_FLOW_FIELD,
+	__NET_FLOW_FIELD_MAX,
+};
+#define NET_FLOW_FIELD_MAX (__NET_FLOW_FIELD_MAX - 1)
+
+enum {
+	NET_FLOW_FIELD_ATTR_UNSPEC,
+	NET_FLOW_FIELD_ATTR_NAME,
+	NET_FLOW_FIELD_ATTR_UID,
+	NET_FLOW_FIELD_ATTR_BITWIDTH,
+	__NET_FLOW_FIELD_ATTR_MAX,
+};
+#define NET_FLOW_FIELD_ATTR_MAX (__NET_FLOW_FIELD_ATTR_MAX - 1)
+
+enum {
+	NET_FLOW_HEADER_UNSPEC,
+	NET_FLOW_HEADER,
+	__NET_FLOW_HEADER_MAX,
+};
+#define NET_FLOW_HEADER_MAX (__NET_FLOW_HEADER_MAX - 1)
+
+enum {
+	NET_FLOW_HEADER_ATTR_UNSPEC,
+	NET_FLOW_HEADER_ATTR_NAME,
+	NET_FLOW_HEADER_ATTR_UID,
+	NET_FLOW_HEADER_ATTR_FIELDS,
+	__NET_FLOW_HEADER_ATTR_MAX,
+};
+#define NET_FLOW_HEADER_ATTR_MAX (__NET_FLOW_HEADER_ATTR_MAX - 1)
+
+enum {
+	NET_FLOW_MASK_TYPE_UNSPEC,
+	NET_FLOW_MASK_TYPE_EXACT,
+	NET_FLOW_MASK_TYPE_LPM,
+};
+
+/**
+ * @struct net_flow_field_ref
+ * @brief uniquely identify field as header:field tuple
+ */
+struct net_flow_field_ref {
+	int instance;
+	int header;
+	int field;
+	int mask_type;
+	int type;
+	union {	/* Are these all the required data types */
+		__u8 value_u8;
+		__u16 value_u16;
+		__u32 value_u32;
+		__u64 value_u64;
+	};
+	union {	/* Are these all the required data types */
+		__u8 mask_u8;
+		__u16 mask_u16;
+		__u32 mask_u32;
+		__u64 mask_u64;
+	};
+};
+
+enum {
+	NET_FLOW_FIELD_REF_UNSPEC,
+	NET_FLOW_FIELD_REF,
+	__NET_FLOW_FIELD_REF_MAX,
+};
+#define NET_FLOW_FIELD_REF_MAX (__NET_FLOW_FIELD_REF_MAX - 1)
+
+enum {
+	NET_FLOW_FIELD_REF_ATTR_TYPE_UNSPEC,
+	NET_FLOW_FIELD_REF_ATTR_TYPE_U8,
+	NET_FLOW_FIELD_REF_ATTR_TYPE_U16,
+	NET_FLOW_FIELD_REF_ATTR_TYPE_U32,
+	NET_FLOW_FIELD_REF_ATTR_TYPE_U64,
+	/* Need more types for ether.addrs, ip.addrs, ... */
+};
+
+enum net_flow_action_arg_type {
+	NET_FLOW_ACTION_ARG_TYPE_NULL,
+	NET_FLOW_ACTION_ARG_TYPE_U8,
+	NET_FLOW_ACTION_ARG_TYPE_U16,
+	NET_FLOW_ACTION_ARG_TYPE_U32,
+	NET_FLOW_ACTION_ARG_TYPE_U64,
+	__NET_FLOW_ACTION_ARG_TYPE_VAL_MAX,
+};
+
+struct net_flow_action_arg {
+	char name[NET_FLOW_NAMSIZ];
+	enum net_flow_action_arg_type type;
+	union {
+		__u8  value_u8;
+		__u16 value_u16;
+		__u32 value_u32;
+		__u64 value_u64;
+	};
+};
+
+enum {
+	NET_FLOW_ACTION_ARG_UNSPEC,
+	NET_FLOW_ACTION_ARG,
+	__NET_FLOW_ACTION_ARG_MAX,
+};
+#define NET_FLOW_ACTION_ARG_MAX (__NET_FLOW_ACTION_ARG_MAX - 1)
+
+enum {
+	NET_FLOW_ACTION_UNSPEC,
+	NET_FLOW_ACTION,
+	__NET_FLOW_ACTION_MAX,
+};
+#define NET_FLOW_ACTION_MAX (__NET_FLOW_ACTION_MAX - 1)
+
+enum {
+	NET_FLOW_ACTION_ATTR_UNSPEC,
+	NET_FLOW_ACTION_ATTR_NAME,
+	NET_FLOW_ACTION_ATTR_UID,
+	NET_FLOW_ACTION_ATTR_SIGNATURE,
+	__NET_FLOW_ACTION_ATTR_MAX,
+};
+#define NET_FLOW_ACTION_ATTR_MAX (__NET_FLOW_ACTION_ATTR_MAX - 1)
+
+enum {
+	NET_FLOW_ACTION_SET_UNSPEC,
+	NET_FLOW_ACTION_SET_ACTIONS,
+	__NET_FLOW_ACTION_SET_MAX,
+};
+#define NET_FLOW_ACTION_SET_MAX (__NET_FLOW_ACTION_SET_MAX - 1)
+
+enum {
+	NET_FLOW_TABLE_UNSPEC,
+	NET_FLOW_TABLE,
+	__NET_FLOW_TABLE_MAX,
+};
+#define NET_FLOW_TABLE_MAX (__NET_FLOW_TABLE_MAX - 1)
+
+enum {
+	NET_FLOW_TABLE_ATTR_UNSPEC,
+	NET_FLOW_TABLE_ATTR_NAME,
+	NET_FLOW_TABLE_ATTR_UID,
+	NET_FLOW_TABLE_ATTR_SOURCE,
+	NET_FLOW_TABLE_ATTR_SIZE,
+	NET_FLOW_TABLE_ATTR_MATCHES,
+	NET_FLOW_TABLE_ATTR_ACTIONS,
+	__NET_FLOW_TABLE_ATTR_MAX,
+};
+#define NET_FLOW_TABLE_ATTR_MAX (__NET_FLOW_TABLE_ATTR_MAX - 1)
+
+struct net_flow_jump_table {
+	struct net_flow_field_ref field;
+	int node; /* <0 is a parser error */
+};
+
+#define NET_FLOW_JUMP_TABLE_DONE	-1
+
+enum {
+	NET_FLOW_JUMP_TABLE_ENTRY_UNSPEC,
+	NET_FLOW_JUMP_TABLE_ENTRY,
+	__NET_FLOW_JUMP_TABLE_ENTRY_MAX,
+};
+
+enum {
+	NET_FLOW_HEADER_NODE_HDRS_UNSPEC,
+	NET_FLOW_HEADER_NODE_HDRS_VALUE,
+	__NET_FLOW_HEADER_NODE_HDRS_MAX,
+};
+#define NET_FLOW_HEADER_NODE_HDRS_MAX (__NET_FLOW_HEADER_NODE_HDRS_MAX - 1)
+
+enum {
+	NET_FLOW_HEADER_NODE_UNSPEC,
+	NET_FLOW_HEADER_NODE_NAME,
+	NET_FLOW_HEADER_NODE_UID,
+	NET_FLOW_HEADER_NODE_HDRS,
+	NET_FLOW_HEADER_NODE_JUMP,
+	__NET_FLOW_HEADER_NODE_MAX,
+};
+#define NET_FLOW_HEADER_NODE_MAX (__NET_FLOW_HEADER_NODE_MAX - 1)
+
+enum {
+	NET_FLOW_HEADER_GRAPH_UNSPEC,
+	NET_FLOW_HEADER_GRAPH_NODE,
+	__NET_FLOW_HEADER_GRAPH_MAX,
+};
+#define NET_FLOW_HEADER_GRAPH_MAX (__NET_FLOW_HEADER_GRAPH_MAX - 1)
+
+#define NET_FLOW_TABLE_EGRESS_ROOT 1
+#define	NET_FLOW_TABLE_INGRESS_ROOT 2
+
+enum {
+	NET_FLOW_TABLE_GRAPH_NODE_UNSPEC,
+	NET_FLOW_TABLE_GRAPH_NODE_UID,
+	NET_FLOW_TABLE_GRAPH_NODE_FLAGS,
+	NET_FLOW_TABLE_GRAPH_NODE_JUMP,
+	__NET_FLOW_TABLE_GRAPH_NODE_MAX,
+};
+#define NET_FLOW_TABLE_GRAPH_NODE_MAX (__NET_FLOW_TABLE_GRAPH_NODE_MAX - 1)
+
+enum {
+	NET_FLOW_TABLE_GRAPH_UNSPEC,
+	NET_FLOW_TABLE_GRAPH_NODE,
+	__NET_FLOW_TABLE_GRAPH_MAX,
+};
+#define NET_FLOW_TABLE_GRAPH_MAX (__NET_FLOW_TABLE_GRAPH_MAX - 1)
+
+enum {
+	NET_FLOW_IDENTIFIER_IFINDEX, /* net_device ifindex */
+};
+
+enum {
+	NET_FLOW_UNSPEC,
+	NET_FLOW_IDENTIFIER_TYPE,
+	NET_FLOW_IDENTIFIER,
+
+	NET_FLOW_TABLES,
+	NET_FLOW_HEADERS,
+	NET_FLOW_ACTIONS,
+	NET_FLOW_HEADER_GRAPH,
+	NET_FLOW_TABLE_GRAPH,
+
+	__NET_FLOW_MAX,
+	NET_FLOW_MAX = (__NET_FLOW_MAX - 1),
+};
+
+enum {
+	NET_FLOW_TABLE_CMD_GET_TABLES,
+	NET_FLOW_TABLE_CMD_GET_HEADERS,
+	NET_FLOW_TABLE_CMD_GET_ACTIONS,
+	NET_FLOW_TABLE_CMD_GET_HDR_GRAPH,
+	NET_FLOW_TABLE_CMD_GET_TABLE_GRAPH,
+
+	__NET_FLOW_CMD_MAX,
+	NET_FLOW_CMD_MAX = (__NET_FLOW_CMD_MAX - 1),
+};
+
+#define NET_FLOW_GENL_NAME "net_flow_table"
+#define NET_FLOW_GENL_VERSION 0x1
+#endif /* _UAPI_LINUX_IF_FLOW */
diff --git a/net/Kconfig b/net/Kconfig
index ff9ffc1..8380bfe 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -293,6 +293,13 @@ config NET_FLOW_LIMIT
 	  with many clients some protection against DoS by a single (spoofed)
 	  flow that greatly exceeds average workload.
 
+config NET_FLOW_TABLES
+	boolean "Support network flow tables"
+	---help---
+	This feature provides an interface for device drivers to report
+	flow tables and supported matches and actions. If you do not
+	want to support hardware offloads for flow tables, say N here.
+
 menu "Network testing"
 
 config NET_PKTGEN
diff --git a/net/core/Makefile b/net/core/Makefile
index 235e6c5..1eea785 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_NET_FLOW_TABLES) += flow_table.o
diff --git a/net/core/flow_table.c b/net/core/flow_table.c
new file mode 100644
index 0000000..ec3f06d
--- /dev/null
+++ b/net/core/flow_table.c
@@ -0,0 +1,837 @@
+/*
+ * include/uapi/linux/if_flow.h - Flow table interface for Switch devices
+ * Copyright (c) 2014 John Fastabend <john.r.fastabend@intel.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Author: John Fastabend <john.r.fastabend@intel.com>
+ */
+
+#include <uapi/linux/if_flow.h>
+#include <linux/if_flow.h>
+#include <linux/if_bridge.h>
+#include <linux/types.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <linux/module.h>
+
+static struct genl_family net_flow_nl_family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= NET_FLOW_GENL_NAME,
+	.version	= NET_FLOW_GENL_VERSION,
+	.maxattr	= NET_FLOW_MAX,
+	.netnsok	= true,
+};
+
+static struct net_device *net_flow_get_dev(struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	int type, ifindex;
+
+	if (!info->attrs[NET_FLOW_IDENTIFIER_TYPE] ||
+	    !info->attrs[NET_FLOW_IDENTIFIER])
+		return NULL;
+
+	type = nla_get_u32(info->attrs[NET_FLOW_IDENTIFIER_TYPE]);
+	switch (type) {
+	case NET_FLOW_IDENTIFIER_IFINDEX:
+		ifindex = nla_get_u32(info->attrs[NET_FLOW_IDENTIFIER]);
+		break;
+	default:
+		return NULL;
+	}
+
+	return dev_get_by_index(net, ifindex);
+}
+
+static int net_flow_put_act_types(struct sk_buff *skb,
+				  struct net_flow_action_arg *args)
+{
+	int i, err;
+
+	for (i = 0; args[i].type; i++) {
+		err = nla_put(skb, NET_FLOW_ACTION_ARG,
+			      sizeof(struct net_flow_action_arg), &args[i]);
+		if (err)
+			return -EMSGSIZE;
+	}
+	return 0;
+}
+
+static const
+struct nla_policy net_flow_action_policy[NET_FLOW_ACTION_ATTR_MAX + 1] = {
+	[NET_FLOW_ACTION_ATTR_NAME]	 = {.type = NLA_STRING,
+					    .len = NET_FLOW_NAMSIZ-1 },
+	[NET_FLOW_ACTION_ATTR_UID]	 = {.type = NLA_U32 },
+	[NET_FLOW_ACTION_ATTR_SIGNATURE] = {.type = NLA_NESTED },
+};
+
+static int net_flow_put_action(struct sk_buff *skb, struct net_flow_action *a)
+{
+	struct net_flow_action_arg *this;
+	struct nlattr *nest;
+	int err, args = 0;
+
+	if (a->name && nla_put_string(skb, NET_FLOW_ACTION_ATTR_NAME, a->name))
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, NET_FLOW_ACTION_ATTR_UID, a->uid))
+		return -EMSGSIZE;
+
+	if (!a->args)
+		return 0;
+
+	for (this = &a->args[0]; strlen(this->name) > 0; this++)
+		args++;
+
+	if (args) {
+		nest = nla_nest_start(skb, NET_FLOW_ACTION_ATTR_SIGNATURE);
+		if (!nest)
+			goto nest_put_failure;
+
+		err = net_flow_put_act_types(skb, a->args);
+		if (err) {
+			nla_nest_cancel(skb, nest);
+			return err;
+		}
+		nla_nest_end(skb, nest);
+	}
+
+	return 0;
+nest_put_failure:
+	return -EMSGSIZE;
+}
+
+static int net_flow_put_actions(struct sk_buff *skb,
+				struct net_flow_action **acts)
+{
+	struct nlattr *actions;
+	int err, i;
+
+	actions = nla_nest_start(skb, NET_FLOW_ACTIONS);
+	if (!actions)
+		return -EMSGSIZE;
+
+	for (i = 0; acts[i]->uid; i++) {
+		struct nlattr *action = nla_nest_start(skb, NET_FLOW_ACTION);
+
+		if (!action)
+			goto action_put_failure;
+
+		err = net_flow_put_action(skb, acts[i]);
+		if (err)
+			goto action_put_failure;
+		nla_nest_end(skb, action);
+	}
+	nla_nest_end(skb, actions);
+
+	return 0;
+action_put_failure:
+	nla_nest_cancel(skb, actions);
+	return -EMSGSIZE;
+}
+
+struct sk_buff *net_flow_build_actions_msg(struct net_flow_action **a,
+					   struct net_device *dev,
+					   u32 portid, int seq, u8 cmd)
+{
+	struct genlmsghdr *hdr;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOBUFS);
+
+	hdr = genlmsg_put(skb, portid, seq, &net_flow_nl_family, 0, cmd);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_u32(skb,
+			NET_FLOW_IDENTIFIER_TYPE,
+			NET_FLOW_IDENTIFIER_IFINDEX) ||
+	    nla_put_u32(skb, NET_FLOW_IDENTIFIER, dev->ifindex)) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	err = net_flow_put_actions(skb, a);
+	if (err < 0)
+		goto out;
+
+	err = genlmsg_end(skb, hdr);
+	if (err < 0)
+		goto out;
+
+	return skb;
+out:
+	nlmsg_free(skb);
+	return ERR_PTR(err);
+}
+
+static int net_flow_cmd_get_actions(struct sk_buff *skb,
+				    struct genl_info *info)
+{
+	struct net_flow_action **a;
+	struct net_device *dev;
+	struct sk_buff *msg;
+
+	dev = net_flow_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->netdev_ops->ndo_flow_get_actions) {
+		dev_put(dev);
+		return -EOPNOTSUPP;
+	}
+
+	a = dev->netdev_ops->ndo_flow_get_actions(dev);
+	if (!a)
+		return -EBUSY;
+
+	msg = net_flow_build_actions_msg(a, dev,
+					 info->snd_portid,
+					 info->snd_seq,
+					 NET_FLOW_TABLE_CMD_GET_ACTIONS);
+	dev_put(dev);
+
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	return genlmsg_reply(msg, info);
+}
+
+static int net_flow_put_table(struct net_device *dev,
+			      struct sk_buff *skb,
+			      struct net_flow_table *t)
+{
+	struct nlattr *matches, *actions;
+	int i;
+
+	if (nla_put_string(skb, NET_FLOW_TABLE_ATTR_NAME, t->name) ||
+	    nla_put_u32(skb, NET_FLOW_TABLE_ATTR_UID, t->uid) ||
+	    nla_put_u32(skb, NET_FLOW_TABLE_ATTR_SOURCE, t->source) ||
+	    nla_put_u32(skb, NET_FLOW_TABLE_ATTR_SIZE, t->size))
+		return -EMSGSIZE;
+
+	matches = nla_nest_start(skb, NET_FLOW_TABLE_ATTR_MATCHES);
+	if (!matches)
+		return -EMSGSIZE;
+
+	for (i = 0; t->matches[i].instance; i++)
+		nla_put(skb, NET_FLOW_FIELD_REF,
+			sizeof(struct net_flow_field_ref),
+			&t->matches[i]);
+	nla_nest_end(skb, matches);
+
+	actions = nla_nest_start(skb, NET_FLOW_TABLE_ATTR_ACTIONS);
+	if (!actions)
+		return -EMSGSIZE;
+
+	for (i = 0; t->actions[i]; i++) {
+		if (nla_put_u32(skb,
+				NET_FLOW_ACTION_ATTR_UID,
+				t->actions[i])) {
+			nla_nest_cancel(skb, actions);
+			return -EMSGSIZE;
+		}
+	}
+	nla_nest_end(skb, actions);
+
+	return 0;
+}
+
+static int net_flow_put_tables(struct net_device *dev,
+			       struct sk_buff *skb,
+			       struct net_flow_table **tables)
+{
+	struct nlattr *nest, *t;
+	int i, err = 0;
+
+	nest = nla_nest_start(skb, NET_FLOW_TABLES);
+	if (!nest)
+		return -EMSGSIZE;
+
+	for (i = 0; tables[i]->uid; i++) {
+		t = nla_nest_start(skb, NET_FLOW_TABLE);
+		if (!t) {
+			err = -EMSGSIZE;
+			goto errout;
+		}
+
+		err = net_flow_put_table(dev, skb, tables[i]);
+		if (err) {
+			nla_nest_cancel(skb, t);
+			goto errout;
+		}
+		nla_nest_end(skb, t);
+	}
+	nla_nest_end(skb, nest);
+	return 0;
+errout:
+	nla_nest_cancel(skb, nest);
+	return err;
+}
+
+static struct sk_buff *net_flow_build_tables_msg(struct net_flow_table **t,
+						 struct net_device *dev,
+						 u32 portid, int seq, u8 cmd)
+{
+	struct genlmsghdr *hdr;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOBUFS);
+
+	hdr = genlmsg_put(skb, portid, seq, &net_flow_nl_family, 0, cmd);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_u32(skb,
+			NET_FLOW_IDENTIFIER_TYPE,
+			NET_FLOW_IDENTIFIER_IFINDEX) ||
+	    nla_put_u32(skb, NET_FLOW_IDENTIFIER, dev->ifindex)) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	err = net_flow_put_tables(dev, skb, t);
+	if (err < 0)
+		goto out;
+
+	err = genlmsg_end(skb, hdr);
+	if (err < 0)
+		goto out;
+
+	return skb;
+out:
+	nlmsg_free(skb);
+	return ERR_PTR(err);
+}
+
+static int net_flow_cmd_get_tables(struct sk_buff *skb,
+				   struct genl_info *info)
+{
+	struct net_flow_table **tables;
+	struct net_device *dev;
+	struct sk_buff *msg;
+
+	dev = net_flow_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->netdev_ops->ndo_flow_get_tables) {
+		dev_put(dev);
+		return -EOPNOTSUPP;
+	}
+
+	tables = dev->netdev_ops->ndo_flow_get_tables(dev);
+	if (!tables) /* transient failure should always have some table */
+		return -EBUSY;
+
+	msg = net_flow_build_tables_msg(tables, dev,
+					info->snd_portid,
+					info->snd_seq,
+					NET_FLOW_TABLE_CMD_GET_TABLES);
+	dev_put(dev);
+
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	return genlmsg_reply(msg, info);
+}
+
+static
+int net_flow_put_fields(struct sk_buff *skb, const struct net_flow_header *h)
+{
+	struct net_flow_field *f;
+	int count = h->field_sz;
+	struct nlattr *field;
+
+	for (f = h->fields; count; count--, f++) {
+		field = nla_nest_start(skb, NET_FLOW_FIELD);
+		if (!field)
+			goto field_put_failure;
+
+		if (nla_put_string(skb, NET_FLOW_FIELD_ATTR_NAME, f->name) ||
+		    nla_put_u32(skb, NET_FLOW_FIELD_ATTR_UID, f->uid) ||
+		    nla_put_u32(skb, NET_FLOW_FIELD_ATTR_BITWIDTH, f->bitwidth))
+			goto out;
+
+		nla_nest_end(skb, field);
+	}
+
+	return 0;
+out:
+	nla_nest_cancel(skb, field);
+field_put_failure:
+	return -EMSGSIZE;
+}
+
+static int net_flow_put_headers(struct sk_buff *skb,
+				struct net_flow_header **headers)
+{
+	struct nlattr *nest, *hdr, *fields;
+	struct net_flow_header *h;
+	int i, err;
+
+	nest = nla_nest_start(skb, NET_FLOW_HEADERS);
+	if (!nest)
+		return -EMSGSIZE;
+
+	for (i = 0; headers[i]->uid; i++) {
+		err = -EMSGSIZE;
+		h = headers[i];
+
+		hdr = nla_nest_start(skb, NET_FLOW_HEADER);
+		if (!hdr)
+			goto hdr_put_failure;
+
+		if (nla_put_string(skb, NET_FLOW_HEADER_ATTR_NAME, h->name) ||
+		    nla_put_u32(skb, NET_FLOW_HEADER_ATTR_UID, h->uid))
+			goto attr_put_failure;
+
+		fields = nla_nest_start(skb, NET_FLOW_HEADER_ATTR_FIELDS);
+		if (!fields)
+			goto attr_put_failure;
+
+		err = net_flow_put_fields(skb, h);
+		if (err)
+			goto fields_put_failure;
+
+		nla_nest_end(skb, fields);
+
+		nla_nest_end(skb, hdr);
+	}
+	nla_nest_end(skb, nest);
+
+	return 0;
+fields_put_failure:
+	nla_nest_cancel(skb, fields);
+attr_put_failure:
+	nla_nest_cancel(skb, hdr);
+hdr_put_failure:
+	nla_nest_cancel(skb, nest);
+	return err;
+}
+
+static struct sk_buff *net_flow_build_headers_msg(struct net_flow_header **h,
+						  struct net_device *dev,
+						  u32 portid, int seq, u8 cmd)
+{
+	struct genlmsghdr *hdr;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOBUFS);
+
+	hdr = genlmsg_put(skb, portid, seq, &net_flow_nl_family, 0, cmd);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_u32(skb,
+			NET_FLOW_IDENTIFIER_TYPE,
+			NET_FLOW_IDENTIFIER_IFINDEX) ||
+	    nla_put_u32(skb, NET_FLOW_IDENTIFIER, dev->ifindex)) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	err = net_flow_put_headers(skb, h);
+	if (err < 0)
+		goto out;
+
+	err = genlmsg_end(skb, hdr);
+	if (err < 0)
+		goto out;
+
+	return skb;
+out:
+	nlmsg_free(skb);
+	return ERR_PTR(err);
+}
+
+static int net_flow_cmd_get_headers(struct sk_buff *skb,
+				    struct genl_info *info)
+{
+	struct net_flow_header **h;
+	struct net_device *dev;
+	struct sk_buff *msg;
+
+	dev = net_flow_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->netdev_ops->ndo_flow_get_headers) {
+		dev_put(dev);
+		return -EOPNOTSUPP;
+	}
+
+	h = dev->netdev_ops->ndo_flow_get_headers(dev);
+	if (!h)
+		return -EBUSY;
+
+	msg = net_flow_build_headers_msg(h, dev,
+					 info->snd_portid,
+					 info->snd_seq,
+					 NET_FLOW_TABLE_CMD_GET_HEADERS);
+	dev_put(dev);
+
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	return genlmsg_reply(msg, info);
+}
+
+static int net_flow_put_header_node(struct sk_buff *skb,
+				    struct net_flow_hdr_node *node)
+{
+	struct nlattr *hdrs, *jumps;
+	int i, err;
+
+	if (nla_put_string(skb, NET_FLOW_HEADER_NODE_NAME, node->name) ||
+	    nla_put_u32(skb, NET_FLOW_HEADER_NODE_UID, node->uid))
+		return -EMSGSIZE;
+
+	/* Insert the set of headers that get extracted at this node */
+	hdrs = nla_nest_start(skb, NET_FLOW_HEADER_NODE_HDRS);
+	if (!hdrs)
+		return -EMSGSIZE;
+	for (i = 0; node->hdrs[i]; i++) {
+		if (nla_put_u32(skb, NET_FLOW_HEADER_NODE_HDRS_VALUE,
+				node->hdrs[i])) {
+			nla_nest_cancel(skb, hdrs);
+			return -EMSGSIZE;
+		}
+	}
+	nla_nest_end(skb, hdrs);
+
+	/* Then give the jump table to find next header node in graph */
+	jumps = nla_nest_start(skb, NET_FLOW_HEADER_NODE_JUMP);
+	if (!jumps)
+		return -EMSGSIZE;
+
+	for (i = 0; node->jump[i].node; i++) {
+		err = nla_put(skb, NET_FLOW_JUMP_TABLE_ENTRY,
+			      sizeof(struct net_flow_jump_table),
+			      &node->jump[i]);
+		if (err) {
+			nla_nest_cancel(skb, jumps);
+			return -EMSGSIZE;
+		}
+	}
+	nla_nest_end(skb, jumps);
+
+	return 0;
+}
+
+static int net_flow_put_header_graph(struct sk_buff *skb,
+				     struct net_flow_hdr_node **g)
+{
+	struct nlattr *nodes, *node;
+	int err, i;
+
+	nodes = nla_nest_start(skb, NET_FLOW_HEADER_GRAPH);
+	if (!nodes)
+		return -EMSGSIZE;
+
+	for (i = 0; g[i]->uid; i++) {
+		node = nla_nest_start(skb, NET_FLOW_HEADER_GRAPH_NODE);
+		if (!node) {
+			err = -EMSGSIZE;
+			goto nodes_put_error;
+		}
+
+		err = net_flow_put_header_node(skb, g[i]);
+		if (err)
+			goto node_put_error;
+
+		nla_nest_end(skb, node);
+	}
+
+	nla_nest_end(skb, nodes);
+	return 0;
+node_put_error:
+	nla_nest_cancel(skb, node);
+nodes_put_error:
+	nla_nest_cancel(skb, nodes);
+	return err;
+}
+
+static
+struct sk_buff *net_flow_build_header_graph_msg(struct net_flow_hdr_node **g,
+						struct net_device *dev,
+						u32 portid, int seq, u8 cmd)
+{
+	struct genlmsghdr *hdr;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOBUFS);
+
+	hdr = genlmsg_put(skb, portid, seq, &net_flow_nl_family, 0, cmd);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_u32(skb,
+			NET_FLOW_IDENTIFIER_TYPE,
+			NET_FLOW_IDENTIFIER_IFINDEX) ||
+	    nla_put_u32(skb, NET_FLOW_IDENTIFIER, dev->ifindex)) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	err = net_flow_put_header_graph(skb, g);
+	if (err < 0)
+		goto out;
+
+	err = genlmsg_end(skb, hdr);
+	if (err < 0)
+		goto out;
+
+	return skb;
+out:
+	nlmsg_free(skb);
+	return ERR_PTR(err);
+}
+
+static int net_flow_cmd_get_header_graph(struct sk_buff *skb,
+					 struct genl_info *info)
+{
+	struct net_flow_hdr_node **h;
+	struct net_device *dev;
+	struct sk_buff *msg;
+
+	dev = net_flow_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->netdev_ops->ndo_flow_get_hdr_graph) {
+		dev_put(dev);
+		return -EOPNOTSUPP;
+	}
+
+	h = dev->netdev_ops->ndo_flow_get_hdr_graph(dev);
+	if (!h)
+		return -EBUSY;
+
+	msg = net_flow_build_header_graph_msg(h, dev,
+					      info->snd_portid,
+					      info->snd_seq,
+					      NET_FLOW_TABLE_CMD_GET_HDR_GRAPH);
+	dev_put(dev);
+
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	return genlmsg_reply(msg, info);
+}
+
+static int net_flow_put_table_node(struct sk_buff *skb,
+				   struct net_flow_tbl_node *node)
+{
+	struct nlattr *nest, *jump;
+	int i, err = -EMSGSIZE;
+
+	nest = nla_nest_start(skb, NET_FLOW_TABLE_GRAPH_NODE);
+	if (!nest)
+		return err;
+
+	if (nla_put_u32(skb, NET_FLOW_TABLE_GRAPH_NODE_UID, node->uid) ||
+	    nla_put_u32(skb, NET_FLOW_TABLE_GRAPH_NODE_FLAGS, node->flags))
+		goto node_put_failure;
+
+	jump = nla_nest_start(skb, NET_FLOW_TABLE_GRAPH_NODE_JUMP);
+	if (!jump)
+		goto node_put_failure;
+
+	for (i = 0; node->jump[i].node; i++) {
+		err = nla_put(skb, NET_FLOW_JUMP_TABLE_ENTRY,
+			      sizeof(struct net_flow_jump_table),
+			      &node->jump[i]);
+		if (err)
+			goto jump_put_failure;
+	}
+
+	nla_nest_end(skb, jump);
+	nla_nest_end(skb, nest);
+	return 0;
+jump_put_failure:
+	nla_nest_cancel(skb, jump);
+node_put_failure:
+	nla_nest_cancel(skb, nest);
+	return err;
+}
+
+static int net_flow_put_table_graph(struct sk_buff *skb,
+				    struct net_flow_tbl_node **nodes)
+{
+	struct nlattr *graph;
+	int err, i = 0;
+
+	graph = nla_nest_start(skb, NET_FLOW_TABLE_GRAPH);
+	if (!graph)
+		return -EMSGSIZE;
+
+	for (i = 0; nodes[i]->uid; i++) {
+		err = net_flow_put_table_node(skb, nodes[i]);
+		if (err) {
+			nla_nest_cancel(skb, graph);
+			return -EMSGSIZE;
+		}
+	}
+
+	nla_nest_end(skb, graph);
+	return 0;
+}
+
+static
+struct sk_buff *net_flow_build_graph_msg(struct net_flow_tbl_node **g,
+					 struct net_device *dev,
+					 u32 portid, int seq, u8 cmd)
+{
+	struct genlmsghdr *hdr;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOBUFS);
+
+	hdr = genlmsg_put(skb, portid, seq, &net_flow_nl_family, 0, cmd);
+	if (!hdr)
+		goto out;
+
+	if (nla_put_u32(skb,
+			NET_FLOW_IDENTIFIER_TYPE,
+			NET_FLOW_IDENTIFIER_IFINDEX) ||
+	    nla_put_u32(skb, NET_FLOW_IDENTIFIER, dev->ifindex)) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	err = net_flow_put_table_graph(skb, g);
+	if (err < 0)
+		goto out;
+
+	err = genlmsg_end(skb, hdr);
+	if (err < 0)
+		goto out;
+
+	return skb;
+out:
+	nlmsg_free(skb);
+	return ERR_PTR(err);
+}
+
+static int net_flow_cmd_get_table_graph(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct net_flow_tbl_node **g;
+	struct net_device *dev;
+	struct sk_buff *msg;
+
+	dev = net_flow_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->netdev_ops->ndo_flow_get_tbl_graph) {
+		dev_put(dev);
+		return -EOPNOTSUPP;
+	}
+
+	g = dev->netdev_ops->ndo_flow_get_tbl_graph(dev);
+	if (!g)
+		return -EBUSY;
+
+	msg = net_flow_build_graph_msg(g, dev,
+				       info->snd_portid,
+				       info->snd_seq,
+				       NET_FLOW_TABLE_CMD_GET_TABLE_GRAPH);
+	dev_put(dev);
+
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	return genlmsg_reply(msg, info);
+}
+
+static const struct nla_policy net_flow_cmd_policy[NET_FLOW_MAX + 1] = {
+	[NET_FLOW_IDENTIFIER_TYPE] = {.type = NLA_U32, },
+	[NET_FLOW_IDENTIFIER]	   = {.type = NLA_U32, },
+	[NET_FLOW_TABLES]	   = {.type = NLA_NESTED, },
+	[NET_FLOW_HEADERS]	   = {.type = NLA_NESTED, },
+	[NET_FLOW_ACTIONS]	   = {.type = NLA_NESTED, },
+	[NET_FLOW_HEADER_GRAPH]	   = {.type = NLA_NESTED, },
+	[NET_FLOW_TABLE_GRAPH]	   = {.type = NLA_NESTED, },
+};
+
+static const struct genl_ops net_flow_table_nl_ops[] = {
+	{
+		.cmd = NET_FLOW_TABLE_CMD_GET_TABLES,
+		.doit = net_flow_cmd_get_tables,
+		.policy = net_flow_cmd_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NET_FLOW_TABLE_CMD_GET_HEADERS,
+		.doit = net_flow_cmd_get_headers,
+		.policy = net_flow_cmd_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NET_FLOW_TABLE_CMD_GET_ACTIONS,
+		.doit = net_flow_cmd_get_actions,
+		.policy = net_flow_cmd_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NET_FLOW_TABLE_CMD_GET_HDR_GRAPH,
+		.doit = net_flow_cmd_get_header_graph,
+		.policy = net_flow_cmd_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NET_FLOW_TABLE_CMD_GET_TABLE_GRAPH,
+		.doit = net_flow_cmd_get_table_graph,
+		.policy = net_flow_cmd_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
+static int __init net_flow_nl_module_init(void)
+{
+	return genl_register_family_with_ops(&net_flow_nl_family,
+					     net_flow_table_nl_ops);
+}
+
+static void net_flow_nl_module_fini(void)
+{
+	genl_unregister_family(&net_flow_nl_family);
+}
+
+module_init(net_flow_nl_module_init);
+module_exit(net_flow_nl_module_fini);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("John Fastabend <john.r.fastabend@intel.com>");
+MODULE_DESCRIPTION("Netlink interface to Flow Tables");
+MODULE_ALIAS_GENL_FAMILY(NET_FLOW_GENL_NAME);

^ permalink raw reply related

* [net-next PATCH v1 00/11] A flow API
From: John Fastabend @ 2014-12-31 19:45 UTC (permalink / raw)
  To: tgraf, sfeldma, jiri, jhs, simon.horman; +Cc: netdev, davem, andy

So... I could continue to mull over this and tweak bits and pieces
here and there but I decided its best to get a wider group of folks
looking at it and hopefulyl with any luck using it so here it is.

This set creates a new netlink family and set of messages to configure
flow tables in hardware. I tried to make the commit messages
reasonably verbose at least in the flow_table patches.

What we get at the end of this series is a working API to get device
capabilities and program flows using the rocker switch.

I created a user space tool 'flow' that I use to configure and query
the devices it is posted here,

	https://github.com/jrfastab/iprotue2-flow-tool

For now it is a stand-alone tool but once the kernel bits get sorted
out (I'm guessing there will need to be a few versions of this series
to get it right) I would like to port it into the iproute2 package.
This way we can keep all of our tooling in one package see 'bridge'
for example.

As far as testing, I've tested various combinations of tables and
rules on the rocker switch and it seems to work. I have not tested
100% of the rocker code paths though. It would be great to get some
sort of automated framework around the API to do this. I don't
think should gate the inclusion of the API though.

I could use some help reviewing,

  (a) error paths and netlink validation code paths

  (b) Break down of structures vs netlink attributes. I
      am trying to balance flexibility given by having
      netlinnk TLV attributes vs conciseness. So some
      things are passed as structures.

  (c) are there any devices that have pipelines that we
      can't represent with this API? It would be good to
      know about these so we can design it in probably
      in a future series.

For some examples and maybe a bit more illustrative description I
posted a quickly typed up set of notes on github io pages. Here we
can show the description along with images produced by the flow tool
showing the pipeline. Once we settle a bit more on the API we should
probably do a clean up of this and other threads happening and commit
something to the Documentation directory.

 http://jrfastab.github.io/jekyll/update/2014/12/21/flow-api.html

Finally I have more patches to add support for creating and destroying
tables. This allows users to define the pipeline at runtime rather
than statically as rocker does now. After this set gets some traction
I'll look at pushing them in a next round. However it likely requires
adding another "world" to rocker. Another piece that I want to add is
a description of the actions and metadata. This way user space can
"learn" what an action is and how metadata interacts with the system.
This work is under development.

Thanks! Any comments/feedback always welcome.

And also thanks to everyone who helped with this flow API so far. All
the folks at Dusseldorf LPC, OVS summit Santa Clara, P4 authors for
some inspiration, the collection of IETF FoRCES documents I mulled
over, Netfilter workshop where I started to realize fixing ethtool
was most likely not going to work, etc.

---

John Fastabend (11):
      net: flow_table: create interface for hw match/action tables
      net: flow_table: add flow, delete flow
      net: flow_table: add apply action argument to tables
      rocker: add pipeline model for rocker switch
      net: rocker: add set flow rules
      net: rocker: add group_id slices and drop explicit goto
      net: rocker: add multicast path to bridging
      net: rocker: add get flow API operation
      net: rocker: add cookie to group acls and use flow_id to set cookie
      net: rocker: have flow api calls set cookie value
      net: rocker: implement delete flow routine

 drivers/net/ethernet/rocker/rocker.c          | 1641 +++++++++++++++++++++++++
 drivers/net/ethernet/rocker/rocker_pipeline.h |  793 ++++++++++++
 include/linux/if_flow.h                       |  115 ++
 include/linux/netdevice.h                     |   20 
 include/uapi/linux/if_flow.h                  |  413 ++++++
 net/Kconfig                                   |    7 
 net/core/Makefile                             |    1 
 net/core/flow_table.c                         | 1339 ++++++++++++++++++++
 8 files changed, 4312 insertions(+), 17 deletions(-)
 create mode 100644 drivers/net/ethernet/rocker/rocker_pipeline.h
 create mode 100644 include/linux/if_flow.h
 create mode 100644 include/uapi/linux/if_flow.h
 create mode 100644 net/core/flow_table.c

-- 
Signature

^ permalink raw reply

* Re: [PATCH net-next v1 0/3] net: fec: add Wake-on-LAN support
From: Fabio Estevam @ 2014-12-31 19:22 UTC (permalink / raw)
  To: David Miller
  Cc: Duan Fugang-B38611, Shawn Guo, netdev@vger.kernel.org,
	Ben Hutchings, Stephen Hemminger
In-Reply-To: <20141231.142034.494966377580113808.davem@davemloft.net>

On Wed, Dec 31, 2014 at 5:20 PM, David Miller <davem@davemloft.net> wrote:
> From: Fabio Estevam <festevam@gmail.com>
> Date: Wed, 31 Dec 2014 17:03:28 -0200
>
>> ,but when we see your patches applied they appear with Nimrod Andy as
>> the author instead:
>> https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=de40ed31b3c577cefd7b54972365a272ecbe9dd6
>>
>> It would be nice if you could use the From field to match the name in
>> the Signed-off tag.
>
> It's his business to setup things properly so they match, not
> mine.  I just take what is in the patch as-is.

Exactly. This was what I suggested him to do.

^ permalink raw reply

* Re: [PATCH net-next v1 0/3] net: fec: add Wake-on-LAN support
From: David Miller @ 2014-12-31 19:20 UTC (permalink / raw)
  To: festevam; +Cc: b38611, shawn.guo, netdev, bhutchings, stephen
In-Reply-To: <CAOMZO5BFbL-J2gM_T0jD3N3P9wHguaNLENE24BRaMQT7E6Lhiw@mail.gmail.com>

From: Fabio Estevam <festevam@gmail.com>
Date: Wed, 31 Dec 2014 17:03:28 -0200

> ,but when we see your patches applied they appear with Nimrod Andy as
> the author instead:
> https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=de40ed31b3c577cefd7b54972365a272ecbe9dd6
> 
> It would be nice if you could use the From field to match the name in
> the Signed-off tag.

It's his business to setup things properly so they match, not
mine.  I just take what is in the patch as-is.

^ permalink raw reply

* Re: [PATCH net] gre: allow live address change
From: David Miller @ 2014-12-31 19:18 UTC (permalink / raw)
  To: shemming; +Cc: netdev
In-Reply-To: <20141227100142.13b7ed79@urahara>

From: Stephen Hemminger <shemming@brocade.com>
Date: Sat, 27 Dec 2014 10:01:42 -0800

> The GRE tap device supports Ethernet over GRE, but doesn't
> care about the source address of the tunnel, therefore it
> can be changed without bring device down.
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] l2tp : multicast notification to the registered listeners
From: David Miller @ 2014-12-31 19:17 UTC (permalink / raw)
  To: stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ
  Cc: therbert-hpIqsD4AKlfQT0dZR+AlfA, g.nault-pHk1y4uTXVDytLWWfqlThQ,
	linux-api-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20141227101239.3ea24d70@urahara>

From: Stephen Hemminger <stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ@public.gmane.org>
Date: Sat, 27 Dec 2014 10:12:39 -0800

> From: Bill Hong <bhong-43mecJUBy8ZBDgjK7y7TUQ@public.gmane.org>
> 
> Previously l2tp module did not provide any means for the user space to
> get notified when tunnels/sessions are added/modified/deleted.
> This change contains the following
> - create a multicast group for the listeners to register.
> - notify the registered listeners when the tunnels/sessions are
>   created/modified/deleted.
> 
> Signed-off-by: Bill Hong <bhong-43mecJUBy8ZBDgjK7y7TUQ@public.gmane.org>
> Reviewed-by: Stephen Hemminger <stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ@public.gmane.org>
> Reviewed-by: Sven-Thorsten Dietrich <sven-43mecJUBy8ZBDgjK7y7TUQ@public.gmane.org>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] tun: return proper error code from tun_do_read
From: David Miller @ 2014-12-31 19:15 UTC (permalink / raw)
  To: agartrell; +Cc: herbert, netdev, linux-kernel, kernel-team
In-Reply-To: <1419578569-13132-1-git-send-email-agartrell@fb.com>

From: Alex Gartrell <agartrell@fb.com>
Date: Thu, 25 Dec 2014 23:22:49 -0800

> Instead of -1 with EAGAIN, read on a O_NONBLOCK tun fd will return 0.  This
> fixes this by properly returning the error code from __skb_recv_datagram.
> 
> Signed-off-by: Alex Gartrell <agartrell@fb.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] tun: Fixed unsigned/signed comparison
From: David Miller @ 2014-12-31 19:15 UTC (permalink / raw)
  To: agartrell; +Cc: herbert, netdev, linux-kernel, kernel-team
In-Reply-To: <1419577503-12862-1-git-send-email-agartrell@fb.com>

From: Alex Gartrell <agartrell@fb.com>
Date: Thu, 25 Dec 2014 23:05:03 -0800

> Validated that this was actually using the unsigned comparison with gdb.
> 
> Signed-off-by: Alex Gartrell <agartrell@fb.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next v1 0/3] net: fec: add Wake-on-LAN support
From: Fabio Estevam @ 2014-12-31 19:03 UTC (permalink / raw)
  To: David Miller
  Cc: Duan Fugang-B38611, Shawn Guo, netdev@vger.kernel.org,
	Ben Hutchings, Stephen Hemminger
In-Reply-To: <20141231.130727.1660985956287926494.davem@davemloft.net>

Hi Fugang,

Just a minor comment about your patches:

On Wed, Dec 31, 2014 at 4:07 PM, David Miller <davem@davemloft.net> wrote:
> From: Fugang Duan <b38611@freescale.com>

Here we see the From field as Fugang Duan.

> Date: Wed, 24 Dec 2014 17:30:38 +0800
>
>> The patch series enable FEC Wake-on-LAN feature for i.MX6q/dl and i.MX6SX SOCs.
>> FEC HW support sleep mode, when system in suspend status with FEC all clock gate
>> off, magic packet can wake up system. For different SOCs, there have special SOC
>> GPR register to let FEC enter sleep mode or exit sleep mode, add these to platform
>> callback for driver' call.
>>
>> Patch#1: add WOL interface supports.
>> Patch#2: add SOC special sleep of/off operations for driver's sleep callback.
>> Patch#3: add magic pattern support for devicetree.
>
> Series applied, thanks.

,but when we see your patches applied they appear with Nimrod Andy as
the author instead:
https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=de40ed31b3c577cefd7b54972365a272ecbe9dd6

It would be nice if you could use the From field to match the name in
the Signed-off tag.

Regards,

Fabio Estevam

^ permalink raw reply

* [net-next PATCH 17/17] fib_trie: Add tracking value for suffix length
From: Alexander Duyck @ 2014-12-31 18:57 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20141231184649.3006.29958.stgit@ahduyck-vm-fedora20>

This change adds a tracking value for the maximum suffix length of all
prefixes stored in any given tnode.  With this value we can determine if we
need to backtrace or not based on if the suffix is greater than the pos
value.

By doing this we can reduce the CPU overhead for lookups in the local table
as many of the prefixes there are 32b long and have a suffix length of 0
meaning we can immediately backtrace to the root node without needing to
test any of the nodes between it and where we ended up.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  122 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 116 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 87fc9a4..281e5e0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -96,6 +96,7 @@ struct tnode {
 	t_key key;
 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
+	unsigned char slen;
 	struct tnode __rcu *parent;
 	struct rcu_head rcu;
 	union {
@@ -311,6 +312,7 @@ static struct tnode *leaf_new(t_key key)
 		 * as the nodes are searched
 		 */
 		l->key = key;
+		l->slen = 0;
 		l->pos = 0;
 		/* set bits to 0 indicating we are not a tnode */
 		l->bits = 0;
@@ -342,6 +344,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 
 	if (tn) {
 		tn->parent = NULL;
+		tn->slen = pos;
 		tn->pos = pos;
 		tn->bits = bits;
 		tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
@@ -387,6 +390,9 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 	else if (!wasfull && isfull)
 		tn->full_children++;
 
+	if (n && (tn->slen < n->slen))
+		tn->slen = n->slen;
+
 	rcu_assign_pointer(tn->child[i], n);
 }
 
@@ -635,6 +641,41 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 	return 0;
 }
 
+static unsigned char update_suffix(struct tnode *tn)
+{
+	unsigned char slen = tn->pos;
+	unsigned long stride, i;
+
+	/* search though the list of children looking for nodes that might
+	 * have a suffix greater than the one we currently have.  This is
+	 * why we start with a stride of 2 since a stride of 1 would
+	 * represent the nodes with suffix length equal to tn->pos
+	 */
+	for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
+		struct tnode *n = tnode_get_child(tn, i);
+
+		if (!n || (n->slen <= slen))
+			continue;
+
+		/* update stride and slen based on new value */
+		stride <<= (n->slen - slen);
+		slen = n->slen;
+		i &= ~(stride - 1);
+
+		/* if slen covers all but the last bit we can stop here
+		 * there will be nothing longer than that since only node
+		 * 0 and 1 << (bits - 1) could have that as their suffix
+		 * length.
+		 */
+		if ((slen + 1) >= (tn->pos + tn->bits))
+			break;
+	}
+
+	tn->slen = slen;
+
+	return slen;
+}
+
 /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
  * the Helsinki University of Technology and Matti Tikkanen of Nokia
  * Telecommunications, page 6:
@@ -790,6 +831,19 @@ no_children:
 		/* drop dead node */
 		tnode_free_init(tn);
 		tnode_free(tn);
+		return;
+	}
+
+	/* Return if at least one deflate was run */
+	if (max_work != MAX_WORK)
+		return;
+
+	/* push the suffix length to the parent node */
+	if (tn->slen > tn->pos) {
+		unsigned char slen = update_suffix(tn);
+
+		if (tp && (slen > tp->slen))
+			tp->slen = slen;
 	}
 }
 
@@ -818,8 +872,58 @@ static inline struct list_head *get_fa_head(struct tnode *l, int plen)
 	return &li->falh;
 }
 
-static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
+static void leaf_pull_suffix(struct tnode *l)
+{
+	struct tnode *tp = node_parent(l);
+
+	while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
+		if (update_suffix(tp) > l->slen)
+			break;
+		tp = node_parent(tp);
+	}
+}
+
+static void leaf_push_suffix(struct tnode *l)
 {
+	struct tnode *tn = node_parent(l);
+
+	/* if this is a new leaf then tn will be NULL and we can sort
+	 * out parent suffix lengths as a part of trie_rebalance
+	 */
+	while (tn && (tn->slen < l->slen)) {
+		tn->slen = l->slen;
+		tn = node_parent(tn);
+	}
+}
+
+static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
+{
+	struct hlist_node *prev;
+
+	/* record the location of the pointer to this object */
+	prev = rtnl_dereference(hlist_pprev_rcu(&old->hlist));
+
+	/* remove the leaf info from the list */
+	hlist_del_rcu(&old->hlist);
+
+	/* if we emptied the list this leaf will be freed and we can sort
+	 * out parent suffix lengths as a part of trie_rebalance
+	 */
+	if (hlist_empty(&l->list))
+		return;
+
+	/* if we removed the tail then we need to update slen */
+	if (!rcu_access_pointer(hlist_next_rcu(prev))) {
+		struct leaf_info *li = hlist_entry(prev, typeof(*li), hlist);
+
+		l->slen = KEYLENGTH - li->plen;
+		leaf_pull_suffix(l);
+	}
+}
+
+static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
+{
+	struct hlist_head *head = &l->list;
 	struct leaf_info *li = NULL, *last = NULL;
 
 	if (hlist_empty(head)) {
@@ -836,6 +940,12 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 		else
 			hlist_add_before_rcu(&new->hlist, &li->hlist);
 	}
+
+	/* if we added to the tail node then we need to update slen */
+	if (!rcu_access_pointer(hlist_next_rcu(&new->hlist))) {
+		l->slen = KEYLENGTH - new->plen;
+		leaf_push_suffix(l);
+	}
 }
 
 /* rcu_read_lock needs to be hold by caller from readside */
@@ -925,7 +1035,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		/* we have found a leaf. Prefixes have already been compared */
 		if (IS_LEAF(n)) {
 			/* Case 1: n is a leaf, and prefixes match*/
-			insert_leaf_info(&n->list, li);
+			insert_leaf_info(n, li);
 			return fa_head;
 		}
 
@@ -939,7 +1049,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		return NULL;
 	}
 
-	insert_leaf_info(&l->list, li);
+	insert_leaf_info(l, li);
 
 	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
 	 *
@@ -1206,7 +1316,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		/* only record pn and cindex if we are going to be chopping
 		 * bits later.  Otherwise we are just wasting cycles.
 		 */
-		if (index) {
+		if (n->slen > n->pos) {
 			pn = n;
 			cindex = index;
 		}
@@ -1225,7 +1335,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		 * between the key and the prefix exist in the region of
 		 * the lsb and higher in the prefix.
 		 */
-		if (unlikely(prefix_mismatch(key, n)))
+		if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
 			goto backtrace;
 
 		/* exit out and process leaf */
@@ -1425,7 +1535,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 		tb->tb_num_default--;
 
 	if (list_empty(fa_head)) {
-		hlist_del_rcu(&li->hlist);
+		remove_leaf_info(l, li);
 		free_leaf_info(li);
 	}
 

^ permalink raw reply related

* [net-next PATCH 16/17] fib_trie: Remove checks for index >= tnode_child_length from tnode_get_child
From: Alexander Duyck @ 2014-12-31 18:57 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20141231184649.3006.29958.stgit@ahduyck-vm-fedora20>

For some reason the compiler doesn't seem to understand that when we are in
a loop that runs from tnode_child_length - 1 to 0 we don't expect the value
of tn->bits to change.  As such every call to tnode_get_child was rerunning
tnode_chile_length which ended up consuming quite a bit of space in the
resultant assembly code.

I have gone though and verified that in all cases where tnode_get_child
is used we are either winding though a fixed loop from tnode_child_length -
1 to 0, or are in a fastpath case where we are verifying the value by
either checking for any remaining bits after shifting index by bits and
testing for leaf, or by using tnode_child_length.

size net/ipv4/fib_trie.o
Before:
   text	   data	    bss	    dec	    hex	filename
  15506	    376	      8	  15890	   3e12	net/ipv4/fib_trie.o

After:
   text	   data	    bss	    dec	    hex	filename
  14827	    376	      8	  15211	   3b6b	net/ipv4/fib_trie.o

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |   14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0c88df0..87fc9a4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,8 +186,6 @@ static inline unsigned long tnode_child_length(const struct tnode *tn)
 static inline struct tnode *tnode_get_child(const struct tnode *tn,
 					    unsigned long i)
 {
-	BUG_ON(i >= tnode_child_length(tn));
-
 	return rtnl_dereference(tn->child[i]);
 }
 
@@ -195,8 +193,6 @@ static inline struct tnode *tnode_get_child(const struct tnode *tn,
 static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
 						unsigned long i)
 {
-	BUG_ON(i >= tnode_child_length(tn));
-
 	return rcu_dereference_rtnl(tn->child[i]);
 }
 
@@ -371,7 +367,7 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
  */
 static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 {
-	struct tnode *chi = rtnl_dereference(tn->child[i]);
+	struct tnode *chi = tnode_get_child(tn, i);
 	int isfull, wasfull;
 
 	BUG_ON(i >= tnode_child_length(tn));
@@ -867,7 +863,7 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
 		if (IS_LEAF(n))
 			break;
 
-		n = rcu_dereference_rtnl(n->child[index]);
+		n = tnode_get_child_rcu(n, index);
 	}
 
 	return n;
@@ -934,7 +930,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		}
 
 		tp = n;
-		n = rcu_dereference_rtnl(n->child[index]);
+		n = tnode_get_child_rcu(n, index);
 	}
 
 	l = leaf_new(key);
@@ -1215,7 +1211,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 			cindex = index;
 		}
 
-		n = rcu_dereference(n->child[index]);
+		n = tnode_get_child_rcu(n, index);
 		if (unlikely(!n))
 			goto backtrace;
 	}
@@ -1835,7 +1831,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 			if (n->bits < MAX_STAT_DEPTH)
 				s->nodesizes[n->bits]++;
 
-			for (i = 0; i < tnode_child_length(n); i++) {
+			for (i = tnode_child_length(n); i--;) {
 				if (!rcu_access_pointer(n->child[i]))
 					s->nullpointers++;
 			}

^ permalink raw reply related

* [net-next PATCH 15/17] fib_trie: inflate/halve nodes in a more RCU friendly way
From: Alexander Duyck @ 2014-12-31 18:56 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20141231184649.3006.29958.stgit@ahduyck-vm-fedora20>

This change pulls the node_set_parent functionality out of put_child_reorg
and instead leaves that to the function to take care of as well.  By doing
this we can fully construct the new cluster of tnodes and all of the
pointers out of it before we start routing pointers into it.

I am suspecting this will likely fix some concurency issues though I don't
have a good test to show as such.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  236 +++++++++++++++++++++++++--------------------------
 1 file changed, 115 insertions(+), 121 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 485983e..0c88df0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -391,8 +391,6 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 	else if (!wasfull && isfull)
 		tn->full_children++;
 
-	node_set_parent(n, tn);
-
 	rcu_assign_pointer(tn->child[i], n);
 }
 
@@ -436,10 +434,8 @@ static void tnode_free(struct tnode *tn)
 
 static int inflate(struct trie *t, struct tnode *oldtnode)
 {
-	unsigned long olen = tnode_child_length(oldtnode);
-	struct tnode *tp = node_parent(oldtnode);
-	struct tnode *tn;
-	unsigned long i;
+	struct tnode *inode, *node0, *node1, *tn, *tp;
+	unsigned long i, j, k;
 	t_key m;
 
 	pr_debug("In inflate\n");
@@ -448,43 +444,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 	if (!tn)
 		return -ENOMEM;
 
-	/*
-	 * Preallocate and store tnodes before the actual work so we
-	 * don't get into an inconsistent state if memory allocation
-	 * fails. In case of failure we return the oldnode and  inflate
-	 * of tnode is ignored.
+	/* Assemble all of the pointers in our cluster, in this case that
+	 * represents all of the pointers out of our allocated nodes that
+	 * point to existing tnodes and the links between our allocated
+	 * nodes.
 	 */
-	for (i = 0, m = 1u << tn->pos; i < olen; i++) {
-		struct tnode *inode = tnode_get_child(oldtnode, i);
-
-		if (tnode_full(oldtnode, inode) && (inode->bits > 1)) {
-			struct tnode *left, *right;
-
-			left = tnode_new(inode->key & ~m, inode->pos,
-					 inode->bits - 1);
-			if (!left)
-				goto nomem;
-			tnode_free_append(tn, left);
-
-			right = tnode_new(inode->key | m, inode->pos,
-					  inode->bits - 1);
-
-			if (!right)
-				goto nomem;
-			tnode_free_append(tn, right);
-
-			put_child(tn, 2*i, left);
-			put_child(tn, 2*i+1, right);
-		}
-	}
-
-	/* prepare oldtnode to be freed */
-	tnode_free_init(oldtnode);
-
-	for (i = 0; i < olen; i++) {
-		struct tnode *inode = tnode_get_child(oldtnode, i);
-		struct tnode *left, *right;
-		unsigned long size, j;
+	for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
+		inode = tnode_get_child(oldtnode, --i);
 
 		/* An empty child */
 		if (inode == NULL)
@@ -496,65 +462,99 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 			continue;
 		}
 
-		/* drop the node in the old tnode free list */
-		tnode_free_append(oldtnode, inode);
-
 		/* An internal node with two children */
 		if (inode->bits == 1) {
-			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
-			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
+			put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
+			put_child(tn, 2 * i, tnode_get_child(inode, 0));
 			continue;
 		}
 
-		/* An internal node with more than two children */
-
 		/* We will replace this node 'inode' with two new
-		 * ones, 'left' and 'right', each with half of the
+		 * ones, 'node0' and 'node1', each with half of the
 		 * original children. The two new nodes will have
 		 * a position one bit further down the key and this
 		 * means that the "significant" part of their keys
 		 * (see the discussion near the top of this file)
 		 * will differ by one bit, which will be "0" in
-		 * left's key and "1" in right's key. Since we are
+		 * node0's key and "1" in node1's key. Since we are
 		 * moving the key position by one step, the bit that
 		 * we are moving away from - the bit at position
-		 * (inode->pos) - is the one that will differ between
-		 * left and right. So... we synthesize that bit in the
-		 * two  new keys.
-		 * The mask 'm' below will be a single "one" bit at
-		 * the position (inode->pos)
+		 * (tn->pos) - is the one that will differ between
+		 * node0 and node1. So... we synthesize that bit in the
+		 * two new keys.
 		 */
+		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
+		if (!node1)
+			goto nomem;
+		tnode_free_append(tn, node1);
+
+		node0 = tnode_new(inode->key & ~m, inode->pos, inode->bits - 1);
+		if (!node0)
+			goto nomem;
+		tnode_free_append(tn, node0);
+
+		/* populate child pointers in new nodes */
+		for (k = tnode_child_length(inode), j = k / 2; j;) {
+			put_child(node1, --j, tnode_get_child(inode, --k));
+			put_child(node0, j, tnode_get_child(inode, j));
+			put_child(node1, --j, tnode_get_child(inode, --k));
+			put_child(node0, j, tnode_get_child(inode, j));
+		}
 
-		/* Use the old key, but set the new significant
-		 *   bit to zero.
-		 */
+		/* link new nodes to parent */
+		NODE_INIT_PARENT(node1, tn);
+		NODE_INIT_PARENT(node0, tn);
+
+		/* link parent to nodes */
+		put_child(tn, 2 * i + 1, node1);
+		put_child(tn, 2 * i, node0);
+	}
 
-		left = tnode_get_child(tn, 2*i);
-		put_child(tn, 2*i, NULL);
+	/* setup the parent pointer into and out of this node */
+	tp = node_parent(oldtnode);
+	NODE_INIT_PARENT(tn, tp);
+	put_child_root(tp, t, tn->key, tn);
+
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
 
-		BUG_ON(!left);
+	/* update all child nodes parent pointers to route to us */
+	for (i = tnode_child_length(oldtnode); i;) {
+		inode = tnode_get_child(oldtnode, --i);
+
+		/* A leaf or an internal node with skipped bits */
+		if (!tnode_full(oldtnode, inode)) {
+			node_set_parent(inode, tn);
+			continue;
+		}
 
-		right = tnode_get_child(tn, 2*i+1);
-		put_child(tn, 2*i+1, NULL);
+		/* drop the node in the old tnode free list */
+		tnode_free_append(oldtnode, inode);
 
-		BUG_ON(!right);
+		/* fetch new nodes */
+		node1 = tnode_get_child(tn, 2 * i + 1);
+		node0 = tnode_get_child(tn, 2 * i);
 
-		size = tnode_child_length(left);
-		for (j = 0; j < size; j++) {
-			put_child(left, j, rtnl_dereference(inode->child[j]));
-			put_child(right, j, rtnl_dereference(inode->child[j + size]));
+		/* bits == 1 then node0 and node1 represent inode's children */
+		if (inode->bits == 1) {
+			node_set_parent(node1, tn);
+			node_set_parent(node0, tn);
+			continue;
 		}
 
-		put_child(tn, 2 * i, left);
-		put_child(tn, 2 * i + 1, right);
+		/* update parent pointers in child node's children */
+		for (k = tnode_child_length(inode), j = k / 2; j;) {
+			node_set_parent(tnode_get_child(inode, --k), node1);
+			node_set_parent(tnode_get_child(inode, --j), node0);
+			node_set_parent(tnode_get_child(inode, --k), node1);
+			node_set_parent(tnode_get_child(inode, --j), node0);
+		}
 
 		/* resize child nodes */
-		resize(t, left);
-		resize(t, right);
+		resize(t, node1);
+		resize(t, node0);
 	}
 
-	put_child_root(tp, t, tn->key, tn);
-
 	/* we completed without error, prepare to free old node */
 	tnode_free(oldtnode);
 	return 0;
@@ -566,10 +566,8 @@ nomem:
 
 static int halve(struct trie *t, struct tnode *oldtnode)
 {
-	unsigned long olen = tnode_child_length(oldtnode);
-	struct tnode *tp = node_parent(oldtnode);
-	struct tnode *tn, *left, *right;
-	int i;
+	struct tnode *tn, *tp, *inode, *node0, *node1;
+	unsigned long i;
 
 	pr_debug("In halve\n");
 
@@ -577,68 +575,64 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 	if (!tn)
 		return -ENOMEM;
 
-	/*
-	 * Preallocate and store tnodes before the actual work so we
-	 * don't get into an inconsistent state if memory allocation
-	 * fails. In case of failure we return the oldnode and halve
-	 * of tnode is ignored.
+	/* Assemble all of the pointers in our cluster, in this case that
+	 * represents all of the pointers out of our allocated nodes that
+	 * point to existing tnodes and the links between our allocated
+	 * nodes.
 	 */
+	for (i = tnode_child_length(oldtnode); i;) {
+		node1 = tnode_get_child(oldtnode, --i);
+		node0 = tnode_get_child(oldtnode, --i);
 
-	for (i = 0; i < olen; i += 2) {
-		left = tnode_get_child(oldtnode, i);
-		right = tnode_get_child(oldtnode, i+1);
+		/* At least one of the children is empty */
+		if (!node1 || !node0) {
+			put_child(tn, i / 2, node1 ? : node0);
+			continue;
+		}
 
 		/* Two nonempty children */
-		if (left && right) {
-			struct tnode *newn;
-
-			newn = tnode_new(left->key, oldtnode->pos, 1);
-			if (!newn) {
-				tnode_free(tn);
-				return -ENOMEM;
-			}
-			tnode_free_append(tn, newn);
-
-			put_child(tn, i/2, newn);
+		inode = tnode_new(node0->key, oldtnode->pos, 1);
+		if (!inode) {
+			tnode_free(tn);
+			return -ENOMEM;
 		}
+		tnode_free_append(tn, inode);
 
+		/* initialize pointers out of node */
+		put_child(inode, 1, node1);
+		put_child(inode, 0, node0);
+		NODE_INIT_PARENT(inode, tn);
+
+		/* link parent to node */
+		put_child(tn, i / 2, inode);
 	}
 
+	/* setup the parent pointer out of and back into this node */
+	tp = node_parent(oldtnode);
+	NODE_INIT_PARENT(tn, tp);
+	put_child_root(tp, t, tn->key, tn);
+
 	/* prepare oldtnode to be freed */
 	tnode_free_init(oldtnode);
 
-	for (i = 0; i < olen; i += 2) {
-		struct tnode *newBinNode;
-
-		left = tnode_get_child(oldtnode, i);
-		right = tnode_get_child(oldtnode, i+1);
-
-		/* At least one of the children is empty */
-		if (left == NULL) {
-			if (right == NULL)    /* Both are empty */
-				continue;
-			put_child(tn, i/2, right);
-			continue;
-		}
+	/* update all of the child parent pointers */
+	for (i = tnode_child_length(tn); i;) {
+		inode = tnode_get_child(tn, --i);
 
-		if (right == NULL) {
-			put_child(tn, i/2, left);
+		/* only new tnodes will be considered "full" nodes */
+		if (!tnode_full(tn, inode)) {
+			node_set_parent(inode, tn);
 			continue;
 		}
 
 		/* Two nonempty children */
-		newBinNode = tnode_get_child(tn, i/2);
-		put_child(newBinNode, 0, left);
-		put_child(newBinNode, 1, right);
-
-		put_child(tn, i / 2, newBinNode);
+		node_set_parent(tnode_get_child(inode, 1), inode);
+		node_set_parent(tnode_get_child(inode, 0), inode);
 
 		/* resize child node */
-		resize(t, newBinNode);
+		resize(t, inode);
 	}
 
-	put_child_root(tp, t, tn->key, tn);
-
 	/* all pointers should be clean so we are done */
 	tnode_free(oldtnode);
 

^ permalink raw reply related

* [net-next PATCH 14/17] fib_trie: Push tnode flushing down to inflate/halve
From: Alexander Duyck @ 2014-12-31 18:56 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20141231184649.3006.29958.stgit@ahduyck-vm-fedora20>

This change pushes the tnode freeing down into the inflate and halve
functions.  It makes more sense here as we have a better grasp of what is
going on and when a given cluster of nodes is ready to be freed.

I believe this may address a bug in the freeing logic as well.  For some
reason if the freelist got to a certain size we would call
synchronize_rcu().  I'm assuming that what they meant to do is call
synchronize_rcu() after they had handed off that much memory via
call_rcu().  As such that is what I have updated the behavior to be.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  103 +++++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 53 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 7fbd2c5..485983e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -147,8 +147,6 @@ struct trie {
 };
 
 static void resize(struct trie *t, struct tnode *tn);
-/* tnodes to free after resize(); protected by RTNL */
-static struct callback_head *tnode_free_head;
 static size_t tnode_free_size;
 
 /*
@@ -307,32 +305,6 @@ static struct tnode *tnode_alloc(size_t size)
 		return vzalloc(size);
 }
 
-static void tnode_free_safe(struct tnode *tn)
-{
-	BUG_ON(IS_LEAF(tn));
-	tn->rcu.next = tnode_free_head;
-	tnode_free_head = &tn->rcu;
-}
-
-static void tnode_free_flush(void)
-{
-	struct callback_head *head;
-
-	while ((head = tnode_free_head)) {
-		struct tnode *tn = container_of(head, struct tnode, rcu);
-
-		tnode_free_head = head->next;
-		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
-
-		node_free(tn);
-	}
-
-	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
-		tnode_free_size = 0;
-		synchronize_rcu();
-	}
-}
-
 static struct tnode *leaf_new(t_key key)
 {
 	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
@@ -433,17 +405,33 @@ static void put_child_root(struct tnode *tp, struct trie *t,
 		rcu_assign_pointer(t->trie, n);
 }
 
-static void tnode_clean_free(struct tnode *tn)
+static inline void tnode_free_init(struct tnode *tn)
 {
-	struct tnode *tofree;
-	unsigned long i;
+	tn->rcu.next = NULL;
+}
+
+static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
+{
+	n->rcu.next = tn->rcu.next;
+	tn->rcu.next = &n->rcu;
+}
 
-	for (i = 0; i < tnode_child_length(tn); i++) {
-		tofree = tnode_get_child(tn, i);
-		if (tofree)
-			node_free(tofree);
+static void tnode_free(struct tnode *tn)
+{
+	struct callback_head *head = &tn->rcu;
+
+	while (head) {
+		head = head->next;
+		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
+		node_free(tn);
+
+		tn = container_of(head, struct tnode, rcu);
+	}
+
+	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+		tnode_free_size = 0;
+		synchronize_rcu();
 	}
-	node_free(tn);
 }
 
 static int inflate(struct trie *t, struct tnode *oldtnode)
@@ -476,20 +464,23 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 					 inode->bits - 1);
 			if (!left)
 				goto nomem;
+			tnode_free_append(tn, left);
 
 			right = tnode_new(inode->key | m, inode->pos,
 					  inode->bits - 1);
 
-			if (!right) {
-				node_free(left);
+			if (!right)
 				goto nomem;
-			}
+			tnode_free_append(tn, right);
 
 			put_child(tn, 2*i, left);
 			put_child(tn, 2*i+1, right);
 		}
 	}
 
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
+
 	for (i = 0; i < olen; i++) {
 		struct tnode *inode = tnode_get_child(oldtnode, i);
 		struct tnode *left, *right;
@@ -505,12 +496,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 			continue;
 		}
 
+		/* drop the node in the old tnode free list */
+		tnode_free_append(oldtnode, inode);
+
 		/* An internal node with two children */
 		if (inode->bits == 1) {
 			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
 			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
-
-			tnode_free_safe(inode);
 			continue;
 		}
 
@@ -556,17 +548,19 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
 		put_child(tn, 2 * i, left);
 		put_child(tn, 2 * i + 1, right);
 
-		tnode_free_safe(inode);
-
+		/* resize child nodes */
 		resize(t, left);
 		resize(t, right);
 	}
 
 	put_child_root(tp, t, tn->key, tn);
-	tnode_free_safe(oldtnode);
+
+	/* we completed without error, prepare to free old node */
+	tnode_free(oldtnode);
 	return 0;
 nomem:
-	tnode_clean_free(tn);
+	/* all pointers should be clean so we are done */
+	tnode_free(tn);
 	return -ENOMEM;
 }
 
@@ -599,17 +593,20 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 			struct tnode *newn;
 
 			newn = tnode_new(left->key, oldtnode->pos, 1);
-
 			if (!newn) {
-				tnode_clean_free(tn);
+				tnode_free(tn);
 				return -ENOMEM;
 			}
+			tnode_free_append(tn, newn);
 
 			put_child(tn, i/2, newn);
 		}
 
 	}
 
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
+
 	for (i = 0; i < olen; i += 2) {
 		struct tnode *newBinNode;
 
@@ -636,11 +633,14 @@ static int halve(struct trie *t, struct tnode *oldtnode)
 
 		put_child(tn, i / 2, newBinNode);
 
+		/* resize child node */
 		resize(t, newBinNode);
 	}
 
 	put_child_root(tp, t, tn->key, tn);
-	tnode_free_safe(oldtnode);
+
+	/* all pointers should be clean so we are done */
+	tnode_free(oldtnode);
 
 	return 0;
 }
@@ -798,7 +798,8 @@ no_children:
 		node_set_parent(n, tp);
 
 		/* drop dead node */
-		tnode_free_safe(tn);
+		tnode_free_init(tn);
+		tnode_free(tn);
 	}
 }
 
@@ -884,16 +885,12 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 
 	while ((tp = node_parent(tn)) != NULL) {
 		resize(t, tn);
-
-		tnode_free_flush();
 		tn = tp;
 	}
 
 	/* Handle last (top) tnode */
 	if (IS_TNODE(tn))
 		resize(t, tn);
-
-	tnode_free_flush();
 }
 
 /* only used from updater-side */

^ permalink raw reply related

* [net-next PATCH 13/17] fib_trie: Push assignment of child to parent down into inflate/halve
From: Alexander Duyck @ 2014-12-31 18:56 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20141231184649.3006.29958.stgit@ahduyck-vm-fedora20>

This change makes it so that the assignment of the tnode to the parent is
handled directly within whatever function is currently handling the node be
it inflate, halve, or resize.  By doing this we can avoid some of the need
to set NULL pointers in the tree while we are resizing the subnodes.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  149 +++++++++++++++++++++++----------------------------
 1 file changed, 66 insertions(+), 83 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 58e1224..7fbd2c5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -146,9 +146,7 @@ struct trie {
 #endif
 };
 
-static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
-				  struct tnode *n, int wasfull);
-static struct tnode *resize(struct trie *t, struct tnode *tn);
+static void resize(struct trie *t, struct tnode *tn);
 /* tnodes to free after resize(); protected by RTNL */
 static struct callback_head *tnode_free_head;
 static size_t tnode_free_size;
@@ -396,22 +394,13 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
 	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
 }
 
-static inline void put_child(struct tnode *tn, unsigned long i,
-			     struct tnode *n)
-{
-	tnode_put_child_reorg(tn, i, n, -1);
-}
-
- /*
-  * Add a child at position i overwriting the old value.
-  * Update the value of full_children and empty_children.
-  */
-
-static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
-				  struct tnode *n, int wasfull)
+/* Add a child at position i overwriting the old value.
+ * Update the value of full_children and empty_children.
+ */
+static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
 {
 	struct tnode *chi = rtnl_dereference(tn->child[i]);
-	int isfull;
+	int isfull, wasfull;
 
 	BUG_ON(i >= tnode_child_length(tn));
 
@@ -422,10 +411,9 @@ static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
 		tn->empty_children--;
 
 	/* update fullChildren */
-	if (wasfull == -1)
-		wasfull = tnode_full(tn, chi);
-
+	wasfull = tnode_full(tn, chi);
 	isfull = tnode_full(tn, n);
+
 	if (wasfull && !isfull)
 		tn->full_children--;
 	else if (!wasfull && isfull)
@@ -458,9 +446,10 @@ static void tnode_clean_free(struct tnode *tn)
 	node_free(tn);
 }
 
-static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
+static int inflate(struct trie *t, struct tnode *oldtnode)
 {
 	unsigned long olen = tnode_child_length(oldtnode);
+	struct tnode *tp = node_parent(oldtnode);
 	struct tnode *tn;
 	unsigned long i;
 	t_key m;
@@ -468,9 +457,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 	pr_debug("In inflate\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
-
 	if (!tn)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	/*
 	 * Preallocate and store tnodes before the actual work so we
@@ -564,30 +552,36 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 			put_child(left, j, rtnl_dereference(inode->child[j]));
 			put_child(right, j, rtnl_dereference(inode->child[j + size]));
 		}
-		put_child(tn, 2*i, resize(t, left));
-		put_child(tn, 2*i+1, resize(t, right));
+
+		put_child(tn, 2 * i, left);
+		put_child(tn, 2 * i + 1, right);
 
 		tnode_free_safe(inode);
+
+		resize(t, left);
+		resize(t, right);
 	}
+
+	put_child_root(tp, t, tn->key, tn);
 	tnode_free_safe(oldtnode);
-	return tn;
+	return 0;
 nomem:
 	tnode_clean_free(tn);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
-static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
+static int halve(struct trie *t, struct tnode *oldtnode)
 {
 	unsigned long olen = tnode_child_length(oldtnode);
+	struct tnode *tp = node_parent(oldtnode);
 	struct tnode *tn, *left, *right;
 	int i;
 
 	pr_debug("In halve\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
-
 	if (!tn)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	/*
 	 * Preallocate and store tnodes before the actual work so we
@@ -606,8 +600,10 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 
 			newn = tnode_new(left->key, oldtnode->pos, 1);
 
-			if (!newn)
-				goto nomem;
+			if (!newn) {
+				tnode_clean_free(tn);
+				return -ENOMEM;
+			}
 
 			put_child(tn, i/2, newn);
 		}
@@ -635,16 +631,18 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 
 		/* Two nonempty children */
 		newBinNode = tnode_get_child(tn, i/2);
-		put_child(tn, i/2, NULL);
 		put_child(newBinNode, 0, left);
 		put_child(newBinNode, 1, right);
-		put_child(tn, i/2, resize(t, newBinNode));
+
+		put_child(tn, i / 2, newBinNode);
+
+		resize(t, newBinNode);
 	}
+
+	put_child_root(tp, t, tn->key, tn);
 	tnode_free_safe(oldtnode);
-	return tn;
-nomem:
-	tnode_clean_free(tn);
-	return ERR_PTR(-ENOMEM);
+
+	return 0;
 }
 
 /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
@@ -704,45 +702,48 @@ nomem:
  *    tnode_child_length(tn)
  *
  */
-static bool should_inflate(const struct tnode *tn)
+static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
 {
 	unsigned long used = tnode_child_length(tn);
 	unsigned long threshold = used;
 
 	/* Keep root node larger */
-	threshold *= node_parent(tn) ? inflate_threshold :
-				       inflate_threshold_root;
+	threshold *= tp ? inflate_threshold : inflate_threshold_root;
 	used += tn->full_children;
 	used -= tn->empty_children;
 
 	return tn->pos && ((50 * used) >= threshold);
 }
 
-static bool should_halve(const struct tnode *tn)
+static bool should_halve(const struct tnode *tp, const struct tnode *tn)
 {
 	unsigned long used = tnode_child_length(tn);
 	unsigned long threshold = used;
 
 	/* Keep root node larger */
-	threshold *= node_parent(tn) ? halve_threshold :
-				       halve_threshold_root;
+	threshold *= tp ? halve_threshold : halve_threshold_root;
 	used -= tn->empty_children;
 
 	return (tn->bits > 1) && ((100 * used) < threshold);
 }
 
 #define MAX_WORK 10
-static struct tnode *resize(struct trie *t, struct tnode *tn)
+static void resize(struct trie *t, struct tnode *tn)
 {
-	struct tnode *old_tn, *n = NULL;
+	struct tnode *tp = node_parent(tn), *n = NULL;
+	struct tnode __rcu **cptr;
 	int max_work;
 
-	if (!tn)
-		return NULL;
-
 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 		 tn, inflate_threshold, halve_threshold);
 
+	/* track the tnode via the pointer from the parent instead of
+	 * doing it ourselves.  This way we can let RCU fully do its
+	 * thing without us interfering
+	 */
+	cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
+	BUG_ON(tn != rtnl_dereference(*cptr));
+
 	/* No children */
 	if (tn->empty_children > (tnode_child_length(tn) - 1))
 		goto no_children;
@@ -755,39 +756,35 @@ static struct tnode *resize(struct trie *t, struct tnode *tn)
 	 * nonempty nodes that are above the threshold.
 	 */
 	max_work = MAX_WORK;
-	while (should_inflate(tn) && max_work--) {
-		old_tn = tn;
-		tn = inflate(t, tn);
-
-		if (IS_ERR(tn)) {
-			tn = old_tn;
+	while (should_inflate(tp, tn) && max_work--) {
+		if (inflate(t, tn)) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(t->stats->resize_node_skipped);
 #endif
 			break;
 		}
+
+		tn = rtnl_dereference(*cptr);
 	}
 
 	/* Return if at least one inflate is run */
 	if (max_work != MAX_WORK)
-		return tn;
+		return;
 
 	/* Halve as long as the number of empty children in this
 	 * node is above threshold.
 	 */
 	max_work = MAX_WORK;
-	while (should_halve(tn) && max_work--) {
-		old_tn = tn;
-		tn = halve(t, tn);
-		if (IS_ERR(tn)) {
-			tn = old_tn;
+	while (should_halve(tp, tn) && max_work--) {
+		if (halve(t, tn)) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(t->stats->resize_node_skipped);
 #endif
 			break;
 		}
-	}
 
+		tn = rtnl_dereference(*cptr);
+	}
 
 	/* Only one child remains */
 	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
@@ -797,11 +794,12 @@ one_child:
 			n = tnode_get_child(tn, --i);
 no_children:
 		/* compress one level */
-		node_set_parent(n, NULL);
+		put_child_root(tp, t, tn->key, n);
+		node_set_parent(n, tp);
+
+		/* drop dead node */
 		tnode_free_safe(tn);
-		return n;
 	}
-	return tn;
 }
 
 /* readside must use rcu_read_lock currently dump routines
@@ -882,34 +880,19 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
 
 static void trie_rebalance(struct trie *t, struct tnode *tn)
 {
-	int wasfull;
-	t_key cindex, key;
 	struct tnode *tp;
 
-	key = tn->key;
-
-	while (tn != NULL && (tp = node_parent(tn)) != NULL) {
-		cindex = get_index(key, tp);
-		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
-		tn = resize(t, tn);
-
-		tnode_put_child_reorg(tp, cindex, tn, wasfull);
-
-		tp = node_parent(tn);
-		if (!tp)
-			rcu_assign_pointer(t->trie, tn);
+	while ((tp = node_parent(tn)) != NULL) {
+		resize(t, tn);
 
 		tnode_free_flush();
-		if (!tp)
-			break;
 		tn = tp;
 	}
 
 	/* Handle last (top) tnode */
 	if (IS_TNODE(tn))
-		tn = resize(t, tn);
+		resize(t, tn);
 
-	rcu_assign_pointer(t->trie, tn);
 	tnode_free_flush();
 }
 

^ permalink raw reply related

* [net-next PATCH 12/17] fib_trie: Add functions should_inflate and should_halve
From: Alexander Duyck @ 2014-12-31 18:56 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20141231184649.3006.29958.stgit@ahduyck-vm-fedora20>

This change pulls the logic for if we should inflate/halve the nodes out
into separate functions.  It also addresses what I believe is a bug where 1
full node is all that is needed to keep a node from ever being halved.

Simple script to reproduce the issue:
	modprobe dummy;	ifconfig dummy0 up
	for i in `seq 0 255`; do ifconfig dummy0:$i 10.0.${i}.1/24 up; done
	ifconfig dummy0:256 10.0.255.33/16 up
	for i in `seq 0 254`; do ifconfig dummy0:$i down; done

Results from /proc/net/fib_triestat
Before:
	Local:
		Aver depth:     3.00
		Max depth:      4
		Leaves:         17
		Prefixes:       18
		Internal nodes: 11
		  1: 8  2: 2  10: 1
		Pointers: 1048
	Null ptrs: 1021
	Total size: 11  kB
After:
	Local:
		Aver depth:     3.41
		Max depth:      5
		Leaves:         17
		Prefixes:       18
		Internal nodes: 12
		  1: 8  2: 3  3: 1
		Pointers: 36
	Null ptrs: 8
	Total size: 3  kB

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  175 ++++++++++++++++++++++++++-------------------------
 1 file changed, 89 insertions(+), 86 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d044fa3..58e1224 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -647,12 +647,94 @@ nomem:
 	return ERR_PTR(-ENOMEM);
 }
 
+/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * tnode_child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: tnode_child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+ *     tn->full_children;
+ *
+ * new_child_length = tnode_child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ *      new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ *      inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ *      inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ *    tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ *    tn->full_children) >=
+ *      inflate_threshold * tnode_child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + tnode_child_length(tn) -
+ *    tn->empty_children) >= inflate_threshold *
+ *    tnode_child_length(tn)
+ *
+ */
+static bool should_inflate(const struct tnode *tn)
+{
+	unsigned long used = tnode_child_length(tn);
+	unsigned long threshold = used;
+
+	/* Keep root node larger */
+	threshold *= node_parent(tn) ? inflate_threshold :
+				       inflate_threshold_root;
+	used += tn->full_children;
+	used -= tn->empty_children;
+
+	return tn->pos && ((50 * used) >= threshold);
+}
+
+static bool should_halve(const struct tnode *tn)
+{
+	unsigned long used = tnode_child_length(tn);
+	unsigned long threshold = used;
+
+	/* Keep root node larger */
+	threshold *= node_parent(tn) ? halve_threshold :
+				       halve_threshold_root;
+	used -= tn->empty_children;
+
+	return (tn->bits > 1) && ((100 * used) < threshold);
+}
+
 #define MAX_WORK 10
 static struct tnode *resize(struct trie *t, struct tnode *tn)
 {
 	struct tnode *old_tn, *n = NULL;
-	int inflate_threshold_use;
-	int halve_threshold_use;
 	int max_work;
 
 	if (!tn)
@@ -668,86 +750,12 @@ static struct tnode *resize(struct trie *t, struct tnode *tn)
 	/* One child */
 	if (tn->empty_children == (tnode_child_length(tn) - 1))
 		goto one_child;
-	/*
-	 * Double as long as the resulting node has a number of
-	 * nonempty nodes that are above the threshold.
-	 */
 
-	/*
-	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
-	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
-	 * Telecommunications, page 6:
-	 * "A node is doubled if the ratio of non-empty children to all
-	 * children in the *doubled* node is at least 'high'."
-	 *
-	 * 'high' in this instance is the variable 'inflate_threshold'. It
-	 * is expressed as a percentage, so we multiply it with
-	 * tnode_child_length() and instead of multiplying by 2 (since the
-	 * child array will be doubled by inflate()) and multiplying
-	 * the left-hand side by 100 (to handle the percentage thing) we
-	 * multiply the left-hand side by 50.
-	 *
-	 * The left-hand side may look a bit weird: tnode_child_length(tn)
-	 * - tn->empty_children is of course the number of non-null children
-	 * in the current node. tn->full_children is the number of "full"
-	 * children, that is non-null tnodes with a skip value of 0.
-	 * All of those will be doubled in the resulting inflated tnode, so
-	 * we just count them one extra time here.
-	 *
-	 * A clearer way to write this would be:
-	 *
-	 * to_be_doubled = tn->full_children;
-	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
-	 *     tn->full_children;
-	 *
-	 * new_child_length = tnode_child_length(tn) * 2;
-	 *
-	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
-	 *      new_child_length;
-	 * if (new_fill_factor >= inflate_threshold)
-	 *
-	 * ...and so on, tho it would mess up the while () loop.
-	 *
-	 * anyway,
-	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
-	 *      inflate_threshold
-	 *
-	 * avoid a division:
-	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
-	 *      inflate_threshold * new_child_length
-	 *
-	 * expand not_to_be_doubled and to_be_doubled, and shorten:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children) >= inflate_threshold * new_child_length
-	 *
-	 * expand new_child_length:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children) >=
-	 *      inflate_threshold * tnode_child_length(tn) * 2
-	 *
-	 * shorten again:
-	 * 50 * (tn->full_children + tnode_child_length(tn) -
-	 *    tn->empty_children) >= inflate_threshold *
-	 *    tnode_child_length(tn)
-	 *
+	/* Double as long as the resulting node has a number of
+	 * nonempty nodes that are above the threshold.
 	 */
-
-	/* Keep root node larger  */
-
-	if (!node_parent(tn)) {
-		inflate_threshold_use = inflate_threshold_root;
-		halve_threshold_use = halve_threshold_root;
-	} else {
-		inflate_threshold_use = inflate_threshold;
-		halve_threshold_use = halve_threshold;
-	}
-
 	max_work = MAX_WORK;
-	while ((tn->full_children > 0 &&  max_work-- &&
-		50 * (tn->full_children + tnode_child_length(tn)
-		      - tn->empty_children)
-		>= inflate_threshold_use * tnode_child_length(tn))) {
-
+	while (should_inflate(tn) && max_work--) {
 		old_tn = tn;
 		tn = inflate(t, tn);
 
@@ -764,16 +772,11 @@ static struct tnode *resize(struct trie *t, struct tnode *tn)
 	if (max_work != MAX_WORK)
 		return tn;
 
-	/*
-	 * Halve as long as the number of empty children in this
+	/* Halve as long as the number of empty children in this
 	 * node is above threshold.
 	 */
-
 	max_work = MAX_WORK;
-	while (tn->bits > 1 &&  max_work-- &&
-	       100 * (tnode_child_length(tn) - tn->empty_children) <
-	       halve_threshold_use * tnode_child_length(tn)) {
-
+	while (should_halve(tn) && max_work--) {
 		old_tn = tn;
 		tn = halve(t, tn);
 		if (IS_ERR(tn)) {

^ permalink raw reply related

* [net-next PATCH 11/17] fib_trie: Move resize to after inflate/halve
From: Alexander Duyck @ 2014-12-31 18:56 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20141231184649.3006.29958.stgit@ahduyck-vm-fedora20>

This change consists of a cut/paste of resize to behind inflate and halve
so that I could remove the two function prototypes.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 net/ipv4/fib_trie.c |  311 +++++++++++++++++++++++++--------------------------
 1 file changed, 154 insertions(+), 157 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 987b06d..d044fa3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -149,8 +149,6 @@ struct trie {
 static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
 				  struct tnode *n, int wasfull);
 static struct tnode *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
 /* tnodes to free after resize(); protected by RTNL */
 static struct callback_head *tnode_free_head;
 static size_t tnode_free_size;
@@ -447,161 +445,6 @@ static void put_child_root(struct tnode *tp, struct trie *t,
 		rcu_assign_pointer(t->trie, n);
 }
 
-#define MAX_WORK 10
-static struct tnode *resize(struct trie *t, struct tnode *tn)
-{
-	struct tnode *old_tn, *n = NULL;
-	int inflate_threshold_use;
-	int halve_threshold_use;
-	int max_work;
-
-	if (!tn)
-		return NULL;
-
-	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
-		 tn, inflate_threshold, halve_threshold);
-
-	/* No children */
-	if (tn->empty_children > (tnode_child_length(tn) - 1))
-		goto no_children;
-
-	/* One child */
-	if (tn->empty_children == (tnode_child_length(tn) - 1))
-		goto one_child;
-	/*
-	 * Double as long as the resulting node has a number of
-	 * nonempty nodes that are above the threshold.
-	 */
-
-	/*
-	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
-	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
-	 * Telecommunications, page 6:
-	 * "A node is doubled if the ratio of non-empty children to all
-	 * children in the *doubled* node is at least 'high'."
-	 *
-	 * 'high' in this instance is the variable 'inflate_threshold'. It
-	 * is expressed as a percentage, so we multiply it with
-	 * tnode_child_length() and instead of multiplying by 2 (since the
-	 * child array will be doubled by inflate()) and multiplying
-	 * the left-hand side by 100 (to handle the percentage thing) we
-	 * multiply the left-hand side by 50.
-	 *
-	 * The left-hand side may look a bit weird: tnode_child_length(tn)
-	 * - tn->empty_children is of course the number of non-null children
-	 * in the current node. tn->full_children is the number of "full"
-	 * children, that is non-null tnodes with a skip value of 0.
-	 * All of those will be doubled in the resulting inflated tnode, so
-	 * we just count them one extra time here.
-	 *
-	 * A clearer way to write this would be:
-	 *
-	 * to_be_doubled = tn->full_children;
-	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
-	 *     tn->full_children;
-	 *
-	 * new_child_length = tnode_child_length(tn) * 2;
-	 *
-	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
-	 *      new_child_length;
-	 * if (new_fill_factor >= inflate_threshold)
-	 *
-	 * ...and so on, tho it would mess up the while () loop.
-	 *
-	 * anyway,
-	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
-	 *      inflate_threshold
-	 *
-	 * avoid a division:
-	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
-	 *      inflate_threshold * new_child_length
-	 *
-	 * expand not_to_be_doubled and to_be_doubled, and shorten:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children) >= inflate_threshold * new_child_length
-	 *
-	 * expand new_child_length:
-	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children) >=
-	 *      inflate_threshold * tnode_child_length(tn) * 2
-	 *
-	 * shorten again:
-	 * 50 * (tn->full_children + tnode_child_length(tn) -
-	 *    tn->empty_children) >= inflate_threshold *
-	 *    tnode_child_length(tn)
-	 *
-	 */
-
-	/* Keep root node larger  */
-
-	if (!node_parent(tn)) {
-		inflate_threshold_use = inflate_threshold_root;
-		halve_threshold_use = halve_threshold_root;
-	} else {
-		inflate_threshold_use = inflate_threshold;
-		halve_threshold_use = halve_threshold;
-	}
-
-	max_work = MAX_WORK;
-	while ((tn->full_children > 0 &&  max_work-- &&
-		50 * (tn->full_children + tnode_child_length(tn)
-		      - tn->empty_children)
-		>= inflate_threshold_use * tnode_child_length(tn))) {
-
-		old_tn = tn;
-		tn = inflate(t, tn);
-
-		if (IS_ERR(tn)) {
-			tn = old_tn;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-			this_cpu_inc(t->stats->resize_node_skipped);
-#endif
-			break;
-		}
-	}
-
-	/* Return if at least one inflate is run */
-	if (max_work != MAX_WORK)
-		return tn;
-
-	/*
-	 * Halve as long as the number of empty children in this
-	 * node is above threshold.
-	 */
-
-	max_work = MAX_WORK;
-	while (tn->bits > 1 &&  max_work-- &&
-	       100 * (tnode_child_length(tn) - tn->empty_children) <
-	       halve_threshold_use * tnode_child_length(tn)) {
-
-		old_tn = tn;
-		tn = halve(t, tn);
-		if (IS_ERR(tn)) {
-			tn = old_tn;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-			this_cpu_inc(t->stats->resize_node_skipped);
-#endif
-			break;
-		}
-	}
-
-
-	/* Only one child remains */
-	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
-		unsigned long i;
-one_child:
-		for (i = tnode_child_length(tn); !n && i;)
-			n = tnode_get_child(tn, --i);
-no_children:
-		/* compress one level */
-		node_set_parent(n, NULL);
-		tnode_free_safe(tn);
-		return n;
-	}
-	return tn;
-}
-
-
 static void tnode_clean_free(struct tnode *tn)
 {
 	struct tnode *tofree;
@@ -804,6 +647,160 @@ nomem:
 	return ERR_PTR(-ENOMEM);
 }
 
+#define MAX_WORK 10
+static struct tnode *resize(struct trie *t, struct tnode *tn)
+{
+	struct tnode *old_tn, *n = NULL;
+	int inflate_threshold_use;
+	int halve_threshold_use;
+	int max_work;
+
+	if (!tn)
+		return NULL;
+
+	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+		 tn, inflate_threshold, halve_threshold);
+
+	/* No children */
+	if (tn->empty_children > (tnode_child_length(tn) - 1))
+		goto no_children;
+
+	/* One child */
+	if (tn->empty_children == (tnode_child_length(tn) - 1))
+		goto one_child;
+	/*
+	 * Double as long as the resulting node has a number of
+	 * nonempty nodes that are above the threshold.
+	 */
+
+	/*
+	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
+	 * Telecommunications, page 6:
+	 * "A node is doubled if the ratio of non-empty children to all
+	 * children in the *doubled* node is at least 'high'."
+	 *
+	 * 'high' in this instance is the variable 'inflate_threshold'. It
+	 * is expressed as a percentage, so we multiply it with
+	 * tnode_child_length() and instead of multiplying by 2 (since the
+	 * child array will be doubled by inflate()) and multiplying
+	 * the left-hand side by 100 (to handle the percentage thing) we
+	 * multiply the left-hand side by 50.
+	 *
+	 * The left-hand side may look a bit weird: tnode_child_length(tn)
+	 * - tn->empty_children is of course the number of non-null children
+	 * in the current node. tn->full_children is the number of "full"
+	 * children, that is non-null tnodes with a skip value of 0.
+	 * All of those will be doubled in the resulting inflated tnode, so
+	 * we just count them one extra time here.
+	 *
+	 * A clearer way to write this would be:
+	 *
+	 * to_be_doubled = tn->full_children;
+	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+	 *     tn->full_children;
+	 *
+	 * new_child_length = tnode_child_length(tn) * 2;
+	 *
+	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+	 *      new_child_length;
+	 * if (new_fill_factor >= inflate_threshold)
+	 *
+	 * ...and so on, tho it would mess up the while () loop.
+	 *
+	 * anyway,
+	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+	 *      inflate_threshold
+	 *
+	 * avoid a division:
+	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+	 *      inflate_threshold * new_child_length
+	 *
+	 * expand not_to_be_doubled and to_be_doubled, and shorten:
+	 * 100 * (tnode_child_length(tn) - tn->empty_children +
+	 *    tn->full_children) >= inflate_threshold * new_child_length
+	 *
+	 * expand new_child_length:
+	 * 100 * (tnode_child_length(tn) - tn->empty_children +
+	 *    tn->full_children) >=
+	 *      inflate_threshold * tnode_child_length(tn) * 2
+	 *
+	 * shorten again:
+	 * 50 * (tn->full_children + tnode_child_length(tn) -
+	 *    tn->empty_children) >= inflate_threshold *
+	 *    tnode_child_length(tn)
+	 *
+	 */
+
+	/* Keep root node larger  */
+
+	if (!node_parent(tn)) {
+		inflate_threshold_use = inflate_threshold_root;
+		halve_threshold_use = halve_threshold_root;
+	} else {
+		inflate_threshold_use = inflate_threshold;
+		halve_threshold_use = halve_threshold;
+	}
+
+	max_work = MAX_WORK;
+	while ((tn->full_children > 0 &&  max_work-- &&
+		50 * (tn->full_children + tnode_child_length(tn)
+		      - tn->empty_children)
+		>= inflate_threshold_use * tnode_child_length(tn))) {
+
+		old_tn = tn;
+		tn = inflate(t, tn);
+
+		if (IS_ERR(tn)) {
+			tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			this_cpu_inc(t->stats->resize_node_skipped);
+#endif
+			break;
+		}
+	}
+
+	/* Return if at least one inflate is run */
+	if (max_work != MAX_WORK)
+		return tn;
+
+	/*
+	 * Halve as long as the number of empty children in this
+	 * node is above threshold.
+	 */
+
+	max_work = MAX_WORK;
+	while (tn->bits > 1 &&  max_work-- &&
+	       100 * (tnode_child_length(tn) - tn->empty_children) <
+	       halve_threshold_use * tnode_child_length(tn)) {
+
+		old_tn = tn;
+		tn = halve(t, tn);
+		if (IS_ERR(tn)) {
+			tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			this_cpu_inc(t->stats->resize_node_skipped);
+#endif
+			break;
+		}
+	}
+
+
+	/* Only one child remains */
+	if (tn->empty_children == (tnode_child_length(tn) - 1)) {
+		unsigned long i;
+one_child:
+		for (i = tnode_child_length(tn); !n && i;)
+			n = tnode_get_child(tn, --i);
+no_children:
+		/* compress one level */
+		node_set_parent(n, NULL);
+		tnode_free_safe(tn);
+		return n;
+	}
+	return tn;
+}
+
 /* readside must use rcu_read_lock currently dump routines
  via get_fa_head and dump */
 

^ permalink raw reply related

* [net-next PATCH 10/17] fib_trie: Push rcu_read_lock/unlock to callers
From: Alexander Duyck @ 2014-12-31 18:56 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20141231184649.3006.29958.stgit@ahduyck-vm-fedora20>

This change is to start cleaning up some of the rcu_read_lock/unlock
handling.  I realized while reviewing the code there are several spots that
I don't believe are being handled correctly or are masking warnings by
locally calling rcu_read_lock/unlock instead of calling them at the correct
level.

A common example is a call to fib_get_table followed by fib_table_lookup.
The rcu_read_lock/unlock ought to wrap both but there are several spots where
they were not wrapped.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
 include/net/ip_fib.h    |   50 ++++++++++-------
 net/ipv4/fib_frontend.c |   27 +++++----
 net/ipv4/fib_rules.c    |   22 +++-----
 net/ipv4/fib_trie.c     |  137 +++++++++++++++++++++--------------------------
 4 files changed, 114 insertions(+), 122 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 09a819e..5bd120e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -222,16 +222,19 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id)
 static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 			     struct fib_result *res)
 {
-	struct fib_table *table;
+	int err = -ENETUNREACH;
+
+	rcu_read_lock();
+
+	if (!fib_table_lookup(fib_get_table(net, RT_TABLE_LOCAL), flp, res,
+			      FIB_LOOKUP_NOREF) ||
+	    !fib_table_lookup(fib_get_table(net, RT_TABLE_MAIN), flp, res,
+			      FIB_LOOKUP_NOREF))
+		err = 0;
 
-	table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
-		return 0;
+	rcu_read_unlock();
 
-	table = fib_get_table(net, RT_TABLE_MAIN);
-	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
-		return 0;
-	return -ENETUNREACH;
+	return err;
 }
 
 #else /* CONFIG_IP_MULTIPLE_TABLES */
@@ -247,20 +250,25 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 			     struct fib_result *res)
 {
 	if (!net->ipv4.fib_has_custom_rules) {
+		int err = -ENETUNREACH;
+
+		rcu_read_lock();
+
 		res->tclassid = 0;
-		if (net->ipv4.fib_local &&
-		    !fib_table_lookup(net->ipv4.fib_local, flp, res,
-				      FIB_LOOKUP_NOREF))
-			return 0;
-		if (net->ipv4.fib_main &&
-		    !fib_table_lookup(net->ipv4.fib_main, flp, res,
-				      FIB_LOOKUP_NOREF))
-			return 0;
-		if (net->ipv4.fib_default &&
-		    !fib_table_lookup(net->ipv4.fib_default, flp, res,
-				      FIB_LOOKUP_NOREF))
-			return 0;
-		return -ENETUNREACH;
+		if ((net->ipv4.fib_local &&
+		     !fib_table_lookup(net->ipv4.fib_local, flp, res,
+				       FIB_LOOKUP_NOREF)) ||
+		    (net->ipv4.fib_main &&
+		     !fib_table_lookup(net->ipv4.fib_main, flp, res,
+				       FIB_LOOKUP_NOREF)) ||
+		    (net->ipv4.fib_default &&
+		     !fib_table_lookup(net->ipv4.fib_default, flp, res,
+				       FIB_LOOKUP_NOREF)))
+			err = 0;
+
+		rcu_read_unlock();
+
+		return err;
 	}
 	return __fib_lookup(net, flp, res);
 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6689020..57be71d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -109,6 +109,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
 	return tb;
 }
 
+/* caller must hold either rtnl or rcu read lock */
 struct fib_table *fib_get_table(struct net *net, u32 id)
 {
 	struct fib_table *tb;
@@ -119,15 +120,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
 		id = RT_TABLE_MAIN;
 	h = id & (FIB_TABLE_HASHSZ - 1);
 
-	rcu_read_lock();
 	head = &net->ipv4.fib_table_hash[h];
 	hlist_for_each_entry_rcu(tb, head, tb_hlist) {
-		if (tb->tb_id == id) {
-			rcu_read_unlock();
+		if (tb->tb_id == id)
 			return tb;
-		}
 	}
-	rcu_read_unlock();
 	return NULL;
 }
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
@@ -167,16 +164,18 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	if (ipv4_is_multicast(addr))
 		return RTN_MULTICAST;
 
+	rcu_read_lock();
+
 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
 	if (local_table) {
 		ret = RTN_UNICAST;
-		rcu_read_lock();
 		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 			if (!dev || dev == res.fi->fib_dev)
 				ret = res.type;
 		}
-		rcu_read_unlock();
 	}
+
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -919,7 +918,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 #undef BRD1_OK
 }
 
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
+static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
 {
 
 	struct fib_result       res;
@@ -929,6 +928,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 		.flowi4_tos = frn->fl_tos,
 		.flowi4_scope = frn->fl_scope,
 	};
+	struct fib_table *tb;
+
+	rcu_read_lock();
+
+	tb = fib_get_table(net, frn->tb_id_in);
 
 	frn->err = -ENOENT;
 	if (tb) {
@@ -945,6 +949,8 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 		}
 		local_bh_enable();
 	}
+
+	rcu_read_unlock();
 }
 
 static void nl_fib_input(struct sk_buff *skb)
@@ -952,7 +958,6 @@ static void nl_fib_input(struct sk_buff *skb)
 	struct net *net;
 	struct fib_result_nl *frn;
 	struct nlmsghdr *nlh;
-	struct fib_table *tb;
 	u32 portid;
 
 	net = sock_net(skb->sk);
@@ -967,9 +972,7 @@ static void nl_fib_input(struct sk_buff *skb)
 	nlh = nlmsg_hdr(skb);
 
 	frn = (struct fib_result_nl *) nlmsg_data(nlh);
-	tb = fib_get_table(net, frn->tb_id_in);
-
-	nl_fib_lookup(frn, tb);
+	nl_fib_lookup(net, frn);
 
 	portid = NETLINK_CB(skb).portid;      /* netlink portid */
 	NETLINK_CB(skb).portid = 0;        /* from kernel */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 8f7bd56..d3db718 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -81,27 +81,25 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 		break;
 
 	case FR_ACT_UNREACHABLE:
-		err = -ENETUNREACH;
-		goto errout;
+		return -ENETUNREACH;
 
 	case FR_ACT_PROHIBIT:
-		err = -EACCES;
-		goto errout;
+		return -EACCES;
 
 	case FR_ACT_BLACKHOLE:
 	default:
-		err = -EINVAL;
-		goto errout;
+		return -EINVAL;
 	}
 
+	rcu_read_lock();
+
 	tbl = fib_get_table(rule->fr_net, rule->table);
-	if (!tbl)
-		goto errout;
+	if (tbl)
+		err = fib_table_lookup(tbl, &flp->u.ip4,
+				       (struct fib_result *)arg->result,
+				       arg->flags);
 
-	err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
-	if (err > 0)
-		err = -EAGAIN;
-errout:
+	rcu_read_unlock();
 	return err;
 }
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 28a3065..987b06d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1181,72 +1181,6 @@ err:
 	return err;
 }
 
-/* should be called with rcu_read_lock */
-static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
-		      t_key key,  const struct flowi4 *flp,
-		      struct fib_result *res, int fib_flags)
-{
-	struct leaf_info *li;
-	struct hlist_head *hhead = &l->list;
-
-	hlist_for_each_entry_rcu(li, hhead, hlist) {
-		struct fib_alias *fa;
-
-		if (l->key != (key & li->mask_plen))
-			continue;
-
-		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
-			struct fib_info *fi = fa->fa_info;
-			int nhsel, err;
-
-			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
-				continue;
-			if (fi->fib_dead)
-				continue;
-			if (fa->fa_info->fib_scope < flp->flowi4_scope)
-				continue;
-			fib_alias_accessed(fa);
-			err = fib_props[fa->fa_type].error;
-			if (unlikely(err < 0)) {
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-				this_cpu_inc(t->stats->semantic_match_passed);
-#endif
-				return err;
-			}
-			if (fi->fib_flags & RTNH_F_DEAD)
-				continue;
-			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
-				const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
-				if (nh->nh_flags & RTNH_F_DEAD)
-					continue;
-				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
-					continue;
-
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-				this_cpu_inc(t->stats->semantic_match_passed);
-#endif
-				res->prefixlen = li->plen;
-				res->nh_sel = nhsel;
-				res->type = fa->fa_type;
-				res->scope = fi->fib_scope;
-				res->fi = fi;
-				res->table = tb;
-				res->fa_head = &li->falh;
-				if (!(fib_flags & FIB_LOOKUP_NOREF))
-					atomic_inc(&fi->fib_clntref);
-				return 0;
-			}
-		}
-
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-		this_cpu_inc(t->stats->semantic_match_miss);
-#endif
-	}
-
-	return 1;
-}
-
 static inline t_key prefix_mismatch(t_key key, struct tnode *n)
 {
 	t_key prefix = n->key;
@@ -1254,6 +1188,7 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n)
 	return (key ^ prefix) & (prefix | -prefix);
 }
 
+/* should be called with rcu_read_lock */
 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags)
 {
@@ -1263,14 +1198,12 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 #endif
 	const t_key key = ntohl(flp->daddr);
 	struct tnode *n, *pn;
+	struct leaf_info *li;
 	t_key cindex;
-	int ret = 1;
-
-	rcu_read_lock();
 
 	n = rcu_dereference(t->trie);
 	if (!n)
-		goto failed;
+		return -EAGAIN;
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	this_cpu_inc(stats->gets);
@@ -1350,7 +1283,7 @@ backtrace:
 
 				pn = node_parent_rcu(pn);
 				if (unlikely(!pn))
-					goto failed;
+					return -EAGAIN;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 				this_cpu_inc(stats->backtrack);
 #endif
@@ -1368,12 +1301,62 @@ backtrace:
 
 found:
 	/* Step 3: Process the leaf, if that fails fall back to backtracing */
-	ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
-	if (unlikely(ret > 0))
-		goto backtrace;
-failed:
-	rcu_read_unlock();
-	return ret;
+	hlist_for_each_entry_rcu(li, &n->list, hlist) {
+		struct fib_alias *fa;
+
+		if ((key ^ n->key) & li->mask_plen)
+			continue;
+
+		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+			struct fib_info *fi = fa->fa_info;
+			int nhsel, err;
+
+			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+				continue;
+			if (fi->fib_dead)
+				continue;
+			if (fa->fa_info->fib_scope < flp->flowi4_scope)
+				continue;
+			fib_alias_accessed(fa);
+			err = fib_props[fa->fa_type].error;
+			if (unlikely(err < 0)) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+				this_cpu_inc(stats->semantic_match_passed);
+#endif
+				return err;
+			}
+			if (fi->fib_flags & RTNH_F_DEAD)
+				continue;
+			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+				const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+				if (nh->nh_flags & RTNH_F_DEAD)
+					continue;
+				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+					continue;
+
+				if (!(fib_flags & FIB_LOOKUP_NOREF))
+					atomic_inc(&fi->fib_clntref);
+
+				res->prefixlen = li->plen;
+				res->nh_sel = nhsel;
+				res->type = fa->fa_type;
+				res->scope = fi->fib_scope;
+				res->fi = fi;
+				res->table = tb;
+				res->fa_head = &li->falh;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+				this_cpu_inc(stats->semantic_match_passed);
+#endif
+				return err;
+			}
+		}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+		this_cpu_inc(stats->semantic_match_miss);
+#endif
+	}
+	goto backtrace;
 }
 EXPORT_SYMBOL_GPL(fib_table_lookup);
 

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox