* [PATCH net-next 2/3] bridge: suppress arp pkts on BR_NEIGH_SUPPRESS ports
From: Roopa Prabhu @ 2017-10-02 4:36 UTC (permalink / raw)
To: davem; +Cc: netdev, nikolay, stephen, bridge
In-Reply-To: <1506919018-27875-1-git-send-email-roopa@cumulusnetworks.com>
From: Roopa Prabhu <roopa@cumulusnetworks.com>
This patch avoids flooding and proxies arp packets
for BR_NEIGH_SUPPRESS ports.
Moves existing br_do_proxy_arp to br_do_proxy_suppress_arp
to support both proxy arp and neigh suppress.
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
net/bridge/br_arp_nd_proxy.c | 186 +++++++++++++++++++++++++++++++++++++++++++
net/bridge/br_device.c | 8 ++
net/bridge/br_input.c | 63 ++-------------
net/bridge/br_private.h | 3 +
4 files changed, 202 insertions(+), 58 deletions(-)
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index e191bb5..5153510 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -11,6 +11,13 @@
*/
#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/neighbour.h>
+#include <net/arp.h>
+#include <linux/if_vlan.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
#include "br_private.h"
void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
@@ -25,3 +32,182 @@ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
}
}
}
+
+static void br_arp_send(struct net_bridge_port *p, int type, int ptype,
+ __be32 dest_ip, struct net_device *dev,
+ __be32 src_ip, const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw,
+ __be16 vlan_proto, u16 vlan_tci)
+{
+ struct sk_buff *skb;
+
+ netdev_dbg(dev, "arp send dev %s dst %pI4 dst_hw %pM src %pI4 src_hw %pM\n",
+ dev->name, &dest_ip, dest_hw, &src_ip, src_hw);
+
+ if (!vlan_tci) {
+ arp_send(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ return;
+ }
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ if (p) {
+ struct net_bridge_vlan_group *vg;
+ u16 pvid;
+
+ vg = nbp_vlan_group_rcu(p);
+ pvid = br_get_pvid(vg);
+ if (pvid && pvid == vlan_tci)
+ vlan_tci = 0;
+ }
+
+ if (vlan_tci) {
+ skb = vlan_insert_tag_set_proto(skb, vlan_proto,
+ vlan_tci);
+ if (!skb) {
+ net_err_ratelimited("%s: failed to insert VLAN tag\n",
+ __func__);
+ return;
+ }
+ }
+
+ arp_xmit(skb);
+}
+
+static int br_chk_addr_ip(struct net_device *dev, void *data)
+{
+ __be32 ip = *(__be32 *)data;
+ struct in_device *in_dev;
+ __be32 addr = 0;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev)
+ addr = inet_confirm_addr(dev_net(dev), in_dev, 0, ip,
+ RT_SCOPE_HOST);
+
+ if (addr == ip)
+ return 1;
+
+ return 0;
+}
+
+static bool br_is_local_ip(struct net_device *dev, __be32 ip)
+{
+ if (br_chk_addr_ip(dev, &ip))
+ return true;
+
+ /* check if ip is configured on upper dev */
+ if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip))
+ return true;
+
+ return false;
+}
+
+void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p)
+{
+ struct net_device *dev = br->dev;
+ struct net_device *vlandev = NULL;
+ struct neighbour *n;
+ struct arphdr *parp;
+ u8 *arpptr, *sha;
+ __be32 sip, tip;
+
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+
+ if ((dev->flags & IFF_NOARP) ||
+ !pskb_may_pull(skb, arp_hdr_len(dev)))
+ return;
+
+ parp = arp_hdr(skb);
+
+ if (parp->ar_pro != htons(ETH_P_IP) ||
+ parp->ar_hln != dev->addr_len ||
+ parp->ar_pln != 4)
+ return;
+
+ arpptr = (u8 *)parp + sizeof(struct arphdr);
+ sha = arpptr;
+ arpptr += dev->addr_len; /* sha */
+ memcpy(&sip, arpptr, sizeof(sip));
+ arpptr += sizeof(sip);
+ arpptr += dev->addr_len; /* tha */
+ memcpy(&tip, arpptr, sizeof(tip));
+
+ if (ipv4_is_loopback(tip) ||
+ ipv4_is_multicast(tip))
+ return;
+
+ if (br->neigh_suppress_enabled) {
+ if (p && (p->flags & BR_NEIGH_SUPPRESS))
+ return;
+ if (ipv4_is_zeronet(sip) || sip == tip) {
+ /* prevent flooding to neigh suppress ports */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ return;
+ }
+ }
+
+ if (parp->ar_op != htons(ARPOP_REQUEST))
+ return;
+
+ if (vid != 0) {
+ vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto,
+ vid);
+ if (!vlandev)
+ return;
+ } else {
+ vlandev = dev;
+ }
+
+ if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) {
+ /* its our local ip, so don't proxy reply
+ * and don't forward to neigh suppress ports
+ */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ return;
+ }
+
+ n = neigh_lookup(&arp_tbl, &tip, vlandev);
+ if (n) {
+ struct net_bridge_fdb_entry *f;
+
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_release(n);
+ return;
+ }
+
+ f = br_fdb_find_rcu(br, n->ha, vid);
+ if (f) {
+ bool replied = false;
+
+ if (f->dst && ((p->flags & BR_PROXYARP) ||
+ (f->dst->flags & BR_PROXYARP_WIFI) ||
+ (f->dst->flags & BR_NEIGH_SUPPRESS))) {
+ if (!vid)
+ br_arp_send(p, ARPOP_REPLY, ETH_P_ARP,
+ sip, skb->dev, tip, sha,
+ n->ha, sha, 0, 0);
+ else
+ br_arp_send(p, ARPOP_REPLY, ETH_P_ARP,
+ sip, skb->dev, tip, sha,
+ n->ha, sha, skb->vlan_proto,
+ skb_vlan_tag_get(skb));
+ replied = true;
+ }
+
+ /* If we have replied or as long as we know the
+ * mac, indicate to arp replied
+ */
+ if (replied || br->neigh_suppress_enabled)
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ }
+
+ neigh_release(n);
+ }
+}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f6b6a92..8961c25 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -39,6 +39,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
const struct nf_br_ops *nf_ops;
const unsigned char *dest;
+ struct ethhdr *eth;
u16 vid = 0;
rcu_read_lock();
@@ -57,11 +58,18 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
BR_INPUT_SKB_CB(skb)->brdev = dev;
skb_reset_mac_header(skb);
+ eth = eth_hdr(skb);
skb_pull(skb, ETH_HLEN);
if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
goto out;
+ if (IS_ENABLED(CONFIG_INET) && br->neigh_suppress_enabled &&
+ (ntohs(eth->h_proto) == ETH_P_ARP ||
+ ntohs(eth->h_proto) == ETH_P_RARP)) {
+ br_do_proxy_suppress_arp(skb, br, vid, NULL);
+ }
+
dest = eth_hdr(skb)->h_dest;
if (is_broadcast_ether_addr(dest)) {
br_flood(br, skb, BR_PKT_BROADCAST, false, true);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7637f58..7637a23 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -71,62 +71,6 @@ static int br_pass_frame_up(struct sk_buff *skb)
br_netif_receive_skb);
}
-static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
- u16 vid, struct net_bridge_port *p)
-{
- struct net_device *dev = br->dev;
- struct neighbour *n;
- struct arphdr *parp;
- u8 *arpptr, *sha;
- __be32 sip, tip;
-
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
-
- if ((dev->flags & IFF_NOARP) ||
- !pskb_may_pull(skb, arp_hdr_len(dev)))
- return;
-
- parp = arp_hdr(skb);
-
- if (parp->ar_pro != htons(ETH_P_IP) ||
- parp->ar_op != htons(ARPOP_REQUEST) ||
- parp->ar_hln != dev->addr_len ||
- parp->ar_pln != 4)
- return;
-
- arpptr = (u8 *)parp + sizeof(struct arphdr);
- sha = arpptr;
- arpptr += dev->addr_len; /* sha */
- memcpy(&sip, arpptr, sizeof(sip));
- arpptr += sizeof(sip);
- arpptr += dev->addr_len; /* tha */
- memcpy(&tip, arpptr, sizeof(tip));
-
- if (ipv4_is_loopback(tip) ||
- ipv4_is_multicast(tip))
- return;
-
- n = neigh_lookup(&arp_tbl, &tip, dev);
- if (n) {
- struct net_bridge_fdb_entry *f;
-
- if (!(n->nud_state & NUD_VALID)) {
- neigh_release(n);
- return;
- }
-
- f = br_fdb_find_rcu(br, n->ha, vid);
- if (f && ((p->flags & BR_PROXYARP) ||
- (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
- arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
- sha, n->ha, sha);
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
- }
-
- neigh_release(n);
- }
-}
-
/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
@@ -171,8 +115,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
- if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
- br_do_proxy_arp(skb, br, vid, p);
+ if (IS_ENABLED(CONFIG_INET) &&
+ (skb->protocol == htons(ETH_P_ARP) ||
+ skb->protocol == htons(ETH_P_RARP))) {
+ br_do_proxy_suppress_arp(skb, br, vid, p);
+ }
switch (pkt_type) {
case BR_PKT_MULTICAST:
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 7dacd83..3006f0d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1131,5 +1131,8 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
}
#endif /* CONFIG_NET_SWITCHDEV */
+/* br_arp_nd_proxy.c */
void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
+void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p);
#endif
--
2.1.4
^ permalink raw reply related
* [PATCH net-next 3/3] bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports
From: Roopa Prabhu @ 2017-10-02 4:36 UTC (permalink / raw)
To: davem; +Cc: netdev, nikolay, stephen, bridge
In-Reply-To: <1506919018-27875-1-git-send-email-roopa@cumulusnetworks.com>
From: Roopa Prabhu <roopa@cumulusnetworks.com>
This patch avoids flooding and proxies ndisc packets
for BR_NEIGH_SUPPRESS ports.
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
net/bridge/br_arp_nd_proxy.c | 246 +++++++++++++++++++++++++++++++++++++++++++
net/bridge/br_device.c | 10 ++
net/bridge/br_input.c | 10 ++
net/bridge/br_private.h | 3 +
4 files changed, 269 insertions(+)
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 5153510..1028e5ab 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -211,3 +211,249 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
neigh_release(n);
}
}
+
+#if IS_ENABLED(CONFIG_IPV6)
+struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg)
+{
+ struct nd_msg *m;
+
+ m = skb_header_pointer(skb, skb_network_offset(skb) +
+ sizeof(struct ipv6hdr), sizeof(*msg), msg);
+ if (!m)
+ return NULL;
+
+ if (m->icmph.icmp6_code != 0 ||
+ (m->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION &&
+ m->icmph.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT))
+ return NULL;
+
+ return m;
+}
+
+static void br_nd_send(struct net_bridge_port *p, struct sk_buff *request,
+ struct neighbour *n, __be16 vlan_proto, u16 vlan_tci,
+ struct nd_msg *ns)
+{
+ struct net_device *dev = request->dev;
+ struct sk_buff *reply;
+ struct nd_msg *na;
+ struct ipv6hdr *pip6;
+ u8 *daddr;
+ int na_olen = 8; /* opt hdr + ETH_ALEN for target */
+ int ns_olen;
+ int i, len;
+
+ if (!dev)
+ return;
+
+ len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
+ sizeof(*na) + na_olen + dev->needed_tailroom;
+
+ reply = alloc_skb(len, GFP_ATOMIC);
+ if (!reply)
+ return;
+
+ reply->protocol = htons(ETH_P_IPV6);
+ reply->dev = dev;
+ skb_reserve(reply, LL_RESERVED_SPACE(dev));
+ skb_push(reply, sizeof(struct ethhdr));
+ skb_set_mac_header(reply, 0);
+
+ daddr = eth_hdr(request)->h_source;
+
+ /* Do we need option processing ? */
+ ns_olen = request->len - (skb_network_offset(request) +
+ sizeof(struct ipv6hdr)) - sizeof(*ns);
+ for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) {
+ if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
+ daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
+ break;
+ }
+ }
+
+ /* Ethernet header */
+ ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
+ ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
+ eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
+ reply->protocol = htons(ETH_P_IPV6);
+
+ skb_pull(reply, sizeof(struct ethhdr));
+ skb_set_network_header(reply, 0);
+ skb_put(reply, sizeof(struct ipv6hdr));
+
+ /* IPv6 header */
+ pip6 = ipv6_hdr(reply);
+ memset(pip6, 0, sizeof(struct ipv6hdr));
+ pip6->version = 6;
+ pip6->priority = ipv6_hdr(request)->priority;
+ pip6->nexthdr = IPPROTO_ICMPV6;
+ pip6->hop_limit = 255;
+ pip6->daddr = ipv6_hdr(request)->saddr;
+ pip6->saddr = *(struct in6_addr *)n->primary_key;
+
+ skb_pull(reply, sizeof(struct ipv6hdr));
+ skb_set_transport_header(reply, 0);
+
+ na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen);
+
+ /* Neighbor Advertisement */
+ memset(na, 0, sizeof(*na) + na_olen);
+ na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
+ na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */
+ na->icmph.icmp6_override = 1;
+ na->icmph.icmp6_solicited = 1;
+ na->target = ns->target;
+ ether_addr_copy(&na->opt[2], n->ha);
+ na->opt[0] = ND_OPT_TARGET_LL_ADDR;
+ na->opt[1] = na_olen >> 3;
+
+ na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
+ &pip6->daddr,
+ sizeof(*na) + na_olen,
+ IPPROTO_ICMPV6,
+ csum_partial(na, sizeof(*na) + na_olen, 0));
+
+ pip6->payload_len = htons(sizeof(*na) + na_olen);
+
+ skb_push(reply, sizeof(struct ipv6hdr));
+ skb_push(reply, sizeof(struct ethhdr));
+
+ reply->ip_summed = CHECKSUM_UNNECESSARY;
+
+ if (p) {
+ struct net_bridge_vlan_group *vg;
+ u16 pvid;
+
+ vg = nbp_vlan_group_rcu(p);
+ pvid = br_get_pvid(vg);
+ if (pvid && pvid == vlan_tci)
+ vlan_tci = 0;
+ }
+
+ if (vlan_tci != 0) {
+ reply = vlan_insert_tag_set_proto(reply, vlan_proto, vlan_tci);
+ if (!reply) {
+ net_err_ratelimited("evpn: failed to insert VLAN tag\n");
+ return;
+ }
+ }
+
+ netdev_dbg(dev, "nd send dev %s dst %pI6 dst_hw %pM src %pI6 src_hw %pM\n",
+ dev->name, &pip6->daddr, daddr, &pip6->saddr, n->ha);
+
+ dev_queue_xmit(reply);
+}
+
+static int br_chk_addr_ip6(struct net_device *dev, void *data)
+{
+ struct in6_addr *addr = (struct in6_addr *)data;
+
+ if (ipv6_chk_addr(dev_net(dev), addr, dev, 0))
+ return 1;
+
+ return 0;
+}
+
+static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr)
+
+{
+ if (br_chk_addr_ip6(dev, addr))
+ return true;
+
+ /* check if ip is configured on upper dev */
+ if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr))
+ return true;
+
+ return false;
+}
+
+void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p, struct nd_msg *msg)
+{
+ struct net_device *dev = br->dev;
+ struct net_device *vlandev = NULL;
+ struct in6_addr *saddr, *daddr;
+ struct ipv6hdr *iphdr;
+ struct inet6_dev *in6_dev;
+ struct neighbour *n;
+
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+
+ if (p && (p->flags & BR_NEIGH_SUPPRESS))
+ return;
+
+ if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT &&
+ !msg->icmph.icmp6_solicited) {
+ /* prevent flooding to neigh suppress ports */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ return;
+ }
+
+ if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
+ return;
+
+ in6_dev = __in6_dev_get(dev);
+ if (!in6_dev)
+ return;
+
+ iphdr = ipv6_hdr(skb);
+ saddr = &iphdr->saddr;
+ daddr = &iphdr->daddr;
+
+ if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) {
+ /* prevent flooding to neigh suppress ports */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ return;
+ }
+
+ if (vid != 0) {
+ /* build neigh table lookup on the vlan device */
+ vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto,
+ vid);
+ if (!vlandev)
+ return;
+ } else {
+ vlandev = dev;
+ }
+
+ if (br_is_local_ip6(vlandev, &msg->target)) {
+ /* its our own ip, so don't proxy reply
+ * and don't forward to arp suppress ports
+ */
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ return;
+ }
+
+ n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, vlandev);
+ if (n) {
+ struct net_bridge_fdb_entry *f;
+
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_release(n);
+ return;
+ }
+
+ f = br_fdb_find_rcu(br, n->ha, vid);
+ if (f) {
+ bool replied = false;
+
+ if (f->dst && (f->dst->flags & BR_NEIGH_SUPPRESS)) {
+ if (vid != 0)
+ br_nd_send(p, skb, n, skb->vlan_proto,
+ skb_vlan_tag_get(skb), msg);
+ else
+ br_nd_send(p, skb, n, 0, 0, msg);
+ replied = true;
+ }
+
+ /* If we have replied or as long as we know the
+ * mac, indicate to NEIGH_SUPPRESS ports that we
+ * have replied
+ */
+ if (replied || br->neigh_suppress_enabled)
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ }
+ neigh_release(n);
+ }
+}
+#endif
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 8961c25..89c5d01 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -68,6 +68,16 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
(ntohs(eth->h_proto) == ETH_P_ARP ||
ntohs(eth->h_proto) == ETH_P_RARP)) {
br_do_proxy_suppress_arp(skb, br, vid, NULL);
+ } else if (IS_ENABLED(CONFIG_IPV6) && br->neigh_suppress_enabled &&
+ skb->protocol == htons(ETH_P_IPV6) &&
+ pskb_may_pull(skb, sizeof(struct ipv6hdr) +
+ sizeof(struct nd_msg)) &&
+ ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+ struct nd_msg *msg, _msg;
+
+ msg = br_is_nd_neigh_msg(skb, &_msg);
+ if (msg)
+ br_do_suppress_nd(skb, br, vid, NULL, msg);
}
dest = eth_hdr(skb)->h_dest;
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7637a23..491e4dd 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -119,6 +119,16 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
(skb->protocol == htons(ETH_P_ARP) ||
skb->protocol == htons(ETH_P_RARP))) {
br_do_proxy_suppress_arp(skb, br, vid, p);
+ } else if (IS_ENABLED(CONFIG_IPV6) && br->neigh_suppress_enabled &&
+ skb->protocol == htons(ETH_P_IPV6) &&
+ pskb_may_pull(skb, sizeof(struct ipv6hdr) +
+ sizeof(struct nd_msg)) &&
+ ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+ struct nd_msg *msg, _msg;
+
+ msg = br_is_nd_neigh_msg(skb, &_msg);
+ if (msg)
+ br_do_suppress_nd(skb, br, vid, p, msg);
}
switch (pkt_type) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 3006f0d..ff36891 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1135,4 +1135,7 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
u16 vid, struct net_bridge_port *p);
+void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
+struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m);
#endif
--
2.1.4
^ permalink raw reply related
* Re: [PATCH net 3/3] net: skb_queue_purge(): lock/unlock the queue only once
From: Michael Witten @ 2017-10-02 5:15 UTC (permalink / raw)
To: Stephen Hemminger
Cc: David S. Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
Eric Dumazet, netdev, linux-kernel
In-Reply-To: <20171001175909.4b9e53d8@xeon-e3>
On Sun, 1 Oct 2017 17:59:09 -0700, Stephen Hemminger wrote:
> On Sun, 01 Oct 2017 22:19:20 -0000 Michael Witten wrote:
>
>> + spin_lock_irqsave(&q->lock, flags);
>> + skb = q->next;
>> + __skb_queue_head_init(q);
>> + spin_unlock_irqrestore(&q->lock, flags);
>
> Other code manipulating lists uses splice operation and
> a sk_buff_head temporary on the stack. That would be easier
> to understand.
>
> struct sk_buf_head head;
>
> __skb_queue_head_init(&head);
> spin_lock_irqsave(&q->lock, flags);
> skb_queue_splice_init(q, &head);
> spin_unlock_irqrestore(&q->lock, flags);
>
>
>> + while (skb != head) {
>> + next = skb->next;
>> kfree_skb(skb);
>> + skb = next;
>> + }
>
> It would be cleaner if you could use
> skb_queue_walk_safe rather than open coding the loop.
>
> skb_queue_walk_safe(&head, skb, tmp)
> kfree_skb(skb);
I appreciate abstraction as much as anybody, but I do not believe
that such abstractions would actually be an improvement here.
* Splice-initing seems more like an idiom than an abstraction;
at first blush, it wouldn't be clear to me what the intention
is.
* Such abstractions are fairly unnecessary.
* The function as written is already so short as to be
easily digested.
* More to the point, this function is not some generic,
higher-level algorithm that just happens to employ the
socket buffer interface; rather, it is a function that
implements part of that very interface, and may thus
twiddle the intimate bits of these data structures
without being accused of abusing a leaky abstraction.
* Such abstractions add overhead, if only conceptually. In this
case, a temporary socket buffer queue allocates *3* unnecessary
struct members, including a whole `spinlock_t' member:
prev
qlen
lock
It's possible that the compiler will be smart enough to leave
those out, but I have my suspicions that it won't, not only
given that the interface contract requires that the temporary
socket buffer queue be properly initialized before use, but
also because splicing into the temporary will manipulate its
`qlen'. Yet, why worry whether optimization happens? The whole
issue can simply be avoided by exploiting the intimate details
that are already philosophically available to us.
Similarly, the function `skb_queue_walk_safe' is nice, but it
loses value both because a temporary queue loses value (as just
described), and because it ignores the fact that legitimate
access to the internals of these data structures allows for
setting up the requested loop in advance; that is to say, the
two parts of the function that we are now debating can be woven
together more tightly than `skb_queue_walk_safe' allows.
For these reasons, I stand by the way that the patch currently
implements this function; it does exactly what is desired, no more
or less.
Sincerely,
Michael Witten
^ permalink raw reply
* Re: [PATCH net 0/4] ip_gre: a bunch of fixes for erspan
From: David Miller @ 2017-10-02 5:32 UTC (permalink / raw)
To: lucien.xin; +Cc: netdev, xeb, u9012063
In-Reply-To: <cover.1506866059.git.lucien.xin@gmail.com>
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 1 Oct 2017 22:00:52 +0800
> This patchset is to fix some issues that could cause 0 or low
> performance, and even unexpected truncated packets on erspan.
Series applied, thanks.
^ permalink raw reply
* Re: [PATCH net] l2tp: fix l2tp_eth module loading
From: David Miller @ 2017-10-02 5:35 UTC (permalink / raw)
To: g.nault; +Cc: netdev, jchapman, tparkin
In-Reply-To: <a7d351fd5948fc69ef17da3c170cd7c152f899c4.1506606179.git.g.nault@alphalink.fr>
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Thu, 28 Sep 2017 15:44:38 +0200
> The l2tp_eth module crashes if its netlink callbacks are run when the
> pernet data aren't initialised.
>
> We should normally register_pernet_device() before the genl callbacks.
> However, the pernet data only maintain a list of l2tpeth interfaces,
> and this list is never used. So let's just drop pernet handling
> instead.
>
> Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support")
> Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Applied and queued up for -stable, thanks.
^ permalink raw reply
* Re: [PATCH 00/18] use ARRAY_SIZE macro
From: Greg KH @ 2017-10-02 5:35 UTC (permalink / raw)
To: Jérémy Lefaure
Cc: alsa-devel, nouveau, dri-devel, dm-devel, brcm80211-dev-list,
devel, linux-scsi, linux-rdma, amd-gfx, Jason Gunthorpe,
linux-acpi, linux-video, intel-wired-lan, linux-media, intel-gfx,
ecryptfs, brcm80211-dev-list.pdl, linux-raid, openipmi-developer,
intel-gvt-dev, devel, linux-nfs, netdev, linux-usb,
linux-wireless, linux-kernel, linux-integrity, Tobin C. Harding
In-Reply-To: <20171001205220.10b78086@blatinox-laptop.localdomain>
On Sun, Oct 01, 2017 at 08:52:20PM -0400, Jérémy Lefaure wrote:
> On Mon, 2 Oct 2017 09:01:31 +1100
> "Tobin C. Harding" <me@tobin.cc> wrote:
>
> > > In order to reduce the size of the To: and Cc: lines, each patch of the
> > > series is sent only to the maintainers and lists concerned by the patch.
> > > This cover letter is sent to every list concerned by this series.
> >
> > Why don't you just send individual patches for each subsystem? I'm not a maintainer but I don't see
> > how any one person is going to be able to apply this whole series, it is making it hard for
> > maintainers if they have to pick patches out from among the series (if indeed any will bother
> > doing that).
> Yeah, maybe it would have been better to send individual patches.
>
> From my point of view it's a series because the patches are related (I
> did a git format-patch from my local branch). But for the maintainers
> point of view, they are individual patches.
And the maintainers view is what matters here, if you wish to get your
patches reviewed and accepted...
thanks,
greg k-h
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply
* Re: [PATCH V4] r8152: add Linksys USB3GIGV1 id
From: David Miller @ 2017-10-02 5:39 UTC (permalink / raw)
To: grundler; +Cc: hayeswang, oneukum, linux-usb, linux-kernel, netdev
In-Reply-To: <20170928183500.61199-1-grundler@chromium.org>
From: Grant Grundler <grundler@chromium.org>
Date: Thu, 28 Sep 2017 11:35:00 -0700
> This linksys dongle by default comes up in cdc_ether mode.
> This patch allows r8152 to claim the device:
> Bus 002 Device 002: ID 13b1:0041 Linksys
>
> Signed-off-by: Grant Grundler <grundler@chromium.org>
Applied, thanks.
^ permalink raw reply
* Re: [PATCH net-next] samples/bpf: fix warnings in xdp_monitor_user
From: Jesper Dangaard Brouer @ 2017-10-02 5:43 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: brouer, ast, daniel, netdev, Stephen Hemminger
In-Reply-To: <20171001210734.27010-1-sthemmin@microsoft.com>
On Sun, 1 Oct 2017 14:07:34 -0700
Stephen Hemminger <stephen@networkplumber.org> wrote:
> Make local functions static to fix
>
> HOSTCC samples/bpf/xdp_monitor_user.o
> samples/bpf/xdp_monitor_user.c:64:7: warning: no previous prototype for ‘gettime’ [-Wmissing-prototypes]
> __u64 gettime(void)
> ^~~~~~~
> samples/bpf/xdp_monitor_user.c:209:6: warning: no previous prototype for ‘print_bpf_prog_info’ [-Wmissing-prototypes]
> void print_bpf_prog_info(void)
> ^~~~~~~~~~~~~~~~~~~
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> samples/bpf/xdp_monitor_user.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [PATCH net-next] net: core: decouple ifalias get/set from rtnl lock
From: David Miller @ 2017-10-02 5:46 UTC (permalink / raw)
To: fw; +Cc: netdev, edumazet
In-Reply-To: <20170929112150.7424-1-fw@strlen.de>
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Sep 2017 13:21:50 +0200
> @@ -1488,7 +1484,7 @@ static void netdev_release(struct device *d)
>
> BUG_ON(dev->reg_state != NETREG_RELEASED);
>
> - kfree(dev->ifalias);
> + kfree(rcu_access_pointer(dev->ifalias));
> netdev_freemem(dev);
> }
>
"kfree_rcu()" at least?
If the deal is that you don't need to do and RCU free because
netdevice objects disappear synchronously, and you can therefore prove
that no RCU based async access can occur to dev->ifalias, then you
need to add a comment here.
^ permalink raw reply
* Re: [PATCH v2 net] net: mvpp2: Fix clock resource by adding an optional bus clock
From: David Miller @ 2017-10-02 5:51 UTC (permalink / raw)
To: gregory.clement
Cc: linux-kernel, netdev, jason, andrew, sebastian.hesselbarth,
thomas.petazzoni, linux-arm-kernel, antoine.tenart, miquel.raynal,
nadavh, shadi, yehuday, omrii, hannah, igall, mw
In-Reply-To: <20170929122739.5296-1-gregory.clement@free-electrons.com>
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Fri, 29 Sep 2017 14:27:39 +0200
> On Armada 7K/8K we need to explicitly enable the bus clock. The bus clock
> is optional because not all the SoCs need them but at least for Armada
> 7K/8K it is actually mandatory.
>
> The binding documentation is updating accordingly.
>
> Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Applied.
^ permalink raw reply
* Re: [PATCH net] RDS: IB: Limit the scope of has_fr/has_fmr variables
From: David Miller @ 2017-10-02 5:54 UTC (permalink / raw)
To: avinash.repaka-QHcLZuEGTsvQT0dZR+AlfA
Cc: santosh.shilimkar-QHcLZuEGTsvQT0dZR+AlfA,
netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
rds-devel-N0ozoZBvEnrZJqsBc5GL+g,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1506734030-15205-1-git-send-email-avinash.repaka-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
From: Avinash Repaka <avinash.repaka-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
Date: Fri, 29 Sep 2017 18:13:50 -0700
> This patch fixes the scope of has_fr and has_fmr variables as they are
> needed only in rds_ib_add_one().
>
> Signed-off-by: Avinash Repaka <avinash.repaka-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
Applied.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH net] RDS: IB: Limit the scope of has_fr/has_fmr variables
From: David Miller @ 2017-10-02 5:56 UTC (permalink / raw)
To: avinash.repaka-QHcLZuEGTsvQT0dZR+AlfA
Cc: santosh.shilimkar-QHcLZuEGTsvQT0dZR+AlfA,
netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
rds-devel-N0ozoZBvEnrZJqsBc5GL+g,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20171001.225419.1632354964848604960.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
From: David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Date: Sun, 01 Oct 2017 22:54:19 -0700 (PDT)
> From: Avinash Repaka <avinash.repaka-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
> Date: Fri, 29 Sep 2017 18:13:50 -0700
>
>> This patch fixes the scope of has_fr and has_fmr variables as they are
>> needed only in rds_ib_add_one().
>>
>> Signed-off-by: Avinash Repaka <avinash.repaka-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
>
> Applied.
Actually, reverted, this breaks the build.
net/rds/rdma_transport.c:38:10: fatal error: ib.h: No such file or directory
#include "ib.h"
Although I can't see how in the world this patch is causing such
an error.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH] net: hns3: Fix an error handling path in 'hclge_rss_init_hw()'
From: David Miller @ 2017-10-02 5:57 UTC (permalink / raw)
To: christophe.jaillet
Cc: yisen.zhuang, salil.mehta, linyunsheng, lipeng321, colin.king,
arnd, netdev, linux-kernel, kernel-janitors
In-Reply-To: <20170930053434.4558-1-christophe.jaillet@wanadoo.fr>
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 30 Sep 2017 07:34:34 +0200
> If this sanity check fails, we must free 'rss_indir'. Otherwise there is a
> memory leak.
> 'goto err' as done in the other error handling paths to fix it.
>
> Fixes: 46a3df9f9718 ("net: hns3: Fix for setting rss_size incorrectly")
> Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Applied.
^ permalink raw reply
* Re: [PATCH 0/6] bcm63xx_enet: small fixes and cleanups
From: David Miller @ 2017-10-02 6:06 UTC (permalink / raw)
To: jonas.gorski
Cc: netdev, linux-arm-kernel, linux-kernel, f.fainelli,
bcm-kernel-feedback-list
In-Reply-To: <20171001110220.27668-1-jonas.gorski@gmail.com>
From: Jonas Gorski <jonas.gorski@gmail.com>
Date: Sun, 1 Oct 2017 13:02:14 +0200
> This patch set fixes a few theoretical issues and cleans up the code a
> bit. It also adds a bit more managed function usage to simplify clock
> and iomem usage.
>
> Based on net-next.
Series applied.
^ permalink raw reply
* Re: [PATCH][next] mlxsw: spectrum: fix uninitialized value in err
From: David Miller @ 2017-10-02 6:06 UTC (permalink / raw)
To: colin.king; +Cc: jiri, idosch, netdev, kernel-janitors, linux-kernel
In-Reply-To: <20171001162735.8091-1-colin.king@canonical.com>
From: Colin King <colin.king@canonical.com>
Date: Sun, 1 Oct 2017 17:27:35 +0100
> From: Colin Ian King <colin.king@canonical.com>
>
> In the unlikely event that mfc->mfc_un.res.ttls[i] is 255 for all
> values of i from 0 to MAXIVS-1, the err is not set at all and hence
> has a garbage value on the error return at the end of the function,
> so initialize it to 0. Also, the error return check on err and goto
> to err: inside the for loop makes it impossible for err to be zero
> at the end of the for loop, so we can remove the redundant err check
> at the end of the loop.
>
> Detected by CoverityScan CID#1457207 ("Unitialized scalar value")
>
> Fixes: c011ec1bbfd6 ("mlxsw: spectrum: Add the multicast routing offloading logic")
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next] samples/bpf: fix warnings in xdp_monitor_user
From: David Miller @ 2017-10-02 6:08 UTC (permalink / raw)
To: stephen; +Cc: ast, daniel, netdev, sthemmin
In-Reply-To: <20171001210734.27010-1-sthemmin@microsoft.com>
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 1 Oct 2017 14:07:34 -0700
> Make local functions static to fix
>
> HOSTCC samples/bpf/xdp_monitor_user.o
> samples/bpf/xdp_monitor_user.c:64:7: warning: no previous prototype for ‘gettime’ [-Wmissing-prototypes]
> __u64 gettime(void)
> ^~~~~~~
> samples/bpf/xdp_monitor_user.c:209:6: warning: no previous prototype for ‘print_bpf_prog_info’ [-Wmissing-prototypes]
> void print_bpf_prog_info(void)
> ^~~~~~~~~~~~~~~~~~~
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Applied.
^ permalink raw reply
* Re: [RFC net-next 0/5] net: dsa: LAG support
From: Ido Schimmel @ 2017-10-02 6:50 UTC (permalink / raw)
To: Florian Fainelli
Cc: netdev, andrew, vivien.didelot, jiri, idosch, Woojung.Huh, john,
sean.wang
In-Reply-To: <20171001194639.8647-1-f.fainelli@gmail.com>
Hi Florian,
On Sun, Oct 01, 2017 at 12:46:34PM -0700, Florian Fainelli wrote:
> Hi all,
>
> This patch series is sent as RFC since I have only been able to test LAG
> with dsa-loop and not with real HW yet (that should be tomorrow). I also
> looked at how the Marvell DSDT API is defined for adding ports to "trunk"
> groups and the API proposed here should work there too. Can't speak about
> QCA, Mediatek or KSZ switches though.
Thanks for working on this. I've yet to look at the patches, but I
thought I'll mention a few issues we bumped into with LAG devices:
1) It is possible for users to stack devices on top of the LAG and only
then enslave your port. This means that the underlying driver might not
be aware of all the necessary configuration. It's quite a complicated
problem to solve properly, so we currently forbid enslavements to
devices that already have uppers.
There's also an issue with IP addresses and routes configured on top of
the LAG, but I hope to fix that soon. I don't think you support L3 in
DSA yet, so it shouldn't be a problem for you.
2) Similarly, you're no longer guaranteed to have the bridge do proper
clean up in case you pull a port out of a bridged LAG, so you'll need to
handle that. Any context you store for the bridge port needs to be
destroyed upon the removal of the last port from the LAG.
> Few open questions that may need solving now or later:
>
> - on Broadcom switches, we should allow enslaving a port as a LAG group
> member if its speed does not match that of the other members of the group
>
> - not sure what to do with a switch fabric, naively, if adding two ports
> of two distinct switches as a LAG group, we may have to propagate that
> to "dsa" cross-chip interfaces as well
At least in mlxsw case, enslaving switch and non-switch ports to the
same LAG doesn't make sense. Any traffic routed by the switch will only
be load-balanced between the switch ports. One way to solve that is to
forbid such enslavements during NETDEV_PRECHANGEUPPER in case the lower
devices in the adjacency list of the LAG don't belong to the same
switch.
Note that such configurations are bound to fail anyway, as the
non-switch ports will not have `switchdev_ops` configured and thus fail
during __switchdev_port_obj_add() / __switchdev_port_attr_set().
^ permalink raw reply
* Re: [RFC net-next 1/5] net: dsa: Add infrastructure to support LAG
From: Ido Schimmel @ 2017-10-02 7:05 UTC (permalink / raw)
To: Andrew Lunn
Cc: Florian Fainelli, netdev, vivien.didelot, jiri, idosch,
Woojung.Huh, john, sean.wang
In-Reply-To: <20171002020327.GA21593@lunn.ch>
On Mon, Oct 02, 2017 at 04:03:27AM +0200, Andrew Lunn wrote:
> On Sun, Oct 01, 2017 at 12:46:35PM -0700, Florian Fainelli wrote:
> > +static bool dsa_slave_lag_check(struct net_device *dev, struct net_device *lag_dev,
> > + struct netdev_lag_upper_info *lag_upper_info)
> > +{
> > + struct dsa_slave_priv *p = netdev_priv(dev);
> > + u8 lag_id;
> > +
> > + /* No more lag identifiers available or already in use */
> > + if (dsa_switch_lag_get_index(p->dp->ds, lag_dev, &lag_id) != 0)
> > + return false;
> > +
> > + if (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH)
> > + return false;
>
> I wounder if the driver needs to decide this? Can different hardware
> support different tx_types?
FWIW, the same check exists in mlxsw, but maybe other devices support
more methods, so I think it makes sense to have the driver decide this.
^ permalink raw reply
* RE: [RESEND PATCH 2/6] staging: fsl-dpaa2/ethsw: Add Freescale DPAA2 Ethernet Switch driver
From: Razvan Stefanescu @ 2017-10-02 7:18 UTC (permalink / raw)
To: Florian Fainelli, Bogdan Purcareata, gregkh@linuxfoundation.org
Cc: devel@driverdev.osuosl.org, linux-kernel@vger.kernel.org,
netdev@vger.kernel.org, agraf@suse.de, arnd@arndb.de,
Alexandru Marginean, Ruxandra Ioana Radulescu, Laurentiu Tudor,
stuyoder@gmail.com
In-Reply-To: <A4C4732A-CC1F-4707-89F8-E7284F022DB0@gmail.com>
> -----Original Message-----
> From: Florian Fainelli [mailto:f.fainelli@gmail.com]
> Sent: Friday, September 29, 2017 19:11
> To: Razvan Stefanescu <razvan.stefanescu@nxp.com>; Bogdan Purcareata
> <bogdan.purcareata@nxp.com>; gregkh@linuxfoundation.org
> Cc: devel@driverdev.osuosl.org; linux-kernel@vger.kernel.org;
> netdev@vger.kernel.org; agraf@suse.de; arnd@arndb.de; Alexandru Marginean
> <alexandru.marginean@nxp.com>; Ruxandra Ioana Radulescu
> <ruxandra.radulescu@nxp.com>; Laurentiu Tudor <laurentiu.tudor@nxp.com>;
> stuyoder@gmail.com
> Subject: RE: [RESEND PATCH 2/6] staging: fsl-dpaa2/ethsw: Add Freescale DPAA2
> Ethernet Switch driver
>
> On September 29, 2017 6:59:18 AM PDT, Razvan Stefanescu
> <razvan.stefanescu@nxp.com> wrote:
> >
> >
> >> -----Original Message-----
> >> From: Bogdan Purcareata
> >> Sent: Friday, September 29, 2017 16:36
> >> To: Razvan Stefanescu <razvan.stefanescu@nxp.com>;
> >> gregkh@linuxfoundation.org
> >> Cc: devel@driverdev.osuosl.org; linux-kernel@vger.kernel.org;
> >> netdev@vger.kernel.org; agraf@suse.de; arnd@arndb.de; Alexandru
> >Marginean
> >> <alexandru.marginean@nxp.com>; Ruxandra Ioana Radulescu
> >> <ruxandra.radulescu@nxp.com>; Laurentiu Tudor
> ><laurentiu.tudor@nxp.com>;
> >> stuyoder@gmail.com
> >> Subject: RE: [RESEND PATCH 2/6] staging: fsl-dpaa2/ethsw: Add
> >Freescale DPAA2
> >> Ethernet Switch driver
> >>
> >> > Introduce the DPAA2 Ethernet Switch driver, which manages Datapath
> >Switch
> >> > (DPSW) objects discovered on the MC bus.
> >> >
> >> > Suggested-by: Alexandru Marginean <alexandru.marginean@nxp.com>
> >> > Signed-off-by: Razvan Stefanescu <razvan.stefanescu@nxp.com>
>
> This looks pretty good for a new switchdev driver, is there a reason you can't
> target drivers/net/ethernet instead of staging? Is it because the MC bus code is
> still in staging (AFAICT)?
>
Yes, driver depends on MC bus, which is still in staging. Also, control traffic
code will require access to DPIO functions, that are also in staging.
Best regards,
Razvan S.
> --
> Florian
^ permalink raw reply
* Re: RFC iproute2 doc files
From: Leon Romanovsky @ 2017-10-02 7:31 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20170920081159.321e426b@xeon-e3>
[-- Attachment #1: Type: text/plain, Size: 488 bytes --]
On Wed, Sep 20, 2017 at 08:11:59AM -0700, Stephen Hemminger wrote:
> I noticed that the iproute man pages are up to date but the LaTex documentation
> is very out of date. Rarely updated since the Linux 2.2 days.
>
> Either someone needs to do a massive editing job on them, or they should just
> be dropped. My preference would be to just drop everything in the doc/ directory.
> The current versions are so old, they can't be helping.
If my vote counts, I will say to drop it.
Thanks
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* Re: [PATCH v2 net-next 0/2] net/sched: support tunnel options in cls_flower and act_tunnel_key
From: Simon Horman @ 2017-10-02 7:50 UTC (permalink / raw)
To: David Miller; +Cc: jiri, jhs, xiyou.wangcong, netdev, oss-drivers
In-Reply-To: <20170929.055423.108055524887949393.davem@davemloft.net>
On Fri, Sep 29, 2017 at 05:54:23AM +0100, David Miller wrote:
> From: Simon Horman <simon.horman@netronome.com>
> Date: Wed, 27 Sep 2017 10:16:32 +0200
>
> > Users of options:
> >
> > * There are eBPF hooks to allow getting on and setting tunnel metadata:
> > bpf_skb_set_tunnel_opt, bpf_skb_get_tunnel_opt.
> >
> > * Open vSwitch is able to match and set Geneve and VXLAN-GBP options.
> >
> > Neither of the above appear to assume any structure for the data.
>
> I really worry about this.
>
> These metadata option blobs are internal kernel datastructure which we
> could change at any point in time. They are not exported to
> userspace as a UAPI.
>
> It's kinda OK for eBPF programs to access this stuff since they are
> expected to cope with changes to internal data-structures.
>
> But for anything user facing, this really doesn't work.
Hi Dave, Hi Jiri,
the feedback I got from Jiri is that there needs to be some exposure
of TLVs. What I have in mind is to describe Geneve option TLVs in the
UAPI and for the kernel - most likely cls_flower, possibly using helpers,
to translate between that encoding and the one used internally by the kernel
- which currently happens to be the on-the-wire format.
I believe that in order to avoid per-packet overhead and at the same time
code complexity the TLVs should be described in-order. So matching on
TLV-A,TLV-B,TLV-C would be a different match to TLV-C,TLV-A,TLV-B. An
order-independent match could be added if desired in future.
This would mean the feature is initially restricted to Geneve but could
be expended to offer a similar feature for other encapsulation protocols
as the need arises.
Would this address your concerns?
^ permalink raw reply
* [PATCH net-next 01/12] qed: Add ll2 option to limit the number of bds per packet
From: Michal Kalderon @ 2017-10-02 8:23 UTC (permalink / raw)
To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
dledford-H+wXaHxf7aLQT0dZR+AlfA, Michal Kalderon, Ariel Elior
In-Reply-To: <1506932638-26268-1-git-send-email-Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
iWARP uses 3 ll2 connections, the maximum number of bds is known
during connection setup. This patch modifies the static array in
the ll2_tx_packet descriptor to be a flexible array and
significantlly reduces memory size.
In addition, some redundant fields in the ll2_tx_packet were
removed, which also contributed to decreasing the descriptor size.
Signed-off-by: Michal Kalderon <Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Ariel Elior <Ariel.Elior-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
---
drivers/net/ethernet/qlogic/qed/qed_ll2.c | 25 +++++++++++++++++++------
drivers/net/ethernet/qlogic/qed/qed_ll2.h | 7 ++-----
2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 250afa5..10e3a43 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1105,6 +1105,7 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn,
struct qed_ll2_info *p_ll2_info)
{
struct qed_ll2_tx_packet *p_descq;
+ u32 desc_size;
u32 capacity;
int rc = 0;
@@ -1122,8 +1123,12 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn,
goto out;
capacity = qed_chain_get_capacity(&p_ll2_info->tx_queue.txq_chain);
- p_descq = kcalloc(capacity, sizeof(struct qed_ll2_tx_packet),
- GFP_KERNEL);
+ /* First element is part of the packet, rest are flexibly added */
+ desc_size = (sizeof(*p_descq) +
+ (p_ll2_info->input.tx_max_bds_per_packet - 1) *
+ sizeof(p_descq->bds_set));
+
+ p_descq = kcalloc(capacity, desc_size, GFP_KERNEL);
if (!p_descq) {
rc = -ENOMEM;
goto out;
@@ -1359,11 +1364,13 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
{
struct qed_hwfn *p_hwfn = cxt;
struct qed_ll2_info *p_ll2_conn;
+ struct qed_ll2_tx_packet *p_pkt;
struct qed_ll2_rx_queue *p_rx;
struct qed_ll2_tx_queue *p_tx;
struct qed_ptt *p_ptt;
int rc = -EINVAL;
u32 i, capacity;
+ u32 desc_size;
u8 qid;
p_ptt = qed_ptt_acquire(p_hwfn);
@@ -1397,9 +1404,15 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
INIT_LIST_HEAD(&p_tx->sending_descq);
spin_lock_init(&p_tx->lock);
capacity = qed_chain_get_capacity(&p_tx->txq_chain);
- for (i = 0; i < capacity; i++)
- list_add_tail(&p_tx->descq_array[i].list_entry,
- &p_tx->free_descq);
+ /* First element is part of the packet, rest are flexibly added */
+ desc_size = (sizeof(*p_pkt) +
+ (p_ll2_conn->input.tx_max_bds_per_packet - 1) *
+ sizeof(p_pkt->bds_set));
+
+ for (i = 0; i < capacity; i++) {
+ p_pkt = (void *)((u8 *)p_tx->descq_array + desc_size * i);
+ list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+ }
p_tx->cur_completing_bd_idx = 0;
p_tx->bds_idx = 0;
p_tx->b_completing_packet = false;
@@ -1698,7 +1711,7 @@ int qed_ll2_prepare_tx_packet(void *cxt,
p_tx = &p_ll2_conn->tx_queue;
p_tx_chain = &p_tx->txq_chain;
- if (pkt->num_of_bds > CORE_LL2_TX_MAX_BDS_PER_PACKET)
+ if (pkt->num_of_bds > p_ll2_conn->input.tx_max_bds_per_packet)
return -EIO;
spin_lock_irqsave(&p_tx->lock, flags);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
index a822528..8019336 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
@@ -63,17 +63,14 @@ struct qed_ll2_rx_packet {
struct qed_ll2_tx_packet {
struct list_head list_entry;
u16 bd_used;
- u16 vlan;
- u16 l4_hdr_offset_w;
- u8 bd_flags;
bool notify_fw;
void *cookie;
-
+ /* Flexible Array of bds_set determined by max_bds_per_packet */
struct {
struct core_tx_bd *txq_bd;
dma_addr_t tx_frag;
u16 frag_len;
- } bds_set[ETH_TX_MAX_BDS_PER_NON_LSO_PACKET];
+ } bds_set[1];
};
struct qed_ll2_rx_queue {
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH net-next 03/12] qed: Add ll2 option for dropping a tx packet
From: Michal Kalderon @ 2017-10-02 8:23 UTC (permalink / raw)
To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
dledford-H+wXaHxf7aLQT0dZR+AlfA, Michal Kalderon, Ariel Elior
In-Reply-To: <1506932638-26268-1-git-send-email-Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
The option of sending a packet on the ll2 and dropping it exists in
hardware and was not used until now, thus not exposed.
The iWARP unaligned MPA flow requires this functionality for
flushing the tx queue.
Signed-off-by: Michal Kalderon <Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Ariel Elior <Ariel.Elior-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
---
drivers/net/ethernet/qlogic/qed/qed_ll2.c | 16 ++++++++++++++--
include/linux/qed/qed_ll2_if.h | 1 +
2 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 1dd0cca..49fcfda 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1597,8 +1597,20 @@ static void qed_ll2_prepare_tx_packet_set(struct qed_hwfn *p_hwfn,
roce_flavor = (pkt->qed_roce_flavor == QED_LL2_ROCE) ? CORE_ROCE
: CORE_RROCE;
- tx_dest = (pkt->tx_dest == QED_LL2_TX_DEST_NW) ? CORE_TX_DEST_NW
- : CORE_TX_DEST_LB;
+ switch (pkt->tx_dest) {
+ case QED_LL2_TX_DEST_NW:
+ tx_dest = CORE_TX_DEST_NW;
+ break;
+ case QED_LL2_TX_DEST_LB:
+ tx_dest = CORE_TX_DEST_LB;
+ break;
+ case QED_LL2_TX_DEST_DROP:
+ tx_dest = CORE_TX_DEST_DROP;
+ break;
+ default:
+ tx_dest = CORE_TX_DEST_LB;
+ break;
+ }
start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 25153ff..aa7cb3b 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -64,6 +64,7 @@ enum qed_ll2_roce_flavor_type {
enum qed_ll2_tx_dest {
QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */
QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */
+ QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */
QED_LL2_TX_DEST_MAX
};
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH net-next 07/12] qed: Add ll2 connection for processing unaligned MPA packets
From: Michal Kalderon @ 2017-10-02 8:23 UTC (permalink / raw)
To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
dledford-H+wXaHxf7aLQT0dZR+AlfA, Michal Kalderon, Ariel Elior
In-Reply-To: <1506932638-26268-1-git-send-email-Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
This patch adds only the establishment and termination of the
ll2 connection that handles unaligned MPA packets.
Signed-off-by: Michal Kalderon <Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Ariel Elior <Ariel.Elior-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
---
drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 65 +++++++++++++++++++++++++++++
drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 1 +
2 files changed, 66 insertions(+)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 8fc9c811..f413621 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1713,6 +1713,19 @@ int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams)
return 0;
}
+/* fpdu can be fragmented over maximum 3 bds: header, partial mpa, unaligned */
+#define QED_IWARP_MAX_BDS_PER_FPDU 3
+static void
+qed_iwarp_ll2_comp_mpa_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
+{
+ struct qed_iwarp_info *iwarp_info;
+ struct qed_hwfn *p_hwfn = cxt;
+
+ iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+ qed_iwarp_ll2_post_rx(p_hwfn, data->cookie,
+ iwarp_info->ll2_mpa_handle);
+}
+
static void
qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
{
@@ -1877,6 +1890,13 @@ static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
kfree(buffer);
}
+void
+qed_iwarp_ll2_slowpath(void *cxt,
+ u8 connection_handle,
+ u32 opaque_data_0, u32 opaque_data_1)
+{
+}
+
static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
{
struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
@@ -1902,6 +1922,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
}
+ if (iwarp_info->ll2_mpa_handle != QED_IWARP_HANDLE_INVAL) {
+ rc = qed_ll2_terminate_connection(p_hwfn,
+ iwarp_info->ll2_mpa_handle);
+ if (rc)
+ DP_INFO(p_hwfn, "Failed to terminate mpa connection\n");
+
+ qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_mpa_handle);
+ iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL;
+ }
+
qed_llh_remove_mac_filter(p_hwfn,
p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr);
return rc;
@@ -1953,12 +1983,14 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
struct qed_iwarp_info *iwarp_info;
struct qed_ll2_acquire_data data;
struct qed_ll2_cbs cbs;
+ u32 mpa_buff_size;
u16 n_ooo_bufs;
int rc = 0;
iwarp_info = &p_hwfn->p_rdma_info->iwarp;
iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
+ iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL;
iwarp_info->max_mtu = params->max_mtu;
@@ -2029,6 +2061,39 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
if (rc)
goto err;
+ /* Start Unaligned MPA connection */
+ cbs.rx_comp_cb = qed_iwarp_ll2_comp_mpa_pkt;
+ cbs.slowpath_cb = qed_iwarp_ll2_slowpath;
+
+ memset(&data, 0, sizeof(data));
+ data.input.conn_type = QED_LL2_TYPE_IWARP;
+ data.input.mtu = params->max_mtu;
+ /* FW requires that once a packet arrives OOO, it must have at
+ * least 2 rx buffers available on the unaligned connection
+ * for handling the case that it is a partial fpdu.
+ */
+ data.input.rx_num_desc = n_ooo_bufs * 2;
+ data.input.tx_num_desc = data.input.rx_num_desc;
+ data.input.tx_max_bds_per_packet = QED_IWARP_MAX_BDS_PER_FPDU;
+ data.p_connection_handle = &iwarp_info->ll2_mpa_handle;
+ data.input.secondary_queue = true;
+ data.cbs = &cbs;
+
+ rc = qed_ll2_acquire_connection(p_hwfn, &data);
+ if (rc)
+ goto err;
+
+ rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_mpa_handle);
+ if (rc)
+ goto err;
+
+ mpa_buff_size = QED_IWARP_MAX_BUF_SIZE(params->max_mtu);
+ rc = qed_iwarp_ll2_alloc_buffers(p_hwfn,
+ data.input.rx_num_desc,
+ mpa_buff_size,
+ iwarp_info->ll2_mpa_handle);
+ if (rc)
+ goto err;
return rc;
err:
qed_iwarp_ll2_stop(p_hwfn, p_ptt);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
index 9e2bfde..9d33a1f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -73,6 +73,7 @@ struct qed_iwarp_info {
u8 tcp_flags;
u8 ll2_syn_handle;
u8 ll2_ooo_handle;
+ u8 ll2_mpa_handle;
u8 peer2peer;
enum mpa_negotiation_mode mpa_rev;
enum mpa_rtr_type rtr_type;
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH net-next 09/12] qed: Add unaligned and packed packet processing
From: Michal Kalderon @ 2017-10-02 8:23 UTC (permalink / raw)
To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
dledford-H+wXaHxf7aLQT0dZR+AlfA, Michal Kalderon, Ariel Elior
In-Reply-To: <1506932638-26268-1-git-send-email-Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
The fpdu data structure is preallocated per connection.
Each connection stores the current status of the connection:
either nothing pending, or there is a partial fpdu that is waiting for
the rest of the fpdu (incomplete bytes != 0).
The same structure is also used for splitting a packet when there are
packed fpdus. The structure is initialized with all data required
for sending the fpdu back to the FW. A fpdu will always be spanned across
a maximum of 3 tx bds. One for the header, one for the partial fdpu
received and one for the remainder (unaligned) packet.
In case of packed fpdu's, two fragments are used, one for the header
and one for the data.
Corner cases are not handled in the patch for clarity, and will be added
as a separate patch.
Signed-off-by: Michal Kalderon <Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
Signed-off-by: Ariel Elior <Ariel.Elior-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
---
drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 257 ++++++++++++++++++++++++++++
drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 13 ++
2 files changed, 270 insertions(+)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index efd4861..83b147f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1419,6 +1419,7 @@ void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn)
qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, 1);
kfree(iwarp_info->mpa_bufs);
+ kfree(iwarp_info->partial_fpdus);
}
int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams)
@@ -1716,8 +1717,170 @@ int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams)
return 0;
}
+static struct qed_iwarp_fpdu *qed_iwarp_get_curr_fpdu(struct qed_hwfn *p_hwfn,
+ u16 cid)
+{
+ struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+ struct qed_iwarp_fpdu *partial_fpdu;
+ u32 idx;
+
+ idx = cid - qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_IWARP);
+ if (idx >= iwarp_info->max_num_partial_fpdus) {
+ DP_ERR(p_hwfn, "Invalid cid %x max_num_partial_fpdus=%x\n", cid,
+ iwarp_info->max_num_partial_fpdus);
+ return NULL;
+ }
+
+ partial_fpdu = &iwarp_info->partial_fpdus[idx];
+
+ return partial_fpdu;
+}
+
+enum qed_iwarp_mpa_pkt_type {
+ QED_IWARP_MPA_PKT_PACKED,
+ QED_IWARP_MPA_PKT_PARTIAL,
+ QED_IWARP_MPA_PKT_UNALIGNED
+};
+
+#define QED_IWARP_MPA_FPDU_LENGTH_SIZE (2)
+#define QED_IWARP_MPA_CRC32_DIGEST_SIZE (4)
+
+/* Pad to multiple of 4 */
+#define QED_IWARP_PDU_DATA_LEN_WITH_PAD(data_len) ALIGN(data_len, 4)
+#define QED_IWARP_FPDU_LEN_WITH_PAD(_mpa_len) \
+ (QED_IWARP_PDU_DATA_LEN_WITH_PAD((_mpa_len) + \
+ QED_IWARP_MPA_FPDU_LENGTH_SIZE) + \
+ QED_IWARP_MPA_CRC32_DIGEST_SIZE)
+
/* fpdu can be fragmented over maximum 3 bds: header, partial mpa, unaligned */
#define QED_IWARP_MAX_BDS_PER_FPDU 3
+
+char *pkt_type_str[] = {
+ "QED_IWARP_MPA_PKT_PACKED",
+ "QED_IWARP_MPA_PKT_PARTIAL",
+ "QED_IWARP_MPA_PKT_UNALIGNED"
+};
+
+static enum qed_iwarp_mpa_pkt_type
+qed_iwarp_mpa_classify(struct qed_hwfn *p_hwfn,
+ struct qed_iwarp_fpdu *fpdu,
+ u16 tcp_payload_len, u8 *mpa_data)
+{
+ enum qed_iwarp_mpa_pkt_type pkt_type;
+ u16 mpa_len;
+
+ if (fpdu->incomplete_bytes) {
+ pkt_type = QED_IWARP_MPA_PKT_UNALIGNED;
+ goto out;
+ }
+
+ mpa_len = ntohs(*((u16 *)(mpa_data)));
+ fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len);
+
+ if (fpdu->fpdu_length <= tcp_payload_len)
+ pkt_type = QED_IWARP_MPA_PKT_PACKED;
+ else
+ pkt_type = QED_IWARP_MPA_PKT_PARTIAL;
+
+out:
+ DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+ "MPA_ALIGN: %s: fpdu_length=0x%x tcp_payload_len:0x%x\n",
+ pkt_type_str[pkt_type], fpdu->fpdu_length, tcp_payload_len);
+
+ return pkt_type;
+}
+
+static void
+qed_iwarp_init_fpdu(struct qed_iwarp_ll2_buff *buf,
+ struct qed_iwarp_fpdu *fpdu,
+ struct unaligned_opaque_data *pkt_data,
+ u16 tcp_payload_size, u8 placement_offset)
+{
+ fpdu->mpa_buf = buf;
+ fpdu->pkt_hdr = buf->data_phys_addr + placement_offset;
+ fpdu->pkt_hdr_size = pkt_data->tcp_payload_offset;
+ fpdu->mpa_frag = buf->data_phys_addr + pkt_data->first_mpa_offset;
+ fpdu->mpa_frag_virt = (u8 *)(buf->data) + pkt_data->first_mpa_offset;
+
+ if (tcp_payload_size < fpdu->fpdu_length)
+ fpdu->incomplete_bytes = fpdu->fpdu_length - tcp_payload_size;
+ else
+ fpdu->incomplete_bytes = 0; /* complete fpdu */
+
+ fpdu->mpa_frag_len = fpdu->fpdu_length - fpdu->incomplete_bytes;
+}
+
+static int
+qed_iwarp_send_fpdu(struct qed_hwfn *p_hwfn,
+ struct qed_iwarp_fpdu *fpdu,
+ struct unaligned_opaque_data *curr_pkt,
+ struct qed_iwarp_ll2_buff *buf,
+ u16 tcp_payload_size, enum qed_iwarp_mpa_pkt_type pkt_type)
+{
+ struct qed_ll2_tx_pkt_info tx_pkt;
+ u8 ll2_handle;
+ int rc;
+
+ memset(&tx_pkt, 0, sizeof(tx_pkt));
+
+ /* An unaligned packet means it's split over two tcp segments. So the
+ * complete packet requires 3 bds, one for the header, one for the
+ * part of the fpdu of the first tcp segment, and the last fragment
+ * will point to the remainder of the fpdu. A packed pdu, requires only
+ * two bds, one for the header and one for the data.
+ */
+ tx_pkt.num_of_bds = (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED) ? 3 : 2;
+ tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+ tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2; /* offset in words */
+
+ /* Send the mpa_buf only with the last fpdu (in case of packed) */
+ if (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED ||
+ tcp_payload_size <= fpdu->fpdu_length)
+ tx_pkt.cookie = fpdu->mpa_buf;
+
+ tx_pkt.first_frag = fpdu->pkt_hdr;
+ tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+ tx_pkt.enable_ip_cksum = true;
+ tx_pkt.enable_l4_cksum = true;
+ tx_pkt.calc_ip_len = true;
+ /* vlan overload with enum iwarp_ll2_tx_queues */
+ tx_pkt.vlan = IWARP_LL2_ALIGNED_TX_QUEUE;
+
+ ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+ /* Set first fragment to header */
+ rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+ if (rc)
+ goto out;
+
+ /* Set second fragment to first part of packet */
+ rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn, ll2_handle,
+ fpdu->mpa_frag,
+ fpdu->mpa_frag_len);
+ if (rc)
+ goto out;
+
+ if (!fpdu->incomplete_bytes)
+ goto out;
+
+ /* Set third fragment to second part of the packet */
+ rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn,
+ ll2_handle,
+ buf->data_phys_addr +
+ curr_pkt->first_mpa_offset,
+ fpdu->incomplete_bytes);
+out:
+ DP_VERBOSE(p_hwfn,
+ QED_MSG_RDMA,
+ "MPA_ALIGN: Sent FPDU num_bds=%d first_frag_len=%x, mpa_frag_len=0x%x, incomplete_bytes:0x%x rc=%d\n",
+ tx_pkt.num_of_bds,
+ tx_pkt.first_frag_len,
+ fpdu->mpa_frag_len,
+ fpdu->incomplete_bytes, rc);
+
+ return rc;
+}
+
static void
qed_iwarp_mpa_get_data(struct qed_hwfn *p_hwfn,
struct unaligned_opaque_data *curr_pkt,
@@ -1741,9 +1904,79 @@ int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams)
qed_iwarp_process_mpa_pkt(struct qed_hwfn *p_hwfn,
struct qed_iwarp_ll2_mpa_buf *mpa_buf)
{
+ struct unaligned_opaque_data *curr_pkt = &mpa_buf->data;
struct qed_iwarp_ll2_buff *buf = mpa_buf->ll2_buf;
+ enum qed_iwarp_mpa_pkt_type pkt_type;
+ struct qed_iwarp_fpdu *fpdu;
int rc = -EINVAL;
+ u8 *mpa_data;
+
+ fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, curr_pkt->cid & 0xffff);
+ if (!fpdu) { /* something corrupt with cid, post rx back */
+ DP_ERR(p_hwfn, "Invalid cid, drop and post back to rx cid=%x\n",
+ curr_pkt->cid);
+ goto err;
+ }
+ do {
+ mpa_data = ((u8 *)(buf->data) + curr_pkt->first_mpa_offset);
+
+ pkt_type = qed_iwarp_mpa_classify(p_hwfn, fpdu,
+ mpa_buf->tcp_payload_len,
+ mpa_data);
+
+ switch (pkt_type) {
+ case QED_IWARP_MPA_PKT_PARTIAL:
+ qed_iwarp_init_fpdu(buf, fpdu,
+ curr_pkt,
+ mpa_buf->tcp_payload_len,
+ mpa_buf->placement_offset);
+
+ mpa_buf->tcp_payload_len = 0;
+ break;
+ case QED_IWARP_MPA_PKT_PACKED:
+ qed_iwarp_init_fpdu(buf, fpdu,
+ curr_pkt,
+ mpa_buf->tcp_payload_len,
+ mpa_buf->placement_offset);
+
+ rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+ mpa_buf->tcp_payload_len,
+ pkt_type);
+ if (rc) {
+ DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+ "Can't send FPDU:reset rc=%d\n", rc);
+ memset(fpdu, 0, sizeof(*fpdu));
+ break;
+ }
+
+ mpa_buf->tcp_payload_len -= fpdu->fpdu_length;
+ curr_pkt->first_mpa_offset += fpdu->fpdu_length;
+ break;
+ case QED_IWARP_MPA_PKT_UNALIGNED:
+ rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+ mpa_buf->tcp_payload_len,
+ pkt_type);
+ if (rc) {
+ DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+ "Can't send FPDU:delay rc=%d\n", rc);
+ /* don't reset fpdu -> we need it for next
+ * classify
+ */
+ break;
+ }
+
+ mpa_buf->tcp_payload_len -= fpdu->incomplete_bytes;
+ curr_pkt->first_mpa_offset += fpdu->incomplete_bytes;
+ /* The framed PDU was sent - no more incomplete bytes */
+ fpdu->incomplete_bytes = 0;
+ break;
+ }
+ } while (mpa_buf->tcp_payload_len && !rc);
+
+ return rc;
+
+err:
qed_iwarp_ll2_post_rx(p_hwfn,
buf,
p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle);
@@ -1989,11 +2222,27 @@ static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
kfree(buffer);
}
+/* The only slowpath for iwarp ll2 is unalign flush. When this completion
+ * is received, need to reset the FPDU.
+ */
void
qed_iwarp_ll2_slowpath(void *cxt,
u8 connection_handle,
u32 opaque_data_0, u32 opaque_data_1)
{
+ struct unaligned_opaque_data unalign_data;
+ struct qed_hwfn *p_hwfn = cxt;
+ struct qed_iwarp_fpdu *fpdu;
+
+ qed_iwarp_mpa_get_data(p_hwfn, &unalign_data,
+ opaque_data_0, opaque_data_1);
+
+ DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "(0x%x) Flush fpdu\n",
+ unalign_data.cid);
+
+ fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, (u16)unalign_data.cid);
+ if (fpdu)
+ memset(fpdu, 0, sizeof(*fpdu));
}
static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
@@ -2194,6 +2443,14 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
iwarp_info->ll2_mpa_handle);
if (rc)
goto err;
+
+ iwarp_info->partial_fpdus = kcalloc((u16)p_hwfn->p_rdma_info->num_qps,
+ sizeof(*iwarp_info->partial_fpdus),
+ GFP_KERNEL);
+ if (!iwarp_info->partial_fpdus)
+ goto err;
+
+ iwarp_info->max_num_partial_fpdus = (u16)p_hwfn->p_rdma_info->num_qps;
/* The mpa_bufs array serves for pending RX packets received on the
* mpa ll2 that don't have place on the tx ring and require later
* processing. We can't fail on allocation of such a struct therefore
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
index 2c53fe4..858755c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -68,6 +68,17 @@ struct qed_iwarp_ll2_mpa_buf {
u8 placement_offset;
};
+struct qed_iwarp_fpdu {
+ struct qed_iwarp_ll2_buff *mpa_buf;
+ void *mpa_frag_virt;
+ dma_addr_t mpa_frag;
+ dma_addr_t pkt_hdr;
+ u16 mpa_frag_len;
+ u16 fpdu_length;
+ u16 incomplete_bytes;
+ u8 pkt_hdr_size;
+};
+
struct qed_iwarp_info {
struct list_head listen_list; /* qed_iwarp_listener */
struct list_head ep_list; /* qed_iwarp_ep */
@@ -87,7 +98,9 @@ struct qed_iwarp_info {
u8 peer2peer;
enum mpa_negotiation_mode mpa_rev;
enum mpa_rtr_type rtr_type;
+ struct qed_iwarp_fpdu *partial_fpdus;
struct qed_iwarp_ll2_mpa_buf *mpa_bufs;
+ u16 max_num_partial_fpdus;
};
enum qed_iwarp_ep_state {
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox