* [RFC PATCH v2 net-next 08/12] net: ipv4: listify ip_rcv_finish
From: Edward Cree @ 2018-06-26 18:20 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
In-Reply-To: <fa3d7e58-e7b6-ad0c-619f-824c25ed0d97@solarflare.com>
ip_rcv_finish_core(), if it does not drop, sets skb->dst by either early
demux or route lookup. The last step, calling dst_input(skb), is left to
the caller; in the listified case, we split to form sublists with a common
dst, but then ip_sublist_rcv_finish() just calls dst_input(skb) in a loop.
The next step in listification would thus be to add a list_input() method
to struct dst_entry.
Early demux is an indirect call based on iph->protocol; this is another
opportunity for listification which is not taken here (it would require
slicing up ip_rcv_finish_core() to allow splitting on protocol changes).
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
net/ipv4/ip_input.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 53 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7a8af8ff3f07..63d4dfdb1766 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -307,7 +307,8 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
return true;
}
-static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
@@ -393,7 +394,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
goto drop;
}
- return dst_input(skb);
+ return NET_RX_SUCCESS;
drop:
kfree_skb(skb);
@@ -405,6 +406,15 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
goto drop;
}
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret = ip_rcv_finish_core(net, sk, skb);
+
+ if (ret != NET_RX_DROP)
+ ret = dst_input(skb);
+ return ret;
+}
+
/*
* Main IP Receive routine.
*/
@@ -515,16 +525,54 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
ip_rcv_finish);
}
+static void ip_sublist_rcv_finish(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(list)) != NULL)
+ dst_input(skb);
+}
+
+static void ip_list_rcv_finish(struct net *net, struct sock *sk,
+ struct sk_buff_head *list)
+{
+ struct dst_entry *curr_dst = NULL;
+ struct sk_buff_head sublist;
+ struct sk_buff *skb;
+
+ __skb_queue_head_init(&sublist);
+
+ while ((skb = __skb_dequeue(list)) != NULL) {
+ struct dst_entry *dst;
+
+ if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
+ continue;
+
+ dst = skb_dst(skb);
+ if (skb_queue_empty(&sublist)) {
+ curr_dst = dst;
+ } else if (curr_dst != dst) {
+ /* dispatch old sublist */
+ ip_sublist_rcv_finish(&sublist);
+ /* start new sublist */
+ __skb_queue_head_init(&sublist);
+ curr_dst = dst;
+ }
+ /* add to current sublist */
+ __skb_queue_tail(&sublist, skb);
+ }
+ /* dispatch final sublist */
+ ip_sublist_rcv_finish(&sublist);
+}
+
static void ip_sublist_rcv(struct sk_buff_head *list, struct net_device *dev,
struct net *net)
{
struct sk_buff_head sublist;
- struct sk_buff *skb;
NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
list, &sublist, dev, NULL, ip_rcv_finish);
- while ((skb = __skb_dequeue(&sublist)) != NULL)
- ip_rcv_finish(net, NULL, skb);
+ ip_list_rcv_finish(net, NULL, &sublist);
}
/* Receive a list of IP packets */
^ permalink raw reply related
* [RFC PATCH v2 net-next 07/12] net: ipv4: listified version of ip_rcv
From: Edward Cree @ 2018-06-26 18:20 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
In-Reply-To: <fa3d7e58-e7b6-ad0c-619f-824c25ed0d97@solarflare.com>
Also involved adding a way to run a netfilter hook over a list of packets.
Rather than attempting to make netfilter know about lists (which would be
a major project in itself) we just let it call the regular okfn (in this
case ip_rcv_finish()) for any packets it steals, and have it give us back
a list of packets it's synchronously accepted (which normally NF_HOOK
would automatically call okfn() on, but we want to be able to potentially
pass the list to a listified version of okfn().)
The netfilter hooks themselves are indirect calls that still happen per-
packet (see nf_hook_entry_hookfn()), but again, changing that can be left
for future work.
There is potential for out-of-order receives if the netfilter hook ends up
synchronously stealing packets, as they will be processed before any
accepts earlier in the list. However, it was already possible for an
asynchronous accept to cause out-of-order receives, so presumably this is
considered OK.
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
include/linux/netdevice.h | 3 ++
include/linux/netfilter.h | 27 +++++++++++++++++
include/net/ip.h | 2 ++
net/core/dev.c | 11 +++++--
net/ipv4/af_inet.c | 1 +
net/ipv4/ip_input.c | 75 ++++++++++++++++++++++++++++++++++++++++++-----
6 files changed, 110 insertions(+), 9 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 105087369779..5296354fa621 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2290,6 +2290,9 @@ struct packet_type {
struct net_device *,
struct packet_type *,
struct net_device *);
+ void (*list_func) (struct sk_buff_head *,
+ struct packet_type *,
+ struct net_device *);
bool (*id_match)(struct packet_type *ptype,
struct sock *sk);
void *af_packet_priv;
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index dd2052f0efb7..42395a8a6e70 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -288,6 +288,22 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct
return ret;
}
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+ struct sk_buff_head *list, struct sk_buff_head *sublist,
+ struct net_device *in, struct net_device *out,
+ int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+ struct sk_buff *skb;
+
+ __skb_queue_head_init(sublist); /* list of synchronously ACCEPTed skbs */
+ while ((skb = __skb_dequeue(list)) != NULL) {
+ int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
+ if (ret == 1)
+ __skb_queue_tail(sublist, skb);
+ }
+}
+
/* Call setsockopt() */
int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
unsigned int len);
@@ -369,6 +385,17 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
return okfn(net, sk, skb);
}
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+ struct sk_buff_head *list, struct sk_buff_head *sublist,
+ struct net_device *in, struct net_device *out,
+ int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+ __skb_queue_head_init(sublist);
+ /* Move everything to the sublist */
+ skb_queue_splice_init(list, sublist);
+}
+
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
struct sock *sk, struct sk_buff *skb,
struct net_device *indev, struct net_device *outdev,
diff --git a/include/net/ip.h b/include/net/ip.h
index 0d2281b4b27a..fb3dfed537c0 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -138,6 +138,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
struct ip_options_rcu *opt);
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev);
+void ip_list_rcv(struct sk_buff_head *list, struct packet_type *pt,
+ struct net_device *orig_dev);
int ip_local_deliver(struct sk_buff *skb);
int ip_mr_input(struct sk_buff *skb);
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 2f46ed07c8d8..f0eb00e9fb57 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4684,8 +4684,15 @@ static inline void __netif_receive_skb_list_ptype(struct sk_buff_head *list,
{
struct sk_buff *skb;
- while ((skb = __skb_dequeue(list)) != NULL)
- pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ if (!pt_prev)
+ return;
+ if (skb_queue_empty(list))
+ return;
+ if (pt_prev->list_func != NULL)
+ pt_prev->list_func(list, pt_prev, orig_dev);
+ else
+ while ((skb = __skb_dequeue(list)) != NULL)
+ pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15e125558c76..e54381fe4b00 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1882,6 +1882,7 @@ fs_initcall(ipv4_offload_init);
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
+ .list_func = ip_list_rcv,
};
static int __init inet_init(void)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7582713dd18f..7a8af8ff3f07 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -408,10 +408,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
/*
* Main IP Receive routine.
*/
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
- struct net *net;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
@@ -421,7 +420,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto drop;
- net = dev_net(dev);
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
@@ -489,9 +487,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
- net, NULL, skb, dev, NULL,
- ip_rcv_finish);
+ return skb;
csum_error:
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
@@ -500,5 +496,70 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
drop:
kfree_skb(skb);
out:
- return NET_RX_DROP;
+ return NULL;
+}
+
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net *net = dev_net(dev);
+
+ skb = ip_rcv_core(skb, net);
+ if (skb == NULL)
+ return NET_RX_DROP;
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
+ ip_rcv_finish);
+}
+
+static void ip_sublist_rcv(struct sk_buff_head *list, struct net_device *dev,
+ struct net *net)
+{
+ struct sk_buff_head sublist;
+ struct sk_buff *skb;
+
+ NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+ list, &sublist, dev, NULL, ip_rcv_finish);
+ while ((skb = __skb_dequeue(&sublist)) != NULL)
+ ip_rcv_finish(net, NULL, skb);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct sk_buff_head *list, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net_device *curr_dev = NULL;
+ struct net *curr_net = NULL;
+ struct sk_buff_head sublist;
+ struct sk_buff *skb;
+
+ __skb_queue_head_init(&sublist);
+
+ while ((skb = __skb_dequeue(list)) != NULL) {
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+
+ skb = ip_rcv_core(skb, net);
+ if (skb == NULL)
+ continue;
+
+ if (skb_queue_empty(&sublist)) {
+ curr_dev = dev;
+ curr_net = net;
+ } else if (curr_dev != dev || curr_net != net) {
+ /* dispatch old sublist */
+ ip_sublist_rcv(&sublist, dev, net);
+ /* start new sublist */
+ __skb_queue_head_init(&sublist);
+ curr_dev = dev;
+ curr_net = net;
+ }
+ /* add to current sublist */
+ __skb_queue_tail(&sublist, skb);
+ }
+ /* dispatch final sublist */
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
}
^ permalink raw reply related
* [RFC PATCH v2 net-next 06/12] net: core: propagate SKB lists through packet_type lookup
From: Edward Cree @ 2018-06-26 18:19 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
In-Reply-To: <fa3d7e58-e7b6-ad0c-619f-824c25ed0d97@solarflare.com>
__netif_receive_skb_taps() does a depressingly large amount of per-packet
work that can't easily be listified, because the another_round looping
makes it nontrivial to slice up into smaller functions.
Fortunately, most of that work disappears in the fast path:
* Hardware devices generally don't have an rx_handler
* Unless you're tcpdumping or something, there is usually only one ptype
* VLAN processing comes before the protocol ptype lookup, so doesn't force
a pt_prev deliver
so normally, __netif_receive_skb_taps() will run straight through and return
the one ptype found in ptype_base[hash of skb->protocol].
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
include/trace/events/net.h | 7 +++
net/core/dev.c | 138 ++++++++++++++++++++++++++++++++-------------
2 files changed, 105 insertions(+), 40 deletions(-)
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 00aa72ce0e7c..3c9b262896c1 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -131,6 +131,13 @@ DEFINE_EVENT(net_dev_template, netif_receive_skb,
TP_ARGS(skb)
);
+DEFINE_EVENT(net_dev_template, netif_receive_skb_list,
+
+ TP_PROTO(struct sk_buff *skb),
+
+ TP_ARGS(skb)
+);
+
DEFINE_EVENT(net_dev_template, netif_rx,
TP_PROTO(struct sk_buff *skb),
diff --git a/net/core/dev.c b/net/core/dev.c
index 92d78b3de656..2f46ed07c8d8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4494,12 +4494,13 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
return 0;
}
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+static int __netif_receive_skb_taps(struct sk_buff *skb, bool pfmemalloc,
+ struct packet_type **pt_prev)
{
- struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
bool deliver_exact = false;
+ struct packet_type *ptype;
int ret = NET_RX_DROP;
__be16 type;
@@ -4514,7 +4515,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
- pt_prev = NULL;
+ *pt_prev = NULL;
another_round:
skb->skb_iif = skb->dev->ifindex;
@@ -4535,25 +4536,25 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
+ if (*pt_prev)
+ ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
+ if (*pt_prev)
+ ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = ptype;
}
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
- skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+ skb = sch_handle_ingress(skb, pt_prev, &ret, orig_dev);
if (!skb)
goto out;
- if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+ if (nf_ingress(skb, pt_prev, &ret, orig_dev) < 0)
goto out;
}
#endif
@@ -4563,9 +4564,9 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
goto drop;
if (skb_vlan_tag_present(skb)) {
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL;
+ if (*pt_prev) {
+ ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
}
if (vlan_do_receive(&skb))
goto another_round;
@@ -4575,9 +4576,9 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL;
+ if (*pt_prev) {
+ ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
@@ -4608,38 +4609,45 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
/* deliver only exact match when indicated */
if (likely(!deliver_exact)) {
- deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
}
- deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
&orig_dev->ptype_specific);
if (unlikely(skb->dev != orig_dev)) {
- deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
&skb->dev->ptype_specific);
}
-
- if (pt_prev) {
- if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
- goto drop;
- else
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
- } else {
+ if (*pt_prev && unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
+ goto drop;
+ return ret;
drop:
- if (!deliver_exact)
- atomic_long_inc(&skb->dev->rx_dropped);
- else
- atomic_long_inc(&skb->dev->rx_nohandler);
- kfree_skb(skb);
- /* Jamal, now you will not able to escape explaining
- * me how you were going to use this. :-)
- */
- ret = NET_RX_DROP;
- }
-
+ if (!deliver_exact)
+ atomic_long_inc(&skb->dev->rx_dropped);
+ else
+ atomic_long_inc(&skb->dev->rx_nohandler);
+ kfree_skb(skb);
+ /* Jamal, now you will not able to escape explaining
+ * me how you were going to use this. :-)
+ */
+ ret = NET_RX_DROP;
out:
+ *pt_prev = NULL;
+ return ret;
+}
+
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+{
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev;
+ int ret;
+
+ ret = __netif_receive_skb_taps(skb, pfmemalloc, &pt_prev);
+ if (pt_prev)
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
return ret;
}
@@ -4670,12 +4678,62 @@ int netif_receive_skb_core(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_receive_skb_core);
-static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
+static inline void __netif_receive_skb_list_ptype(struct sk_buff_head *list,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
{
struct sk_buff *skb;
while ((skb = __skb_dequeue(list)) != NULL)
- __netif_receive_skb_core(skb, pfmemalloc);
+ pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
+{
+ /* Fast-path assumptions:
+ * - There is no RX handler.
+ * - Only one packet_type matches.
+ * If either of these fails, we will end up doing some per-packet
+ * processing in-line, then handling the 'last ptype' for the whole
+ * sublist. This can't cause out-of-order delivery to any single ptype,
+ * because the 'last ptype' must be constant across the sublist, and all
+ * other ptypes are handled per-packet.
+ */
+ /* Current (common) ptype of sublist */
+ struct packet_type *pt_curr = NULL;
+ /* Current (common) orig_dev of sublist */
+ struct net_device *od_curr = NULL;
+ struct sk_buff_head sublist;
+ struct sk_buff *skb;
+
+ __skb_queue_head_init(&sublist);
+
+ while ((skb = __skb_dequeue(list)) != NULL) {
+ struct packet_type *pt_prev;
+ struct net_device *orig_dev = skb->dev;
+
+ __netif_receive_skb_taps(skb, pfmemalloc, &pt_prev);
+ if (pt_prev) {
+ if (skb_queue_empty(&sublist)) {
+ pt_curr = pt_prev;
+ od_curr = orig_dev;
+ } else if (!(pt_curr == pt_prev &&
+ od_curr == orig_dev)) {
+ /* dispatch old sublist */
+ __netif_receive_skb_list_ptype(&sublist,
+ pt_curr,
+ od_curr);
+ /* start new sublist */
+ __skb_queue_head_init(&sublist);
+ pt_curr = pt_prev;
+ od_curr = orig_dev;
+ }
+ __skb_queue_tail(&sublist, skb);
+ }
+ }
+
+ /* dispatch final sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
}
static int __netif_receive_skb(struct sk_buff *skb)
^ permalink raw reply related
* [RFC PATCH v2 net-next 05/12] net: core: another layer of lists, around PF_MEMALLOC skb handling
From: Edward Cree @ 2018-06-26 18:19 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
In-Reply-To: <fa3d7e58-e7b6-ad0c-619f-824c25ed0d97@solarflare.com>
First example of a layer splitting the list (rather than merely taking
individual packets off it).
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
net/core/dev.c | 46 ++++++++++++++++++++++++++++++++++++++--------
1 file changed, 38 insertions(+), 8 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index 27980c13ad5c..92d78b3de656 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4670,6 +4670,14 @@ int netif_receive_skb_core(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_receive_skb_core);
+static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(list)) != NULL)
+ __netif_receive_skb_core(skb, pfmemalloc);
+}
+
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
@@ -4695,6 +4703,36 @@ static int __netif_receive_skb(struct sk_buff *skb)
return ret;
}
+static void __netif_receive_skb_list(struct sk_buff_head *list)
+{
+ unsigned long noreclaim_flag = 0;
+ struct sk_buff_head sublist;
+ bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+ struct sk_buff *skb;
+
+ __skb_queue_head_init(&sublist);
+
+ while ((skb = __skb_dequeue(list)) != NULL) {
+ if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+ /* Handle the previous sublist */
+ __netif_receive_skb_list_core(&sublist, pfmemalloc);
+ pfmemalloc = !pfmemalloc;
+ /* See comments in __netif_receive_skb */
+ if (pfmemalloc)
+ noreclaim_flag = memalloc_noreclaim_save();
+ else
+ memalloc_noreclaim_restore(noreclaim_flag);
+ __skb_queue_head_init(&sublist);
+ }
+ __skb_queue_tail(&sublist, skb);
+ }
+ /* Handle the last sublist */
+ __netif_receive_skb_list_core(&sublist, pfmemalloc);
+ /* Restore pflags */
+ if (pfmemalloc)
+ memalloc_noreclaim_restore(noreclaim_flag);
+}
+
static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
@@ -4729,14 +4767,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
return ret;
}
-static void __netif_receive_skb_list(struct sk_buff_head *list)
-{
- struct sk_buff *skb;
-
- while ((skb = __skb_dequeue(list)) != NULL)
- __netif_receive_skb(skb);
-}
-
static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
^ permalink raw reply related
* [RFC PATCH v2 net-next 04/12] net: core: Another step of skb receive list processing
From: Edward Cree @ 2018-06-26 18:18 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
In-Reply-To: <fa3d7e58-e7b6-ad0c-619f-824c25ed0d97@solarflare.com>
netif_receive_skb_list_internal() now processes a list and hands it
on to the next function.
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
net/core/dev.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 69 insertions(+), 4 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ab16941a651..27980c13ad5c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4729,6 +4729,14 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
return ret;
}
+static void __netif_receive_skb_list(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(list)) != NULL)
+ __netif_receive_skb(skb);
+}
+
static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
@@ -4769,6 +4777,64 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
return ret;
}
+static void netif_receive_skb_list_internal(struct sk_buff_head *list)
+{
+ /* Two sublists so we can go back and forth between them */
+ struct sk_buff_head sublist, sublist2;
+ struct bpf_prog *xdp_prog = NULL;
+ struct sk_buff *skb;
+
+ __skb_queue_head_init(&sublist);
+
+ while ((skb = __skb_dequeue(list)) != NULL) {
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+ if (skb_defer_rx_timestamp(skb))
+ /* Handled, don't add to sublist */
+ continue;
+ __skb_queue_tail(&sublist, skb);
+ }
+
+ __skb_queue_head_init(&sublist2);
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
+ preempt_disable();
+ rcu_read_lock();
+ while ((skb = __skb_dequeue(&sublist)) != NULL) {
+ xdp_prog = rcu_dereference(skb->dev->xdp_prog);
+ if (do_xdp_generic(xdp_prog, skb) != XDP_PASS)
+ /* Dropped, don't add to sublist */
+ continue;
+ __skb_queue_tail(&sublist2, skb);
+ }
+ rcu_read_unlock();
+ preempt_enable();
+ /* Move all packets onto first sublist */
+ skb_queue_splice_init(&sublist2, &sublist);
+ }
+
+ rcu_read_lock();
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ while ((skb = __skb_dequeue(&sublist)) != NULL) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ /* Handled, don't add to sublist */
+ continue;
+ }
+
+ __skb_queue_tail(&sublist2, skb);
+ }
+
+ /* Move all packets onto first sublist */
+ skb_queue_splice_init(&sublist2, &sublist);
+ }
+#endif
+ __netif_receive_skb_list(&sublist);
+ rcu_read_unlock();
+}
+
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
@@ -4797,8 +4863,8 @@ EXPORT_SYMBOL(netif_receive_skb);
* @list: list of skbs to process. Must not be shareable (e.g. it may
* be on the stack)
*
- * For now, just calls netif_receive_skb() in a loop, ignoring the
- * return value.
+ * Since return value of netif_receive_skb() is normally ignored, and
+ * wouldn't be meaningful for a list, this function returns void.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
@@ -4809,8 +4875,7 @@ void netif_receive_skb_list(struct sk_buff_head *list)
skb_queue_for_each(skb, list)
trace_netif_receive_skb_list_entry(skb);
- while ((skb = __skb_dequeue(list)) != NULL)
- netif_receive_skb_internal(skb);
+ netif_receive_skb_list_internal(list);
}
EXPORT_SYMBOL(netif_receive_skb_list);
^ permalink raw reply related
* [RFC PATCH v2 net-next 03/12] net: core: unwrap skb list receive slightly further
From: Edward Cree @ 2018-06-26 18:18 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
In-Reply-To: <fa3d7e58-e7b6-ad0c-619f-824c25ed0d97@solarflare.com>
Adds iterator skb_queue_for_each() to run over a list without modifying it.
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
include/linux/skbuff.h | 16 ++++++++++++++++
include/trace/events/net.h | 7 +++++++
net/core/dev.c | 4 +++-
3 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c86885954994..a8c16c6700f3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1626,6 +1626,22 @@ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
}
/**
+ * skb_queue_for_each - iterate over an skb queue
+ * @pos: the &struct sk_buff to use as a loop cursor.
+ * @head: the &struct sk_buff_head for your list.
+ *
+ * The reference count is not incremented and the reference is therefore
+ * volatile; the list lock is not taken either. Use with caution.
+ *
+ * The list must not be modified (though the individual skbs can be)
+ * within the loop body.
+ *
+ * After loop completion, @pos will be %NULL.
+ */
+#define skb_queue_for_each(pos, head) \
+ for (pos = skb_peek(head); pos != NULL; pos = skb_peek_next(pos, head))
+
+/**
* skb_queue_len - get queue length
* @list_: list to measure
*
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 9c886739246a..00aa72ce0e7c 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -223,6 +223,13 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,
TP_ARGS(skb)
);
+DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,
+
+ TP_PROTO(const struct sk_buff *skb),
+
+ TP_ARGS(skb)
+);
+
DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,
TP_PROTO(const struct sk_buff *skb),
diff --git a/net/core/dev.c b/net/core/dev.c
index 473e24e31e38..0ab16941a651 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4807,8 +4807,10 @@ void netif_receive_skb_list(struct sk_buff_head *list)
{
struct sk_buff *skb;
+ skb_queue_for_each(skb, list)
+ trace_netif_receive_skb_list_entry(skb);
while ((skb = __skb_dequeue(list)) != NULL)
- netif_receive_skb(skb);
+ netif_receive_skb_internal(skb);
}
EXPORT_SYMBOL(netif_receive_skb_list);
^ permalink raw reply related
* Re: [patch net-next RFC 03/12] mlxsw: core: Add core environment module for port temperature reading
From: Andrew Lunn @ 2018-06-26 18:18 UTC (permalink / raw)
To: Vadim Pasternak
Cc: Guenter Roeck, linux-pm@vger.kernel.org, netdev@vger.kernel.org,
rui.zhang@intel.com, edubezval@gmail.com, jiri@resnulli.us
In-Reply-To: <HE1PR0502MB375314997425DFF8DA5D3D1BA2490@HE1PR0502MB3753.eurprd05.prod.outlook.com>
> However, I have some concerns on this matter.
> Our hardware provides bulk reading of the modules temperature, means
> I can get all inputs by one hardware request, which is important optimization.
> Reading each module individually will be resulted in huge overhead and will
> require maybe some cashing of temperature inputs.
Well, you can cache the SFP calibration values, and the 4 limit
values. To get an actually temperature you need to read 2 bytes from
the SFP module. I don't see why that would be expensive. You talk to
the firmware over PCIe right? So you have lots of bandwidth.
Andrew
^ permalink raw reply
* [RFC PATCH v2 net-next 02/12] sfc: batch up RX delivery
From: Edward Cree @ 2018-06-26 18:17 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
In-Reply-To: <fa3d7e58-e7b6-ad0c-619f-824c25ed0d97@solarflare.com>
Improves packet rate of 1-byte UDP receives by up to 10%.
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
drivers/net/ethernet/sfc/efx.c | 12 ++++++++++++
drivers/net/ethernet/sfc/net_driver.h | 3 +++
drivers/net/ethernet/sfc/rx.c | 7 ++++++-
3 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index ad4a354ce570..e84e4437dbbd 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -264,11 +264,17 @@ static int efx_check_disabled(struct efx_nic *efx)
static int efx_process_channel(struct efx_channel *channel, int budget)
{
struct efx_tx_queue *tx_queue;
+ struct sk_buff_head rx_list;
int spent;
if (unlikely(!channel->enabled))
return 0;
+ /* Prepare the batch receive list */
+ EFX_WARN_ON_PARANOID(channel->rx_list != NULL);
+ channel->rx_list = &rx_list;
+ __skb_queue_head_init(channel->rx_list);
+
efx_for_each_channel_tx_queue(tx_queue, channel) {
tx_queue->pkts_compl = 0;
tx_queue->bytes_compl = 0;
@@ -291,6 +297,10 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
}
}
+ /* Receive any packets we queued up */
+ netif_receive_skb_list(channel->rx_list);
+ channel->rx_list = NULL;
+
return spent;
}
@@ -555,6 +565,8 @@ static int efx_probe_channel(struct efx_channel *channel)
goto fail;
}
+ channel->rx_list = NULL;
+
return 0;
fail:
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 65568925c3ef..e1d3ca3b90b5 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -448,6 +448,7 @@ enum efx_sync_events_state {
* __efx_rx_packet(), or zero if there is none
* @rx_pkt_index: Ring index of first buffer for next packet to be delivered
* by __efx_rx_packet(), if @rx_pkt_n_frags != 0
+ * @rx_list: list of SKBs from current RX, awaiting processing
* @rx_queue: RX queue for this channel
* @tx_queue: TX queues for this channel
* @sync_events_state: Current state of sync events on this channel
@@ -500,6 +501,8 @@ struct efx_channel {
unsigned int rx_pkt_n_frags;
unsigned int rx_pkt_index;
+ struct sk_buff_head *rx_list;
+
struct efx_rx_queue rx_queue;
struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index d2e254f2f72b..3e4d67d2d45d 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -634,7 +634,12 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
return;
/* Pass the packet up */
- netif_receive_skb(skb);
+ if (channel->rx_list != NULL)
+ /* Add to list, will pass up later */
+ __skb_queue_tail(channel->rx_list, skb);
+ else
+ /* No list, so pass it up now */
+ netif_receive_skb(skb);
}
/* Handle a received packet. Second half: Touches packet payload. */
^ permalink raw reply related
* [RFC PATCH v2 net-next 01/12] net: core: trivial netif_receive_skb_list() entry point
From: Edward Cree @ 2018-06-26 18:17 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
In-Reply-To: <fa3d7e58-e7b6-ad0c-619f-824c25ed0d97@solarflare.com>
Just calls netif_receive_skb() in a loop.
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
include/linux/netdevice.h | 1 +
net/core/dev.c | 20 ++++++++++++++++++++
2 files changed, 21 insertions(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850c7936..105087369779 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3364,6 +3364,7 @@ int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
+void netif_receive_skb_list(struct sk_buff_head *list);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/net/core/dev.c b/net/core/dev.c
index a5aa1c7444e6..473e24e31e38 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4792,6 +4792,26 @@ int netif_receive_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_receive_skb);
+/**
+ * netif_receive_skb_list - process many receive buffers from network
+ * @list: list of skbs to process. Must not be shareable (e.g. it may
+ * be on the stack)
+ *
+ * For now, just calls netif_receive_skb() in a loop, ignoring the
+ * return value.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ */
+void netif_receive_skb_list(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(list)) != NULL)
+ netif_receive_skb(skb);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
DEFINE_PER_CPU(struct work_struct, flush_works);
/* Network device is going away, flush any packets still pending */
^ permalink raw reply related
* Re: Fwd: [PATCH 0/6] offload Linux LAG devices to the TC datapath
From: John Hurley @ 2018-06-26 18:16 UTC (permalink / raw)
To: Or Gerlitz
Cc: Jakub Kicinski, Jiri Pirko, Linux Netdev List, Simon Horman,
Andy Gospodarek
In-Reply-To: <c3e3790f-fd49-3106-718f-d87993a0c195@mellanox.com>
On Tue, Jun 26, 2018 at 3:57 PM, Or Gerlitz <ogerlitz@mellanox.com> wrote:
>> -------- Forwarded Message --------
>> Subject: [PATCH 0/6] offload Linux LAG devices to the TC datapath
>> Date: Thu, 21 Jun 2018 14:35:55 +0100
>> From: John Hurley <john.hurley@netronome.com>
>> To: dev@openvswitch.org, roid@mellanox.com, gavi@mellanox.com, paulb@mellanox.com, fbl@sysclose.org, simon.horman@netronome.com
>> CC: John Hurley <john.hurley@netronome.com>
>>
>> This patchset extends OvS TC and the linux-netdev implementation to
>> support the offloading of Linux Link Aggregation devices (LAG) and their
>> slaves. TC blocks are used to provide this offload. Blocks, in TC, group
>> together a series of qdiscs. If a filter is added to one of these qdiscs
>> then it applied to all. Similarly, if a packet is matched on one of the
>> grouped qdiscs then the stats for the entire block are increased. The
>> basis of the LAG offload is that the LAG master (attached to the OvS
>> bridge) and slaves that may exist outside of OvS are all added to the same
>> TC block. OvS can then control the filters and collect the stats on the
>> slaves via its interaction with the LAG master.
>>
>> The TC API is extended within OvS to allow the addition of a block id to
>> ingress qdisc adds. Block ids are then assigned to each LAG master that is
>> attached to the OvS bridge. The linux netdev netlink socket is used to
>> monitor slave devices. If a LAG slave is found whose master is on the bridge
>> then it is added to the same block as its master. If the underlying slaves
>> belong to an offloadable device then the Linux LAG device can be offloaded
>> to hardware.
>
> Guys (J/J/J),
>
> Doing this here b/c
>
> a. this has impact on the kernel side of things
>
> b. I am more of a netdev and not openvswitch citizen..
>
> some comments,
>
> 1. this + Jakub's patch for the reply are really a great design
>
> 2. re the egress side of things. Some NIC HWs can't just use LAG
> as the egress port destination of an ACL (tc rule) and the HW rule
> needs to be duplicated to both HW ports. So... in that case, you
> see the HW driver doing the duplication (:() or we can somehow
> make it happen from user-space?
>
Hi Or,
I'm not sure how rule duplication would work for rules that egress to
a LAG device.
Perhaps this could be done for an active/backup mode where user-space
adds a rule to 1 port and deletes from another as appropriate.
For load balancing modes where the egress port is selected based on a
hash of packet fields, it would be a lot more complicated.
OvS can do this with its own bonds as far as I'm aware but (if
recirculation is turned off) it basically creates exact match datapath
entries for each packet flow.
Perhaps I do not fully understand your question?
> 3. for the case of overlay networks, e.g OVS based vxlan tunnel, the
> ingress (decap) rule is set on the vxlan device. Jakub, you mentioned
> a possible kernel patch to the HW (nfp, mlx5) drivers to have them bind
> to the tunnel device for ingress rules. If we have agreed way to identify
> uplink representors, can we do that from ovs too? does it matter if we are
> bonding + encapsulating or just encapsulating? note that under encap scheme
> the bond is typically not part of the OVS bridge.
>
If we have a way to bind the HW drivers to tunnel devs for ingress
rules then this should work fine with OvS (possibly requiring a small
patch - Id need to check).
In terms of bonding + encap this probably needs to be handled in the
hw itself for the same reason I mentioned in point 2.
> Or.
^ permalink raw reply
* [RFC PATCH v2 net-next 00/12] Handle multiple received packets at each stage
From: Edward Cree @ 2018-06-26 18:15 UTC (permalink / raw)
To: linux-net-drivers, netdev; +Cc: davem
This patch series adds the capability for the network stack to receive a
list of packets and process them as a unit, rather than handling each
packet singly in sequence. This is done by factoring out the existing
datapath code at each layer and wrapping it in list handling code.
The motivation for this change is twofold:
* Instruction cache locality. Currently, running the entire network
stack receive path on a packet involves more code than will fit in the
lowest-level icache, meaning that when the next packet is handled, the
code has to be reloaded from more distant caches. By handling packets
in "row-major order", we ensure that the code at each layer is hot for
most of the list. (There is a corresponding downside in _data_ cache
locality, since we are now touching every packet at every layer, but in
practice there is easily enough room in dcache to hold one cacheline of
each of the 64 packets in a NAPI poll.)
* Reduction of indirect calls. Owing to Spectre mitigations, indirect
function calls are now more expensive than ever; they are also heavily
used in the network stack's architecture (see [1]). By replacing 64
indirect calls to the next-layer per-packet function with a single
indirect call to the next-layer list function, we can save CPU cycles.
Drivers pass an SKB list to the stack at the end of the NAPI poll; this
gives a natural batch size (the NAPI poll weight) and avoids waiting at
the software level for further packets to make a larger batch (which
would add latency). It also means that the batch size is automatically
tuned by the existing interrupt moderation mechanism.
The stack then runs each layer of processing over all the packets in the
list before proceeding to the next layer. Where the 'next layer' (or
the context in which it must run) differs among the packets, the stack
splits the list; this 'late demux' means that packets which differ only
in later headers (e.g. same L2/L3 but different L4) can traverse the
early part of the stack together.
Also, where the next layer is not (yet) list-aware, the stack can revert
to calling the rest of the stack in a loop; this allows gradual/creeping
listification, with no 'flag day' patch needed to listify everything.
Patches 1-2 simply place received packets on a list during the event
processing loop on the sfc EF10 architecture, then call the normal stack
for each packet singly at the end of the NAPI poll. (Analogues of patch
#2 for other NIC drivers should be fairly straightforward.)
Patches 3-9 extend the list processing as far as the IP receive handler.
Patches 10-12 apply the list techniques to Generic XDP, since the bpf_func
there is an indirect call. In patch #12 we JIT a list_func that performs
list unwrapping and makes direct calls to the bpf_func.
Patches 1-2 alone give about a 10% improvement in packet rate in the
baseline test; adding patches 3-9 raises this to around 25%. Patches 10-
12, intended to improve Generic XDP performance, have in fact slightly
worsened it; I am unsure why this is and have included them in this RFC
in the hopes that someone will spot the reason. If no progress is made I
will drop them from the series.
Performance measurements were made with NetPerf UDP_STREAM, using 1-byte
packets and a single core to handle interrupts on the RX side; this was
in order to measure as simply as possible the packet rate handled by a
single core. Figures are in Mbit/s; divide by 8 to obtain Mpps. The
setup was tuned for maximum reproducibility, rather than raw performance.
Full details and more results (both with and without retpolines) are
presented in [2].
The baseline test uses four streams, and multiple RXQs all bound to a
single CPU (the netperf binary is bound to a neighbouring CPU). These
tests were run with retpolines.
net-next: 6.60 Mb/s (datum)
after 9: 8.35 Mb/s (datum + 26.6%)
after 12: 8.29 Mb/s (datum + 25.6%)
Note however that these results are not robust; changes in the parameters
of the test often shrink the gain to single-digit percentages. For
instance, when using only a single RXQ, only a 4% gain was seen. The
results also seem to change significantly each time the patch series is
rebased onto a new net-next; for instance the results in [3] with
retpolines (slide 9) show only 11.6% gain in the same test as above (the
post-patch performance is the same but the pre-patch datum is 7.5Mb/s).
I also performed tests with Generic XDP enabled (using a simple map-based
UDP port drop program with no entries in the map), both with and without
the eBPF JIT enabled.
No JIT:
net-next: 3.52 Mb/s (datum)
after 9: 4.91 Mb/s (datum + 39.5%)
after 12: 4.83 Mb/s (datum + 37.3%)
With JIT:
net-next: 5.23 Mb/s (datum)
after 9: 6.64 Mb/s (datum + 27.0%)
after 12: 6.46 Mb/s (datum + 23.6%)
Another test variation was the use of software filtering/firewall rules.
Adding a single iptables rule (a UDP port drop on a port range not
matching the test traffic), thus making the netfilter hook have work to
do, reduced baseline performance but showed a similar delta from the
patches. Similarly, testing with a set of TC flower filters (kindly
supplied by Cong Wang) in the single-RXQ setup (that previously gave 4%)
slowed down the baseline but not the patched performance, giving a 5.7%
performance delta. These data suggest that the batching approach
remains effective in the presence of software switching rules.
Changes from v1 (see [3]):
* Rebased across 2 years' net-next movement (surprisingly straightforward).
- Added Generic XDP handling to netif_receive_skb_list_internal()
- Dealt with changes to PFMEMALLOC setting APIs
* General cleanup of code and comments.
* Skipped function calls for empty lists at various points in the stack
(patch #9).
* Added listified Generic XDP handling (patches 10-12), though it doesn't
seem to help (see above).
* Extended testing to cover software firewalls / netfilter etc.
[1] http://vger.kernel.org/netconf2018_files/DavidMiller_netconf2018.pdf
[2] http://vger.kernel.org/netconf2018_files/EdwardCree_netconf2018.pdf
[3] http://lists.openwall.net/netdev/2016/04/19/89
Edward Cree (12):
net: core: trivial netif_receive_skb_list() entry point
sfc: batch up RX delivery
net: core: unwrap skb list receive slightly further
net: core: Another step of skb receive list processing
net: core: another layer of lists, around PF_MEMALLOC skb handling
net: core: propagate SKB lists through packet_type lookup
net: ipv4: listified version of ip_rcv
net: ipv4: listify ip_rcv_finish
net: don't bother calling list RX functions on empty lists
net: listify Generic XDP processing, part 1
net: listify Generic XDP processing, part 2
net: listify jited Generic XDP processing on x86_64
arch/x86/net/bpf_jit_comp.c | 164 ++++++++++++++
drivers/net/ethernet/sfc/efx.c | 12 +
drivers/net/ethernet/sfc/net_driver.h | 3 +
drivers/net/ethernet/sfc/rx.c | 7 +-
include/linux/filter.h | 43 +++-
include/linux/netdevice.h | 4 +
include/linux/netfilter.h | 27 +++
include/linux/skbuff.h | 16 ++
include/net/ip.h | 2 +
include/trace/events/net.h | 14 ++
kernel/bpf/core.c | 38 +++-
net/core/dev.c | 415 +++++++++++++++++++++++++++++-----
net/core/filter.c | 10 +-
net/ipv4/af_inet.c | 1 +
net/ipv4/ip_input.c | 129 ++++++++++-
15 files changed, 810 insertions(+), 75 deletions(-)
^ permalink raw reply
* Re: [PATCH rdma-next 08/12] overflow.h: Add arithmetic shift helper
From: Jason Gunthorpe @ 2018-06-26 17:54 UTC (permalink / raw)
To: Rasmus Villemoes
Cc: Leon Romanovsky, Doug Ledford, Kees Cook, Leon Romanovsky,
RDMA mailing list, Hadar Hen Zion, Matan Barak, Michael J Ruhl,
Noa Osherovich, Raed Salem, Yishai Hadas, Saeed Mahameed,
linux-netdev, linux-kernel
In-Reply-To: <CAKwiHFiRYbyiJqDYCgKXKZYRr0KjCt8q9AwKwfqoCA1sT2KFyQ@mail.gmail.com>
On Tue, Jun 26, 2018 at 10:07:07AM +0200, Rasmus Villemoes wrote:
> On 25 June 2018 at 19:11, Jason Gunthorpe <[1]jgg@mellanox.com> wrote:
>
> On Mon, Jun 25, 2018 at 11:26:05AM +0200, Rasmus Villemoes wrote:
> > check_shift_overflow(a, s, d) {
> > unsigned _nbits = 8*sizeof(a);
> > typeof(a) _a = (a);
> > typeof(s) _s = (s);
> > typeof(d) _d = (d);
> >
> > *_d = ((u64)(_a) << (_s & (_nbits-1)));
> > _s >= _nbits || (_s > 0 && (_a >> (_nbits - _s -
> > is_signed_type(a))) != 0);
> > }
> Those types are not quite right.. What about this?
> check_shift_overflow(a, s, d) ({
> unsigned int _nbits = 8*sizeof(d) - is_signed_type(d);
> typeof(d) _a = a; // Shift is always performed on type 'd'
> typeof(s) _s = s;
> typeof(d) _d = d;
> *_d = (_a << (_s & (_nbits-1)));
> (((*_d) >> (_s & (_nbits-1)) != _a);
> })
>
> No, because, the check_*_overflow (and the __builtin_*_overflow
> cousins) functions must do their job without causing undefined
> behaviour, regardless of what crazy input values and types they are
> given.
Okay, I see you are concerned about a UB during shifting signed
values. I didn't consider that..
> Also, the output must be completely defined for all inputs [1].
> I omitted it for brevity, but I also wanted a and *d to have the same
> type, so there should also be one of those (void)(&_a == _d);
Humm. No, that doesn't match the use case. Typically this would take
an ABI constant like a u32 and shift it into a size_t for use with an
allocator. So demanding a and d have equal types is not good, and
requiring user casting is not good as the casting could be truncating.
> statements. See the other check_*_overflow and the commit adding them.
> Without the (u64) cast, any signed (and negative) a would cause UB in
> your suggestion.
When thinking about signed cases.. The explicit u64 cast, and
implict promotion to typeof(d), produce something counter intuitive,
eg:
(u64)(s32)-1 == 0xffffffffffffffff
Which would result in a shift oucome that is not what anyone would
expect, IMHO... So the first version isn't what I'd expect either..
> Also, having _nbits be 31 when a (and/or *d) has type
> int, and then and'ing the shift by 30 doesn't make any sense; I have no
> idea what you're trying to do.
Yes, it is not helpful to avoid UB when a is signed..
> [1] For this one, it would probably be most consistent to say that the
> result is a*2^s computed in infinite-precision, then truncated to fit
> in d.
I think this does not match the usual use cases, this should strictly
be an unsigned shift. The output is guarenteed to always be positive
or overflow is signaled.
Signed types are alllowed, but negative values are not.
What about more like this?
check_shift_overflow(a, s, d) ({
// Shift is always performed on the machine's largest unsigned
u64 _a = a;
typeof(s) _s = s;
typeof(d) _d = d;
// Make s safe against UB
unsigned int _to_shift = _s >= 0 && _s < 8*sizeof(*d) : _s ? 0;
*_d = (_a << _to_shift);
// s is malformed
(_to_shift != _s ||
// d is a signed type and became negative
*_d < 0 ||
// a is a signed type and was negative
_a < 0 ||
// Not invertable means a was truncated during shifting
(*_d >> _to_shift) != a))
})
I'm not seeing a UB with this?
Jason
^ permalink raw reply
* RE: [patch net-next RFC 03/12] mlxsw: core: Add core environment module for port temperature reading
From: Vadim Pasternak @ 2018-06-26 17:50 UTC (permalink / raw)
To: Guenter Roeck, Andrew Lunn
Cc: linux-pm@vger.kernel.org, netdev@vger.kernel.org,
rui.zhang@intel.com, edubezval@gmail.com, jiri@resnulli.us
In-Reply-To: <20180626170012.GA28370@roeck-us.net>
> -----Original Message-----
> From: Guenter Roeck [mailto:linux@roeck-us.net]
> Sent: Tuesday, June 26, 2018 8:00 PM
> To: Andrew Lunn <andrew@lunn.ch>
> Cc: Vadim Pasternak <vadimp@mellanox.com>; linux-pm@vger.kernel.org;
> netdev@vger.kernel.org; rui.zhang@intel.com; edubezval@gmail.com;
> jiri@resnulli.us
> Subject: Re: [patch net-next RFC 03/12] mlxsw: core: Add core environment
> module for port temperature reading
>
> On Tue, Jun 26, 2018 at 04:22:38PM +0200, Andrew Lunn wrote:
> > On Tue, Jun 26, 2018 at 12:10:28PM +0000, Vadim Pasternak wrote:
> >
> > Adding the linux-pm@vger.kernel.org list.
> >
> > > Add new core_env module to allow port temperature reading. This
> > > information has most critical impact on system's thermal monitoring
> > > and is to be used by core_hwmon and core_thermal modules.
> > >
> > > New internal API reads the temperature from all the modules, which
> > > are equipped with the thermal sensor and exposes temperature
> > > according to the worst measure. All individual temperature values
> > > are normalized to pre-defined range.
> >
> > This patchset has been sent to the netdev list before. I raised a few
> > questions about this, which is why it is now being posted to a bigger
> > group for review.
> >
> > The hardware has up to 64 temperature sensors. These sensors are
> > hot-plugable, since they are inside SFP modules, which are
> > hot-plugable. Different SFP modules can have different operating
> > temperature ranges. They contain an EEPROM which lists upper and lower
> > warning and fail temperatures, and report alarms when these thresholds
> > a reached.
> >
> > This code takes the 64 sensors readings and calculates a single value
> > it passes to one thermal zone. That thermal zone then controls one fan
> > to keep this single value in range.
> >
> > I queried is this is the correct way to do this? Would it not be
> > better to have up to 64 thermal zones? Leave the thermal core to
> > iterate over all the zones in order to determine how the fan should be
> > driven?
> >
> I very much think so. This problem must exist elsewhere; essentially it is the
> bundling of multiple temperature sensors into a single thermal zone. I am not
> sure if this should be 64 thermal zones or one thermal zone with up to 64
> sensors and some algorithm to select the relevant temperature; that would be
> up to the thermal subsystem maintainers to decide. Either case, the sensors
> should be handled and reported as individual sensors, with appropriate limits,
> not as single sensor.
> Yes, I understand that means we'll have hundreds of hwmon devices, but that
> should not be a problem (and if it is, we'll have to fix the problem, not the code
> exposing it).
I guess that many thermal zones with single PWM control will not work.
PWM will never stabilize in case there are some hot and some cold modules.
It seems it could be only temperature input array providing to the thermal
zone. And additionally it should have arrays at least for the warning and critical
thresholds.
We are using step-wise thermal algorithm as a default.
In case thermal zone will have multi temperature inputs this algorithm possibly
should be adapted for handling temperature arrays (input and thresholds)
along with the thermal zone normalization parameters - more or less the same
normalization process as I provided in this patch, but generic for the thermal
subsystem.
Or another possibility - to add some new thermal algorithm "step-wise-multi"
or something like that.
However, I have some concerns on this matter.
Our hardware provides bulk reading of the modules temperature, means
I can get all inputs by one hardware request, which is important optimization.
Reading each module individually will be resulted in huge overhead and will
require maybe some cashing of temperature inputs.
And also, now we have up to 64 modules per system and on the way the
system supporting 128 modules.
Would it be good to have such huge number of hwmon configuration records,
like:
HWMON_T_INPUT | HWMON_T_MAX_ALARM | HWMON_T_CRIT_ALARM ?
>
> I understand that the thermal subsystem does not currently support handling this
> problem. There may also be some missing pieces between the hwmon and
> thermal subsystems, such as reporting limits or alarms when a hwmon driver
> register with the thermal subsystem.
>
> Maybe it is time to add this support as part of this patch series ?
>
> > This is possibly the first board with so many sensors. However, i
> > doubt it is totally unique. Other big Ethernet switches with lots of
> > SFP modules may be added later. Also, 10G copper PHYs often have
> > temperature sensors, so this is not limited to just boards with
> > optical ports. So having a generic solution would be good.
>
> Agreed.
>
> Thanks,
> Guenter
>
> >
> > What do the Linux PM exports say about this?
> >
> > Thanks
> > Andrew
^ permalink raw reply
* [PATCH net-next v2 3/3] net: phy: xgmiitorgmii: Check read_status results
From: Brandon Maier @ 2018-06-26 17:50 UTC (permalink / raw)
To: netdev
Cc: andrew, f.fainelli, davem, michal.simek, clayton.shotwell,
kristopher.cory, linux-kernel, Brandon Maier
In-Reply-To: <20180626175050.71165-1-brandon.maier@rockwellcollins.com>
We're ignoring the result of the attached phy device's read_status().
Return it so we can detect errors.
Signed-off-by: Brandon Maier <brandon.maier@rockwellcollins.com>
---
v2:
- No change
v1: https://marc.info/?l=linux-netdev&m=152838766410559&w=2
drivers/net/phy/xilinx_gmii2rgmii.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c
index d6f8b64cddbe..74a8782313cf 100644
--- a/drivers/net/phy/xilinx_gmii2rgmii.c
+++ b/drivers/net/phy/xilinx_gmii2rgmii.c
@@ -42,8 +42,11 @@ static int xgmiitorgmii_read_status(struct phy_device *phydev)
struct mii_bus *bus = priv->mdio->bus;
int addr = priv->mdio->addr;
u16 val = 0;
+ int err;
- priv->phy_drv->read_status(phydev);
+ err = priv->phy_drv->read_status(phydev);
+ if (err < 0)
+ return err;
val = mdiobus_read(bus, addr, XILINX_GMII2RGMII_REG);
val &= ~XILINX_GMII2RGMII_SPEED_MASK;
--
2.17.1
^ permalink raw reply related
* [PATCH net-next v2 2/3] net: phy: xgmiitorgmii: Use correct mdio bus
From: Brandon Maier @ 2018-06-26 17:50 UTC (permalink / raw)
To: netdev
Cc: andrew, f.fainelli, davem, michal.simek, clayton.shotwell,
kristopher.cory, linux-kernel, Brandon Maier
In-Reply-To: <20180626175050.71165-1-brandon.maier@rockwellcollins.com>
The xgmiitorgmii is using the mii_bus of the device it's attached to,
instead of the bus it was given during probe.
Signed-off-by: Brandon Maier <brandon.maier@rockwellcollins.com>
---
v2:
- Fix trivial typo in commit message
v1: https://marc.info/?l=linux-netdev&m=152838761310537&w=2
drivers/net/phy/xilinx_gmii2rgmii.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c
index 04c8bec1c4c1..d6f8b64cddbe 100644
--- a/drivers/net/phy/xilinx_gmii2rgmii.c
+++ b/drivers/net/phy/xilinx_gmii2rgmii.c
@@ -33,17 +33,19 @@ struct gmii2rgmii {
struct phy_device *phy_dev;
struct phy_driver *phy_drv;
struct phy_driver conv_phy_drv;
- int addr;
+ struct mdio_device *mdio;
};
static int xgmiitorgmii_read_status(struct phy_device *phydev)
{
struct gmii2rgmii *priv = phydev->priv;
+ struct mii_bus *bus = priv->mdio->bus;
+ int addr = priv->mdio->addr;
u16 val = 0;
priv->phy_drv->read_status(phydev);
- val = mdiobus_read(phydev->mdio.bus, priv->addr, XILINX_GMII2RGMII_REG);
+ val = mdiobus_read(bus, addr, XILINX_GMII2RGMII_REG);
val &= ~XILINX_GMII2RGMII_SPEED_MASK;
if (phydev->speed == SPEED_1000)
@@ -53,7 +55,7 @@ static int xgmiitorgmii_read_status(struct phy_device *phydev)
else
val |= BMCR_SPEED10;
- mdiobus_write(phydev->mdio.bus, priv->addr, XILINX_GMII2RGMII_REG, val);
+ mdiobus_write(bus, addr, XILINX_GMII2RGMII_REG, val);
return 0;
}
@@ -86,7 +88,7 @@ static int xgmiitorgmii_probe(struct mdio_device *mdiodev)
return -EPROBE_DEFER;
}
- priv->addr = mdiodev->addr;
+ priv->mdio = mdiodev;
priv->phy_drv = priv->phy_dev->drv;
memcpy(&priv->conv_phy_drv, priv->phy_dev->drv,
sizeof(struct phy_driver));
--
2.17.1
^ permalink raw reply related
* [PATCH net-next v2 1/3] net: phy: xgmiitorgmii: Check phy_driver ready before accessing
From: Brandon Maier @ 2018-06-26 17:50 UTC (permalink / raw)
To: netdev
Cc: andrew, f.fainelli, davem, michal.simek, clayton.shotwell,
kristopher.cory, linux-kernel, Brandon Maier
Since a phy_device is added to the global mdio_bus list during
phy_device_register(), but a phy_device's phy_driver doesn't get
attached until phy_probe(). It's possible of_phy_find_device() in
xgmiitorgmii will return a valid phy with a NULL phy_driver. Leading to
a NULL pointer access during the memcpy().
Fixes this Oops:
Unable to handle kernel NULL pointer dereference at virtual address 00000000
pgd = c0004000
[00000000] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.14.40 #1
Hardware name: Xilinx Zynq Platform
task: ce4c8d00 task.stack: ce4ca000
PC is at memcpy+0x48/0x330
LR is at xgmiitorgmii_probe+0x90/0xe8
pc : [<c074bc68>] lr : [<c0529548>] psr: 20000013
sp : ce4cbb54 ip : 00000000 fp : ce4cbb8c
r10: 00000000 r9 : 00000000 r8 : c0c49178
r7 : 00000000 r6 : cdc14718 r5 : ce762800 r4 : cdc14710
r3 : 00000000 r2 : 00000054 r1 : 00000000 r0 : cdc14718
Flags: nzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none
Control: 18c5387d Table: 0000404a DAC: 00000051
Process swapper/0 (pid: 1, stack limit = 0xce4ca210)
...
[<c074bc68>] (memcpy) from [<c0529548>] (xgmiitorgmii_probe+0x90/0xe8)
[<c0529548>] (xgmiitorgmii_probe) from [<c0526a94>] (mdio_probe+0x28/0x34)
[<c0526a94>] (mdio_probe) from [<c04db98c>] (driver_probe_device+0x254/0x414)
[<c04db98c>] (driver_probe_device) from [<c04dbd58>] (__device_attach_driver+0xac/0x10c)
[<c04dbd58>] (__device_attach_driver) from [<c04d96f4>] (bus_for_each_drv+0x84/0xc8)
[<c04d96f4>] (bus_for_each_drv) from [<c04db5bc>] (__device_attach+0xd0/0x134)
[<c04db5bc>] (__device_attach) from [<c04dbdd4>] (device_initial_probe+0x1c/0x20)
[<c04dbdd4>] (device_initial_probe) from [<c04da8fc>] (bus_probe_device+0x98/0xa0)
[<c04da8fc>] (bus_probe_device) from [<c04d8660>] (device_add+0x43c/0x5d0)
[<c04d8660>] (device_add) from [<c0526cb8>] (mdio_device_register+0x34/0x80)
[<c0526cb8>] (mdio_device_register) from [<c0580b48>] (of_mdiobus_register+0x170/0x30c)
[<c0580b48>] (of_mdiobus_register) from [<c05349c4>] (macb_probe+0x710/0xc00)
[<c05349c4>] (macb_probe) from [<c04dd700>] (platform_drv_probe+0x44/0x80)
[<c04dd700>] (platform_drv_probe) from [<c04db98c>] (driver_probe_device+0x254/0x414)
[<c04db98c>] (driver_probe_device) from [<c04dbc58>] (__driver_attach+0x10c/0x118)
[<c04dbc58>] (__driver_attach) from [<c04d9600>] (bus_for_each_dev+0x8c/0xd0)
[<c04d9600>] (bus_for_each_dev) from [<c04db1fc>] (driver_attach+0x2c/0x30)
[<c04db1fc>] (driver_attach) from [<c04daa98>] (bus_add_driver+0x50/0x260)
[<c04daa98>] (bus_add_driver) from [<c04dc440>] (driver_register+0x88/0x108)
[<c04dc440>] (driver_register) from [<c04dd6b4>] (__platform_driver_register+0x50/0x58)
[<c04dd6b4>] (__platform_driver_register) from [<c0b31248>] (macb_driver_init+0x24/0x28)
[<c0b31248>] (macb_driver_init) from [<c010203c>] (do_one_initcall+0x60/0x1a4)
[<c010203c>] (do_one_initcall) from [<c0b00f78>] (kernel_init_freeable+0x15c/0x1f8)
[<c0b00f78>] (kernel_init_freeable) from [<c0763d10>] (kernel_init+0x18/0x124)
[<c0763d10>] (kernel_init) from [<c0112d74>] (ret_from_fork+0x14/0x20)
Code: ba000002 f5d1f03c f5d1f05c f5d1f07c (e8b151f8)
---[ end trace 3e4ec21905820a1f ]---
Signed-off-by: Brandon Maier <brandon.maier@rockwellcollins.com>
---
v2:
- Add Oops to commit message
v1: https://marc.info/?l=linux-netdev&m=152838762210538&w=2
drivers/net/phy/xilinx_gmii2rgmii.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c
index 2e5150b0b8d5..04c8bec1c4c1 100644
--- a/drivers/net/phy/xilinx_gmii2rgmii.c
+++ b/drivers/net/phy/xilinx_gmii2rgmii.c
@@ -81,6 +81,11 @@ static int xgmiitorgmii_probe(struct mdio_device *mdiodev)
return -EPROBE_DEFER;
}
+ if (!priv->phy_dev->drv) {
+ dev_info(dev, "Attached phy not ready\n");
+ return -EPROBE_DEFER;
+ }
+
priv->addr = mdiodev->addr;
priv->phy_drv = priv->phy_dev->drv;
memcpy(&priv->conv_phy_drv, priv->phy_dev->drv,
--
2.17.1
^ permalink raw reply related
* Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net
From: Michael S. Tsirkin @ 2018-06-26 17:50 UTC (permalink / raw)
To: Cornelia Huck
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, konrad.wilk,
Jakub Kicinski, Samudrala, Sridhar, qemu-devel, virtualization,
Siwei Liu, Venu Busireddy, Netdev, boris.ostrovsky, aaron.f.brown,
Joao Martins
In-Reply-To: <20180626170813.4db094a1.cohuck@redhat.com>
On Tue, Jun 26, 2018 at 05:08:13PM +0200, Cornelia Huck wrote:
> On Fri, 22 Jun 2018 17:05:04 -0700
> Siwei Liu <loseweigh@gmail.com> wrote:
>
> > On Fri, Jun 22, 2018 at 3:33 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > I suspect the diveregence will be lost on most users though
> > > simply because they don't even care about vfio. They just
> > > want things to go fast.
> >
> > Like Jason said, VF isn't faster than virtio-net in all cases. It
> > depends on the workload and performance metrics: throughput, latency,
> > or packet per second.
>
> So, will it be guest/admin-controllable then where the traffic flows
> through? Just because we do have a vf available after negotiation of
> the feature bit, it does not necessarily mean we want to use it? Do we
> (the guest) even want to make it visible in that case?
I think these ideas belong to what Alex Duyck wanted to do:
some kind of advanced device that isn't tied to
any network interfaces and allows workload and performance
specific tuning.
Way out of scope for a simple failover, and more importantly,
no one is looking at even enumerating the problems involved,
much less solving them.
--
MST
^ permalink raw reply
* Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net
From: Michael S. Tsirkin @ 2018-06-26 17:42 UTC (permalink / raw)
To: Cornelia Huck
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, konrad.wilk,
Jakub Kicinski, Samudrala, Sridhar, qemu-devel, virtualization,
Siwei Liu, Venu Busireddy, Netdev, boris.ostrovsky, aaron.f.brown,
Joao Martins
In-Reply-To: <20180626180316.3723422f.cohuck@redhat.com>
On Tue, Jun 26, 2018 at 06:03:16PM +0200, Cornelia Huck wrote:
> Ok, that makes me conclude that we definitely need to involve the
> libvirt folks before we proceed further with defining QEMU interfaces.
That's always a wise thing to do.
--
MST
^ permalink raw reply
* Re: [PATCH net-next] tcp: remove one indentation level in tcp_create_openreq_child
From: Neal Cardwell @ 2018-06-26 17:19 UTC (permalink / raw)
To: Yuchung Cheng; +Cc: Eric Dumazet, David Miller, Netdev, Eric Dumazet
In-Reply-To: <CAK6E8=deSubEg3QnLw7ZGAe8q=yOnuHJk7L1bO7KPFv2HB7Low@mail.gmail.com>
On Tue, Jun 26, 2018 at 11:46 AM Eric Dumazet <edumazet@google.com> wrote:
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> net/ipv4/tcp_minisocks.c | 223 ++++++++++++++++++++-------------------
> 1 file changed, 113 insertions(+), 110 deletions(-)
Yes, very nice clean-up! Thanks for doing this.
Acked-by: Neal Cardwell <ncardwell@google.com>
neal
^ permalink raw reply
* Re: [PATCH net-next] tcp: remove one indentation level in tcp_create_openreq_child
From: Yuchung Cheng @ 2018-06-26 17:16 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S . Miller, netdev, Eric Dumazet
In-Reply-To: <20180626154549.102366-1-edumazet@google.com>
On Tue, Jun 26, 2018 at 8:45 AM, Eric Dumazet <edumazet@google.com> wrote:
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
nice refactor!
Acked-by: Yuchung Cheng <ycheng@google.com>
> net/ipv4/tcp_minisocks.c | 223 ++++++++++++++++++++-------------------
> 1 file changed, 113 insertions(+), 110 deletions(-)
>
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 1dda1341a223937580b4efdbedb21ae50b221ff7..dac5893a52b4520d86ed2fcadbfb561a559fcd3d 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -449,119 +449,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
> struct sk_buff *skb)
> {
> struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
> -
> - if (newsk) {
> - const struct inet_request_sock *ireq = inet_rsk(req);
> - struct tcp_request_sock *treq = tcp_rsk(req);
> - struct inet_connection_sock *newicsk = inet_csk(newsk);
> - struct tcp_sock *newtp = tcp_sk(newsk);
> - struct tcp_sock *oldtp = tcp_sk(sk);
> -
> - smc_check_reset_syn_req(oldtp, req, newtp);
> -
> - /* Now setup tcp_sock */
> - newtp->pred_flags = 0;
> -
> - newtp->rcv_wup = newtp->copied_seq =
> - newtp->rcv_nxt = treq->rcv_isn + 1;
> - newtp->segs_in = 1;
> -
> - newtp->snd_sml = newtp->snd_una =
> - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
> -
> - INIT_LIST_HEAD(&newtp->tsq_node);
> - INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
> -
> - tcp_init_wl(newtp, treq->rcv_isn);
> -
> - newtp->srtt_us = 0;
> - newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
> - minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
> - newicsk->icsk_rto = TCP_TIMEOUT_INIT;
> - newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
> -
> - newtp->packets_out = 0;
> - newtp->retrans_out = 0;
> - newtp->sacked_out = 0;
> - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
> - newtp->tlp_high_seq = 0;
> - newtp->lsndtime = tcp_jiffies32;
> - newsk->sk_txhash = treq->txhash;
> - newtp->last_oow_ack_time = 0;
> - newtp->total_retrans = req->num_retrans;
> -
> - /* So many TCP implementations out there (incorrectly) count the
> - * initial SYN frame in their delayed-ACK and congestion control
> - * algorithms that we must have the following bandaid to talk
> - * efficiently to them. -DaveM
> - */
> - newtp->snd_cwnd = TCP_INIT_CWND;
> - newtp->snd_cwnd_cnt = 0;
> -
> - /* There's a bubble in the pipe until at least the first ACK. */
> - newtp->app_limited = ~0U;
> -
> - tcp_init_xmit_timers(newsk);
> - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
> -
> - newtp->rx_opt.saw_tstamp = 0;
> -
> - newtp->rx_opt.dsack = 0;
> - newtp->rx_opt.num_sacks = 0;
> -
> - newtp->urg_data = 0;
> -
> - if (sock_flag(newsk, SOCK_KEEPOPEN))
> - inet_csk_reset_keepalive_timer(newsk,
> - keepalive_time_when(newtp));
> -
> - newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
> - newtp->rx_opt.sack_ok = ireq->sack_ok;
> - newtp->window_clamp = req->rsk_window_clamp;
> - newtp->rcv_ssthresh = req->rsk_rcv_wnd;
> - newtp->rcv_wnd = req->rsk_rcv_wnd;
> - newtp->rx_opt.wscale_ok = ireq->wscale_ok;
> - if (newtp->rx_opt.wscale_ok) {
> - newtp->rx_opt.snd_wscale = ireq->snd_wscale;
> - newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
> - } else {
> - newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
> - newtp->window_clamp = min(newtp->window_clamp, 65535U);
> - }
> - newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
> - newtp->rx_opt.snd_wscale);
> - newtp->max_window = newtp->snd_wnd;
> -
> - if (newtp->rx_opt.tstamp_ok) {
> - newtp->rx_opt.ts_recent = req->ts_recent;
> - newtp->rx_opt.ts_recent_stamp = get_seconds();
> - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
> - } else {
> - newtp->rx_opt.ts_recent_stamp = 0;
> - newtp->tcp_header_len = sizeof(struct tcphdr);
> - }
> - newtp->tsoffset = treq->ts_off;
> + const struct inet_request_sock *ireq = inet_rsk(req);
> + struct tcp_request_sock *treq = tcp_rsk(req);
> + struct inet_connection_sock *newicsk;
> + struct tcp_sock *oldtp, *newtp;
> +
> + if (!newsk)
> + return NULL;
> +
> + newicsk = inet_csk(newsk);
> + newtp = tcp_sk(newsk);
> + oldtp = tcp_sk(sk);
> +
> + smc_check_reset_syn_req(oldtp, req, newtp);
> +
> + /* Now setup tcp_sock */
> + newtp->pred_flags = 0;
> +
> + newtp->rcv_wup = newtp->copied_seq =
> + newtp->rcv_nxt = treq->rcv_isn + 1;
> + newtp->segs_in = 1;
> +
> + newtp->snd_sml = newtp->snd_una =
> + newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
> +
> + INIT_LIST_HEAD(&newtp->tsq_node);
> + INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
> +
> + tcp_init_wl(newtp, treq->rcv_isn);
> +
> + newtp->srtt_us = 0;
> + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
> + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
> + newicsk->icsk_rto = TCP_TIMEOUT_INIT;
> + newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
> +
> + newtp->packets_out = 0;
> + newtp->retrans_out = 0;
> + newtp->sacked_out = 0;
> + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
> + newtp->tlp_high_seq = 0;
> + newtp->lsndtime = tcp_jiffies32;
> + newsk->sk_txhash = treq->txhash;
> + newtp->last_oow_ack_time = 0;
> + newtp->total_retrans = req->num_retrans;
> +
> + /* So many TCP implementations out there (incorrectly) count the
> + * initial SYN frame in their delayed-ACK and congestion control
> + * algorithms that we must have the following bandaid to talk
> + * efficiently to them. -DaveM
> + */
> + newtp->snd_cwnd = TCP_INIT_CWND;
> + newtp->snd_cwnd_cnt = 0;
> +
> + /* There's a bubble in the pipe until at least the first ACK. */
> + newtp->app_limited = ~0U;
> +
> + tcp_init_xmit_timers(newsk);
> + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
> +
> + newtp->rx_opt.saw_tstamp = 0;
> +
> + newtp->rx_opt.dsack = 0;
> + newtp->rx_opt.num_sacks = 0;
> +
> + newtp->urg_data = 0;
> +
> + if (sock_flag(newsk, SOCK_KEEPOPEN))
> + inet_csk_reset_keepalive_timer(newsk,
> + keepalive_time_when(newtp));
> +
> + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
> + newtp->rx_opt.sack_ok = ireq->sack_ok;
> + newtp->window_clamp = req->rsk_window_clamp;
> + newtp->rcv_ssthresh = req->rsk_rcv_wnd;
> + newtp->rcv_wnd = req->rsk_rcv_wnd;
> + newtp->rx_opt.wscale_ok = ireq->wscale_ok;
> + if (newtp->rx_opt.wscale_ok) {
> + newtp->rx_opt.snd_wscale = ireq->snd_wscale;
> + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
> + } else {
> + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
> + newtp->window_clamp = min(newtp->window_clamp, 65535U);
> + }
> + newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
> + newtp->max_window = newtp->snd_wnd;
> +
> + if (newtp->rx_opt.tstamp_ok) {
> + newtp->rx_opt.ts_recent = req->ts_recent;
> + newtp->rx_opt.ts_recent_stamp = get_seconds();
> + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
> + } else {
> + newtp->rx_opt.ts_recent_stamp = 0;
> + newtp->tcp_header_len = sizeof(struct tcphdr);
> + }
> + newtp->tsoffset = treq->ts_off;
> #ifdef CONFIG_TCP_MD5SIG
> - newtp->md5sig_info = NULL; /*XXX*/
> - if (newtp->af_specific->md5_lookup(sk, newsk))
> - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
> + newtp->md5sig_info = NULL; /*XXX*/
> + if (newtp->af_specific->md5_lookup(sk, newsk))
> + newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
> #endif
> - if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
> - newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
> - newtp->rx_opt.mss_clamp = req->mss;
> - tcp_ecn_openreq_child(newtp, req);
> - newtp->fastopen_req = NULL;
> - newtp->fastopen_rsk = NULL;
> - newtp->syn_data_acked = 0;
> - newtp->rack.mstamp = 0;
> - newtp->rack.advanced = 0;
> - newtp->rack.reo_wnd_steps = 1;
> - newtp->rack.last_delivered = 0;
> - newtp->rack.reo_wnd_persist = 0;
> - newtp->rack.dsack_seen = 0;
> + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
> + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
> + newtp->rx_opt.mss_clamp = req->mss;
> + tcp_ecn_openreq_child(newtp, req);
> + newtp->fastopen_req = NULL;
> + newtp->fastopen_rsk = NULL;
> + newtp->syn_data_acked = 0;
> + newtp->rack.mstamp = 0;
> + newtp->rack.advanced = 0;
> + newtp->rack.reo_wnd_steps = 1;
> + newtp->rack.last_delivered = 0;
> + newtp->rack.reo_wnd_persist = 0;
> + newtp->rack.dsack_seen = 0;
> +
> + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
>
> - __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
> - }
> return newsk;
> }
> EXPORT_SYMBOL(tcp_create_openreq_child);
> --
> 2.18.0.rc2.346.g013aa6912e-goog
>
^ permalink raw reply
* [PATCH v3 net-next 4/4] selftests: rtnetlink: add ipsec offload API test
From: Shannon Nelson @ 2018-06-26 17:07 UTC (permalink / raw)
To: davem, netdev, jakub.kicinski; +Cc: anders.roxell, linux-kselftest
In-Reply-To: <1530032875-30482-1-git-send-email-shannon.nelson@oracle.com>
Using the netdevsim as a device for testing, try out the XFRM commands
for setting up IPsec hardware offloads.
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
tools/testing/selftests/net/rtnetlink.sh | 114 +++++++++++++++++++++++++++++++
1 file changed, 114 insertions(+)
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 15948cf..9e1a82e 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -608,6 +608,119 @@ kci_test_ipsec()
echo "PASS: ipsec"
}
+#-------------------------------------------------------------------
+# Example commands
+# ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+# spi 0x07 mode transport reqid 0x07 replay-window 32 \
+# aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+# sel src 14.0.0.52/24 dst 14.0.0.70/24
+# offload dev sim1 dir out
+# ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+# tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+# spi 0x07 mode transport reqid 0x07
+#
+#-------------------------------------------------------------------
+kci_test_ipsec_offload()
+{
+ ret=0
+ algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
+ srcip=192.168.123.3
+ dstip=192.168.123.4
+ dev=simx1
+ sysfsd=/sys/kernel/debug/netdevsim/$dev
+ sysfsf=$sysfsd/ipsec
+
+ # setup netdevsim since dummydev doesn't have offload support
+ modprobe netdevsim
+ check_err $?
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: ipsec_offload can't load netdevsim"
+ return 1
+ fi
+
+ ip link add $dev type netdevsim
+ ip addr add $srcip dev $dev
+ ip link set $dev up
+ if [ ! -d $sysfsd ] ; then
+ echo "FAIL: ipsec_offload can't create device $dev"
+ return 1
+ fi
+ if [ ! -f $sysfsf ] ; then
+ echo "FAIL: ipsec_offload netdevsim doesn't support IPsec offload"
+ return 1
+ fi
+
+ # flush to be sure there's nothing configured
+ ip x s flush ; ip x p flush
+
+ # create offloaded SAs, both in and out
+ ip x p add dir out src $srcip/24 dst $dstip/24 \
+ tmpl proto esp src $srcip dst $dstip spi 9 \
+ mode transport reqid 42
+ check_err $?
+ ip x p add dir out src $dstip/24 dst $srcip/24 \
+ tmpl proto esp src $dstip dst $srcip spi 9 \
+ mode transport reqid 42
+ check_err $?
+
+ ip x s add proto esp src $srcip dst $dstip spi 9 \
+ mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
+ offload dev $dev dir out
+ check_err $?
+ ip x s add proto esp src $dstip dst $srcip spi 9 \
+ mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
+ offload dev $dev dir in
+ check_err $?
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: ipsec_offload can't create SA"
+ return 1
+ fi
+
+ # does offload show up in ip output
+ lines=`ip x s list | grep -c "crypto offload parameters: dev $dev dir"`
+ if [ $lines -ne 2 ] ; then
+ echo "FAIL: ipsec_offload SA offload missing from list output"
+ check_err 1
+ fi
+
+ # use ping to exercise the Tx path
+ ping -I $dev -c 3 -W 1 -i 0 $dstip >/dev/null
+
+ # does driver have correct offload info
+ diff $sysfsf - << EOF
+SA count=2 tx=3
+sa[0] tx ipaddr=0x00000000 00000000 00000000 00000000
+sa[0] spi=0x00000009 proto=0x32 salt=0x61626364 crypt=1
+sa[0] key=0x34333231 38373635 32313039 36353433
+sa[1] rx ipaddr=0x00000000 00000000 00000000 037ba8c0
+sa[1] spi=0x00000009 proto=0x32 salt=0x61626364 crypt=1
+sa[1] key=0x34333231 38373635 32313039 36353433
+EOF
+ if [ $? -ne 0 ] ; then
+ echo "FAIL: ipsec_offload incorrect driver data"
+ check_err 1
+ fi
+
+ # does offload get removed from driver
+ ip x s flush
+ ip x p flush
+ lines=`grep -c "SA count=0" $sysfsf`
+ if [ $lines -ne 1 ] ; then
+ echo "FAIL: ipsec_offload SA not removed from driver"
+ check_err 1
+ fi
+
+ # clean up any leftovers
+ ip link del $dev
+ rmmod netdevsim
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: ipsec_offload"
+ return 1
+ fi
+ echo "PASS: ipsec_offload"
+}
+
kci_test_gretap()
{
testns="testns"
@@ -862,6 +975,7 @@ kci_test_rtnl()
kci_test_encap
kci_test_macsec
kci_test_ipsec
+ kci_test_ipsec_offload
kci_del_dummy
}
--
2.7.4
^ permalink raw reply related
* [PATCH v3 net-next 3/4] netdevsim: add ipsec offload testing
From: Shannon Nelson @ 2018-06-26 17:07 UTC (permalink / raw)
To: davem, netdev, jakub.kicinski; +Cc: anders.roxell, linux-kselftest
In-Reply-To: <1530032875-30482-1-git-send-email-shannon.nelson@oracle.com>
Implement the IPsec/XFRM offload API for testing.
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
V2 - addressed formatting comments from Jakub Kicinski
V3 - a couple more little xmas tree nits
drivers/net/netdevsim/Makefile | 4 +
drivers/net/netdevsim/ipsec.c | 297 ++++++++++++++++++++++++++++++++++++++
drivers/net/netdevsim/netdev.c | 7 +
drivers/net/netdevsim/netdevsim.h | 41 ++++++
4 files changed, 349 insertions(+)
create mode 100644 drivers/net/netdevsim/ipsec.c
diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile
index 449b2a1..0fee1d0 100644
--- a/drivers/net/netdevsim/Makefile
+++ b/drivers/net/netdevsim/Makefile
@@ -13,3 +13,7 @@ endif
ifneq ($(CONFIG_NET_DEVLINK),)
netdevsim-objs += devlink.o fib.o
endif
+
+ifneq ($(CONFIG_XFRM_OFFLOAD),)
+netdevsim-objs += ipsec.o
+endif
diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
new file mode 100644
index 0000000..ceff544
--- /dev/null
+++ b/drivers/net/netdevsim/ipsec.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */
+
+#include <crypto/aead.h>
+#include <linux/debugfs.h>
+#include <net/xfrm.h>
+
+#include "netdevsim.h"
+
+#define NSIM_IPSEC_AUTH_BITS 128
+
+static ssize_t nsim_dbg_netdev_ops_read(struct file *filp,
+ char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct netdevsim *ns = filp->private_data;
+ struct nsim_ipsec *ipsec = &ns->ipsec;
+ size_t bufsize;
+ char *buf, *p;
+ int len;
+ int i;
+
+ /* the buffer needed is
+ * (num SAs * 3 lines each * ~60 bytes per line) + one more line
+ */
+ bufsize = (ipsec->count * 4 * 60) + 60;
+ buf = kzalloc(bufsize, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ p = buf;
+ p += snprintf(p, bufsize - (p - buf),
+ "SA count=%u tx=%u\n",
+ ipsec->count, ipsec->tx);
+
+ for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+ struct nsim_sa *sap = &ipsec->sa[i];
+
+ if (!sap->used)
+ continue;
+
+ p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] %cx ipaddr=0x%08x %08x %08x %08x\n",
+ i, (sap->rx ? 'r' : 't'), sap->ipaddr[0],
+ sap->ipaddr[1], sap->ipaddr[2], sap->ipaddr[3]);
+ p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] spi=0x%08x proto=0x%x salt=0x%08x crypt=%d\n",
+ i, be32_to_cpu(sap->xs->id.spi),
+ sap->xs->id.proto, sap->salt, sap->crypt);
+ p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] key=0x%08x %08x %08x %08x\n",
+ i, sap->key[0], sap->key[1],
+ sap->key[2], sap->key[3]);
+ }
+
+ len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf);
+
+ kfree(buf);
+ return len;
+}
+
+static const struct file_operations ipsec_dbg_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = nsim_dbg_netdev_ops_read,
+};
+
+static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec)
+{
+ u32 i;
+
+ if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT)
+ return -ENOSPC;
+
+ /* search sa table */
+ for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+ if (!ipsec->sa[i].used)
+ return i;
+ }
+
+ return -ENOSPC;
+}
+
+static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs,
+ u32 *mykey, u32 *mysalt)
+{
+ const char aes_gcm_name[] = "rfc4106(gcm(aes))";
+ struct net_device *dev = xs->xso.dev;
+ unsigned char *key_data;
+ char *alg_name = NULL;
+ int key_len;
+
+ if (!xs->aead) {
+ netdev_err(dev, "Unsupported IPsec algorithm\n");
+ return -EINVAL;
+ }
+
+ if (xs->aead->alg_icv_len != NSIM_IPSEC_AUTH_BITS) {
+ netdev_err(dev, "IPsec offload requires %d bit authentication\n",
+ NSIM_IPSEC_AUTH_BITS);
+ return -EINVAL;
+ }
+
+ key_data = &xs->aead->alg_key[0];
+ key_len = xs->aead->alg_key_len;
+ alg_name = xs->aead->alg_name;
+
+ if (strcmp(alg_name, aes_gcm_name)) {
+ netdev_err(dev, "Unsupported IPsec algorithm - please use %s\n",
+ aes_gcm_name);
+ return -EINVAL;
+ }
+
+ /* 160 accounts for 16 byte key and 4 byte salt */
+ if (key_len > NSIM_IPSEC_AUTH_BITS) {
+ *mysalt = ((u32 *)key_data)[4];
+ } else if (key_len == NSIM_IPSEC_AUTH_BITS) {
+ *mysalt = 0;
+ } else {
+ netdev_err(dev, "IPsec hw offload only supports 128 bit keys with optional 32 bit salt\n");
+ return -EINVAL;
+ }
+ memcpy(mykey, key_data, 16);
+
+ return 0;
+}
+
+static int nsim_ipsec_add_sa(struct xfrm_state *xs)
+{
+ struct nsim_ipsec *ipsec;
+ struct net_device *dev;
+ struct netdevsim *ns;
+ struct nsim_sa sa;
+ u16 sa_idx;
+ int ret;
+
+ dev = xs->xso.dev;
+ ns = netdev_priv(dev);
+ ipsec = &ns->ipsec;
+
+ if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) {
+ netdev_err(dev, "Unsupported protocol 0x%04x for ipsec offload\n",
+ xs->id.proto);
+ return -EINVAL;
+ }
+
+ if (xs->calg) {
+ netdev_err(dev, "Compression offload not supported\n");
+ return -EINVAL;
+ }
+
+ /* find the first unused index */
+ ret = nsim_ipsec_find_empty_idx(ipsec);
+ if (ret < 0) {
+ netdev_err(dev, "No space for SA in Rx table!\n");
+ return ret;
+ }
+ sa_idx = (u16)ret;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.used = true;
+ sa.xs = xs;
+
+ if (sa.xs->id.proto & IPPROTO_ESP)
+ sa.crypt = xs->ealg || xs->aead;
+
+ /* get the key and salt */
+ ret = nsim_ipsec_parse_proto_keys(xs, sa.key, &sa.salt);
+ if (ret) {
+ netdev_err(dev, "Failed to get key data for SA table\n");
+ return ret;
+ }
+
+ if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
+ sa.rx = true;
+
+ if (xs->props.family == AF_INET6)
+ memcpy(sa.ipaddr, &xs->id.daddr.a6, 16);
+ else
+ memcpy(&sa.ipaddr[3], &xs->id.daddr.a4, 4);
+ }
+
+ /* the preparations worked, so save the info */
+ memcpy(&ipsec->sa[sa_idx], &sa, sizeof(sa));
+
+ /* the XFRM stack doesn't like offload_handle == 0,
+ * so add a bitflag in case our array index is 0
+ */
+ xs->xso.offload_handle = sa_idx | NSIM_IPSEC_VALID;
+ ipsec->count++;
+
+ return 0;
+}
+
+static void nsim_ipsec_del_sa(struct xfrm_state *xs)
+{
+ struct netdevsim *ns = netdev_priv(xs->xso.dev);
+ struct nsim_ipsec *ipsec = &ns->ipsec;
+ u16 sa_idx;
+
+ sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID;
+ if (!ipsec->sa[sa_idx].used) {
+ netdev_err(ns->netdev, "Invalid SA for delete sa_idx=%d\n",
+ sa_idx);
+ return;
+ }
+
+ memset(&ipsec->sa[sa_idx], 0, sizeof(struct nsim_sa));
+ ipsec->count--;
+}
+
+static bool nsim_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
+{
+ struct netdevsim *ns = netdev_priv(xs->xso.dev);
+ struct nsim_ipsec *ipsec = &ns->ipsec;
+
+ ipsec->ok++;
+
+ return true;
+}
+
+static const struct xfrmdev_ops nsim_xfrmdev_ops = {
+ .xdo_dev_state_add = nsim_ipsec_add_sa,
+ .xdo_dev_state_delete = nsim_ipsec_del_sa,
+ .xdo_dev_offload_ok = nsim_ipsec_offload_ok,
+};
+
+bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb)
+{
+ struct nsim_ipsec *ipsec = &ns->ipsec;
+ struct xfrm_state *xs;
+ struct nsim_sa *tsa;
+ u32 sa_idx;
+
+ /* do we even need to check this packet? */
+ if (!skb->sp)
+ return true;
+
+ if (unlikely(!skb->sp->len)) {
+ netdev_err(ns->netdev, "no xfrm state len = %d\n",
+ skb->sp->len);
+ return false;
+ }
+
+ xs = xfrm_input_state(skb);
+ if (unlikely(!xs)) {
+ netdev_err(ns->netdev, "no xfrm_input_state() xs = %p\n", xs);
+ return false;
+ }
+
+ sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID;
+ if (unlikely(sa_idx > NSIM_IPSEC_MAX_SA_COUNT)) {
+ netdev_err(ns->netdev, "bad sa_idx=%d max=%d\n",
+ sa_idx, NSIM_IPSEC_MAX_SA_COUNT);
+ return false;
+ }
+
+ tsa = &ipsec->sa[sa_idx];
+ if (unlikely(!tsa->used)) {
+ netdev_err(ns->netdev, "unused sa_idx=%d\n", sa_idx);
+ return false;
+ }
+
+ if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) {
+ netdev_err(ns->netdev, "unexpected proto=%d\n", xs->id.proto);
+ return false;
+ }
+
+ ipsec->tx++;
+
+ return true;
+}
+
+void nsim_ipsec_init(struct netdevsim *ns)
+{
+ ns->netdev->xfrmdev_ops = &nsim_xfrmdev_ops;
+
+#define NSIM_ESP_FEATURES (NETIF_F_HW_ESP | \
+ NETIF_F_HW_ESP_TX_CSUM | \
+ NETIF_F_GSO_ESP)
+
+ ns->netdev->features |= NSIM_ESP_FEATURES;
+ ns->netdev->hw_enc_features |= NSIM_ESP_FEATURES;
+
+ ns->ipsec.pfile = debugfs_create_file("ipsec", 0400, ns->ddir, ns,
+ &ipsec_dbg_fops);
+}
+
+void nsim_ipsec_teardown(struct netdevsim *ns)
+{
+ struct nsim_ipsec *ipsec = &ns->ipsec;
+
+ if (ipsec->count)
+ netdev_err(ns->netdev, "tearing down IPsec offload with %d SAs left\n",
+ ipsec->count);
+ debugfs_remove_recursive(ipsec->pfile);
+}
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index ec68f38..6ce8604d 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -171,6 +171,8 @@ static int nsim_init(struct net_device *dev)
if (err)
goto err_unreg_dev;
+ nsim_ipsec_init(ns);
+
return 0;
err_unreg_dev:
@@ -186,6 +188,7 @@ static void nsim_uninit(struct net_device *dev)
{
struct netdevsim *ns = netdev_priv(dev);
+ nsim_ipsec_teardown(ns);
nsim_devlink_teardown(ns);
debugfs_remove_recursive(ns->ddir);
nsim_bpf_uninit(ns);
@@ -203,11 +206,15 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct netdevsim *ns = netdev_priv(dev);
+ if (!nsim_ipsec_tx(ns, skb))
+ goto out;
+
u64_stats_update_begin(&ns->syncp);
ns->tx_packets++;
ns->tx_bytes += skb->len;
u64_stats_update_end(&ns->syncp);
+out:
dev_kfree_skb(skb);
return NETDEV_TX_OK;
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 3a8581a..29448e8 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -29,6 +29,27 @@ struct bpf_prog;
struct dentry;
struct nsim_vf_config;
+#define NSIM_IPSEC_MAX_SA_COUNT 33
+#define NSIM_IPSEC_VALID BIT(31)
+
+struct nsim_sa {
+ struct xfrm_state *xs;
+ __be32 ipaddr[4];
+ u32 key[4];
+ u32 salt;
+ bool used;
+ bool crypt;
+ bool rx;
+};
+
+struct nsim_ipsec {
+ struct nsim_sa sa[NSIM_IPSEC_MAX_SA_COUNT];
+ struct dentry *pfile;
+ u32 count;
+ u32 tx;
+ u32 ok;
+};
+
struct netdevsim {
struct net_device *netdev;
@@ -67,6 +88,7 @@ struct netdevsim {
#if IS_ENABLED(CONFIG_NET_DEVLINK)
struct devlink *devlink;
#endif
+ struct nsim_ipsec ipsec;
};
extern struct dentry *nsim_ddir;
@@ -147,6 +169,25 @@ static inline void nsim_devlink_exit(void)
}
#endif
+#if IS_ENABLED(CONFIG_XFRM_OFFLOAD)
+void nsim_ipsec_init(struct netdevsim *ns);
+void nsim_ipsec_teardown(struct netdevsim *ns);
+bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb);
+#else
+static inline void nsim_ipsec_init(struct netdevsim *ns)
+{
+}
+
+static inline void nsim_ipsec_teardown(struct netdevsim *ns)
+{
+}
+
+static inline bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb)
+{
+ return true;
+}
+#endif
+
static inline struct netdevsim *to_nsim(struct device *ptr)
{
return container_of(ptr, struct netdevsim, dev);
--
2.7.4
^ permalink raw reply related
* [PATCH v3 net-next 1/4] selftests: rtnetlink: clear the return code at start of ipsec test
From: Shannon Nelson @ 2018-06-26 17:07 UTC (permalink / raw)
To: davem, netdev, jakub.kicinski; +Cc: anders.roxell, linux-kselftest
In-Reply-To: <1530032875-30482-1-git-send-email-shannon.nelson@oracle.com>
Following the custom from the other functions, clear the global
ret code before starting the test so as to not have previously
failed tests cause us to thing this test has failed.
Reported-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
tools/testing/selftests/net/rtnetlink.sh | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index b33a371..261a981 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -522,6 +522,8 @@ kci_test_macsec()
#-------------------------------------------------------------------
kci_test_ipsec()
{
+ ret=0
+
# find an ip address on this machine and make up a destination
srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head -1 | cut -f1 -d/`
net=`echo $srcip | cut -f1-3 -d.`
--
2.7.4
^ permalink raw reply related
* [PATCH v3 net-next 2/4] selftests: rtnetlink: use dummydev as a test device
From: Shannon Nelson @ 2018-06-26 17:07 UTC (permalink / raw)
To: davem, netdev, jakub.kicinski; +Cc: anders.roxell, linux-kselftest
In-Reply-To: <1530032875-30482-1-git-send-email-shannon.nelson@oracle.com>
We really shouldn't mess with local system settings, so let's
use the already created dummy device instead for ipsec testing.
Oh, and let's put the temp file into a proper directory.
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
tools/testing/selftests/net/rtnetlink.sh | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 261a981..15948cf 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -523,21 +523,19 @@ kci_test_macsec()
kci_test_ipsec()
{
ret=0
-
- # find an ip address on this machine and make up a destination
- srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head -1 | cut -f1 -d/`
- net=`echo $srcip | cut -f1-3 -d.`
- base=`echo $srcip | cut -f4 -d.`
- dstip="$net."`expr $base + 1`
-
algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
+ srcip=192.168.123.1
+ dstip=192.168.123.2
+ spi=7
+
+ ip addr add $srcip dev $devdummy
# flush to be sure there's nothing configured
ip x s flush ; ip x p flush
check_err $?
# start the monitor in the background
- tmpfile=`mktemp ipsectestXXX`
+ tmpfile=`mktemp /var/run/ipsectestXXX`
mpid=`(ip x m > $tmpfile & echo $!) 2>/dev/null`
sleep 0.2
@@ -601,6 +599,7 @@ kci_test_ipsec()
check_err $?
ip x p flush
check_err $?
+ ip addr del $srcip/32 dev $devdummy
if [ $ret -ne 0 ]; then
echo "FAIL: ipsec"
--
2.7.4
^ permalink raw reply related
* [PATCH v3 net-next 0/4] Updates for ipsec selftests
From: Shannon Nelson @ 2018-06-26 17:07 UTC (permalink / raw)
To: davem, netdev, jakub.kicinski; +Cc: anders.roxell, linux-kselftest
Fix up the existing ipsec selftest and add tests for
the ipsec offload driver API.
v2: addressed formatting nits in netdevsim from Jakub Kicinski
v3: a couple more nits from Jakub
Shannon Nelson (4):
selftests: rtnetlink: clear the return code at start of ipsec test
selftests: rtnetlink: use dummydev as a test device
netdevsim: add ipsec offload testing
selftests: rtnetlink: add ipsec offload API test
drivers/net/netdevsim/Makefile | 4 +
drivers/net/netdevsim/ipsec.c | 345 +++++++++++++++++++++++++++++++
drivers/net/netdevsim/netdev.c | 7 +
drivers/net/netdevsim/netdevsim.h | 37 ++++
tools/testing/selftests/net/rtnetlink.sh | 132 +++++++++++-
5 files changed, 518 insertions(+), 7 deletions(-)
create mode 100644 drivers/net/netdevsim/ipsec.c
--
2.7.4
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox