* [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path
@ 2014-11-13 22:54 Rick Jones
2014-11-14 1:32 ` David Miller
2014-11-14 2:17 ` Eric Dumazet
0 siblings, 2 replies; 6+ messages in thread
From: Rick Jones @ 2014-11-13 22:54 UTC (permalink / raw)
To: netdev; +Cc: davem
From: Rick Jones <rick.jones2@hp.com>
If icmp_rcv() has successfully processed the incoming ICMP datagram, we
should use consume_skb() rather than kfree_skb() because a hit on the likes
of perf -e skb:kfree_skb is not called-for.
Signed-off-by: Rick Jones <rick.jones2@hp.com>
---
A test system hit with a flood ping hits on perf top -e ksb:kfre_skb before
the change and none after for the normal/success path. The IPv6 path would
be somewhat more ugly. For the time being, just deal with the overlap on
ping_rcv() between the two to avoid a possible double free of an skb.
diff --git a/include/net/ping.h b/include/net/ping.h
index 026479b..f074060 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -82,7 +82,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len);
int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-void ping_rcv(struct sk_buff *skb);
+bool ping_rcv(struct sk_buff *skb);
#ifdef CONFIG_PROC_FS
struct ping_seq_afinfo {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36b7bfa..b9f3653 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert);
*/
struct icmp_control {
- void (*handler)(struct sk_buff *skb);
+ bool (*handler)(struct sk_buff *skb);
short error; /* This ICMP is classed as an error message */
};
@@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto)
* ICMP_PARAMETERPROB.
*/
-static void icmp_unreach(struct sk_buff *skb)
+static bool icmp_unreach(struct sk_buff *skb)
{
const struct iphdr *iph;
struct icmphdr *icmph;
@@ -839,10 +839,11 @@ static void icmp_unreach(struct sk_buff *skb)
icmp_socket_deliver(skb, info);
out:
- return;
+ return true;
out_err:
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
- goto out;
+ kfree_skb(skb);
+ return false;
}
@@ -850,17 +851,22 @@ out_err:
* Handle ICMP_REDIRECT.
*/
-static void icmp_redirect(struct sk_buff *skb)
+static bool icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
- return;
+ kfree_skb(skb);
+ return false;
}
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- return;
+ if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
+ /* there aught to be a stat */
+ kfree_skb(skb);
+ return false;
+ }
icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+ return true;
}
/*
@@ -875,7 +881,7 @@ static void icmp_redirect(struct sk_buff *skb)
* See also WRT handling of options once they are done and working.
*/
-static void icmp_echo(struct sk_buff *skb)
+static bool icmp_echo(struct sk_buff *skb)
{
struct net *net;
@@ -891,6 +897,8 @@ static void icmp_echo(struct sk_buff *skb)
icmp_param.head_len = sizeof(struct icmphdr);
icmp_reply(&icmp_param, skb);
}
+ /* should there be an ICMP stat for ignored echos? */
+ return true;
}
/*
@@ -900,7 +908,7 @@ static void icmp_echo(struct sk_buff *skb)
* MUST be accurate to a few minutes.
* MUST be updated at least at 15Hz.
*/
-static void icmp_timestamp(struct sk_buff *skb)
+static bool icmp_timestamp(struct sk_buff *skb)
{
struct timespec tv;
struct icmp_bxm icmp_param;
@@ -927,15 +935,18 @@ static void icmp_timestamp(struct sk_buff *skb)
icmp_param.data_len = 0;
icmp_param.head_len = sizeof(struct icmphdr) + 12;
icmp_reply(&icmp_param, skb);
-out:
- return;
+ return true;
+
out_err:
ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
- goto out;
+ kfree_skb(skb);
+ return false;
}
-static void icmp_discard(struct sk_buff *skb)
+static bool icmp_discard(struct sk_buff *skb)
{
+ /* pretend it was a success */
+ return true;
}
/*
@@ -946,6 +957,7 @@ int icmp_rcv(struct sk_buff *skb)
struct icmphdr *icmph;
struct rtable *rt = skb_rtable(skb);
struct net *net = dev_net(rt->dst.dev);
+ bool success;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
@@ -1012,7 +1024,12 @@ int icmp_rcv(struct sk_buff *skb)
}
}
- icmp_pointers[icmph->type].handler(skb);
+ success = icmp_pointers[icmph->type].handler(skb);
+
+ if (success)
+ consume_skb(skb);
+
+ return 0;
drop:
kfree_skb(skb);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 736236c..7d54eed 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -955,7 +955,7 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
* All we need to do is get the socket.
*/
-void ping_rcv(struct sk_buff *skb)
+bool ping_rcv(struct sk_buff *skb)
{
struct sock *sk;
struct net *net = dev_net(skb->dev);
@@ -974,11 +974,13 @@ void ping_rcv(struct sk_buff *skb)
pr_debug("rcv on socket %p\n", sk);
ping_queue_rcv_skb(sk, skb_get(skb));
sock_put(sk);
- return;
+ return true;
}
pr_debug("no socket, dropping\n");
- /* We're called from icmp_rcv(). kfree_skb() is done there. */
+ /* Do the kfree_skb() here to get a better drop profile */
+ kfree_skb(skb);
+ return false;
}
EXPORT_SYMBOL_GPL(ping_rcv);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 0929340..6c0f805 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -679,6 +679,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
const struct in6_addr *saddr, *daddr;
struct icmp6hdr *hdr;
u8 type;
+ bool success = true;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
@@ -726,7 +727,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
break;
case ICMPV6_ECHO_REPLY:
- ping_rcv(skb);
+ success = ping_rcv(skb);
break;
case ICMPV6_PKT_TOOBIG:
@@ -790,7 +791,15 @@ static int icmpv6_rcv(struct sk_buff *skb)
icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
}
- kfree_skb(skb);
+ /* until the v6 path can be better sorted we may still need
+ * to kfree_sbk() here but want to avoid a double free from
+ * the ping_rcv() path, which shares code with IPv4. assume
+ * success and preserve the status quo behaviour for the rest
+ * of the paths to here
+ */
+ if (success)
+ kfree_skb(skb);
+
return 0;
csum_error:
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path
2014-11-13 22:54 [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path Rick Jones
@ 2014-11-14 1:32 ` David Miller
2014-11-14 2:17 ` Eric Dumazet
1 sibling, 0 replies; 6+ messages in thread
From: David Miller @ 2014-11-14 1:32 UTC (permalink / raw)
To: raj; +Cc: netdev
From: raj@tardy.usa.hp.com (Rick Jones)
Date: Thu, 13 Nov 2014 14:54:57 -0800 (PST)
> + /* until the v6 path can be better sorted we may still need
> + * to kfree_sbk() here but want to avoid a double free from
Typo "kfree_skb()".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path
2014-11-13 22:54 [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path Rick Jones
2014-11-14 1:32 ` David Miller
@ 2014-11-14 2:17 ` Eric Dumazet
2014-11-14 16:16 ` Rick Jones
1 sibling, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2014-11-14 2:17 UTC (permalink / raw)
To: Rick Jones; +Cc: netdev, davem
On Thu, 2014-11-13 at 14:54 -0800, Rick Jones wrote:
> From: Rick Jones <rick.jones2@hp.com>
>
> If icmp_rcv() has successfully processed the incoming ICMP datagram, we
> should use consume_skb() rather than kfree_skb() because a hit on the likes
> of perf -e skb:kfree_skb is not called-for.
>
> Signed-off-by: Rick Jones <rick.jones2@hp.com>
>
> ---
>
> A test system hit with a flood ping hits on perf top -e ksb:kfre_skb before
> the change and none after for the normal/success path. The IPv6 path would
> be somewhat more ugly. For the time being, just deal with the overlap on
> ping_rcv() between the two to avoid a possible double free of an skb.
>
> diff --git a/include/net/ping.h b/include/net/ping.h
> index 026479b..f074060 100644
> --- a/include/net/ping.h
> +++ b/include/net/ping.h
> @@ -82,7 +82,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
> int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
> size_t len);
> int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
> -void ping_rcv(struct sk_buff *skb);
> +bool ping_rcv(struct sk_buff *skb);
>
> #ifdef CONFIG_PROC_FS
> struct ping_seq_afinfo {
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index 36b7bfa..b9f3653 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert);
> */
>
> struct icmp_control {
> - void (*handler)(struct sk_buff *skb);
> + bool (*handler)(struct sk_buff *skb);
> short error; /* This ICMP is classed as an error message */
> };
>
> @@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto)
> * ICMP_PARAMETERPROB.
> */
>
> -static void icmp_unreach(struct sk_buff *skb)
> +static bool icmp_unreach(struct sk_buff *skb)
> {
> const struct iphdr *iph;
> struct icmphdr *icmph;
> @@ -839,10 +839,11 @@ static void icmp_unreach(struct sk_buff *skb)
> icmp_socket_deliver(skb, info);
>
> out:
> - return;
> + return true;
> out_err:
> ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
> - goto out;
> + kfree_skb(skb);
> + return false;
> }
>
>
> @@ -850,17 +851,22 @@ out_err:
> * Handle ICMP_REDIRECT.
> */
>
> -static void icmp_redirect(struct sk_buff *skb)
> +static bool icmp_redirect(struct sk_buff *skb)
> {
> if (skb->len < sizeof(struct iphdr)) {
> ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
> - return;
> + kfree_skb(skb);
> + return false;
> }
>
> - if (!pskb_may_pull(skb, sizeof(struct iphdr)))
> - return;
> + if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
> + /* there aught to be a stat */
> + kfree_skb(skb);
> + return false;
> + }
>
> icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
> + return true;
> }
>
> /*
> @@ -875,7 +881,7 @@ static void icmp_redirect(struct sk_buff *skb)
> * See also WRT handling of options once they are done and working.
> */
>
> -static void icmp_echo(struct sk_buff *skb)
> +static bool icmp_echo(struct sk_buff *skb)
> {
> struct net *net;
>
> @@ -891,6 +897,8 @@ static void icmp_echo(struct sk_buff *skb)
> icmp_param.head_len = sizeof(struct icmphdr);
> icmp_reply(&icmp_param, skb);
> }
> + /* should there be an ICMP stat for ignored echos? */
> + return true;
> }
>
> /*
> @@ -900,7 +908,7 @@ static void icmp_echo(struct sk_buff *skb)
> * MUST be accurate to a few minutes.
> * MUST be updated at least at 15Hz.
> */
> -static void icmp_timestamp(struct sk_buff *skb)
> +static bool icmp_timestamp(struct sk_buff *skb)
> {
> struct timespec tv;
> struct icmp_bxm icmp_param;
> @@ -927,15 +935,18 @@ static void icmp_timestamp(struct sk_buff *skb)
> icmp_param.data_len = 0;
> icmp_param.head_len = sizeof(struct icmphdr) + 12;
> icmp_reply(&icmp_param, skb);
> -out:
> - return;
> + return true;
> +
> out_err:
> ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
> - goto out;
> + kfree_skb(skb);
> + return false;
> }
>
> -static void icmp_discard(struct sk_buff *skb)
> +static bool icmp_discard(struct sk_buff *skb)
> {
> + /* pretend it was a success */
> + return true;
> }
>
> /*
> @@ -946,6 +957,7 @@ int icmp_rcv(struct sk_buff *skb)
> struct icmphdr *icmph;
> struct rtable *rt = skb_rtable(skb);
> struct net *net = dev_net(rt->dst.dev);
> + bool success;
>
> if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
> struct sec_path *sp = skb_sec_path(skb);
> @@ -1012,7 +1024,12 @@ int icmp_rcv(struct sk_buff *skb)
> }
> }
>
> - icmp_pointers[icmph->type].handler(skb);
> + success = icmp_pointers[icmph->type].handler(skb);
> +
> + if (success)
> + consume_skb(skb);
> +
> + return 0;
This looks quite complicated to me.
Why are you adding kfree_skb() everywhere instead of :
bool to_consume = icmp_pointers[icmph->type].handler(skb);
if (ro_consume)
consume_skb(skb);
else
kfree_skb(skb);
>
> drop:
> kfree_skb(skb);
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path
2014-11-14 2:17 ` Eric Dumazet
@ 2014-11-14 16:16 ` Rick Jones
2014-11-14 16:58 ` Eric Dumazet
0 siblings, 1 reply; 6+ messages in thread
From: Rick Jones @ 2014-11-14 16:16 UTC (permalink / raw)
To: Eric Dumazet, Rick Jones; +Cc: netdev, davem
>
> This looks quite complicated to me.
>
> Why are you adding kfree_skb() everywhere instead of :
>
> bool to_consume = icmp_pointers[icmph->type].handler(skb);
> if (ro_consume)
> consume_skb(skb);
> else
> kfree_skb(skb);
I thought the point of the drop profiling was to show where the drops
were happening. Leaving the kfree_skb() up in icmp_rcv() does not
improve showing where the drops happened. That is why I've pushed it
down into the routines called by icmp_rcv().
rick
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path
2014-11-14 16:16 ` Rick Jones
@ 2014-11-14 16:58 ` Eric Dumazet
2014-11-14 18:29 ` Rick Jones
0 siblings, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2014-11-14 16:58 UTC (permalink / raw)
To: Rick Jones; +Cc: Rick Jones, netdev, davem
On Fri, 2014-11-14 at 08:16 -0800, Rick Jones wrote:
> I thought the point of the drop profiling was to show where the drops
> were happening. Leaving the kfree_skb() up in icmp_rcv() does not
> improve showing where the drops happened. That is why I've pushed it
> down into the routines called by icmp_rcv().
OK, but we drop an icmp message, and that really should be enough.
The point is that most normal icmp messages wont be dropped, but
consumed.
I am not sure we want to bloat the kernel for such minor problem.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path
2014-11-14 16:58 ` Eric Dumazet
@ 2014-11-14 18:29 ` Rick Jones
0 siblings, 0 replies; 6+ messages in thread
From: Rick Jones @ 2014-11-14 18:29 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, davem
On 11/14/2014 08:58 AM, Eric Dumazet wrote:
> On Fri, 2014-11-14 at 08:16 -0800, Rick Jones wrote:
>
>> I thought the point of the drop profiling was to show where the drops
>> were happening. Leaving the kfree_skb() up in icmp_rcv() does not
>> improve showing where the drops happened. That is why I've pushed it
>> down into the routines called by icmp_rcv().
>
> OK, but we drop an icmp message, and that really should be enough.
>
> The point is that most normal icmp messages wont be dropped, but
> consumed.
>
> I am not sure we want to bloat the kernel for such minor problem.
I can certainly rework the patch that way, but one thing I have noticed
when the system is the initiator of pings rather than the target, is
that I get (or at least I think I do) drop profile hits in ping_rcv():
# To display the perf.data header info, please use
--header/--header-only options.
#
# Samples: 997K of event 'skb:kfree_skb'
# Event count (approx.): 997789
#
# Children Self Symbol Shared
Object
# ........ ........ ..........................................
...........................
#
100.00% 100.00% [k] kfree_skb
[kernel.kallsyms]
|
|--100.00%-- ping_rcv
| icmp_rcv
| ip_local_deliver_finish
| ip_local_deliver
| ip_rcv_finish
| ip_rcv
| __netif_receive_skb_core
| __netif_receive_skb
| netif_receive_skb_internal
| napi_gro_receive
| e1000_receive_skb
| e1000_clean_rx_irq
| e1000e_poll
| net_rx_action
| __do_softirq
I don't have an explanation for it though. Perhaps it is just confusion
on my part.
That is from:
raj@raj-8510w:~$ sudo ~/net-next/tools/perf/perf record -a -g -e
skb:kfree_skb ping -n -f -q tardy.usa.hp.com -c 1000000
PING tardy.usa.hp.com (16.103.148.51) 56(84) bytes of data.
--- tardy.usa.hp.com ping statistics ---
1000000 packets transmitted, 1000000 received, 0% packet loss, time 65751ms
rtt min/avg/max/mdev = 0.036/0.053/0.170/0.008 ms, ipg/ewma 0.065/0.052 ms
[ perf record: Woken up 1037 times to write data ]
[ perf record: Captured and wrote 259.456 MB perf.data (~11335798 samples) ]
Warning:
Processed 1002854 events and lost 2 chunks!
Check IO/CPU overload!
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2014-11-14 18:29 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-11-13 22:54 [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path Rick Jones
2014-11-14 1:32 ` David Miller
2014-11-14 2:17 ` Eric Dumazet
2014-11-14 16:16 ` Rick Jones
2014-11-14 16:58 ` Eric Dumazet
2014-11-14 18:29 ` Rick Jones
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).