UDP wierdness around skb_copy_and_csum_datagram

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* UDP wierdness around skb_copy_and_csum_datagram_msg()
@ 2016-09-29  0:18 Jay Smith
  2016-09-29  1:24 ` Eric Dumazet
  0 siblings, 1 reply; 8+ messages in thread
From: Jay Smith @ 2016-09-29  0:18 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 2165 bytes --]

I've spent the last week or so trying to track down a recurring
problem I'm seeing with UDP datagram handling.  I'm new to the
internals of the Linux network stack, but it appears to me that
there's a substantial error in recent kernels' handling of UDP
checksum errors.

The behavior I'm seeing:  I have a UDP server that receives lots of
datagrams from many devices on a single port. A small number of those
devices occasionally send packets with bad UDP checksums.  After I
receive one of these bad packets, the next recvmsg() made on the
socket returns data from the bad-checksum packet, but the
source-address and length of the next (good) packet that arrived at
the port.

I believe this issue was introduced between linux 3.18 and 3.19, by a
set of changes to net/core/datagram.c that made
skb_copy_and_csum_datagram_msg() and related functions use the
iov_iter structure to copy data to user buffers.  In the case where
those functions copy a datagram and then conclude that the checksum is
invalid, they don't remove the already-copied data from the user's
iovec; when udp_recvmsg() calls skb_copy_and_csum_datagram_msg() for a
second time, looking at the next datagram on the queue, that second
datagram's data is appended to the first datagram's.  So when
recvmsg() returns to the user, the return value and msg_name reflect
the second datagram, but the first bytes in the user's iovec come from
the first (bad) datagram.

(I've attached a test program that demonstrates the problem.  Note
that it sees correct behavior unless the bad-checksum packet has > 68
bytes of UDP data; otherwise, the packet doesn't make it past the
CHECKSUM_BREAK test, and never enters
skp_copy_and_csum_datagram_msg().)

The fix for UDP seems pretty simple; the iov_iter's iov_offset member
just needs to be set back to zero on a checksum failure.  But it looks
like skb_copy_and_csum_datagram_msg() is also called from tcp_input.c,
where I assume that multiple sk_buffs can be copied-and-csum'd into
the same iov -- if that's right, it seems like iov_iter needs some
additional state to support rolling-back the most recent copy without
losing previous ones.

Any thoughts?

[-- Attachment #2: csumtest.c --]
[-- Type: text/x-csrc, Size: 3275 bytes --]

#include <string.h>
#include <unistd.h>
#include <netdb.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <arpa/inet.h>

#include <netinet/udp.h>

int main(int argc, char **argv)
{
  int ret;

  if (argc < 2) {
    fprintf(stderr, "Usage: csumtest <bytes-in-bad-packet>\n");
    exit(1);
  }

  struct sockaddr_in addr;
  socklen_t addrLen = sizeof(addr);  
  addr.sin_family = AF_INET;
  addr.sin_addr.s_addr = inet_addr("127.0.0.1");
  addr.sin_port = htons(0);

  int listen = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
  if (listen < 0)
  {
    fprintf(stderr, "failed to create listening socket (err=%d: %s)\n", errno, strerror(errno));
    exit(1);
  }

  ret = bind(listen, (struct sockaddr *) &addr, addrLen);
  if (ret != 0)
  {
    fprintf(stderr, "failed to bind listening socket socket (err=%d: %s)\n", errno, strerror(errno));
    exit(1);
  }

  ret = getsockname(listen, (struct sockaddr *) &addr, &addrLen);
  if (ret != 0)
  {
    fprintf(stderr, "getsockname failed for listening socket socket (err=%d: %s)\n", errno, strerror(errno));
    exit(1);
  }
  else
  {
    printf("listening on port %d\n", ntohs(addr.sin_port));
  }

  // Send a packet with deliberately bad checksum
  char rawBuf[4096];
  memset(rawBuf, 0, 4096);
  struct udphdr *udph = (struct udphdr *) rawBuf;

  // Send as UDP data the string "BAD DATA", repeated as necessary
  // to fill the number of bytes requested on the command-line
  char *data = rawBuf + sizeof(struct udphdr);
  int badBytes = atoi(argv[1]);
  int i;
  for (i = 0; i < badBytes; i += 8)
  {
    strncpy(data + i, "BAD DATA", 8);
  }

  // UDP headers contain a bogus checksum, but are otherwise reasonable
  udph->check = htons(0xbadd);
  udph->source = htons(18);
  udph->dest = addr.sin_port;
  udph->len = htons(sizeof(struct udphdr) + badBytes);

  int raw = socket(AF_INET, SOCK_RAW, IPPROTO_UDP);
  ret = sendto(raw, rawBuf,
	       sizeof(struct udphdr) + badBytes, 0,
	       (struct sockaddr *) &addr, addrLen);
  if (ret < 0)
  {
    fprintf(stderr, "raw send failed (err=%d: %s)\n", errno, strerror(errno));
    exit(1);
  }

  // Send a second, legitimate UDP packet
  int cooked = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
  char *second = "Good data";
  ret = sendto(cooked, second, strlen(second), 0,
	       (struct sockaddr *) &addr, addrLen);
  if (ret < 0)
  {
    fprintf(stderr, "second cooked send failed (err=%d: %s)\n", errno, strerror(errno));
    exit(1);
  }

  // Read what's queued up for us.
  while (1)
  {
    struct msghdr msg;
    struct iovec  iov;
    char readBuf[4096];
    ssize_t bytes;

    memset(readBuf, 0, 4096);

    iov.iov_base = readBuf;
    iov.iov_len = sizeof(readBuf);

    msg.msg_name = NULL;
    msg.msg_namelen = 0;
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;
    msg.msg_control = NULL;
    msg.msg_controllen = 0;
    msg.msg_flags = 0;

    bytes = recvmsg(listen, &msg, 0);
    if (bytes < 0)
    {
      fprintf(stderr, "getsockname failed for listening socket socket (err=%d: %s)\n", errno, strerror(errno));
      exit(1);
    }

    printf("recvmsg returned %ld bytes: %s\n", bytes, readBuf);

    if (bytes == 9)
      break;
  }
}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP wierdness around skb_copy_and_csum_datagram_msg()
  2016-09-29  0:18 UDP wierdness around skb_copy_and_csum_datagram_msg() Jay Smith
@ 2016-09-29  1:24 ` Eric Dumazet
  2016-09-29  2:20   ` Jay Smith
  0 siblings, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2016-09-29  1:24 UTC (permalink / raw)
  To: Jay Smith; +Cc: netdev

On Wed, 2016-09-28 at 17:18 -0700, Jay Smith wrote:
> I've spent the last week or so trying to track down a recurring
> problem I'm seeing with UDP datagram handling.  I'm new to the
> internals of the Linux network stack, but it appears to me that
> there's a substantial error in recent kernels' handling of UDP
> checksum errors.
> 
> The behavior I'm seeing:  I have a UDP server that receives lots of
> datagrams from many devices on a single port. A small number of those
> devices occasionally send packets with bad UDP checksums.  After I
> receive one of these bad packets, the next recvmsg() made on the
> socket returns data from the bad-checksum packet, but the
> source-address and length of the next (good) packet that arrived at
> the port.
> 
> I believe this issue was introduced between linux 3.18 and 3.19, by a
> set of changes to net/core/datagram.c that made
> skb_copy_and_csum_datagram_msg() and related functions use the
> iov_iter structure to copy data to user buffers.  In the case where
> those functions copy a datagram and then conclude that the checksum is
> invalid, they don't remove the already-copied data from the user's
> iovec; when udp_recvmsg() calls skb_copy_and_csum_datagram_msg() for a
> second time, looking at the next datagram on the queue, that second
> datagram's data is appended to the first datagram's.  So when
> recvmsg() returns to the user, the return value and msg_name reflect
> the second datagram, but the first bytes in the user's iovec come from
> the first (bad) datagram.
> 
> (I've attached a test program that demonstrates the problem.  Note
> that it sees correct behavior unless the bad-checksum packet has > 68
> bytes of UDP data; otherwise, the packet doesn't make it past the
> CHECKSUM_BREAK test, and never enters
> skp_copy_and_csum_datagram_msg().)
> 
> The fix for UDP seems pretty simple; the iov_iter's iov_offset member
> just needs to be set back to zero on a checksum failure.  But it looks
> like skb_copy_and_csum_datagram_msg() is also called from tcp_input.c,
> where I assume that multiple sk_buffs can be copied-and-csum'd into
> the same iov -- if that's right, it seems like iov_iter needs some
> additional state to support rolling-back the most recent copy without
> losing previous ones.
> 
> Any thoughts?

Nice catch !

What about clearing iov_offset from UDP (v4 & v6) only ?

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7d96dc2d3d08fa909f247dfbcbd0fc1eeb59862b..928da2fbb3caa6de4d0e1d889c237590f71607ea 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1342,6 +1342,7 @@ csum_copy_err:
 	/* starting over for a new packet, but check if we need to yield */
 	cond_resched();
 	msg->msg_flags &= ~MSG_TRUNC;
+	msg->msg_iter.iov_offset = 0;
 	goto try_again;
 }
 

(similar for ipv6)

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: UDP wierdness around skb_copy_and_csum_datagram_msg()
  2016-09-29  1:24 ` Eric Dumazet
@ 2016-09-29  2:20   ` Jay Smith
  2016-09-29 23:28     ` Christian Lamparter
  0 siblings, 1 reply; 8+ messages in thread
From: Jay Smith @ 2016-09-29  2:20 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Christian Lamparter, Alan Curry, Al Viro

Actually, on a little more searching of this list's archives, I think
that this discussion:  https://patchwork.kernel.org/patch/9260733/ is
about exactly the same issue I've found, except from the TCP side. I'm
cc'ing a few of the participants from that discussion.

So is the patch proposed there (copying and restoring the entire
iov_iter in skb_copy_and_csum_datagram_msg()) being considered as a
fix?  If not, would an alternate one that concealed the
save-and-restore logic inside iov_iter.c be more acceptable?  I'd be
happy to produce whatever's needed, or yield to someone with stronger
feelings about where it should go...

On Wed, Sep 28, 2016 at 6:24 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-09-28 at 17:18 -0700, Jay Smith wrote:
>> I've spent the last week or so trying to track down a recurring
>> problem I'm seeing with UDP datagram handling.  I'm new to the
>> internals of the Linux network stack, but it appears to me that
>> there's a substantial error in recent kernels' handling of UDP
>> checksum errors.
>>
>> The behavior I'm seeing:  I have a UDP server that receives lots of
>> datagrams from many devices on a single port. A small number of those
>> devices occasionally send packets with bad UDP checksums.  After I
>> receive one of these bad packets, the next recvmsg() made on the
>> socket returns data from the bad-checksum packet, but the
>> source-address and length of the next (good) packet that arrived at
>> the port.
>>
>> I believe this issue was introduced between linux 3.18 and 3.19, by a
>> set of changes to net/core/datagram.c that made
>> skb_copy_and_csum_datagram_msg() and related functions use the
>> iov_iter structure to copy data to user buffers.  In the case where
>> those functions copy a datagram and then conclude that the checksum is
>> invalid, they don't remove the already-copied data from the user's
>> iovec; when udp_recvmsg() calls skb_copy_and_csum_datagram_msg() for a
>> second time, looking at the next datagram on the queue, that second
>> datagram's data is appended to the first datagram's.  So when
>> recvmsg() returns to the user, the return value and msg_name reflect
>> the second datagram, but the first bytes in the user's iovec come from
>> the first (bad) datagram.
>>
>> (I've attached a test program that demonstrates the problem.  Note
>> that it sees correct behavior unless the bad-checksum packet has > 68
>> bytes of UDP data; otherwise, the packet doesn't make it past the
>> CHECKSUM_BREAK test, and never enters
>> skp_copy_and_csum_datagram_msg().)
>>
>> The fix for UDP seems pretty simple; the iov_iter's iov_offset member
>> just needs to be set back to zero on a checksum failure.  But it looks
>> like skb_copy_and_csum_datagram_msg() is also called from tcp_input.c,
>> where I assume that multiple sk_buffs can be copied-and-csum'd into
>> the same iov -- if that's right, it seems like iov_iter needs some
>> additional state to support rolling-back the most recent copy without
>> losing previous ones.
>>
>> Any thoughts?
>
> Nice catch !
>
> What about clearing iov_offset from UDP (v4 & v6) only ?
>
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 7d96dc2d3d08fa909f247dfbcbd0fc1eeb59862b..928da2fbb3caa6de4d0e1d889c237590f71607ea 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1342,6 +1342,7 @@ csum_copy_err:
>         /* starting over for a new packet, but check if we need to yield */
>         cond_resched();
>         msg->msg_flags &= ~MSG_TRUNC;
> +       msg->msg_iter.iov_offset = 0;
>         goto try_again;
>  }
>
>
> (similar for ipv6)
>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP wierdness around skb_copy_and_csum_datagram_msg()
  2016-09-29  2:20   ` Jay Smith
@ 2016-09-29 23:28     ` Christian Lamparter
  2016-09-30  0:06       ` Eric Dumazet
  2016-09-30 17:35       ` Jay Smith
  0 siblings, 2 replies; 8+ messages in thread
From: Christian Lamparter @ 2016-09-29 23:28 UTC (permalink / raw)
  To: Jay Smith, Alan Curry; +Cc: Eric Dumazet, netdev, Al Viro, davem@davemloft.net

On Wednesday, September 28, 2016 7:20:39 PM CEST Jay Smith wrote:
> Actually, on a little more searching of this list's archives, I think
> that this discussion:  https://patchwork.kernel.org/patch/9260733/ is
> about exactly the same issue I've found, except from the TCP side. I'm
> cc'ing a few of the participants from that discussion.
> 
> So is the patch proposed there (copying and restoring the entire
> iov_iter in skb_copy_and_csum_datagram_msg()) being considered as a
> fix?

>From Alan's post:

"My ugly patch fixes this in the most obvious way: make a local copy of
msg->msg_iter before the call to skb_copy_and_csum_datagram(), and copy
it back if the checksum is bad, just before goto csum_error;"

IMHO this meant that the patch is a proof of concept for his problem.

> If not, would an alternate one that concealed the save-and-restore logic
> inside iov_iter.c be more acceptable? I'd be happy to produce whatever's
> needed, or yield to someone with stronger feelings about where it should
> go...
Al Viro identified more inconsistencies within the error-paths that deal
with EFAULT in the whole area (in and around skb_copy_and_csum_datagram()).

As far as I can tell the original discussion about the data corruption
issue went off on a tangent and it is stuck in figuring out "How to handle
the errors in tcp_copy_to_iovec()".

As for fixing the issue: I'm happy to test and review patches. 
The trouble is that nobody seem to be able to produce them...

Regards,
Christian

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP wierdness around skb_copy_and_csum_datagram_msg()
  2016-09-29 23:28     ` Christian Lamparter
@ 2016-09-30  0:06       ` Eric Dumazet
  2016-09-30 17:35       ` Jay Smith
  1 sibling, 0 replies; 8+ messages in thread
From: Eric Dumazet @ 2016-09-30  0:06 UTC (permalink / raw)
  To: Christian Lamparter
  Cc: Jay Smith, Alan Curry, netdev, Al Viro, davem@davemloft.net

On Fri, 2016-09-30 at 01:28 +0200, Christian Lamparter wrote:
> On Wednesday, September 28, 2016 7:20:39 PM CEST Jay Smith wrote:
> > Actually, on a little more searching of this list's archives, I think
> > that this discussion:  https://patchwork.kernel.org/patch/9260733/ is
> > about exactly the same issue I've found, except from the TCP side. I'm
> > cc'ing a few of the participants from that discussion.
> > 
> > So is the patch proposed there (copying and restoring the entire
> > iov_iter in skb_copy_and_csum_datagram_msg()) being considered as a
> > fix?
> 
> From Alan's post:
> 
> "My ugly patch fixes this in the most obvious way: make a local copy of
> msg->msg_iter before the call to skb_copy_and_csum_datagram(), and copy
> it back if the checksum is bad, just before goto csum_error;"
> 
> IMHO this meant that the patch is a proof of concept for his problem.
> 
> > If not, would an alternate one that concealed the save-and-restore logic
> > inside iov_iter.c be more acceptable? I'd be happy to produce whatever's
> > needed, or yield to someone with stronger feelings about where it should
> > go...
> Al Viro identified more inconsistencies within the error-paths that deal
> with EFAULT in the whole area (in and around skb_copy_and_csum_datagram()).
> 
> As far as I can tell the original discussion about the data corruption
> issue went off on a tangent and it is stuck in figuring out "How to handle
> the errors in tcp_copy_to_iovec()".
> 
> As for fixing the issue: I'm happy to test and review patches. 
> The trouble is that nobody seem to be able to produce them...
> 

This is doable with a bit of fault injection I believe.

And "ethtool -K eth0 rx off gro off lro off"  to let the TCP receiver
compute the checksum itself.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP wierdness around skb_copy_and_csum_datagram_msg()
  2016-09-29 23:28     ` Christian Lamparter
  2016-09-30  0:06       ` Eric Dumazet
@ 2016-09-30 17:35       ` Jay Smith
  2016-09-30 18:40         ` Christian Lamparter
  1 sibling, 1 reply; 8+ messages in thread
From: Jay Smith @ 2016-09-30 17:35 UTC (permalink / raw)
  To: Christian Lamparter
  Cc: Jay Smith, Alan Curry, Eric Dumazet, netdev, Al Viro,
	davem@davemloft.net

Christian Lamparter writes:

> On Wednesday, September 28, 2016 7:20:39 PM CEST Jay Smith wrote:
>> Actually, on a little more searching of this list's archives, I think
>> that this discussion:  https://patchwork.kernel.org/patch/9260733/ is
>> about exactly the same issue I've found, except from the TCP side. I'm
>> cc'ing a few of the participants from that discussion.
>> 
>> So is the patch proposed there (copying and restoring the entire
>> iov_iter in skb_copy_and_csum_datagram_msg()) being considered as a
>> fix?
>
> From Alan's post:
>
> "My ugly patch fixes this in the most obvious way: make a local copy of
> msg->msg_iter before the call to skb_copy_and_csum_datagram(), and copy
> it back if the checksum is bad, just before goto csum_error;"
>
> IMHO this meant that the patch is a proof of concept for his problem.

It's also the simplest thing that fixes all of the relevant cases (udp4,
udp6, tcp4).  Basically, the state of the iov_iter (which, if I'm
reading correctly, consists of three elements -- iov_offset, count, and
nr_segs all change values as the iterator moves through the vectors)
needs to be backed-up and restored at exactly the points in datagram.c
where Alan's patch does so.

Whether that should be done with memcpy, as Alan does, or by exposing
some more abstract backup/restore functions from iov_iter.c is a matter
of taste.  I'm happy to accept the call of someone more maintainer-ish
on that.

> Al Viro identified more inconsistencies within the error-paths that deal
> with EFAULT in the whole area (in and around skb_copy_and_csum_datagram()).

Was this in some other thread?  The only other discussion I see of that
function in the "PROBLEM: network data corruption..." thread is around
this patch https://patchwork.kernel.org/patch/9245023/ , which as Al
says was just a diagnostic patch -- it intentionally doesn't handle the
multiple-vector case.

It seems like the EFAULT case in skb_copy_and_csum_datagram() would
indicate that the iov_iter code ran out of room to copy the current
message, even though it's checked for that room at datagram.c:738.
Which I guess is possible -- there could be some non-obvious counting
error in the iov_inter.c macros.  But, at least in the UDP cases, it
wouldn't trigger the same problem as a checksum failure -- the EFAULT
gets returned to the caller in that case, and the buffer isn't meant
to be valid.  It's only in the checksum case that we retry underneath
the udp_recvmsg() covers, and end up returning the supposedly-rejected data.

>
> As for fixing the issue: I'm happy to test and review patches. 
> The trouble is that nobody seem to be able to produce them...

Sorry -- is the trouble you're talking about here that no-one's produced
a patch, or that we don't have a reproduction of the problem?  I don't
think either is true.

The test program I'd attached to my first mail reliably reproduces
the UDP version of the problem.  It's pretty simple: listen on a UDP
port (using loopback, so that there's no hardware csum offload), use a
raw socket to send a datagram with a bad UDP checksum, then send a good
datagram, and then finally read from the socket.  On post-3.19 kernels,
you always get the contents of the bad packet at the start of the user
buffer: 

# bin/csumtestn 69
listening on port 47193
recvmsg returned 9 bytes: BAD DATABAD DATABAD DATABAD DATABAD DATABAD DATABAD DATABAD DATABAD DGood data

After Alan's patch, the good packet's contents are at the start of the
buffer, where they belong:

# bin/csumtestn 69
listening on port 54620
recvmsg returned 9 bytes: Good dataAD DATABAD DATABAD DATABAD DATABAD DATABAD DATABAD DATABAD D

So functionally, I believe that Alan's patch does the trick.  I haven't
actually tested it on UDP6, but a similar test should work there.

Inserting the bad packets deterministically into a TCP connection is
trickier, but I thought in the previous thread that you and Alan both
had wireless hardware configurations that frequently generated checksum
errors, and that Alan's claim was that his patch gave him good TCP data
even in the presence of those checksum errors.  Or do I misunderstand?

(Just to be clear, though, if there is a need for a new patch, for
whatever reason, I'm happy to generate one.)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP wierdness around skb_copy_and_csum_datagram_msg()
  2016-09-30 17:35       ` Jay Smith
@ 2016-09-30 18:40         ` Christian Lamparter
  2016-10-17 17:10           ` Jay Smith
  0 siblings, 1 reply; 8+ messages in thread
From: Christian Lamparter @ 2016-09-30 18:40 UTC (permalink / raw)
  To: Jay Smith
  Cc: Christian Lamparter, Alan Curry, Eric Dumazet, netdev, Al Viro,
	davem@davemloft.net

On Friday, September 30, 2016 10:35:25 AM CEST Jay Smith wrote:
> Christian Lamparter writes:
> > On Wednesday, September 28, 2016 7:20:39 PM CEST Jay Smith wrote:
> >> Actually, on a little more searching of this list's archives, I think
> >> that this discussion:  https://patchwork.kernel.org/patch/9260733/ is
> >> about exactly the same issue I've found, except from the TCP side. I'm
> >> cc'ing a few of the participants from that discussion.
> >> 
> >> So is the patch proposed there (copying and restoring the entire
> >> iov_iter in skb_copy_and_csum_datagram_msg()) being considered as a
> >> fix?
> >
> > From Alan's post:
> >
> > "My ugly patch fixes this in the most obvious way: make a local copy of
> > msg->msg_iter before the call to skb_copy_and_csum_datagram(), and copy
> > it back if the checksum is bad, just before goto csum_error;"
> >
> > IMHO this meant that the patch is a proof of concept for his problem.
> 
> It's also the simplest thing that fixes all of the relevant cases (udp4,
> udp6, tcp4). 
Al Viro noted that tcp needed more work here (tp->ucopy.len and friends).
Trouble is, that this discussion (between Alan, David, me and Eric) was 
offlist at that point... And it doesn't look like anyone involved
wants to repeat what they have already said/written. 

(Al Viro, are you there?)

> [...]
> > As for fixing the issue: I'm happy to test and review patches. 
> > The trouble is that nobody seem to be able to produce them...
> 
> Sorry -- is the trouble you're talking about here that no-one's produced
> a patch, or that we don't have a reproduction of the problem?  I don't
> think either is true.
Reproduction wasn't the issue. Back in August, I posted method 
which used the network emulator (CONFIG_NET_SCH_NETEM) to reproduce
it easily and almost anywhere [0].

(i.e.: run this on the server/router)
# tc qdisc add dev eth0 root netem corrupt 0.1%
 
Alan also wrote a userspace tool that had a select/noselect switch
which would lead to different outcomes... etc. However, in the end
Alan could fix his issue by switching to CCMP(AES WLAN cipher), 
which he reported fixed his corruption issue.

So sadly yes, what I meant was that the patches are missing in action.

Regards,
Christian

[0] <https://lkml.org/lkml/2016/8/3/181>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP wierdness around skb_copy_and_csum_datagram_msg()
  2016-09-30 18:40         ` Christian Lamparter
@ 2016-10-17 17:10           ` Jay Smith
  0 siblings, 0 replies; 8+ messages in thread
From: Jay Smith @ 2016-10-17 17:10 UTC (permalink / raw)
  To: Christian Lamparter
  Cc: Jay Smith, Alan Curry, Eric Dumazet, netdev, Al Viro,
	davem@davemloft.net


Trying to revive this thread.

To review:  skb_copy_and_csum_datagram_msg() pretty clearly doesn't do
the right thing since it started using an iov_iter to copy into the
user's iovec.  In particular, if it encounters a datagram that fails the
checksum, the iov_iter continues to point to the end of the failed
datagram's data, and that data makes it out to user-space.

I'd previously sent a test program that consistenly reproduced the UDP(v4)
symptoms of this problem [0].  I've now also taken Christian's netem
suggestion and written a quick program that sends known data over
loopback TCP from one thread and reads it from another.  It optionally
sets up a netem qdisc that corrupts 1% of packets.  As expected, even
with corruption, tcp delivers correct data to the user on pre-3.19
kernels; on 3.19 and later, long transfers usually see corruptions.
(Source for the TCP test program below.)

I've also tried both test programs with the following patch:

diff --git a/net/core/datagram.c b/net/core/datagram.c
index b7de71f..61da091 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -730,6 +730,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
 {
        __wsum csum;
        int chunk = skb->len - hlen;
+       struct iov_iter saved_iter;
 
        if (!chunk)
                return 0;
@@ -741,11 +742,16 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
                        goto fault;
        } else {
                csum = csum_partial(skb->data, hlen, skb->csum);
+
+               /* save msg_iter state, so we can revert if csum fails. */
+               saved_iter = msg->msg_iter;
                if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
                                               chunk, &csum))
                        goto fault;
-               if (csum_fold(csum))
+               if (csum_fold(csum)) {
+                       msg->msg_iter = saved_iter;
                        goto csum_error;
+               }
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
                        netdev_rx_csum_fault(skb->dev);
        }


(This is essentially the same fix Alan previously sent [1], except that
it uses struct assignment rather than memcpy'ing the struct iov_iter.)
As expected, both UDP and TCP tests succeed under this fix.

So I've missed whatever conversations happened off-list after Alan's
original report.  But it appears to me that this patch completely
resolves the csum/iov_iter problem for both TCP and UDP; I'm not sure I
see what further problem we'd want to hold the fix off for?

[0] https://www.spinics.net/lists/netdev/msg398026.html
[1] https://patchwork.kernel.org/patch/9260733/


New TCP test program:

#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <pthread.h>

#include <netlink/route/tc.h>
#include <netlink/route/qdisc.h>
#include <netlink/route/qdisc/netem.h>

int bytes = 0;
struct sockaddr_in addr;
socklen_t addrLen = sizeof(addr);  


void *sender(void *ignore)
{
  int send = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
  if (send < 0)
  {
    fprintf(stderr, "failed to create sending socket (err=%d: %s)\n", errno, strerror(errno));
    exit(1);
  }

  int ret = connect(send, (struct sockaddr *) &addr, addrLen);
  if (ret != 0)
  {
    fprintf(stderr, "failed to connect sending socket (err=%d: %s)\n", errno, strerror(errno));
    exit(1);
  }

  int i = 0;
  while (i < bytes)
  {
#define OUT_MESSAGE  "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
    int w = write(send, OUT_MESSAGE, strlen(OUT_MESSAGE));
    if (w < 0)
    {
      fprintf(stdout, "failed to write byte %d\n", i);
      exit(1);
    }
    i += w;
  }

  close(send);
}

/* set a qdisc on lo with corruption_probability 1% (or remove if if on==0) */
void setCorrupt(int on)
{
  struct nl_sock *sock;
  struct nl_cache *cache;
  struct rtnl_qdisc *q;
  struct rtnl_link *link;
  int if_index;

  sock = nl_socket_alloc();
  nl_connect(sock, NETLINK_ROUTE);

  rtnl_link_alloc_cache(sock, AF_UNSPEC, &cache);
  link = rtnl_link_get_by_name(cache, "lo");
  if_index = rtnl_link_get_ifindex(link);

  q = rtnl_qdisc_alloc();
  rtnl_tc_set_ifindex(TC_CAST(q), if_index);
  rtnl_tc_set_parent(TC_CAST(q), TC_H_ROOT);
  rtnl_tc_set_handle(TC_CAST(q), TC_HANDLE(1, 0));
  rtnl_tc_set_kind(TC_CAST(q), "netem");
  rtnl_netem_set_corruption_probability(q, 0xffffffff / 100);
  if (on)
  {
    int ret = rtnl_qdisc_add(sock, q, NLM_F_CREATE);
    if (ret < 0)
      {
	fprintf(stderr, "rtnl_qdisc_add error: %s\n", nl_geterror(ret));
	exit(1);
      }
  }
  else
  {
    int ret = rtnl_qdisc_delete(sock, q);
    if (ret < 0)
      {
	fprintf(stderr, "rtnl_qdisc_del error: %s\n", nl_geterror(ret));
	exit(1);
      }
  }

  rtnl_qdisc_put(q);
  nl_socket_free(sock);
  rtnl_link_put(link);
  nl_cache_put(cache);
}


int main(int argc, char **argv)
{
  if (argc < 2)
  {
    fprintf(stderr, "Usage: tcpcsum <number-of-bytes> [corruption-rate]");
    exit(1);
  }

  bytes = atoi(argv[1]);
  int corrupt = argc > 2;

  if (corrupt)
  {
    setCorrupt(1);
  }

  addr.sin_family = AF_INET;
  addr.sin_addr.s_addr = inet_addr("127.0.0.1");
  addr.sin_port = htons(0);

  int l = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
  if (l < 0)
  {
    fprintf(stderr, "failed to create listening socket (err=%d: %s)\n", errno, strerror(errno));
    goto exit;
  }

  int ret = bind(l, (struct sockaddr *) &addr, addrLen);
  if (ret != 0)
  {
    fprintf(stderr, "failed to bind listening socket socket (err=%d: %s)\n", errno, strerror(errno));
    goto exit;
  }

  ret = listen(l, 5);
  if (ret != 0)
  {
    fprintf(stderr, "listen failed (err=%d: %s)\n", errno, strerror(errno));
    goto exit;
  }

  ret = getsockname(l, (struct sockaddr *) &addr, &addrLen);
  if (ret != 0)
  {
    fprintf(stderr, "getsockname failed for listening socket socket (err=%d: %s)\n", errno, strerror(errno));
    goto exit;
  }
  else
  {
    printf("listening on port %d\n", ntohs(addr.sin_port));
  }
  
  // Launch writer thread
  pthread_t t;
  ret = pthread_create(&t, NULL, &sender, NULL);
  if (ret != 0)
  {
    fprintf(stderr, "pthread_create failed: (err=%d: %s)\n", errno, strerror(errno));
    goto exit;
  }

  int recv = accept(l, NULL, NULL);
  if (recv < 0)
  {
    fprintf(stderr, "accept failed: (err=%d: %s)\n", errno, strerror(errno));
    goto exit;
  }
   
#define BUFLEN  16384
  char buf[BUFLEN];
  int total = 0;
  int r = 0;
  while((r = read(recv, buf, BUFLEN)))
  {
    if(r < 0)
    {
      perror("read from socket");
      return 1;
    }

    if (r == 0)
    {
      break;
    }
    
    int i;
    for(i= 0; i < r; i++, total++)
    {
      if (buf[i] != 'a')
      {
	fprintf(stdout, "data corruption found at byte %d: %c\n", total, buf[i]);
	goto exit;
      }
    }
  }

  fprintf(stdout, "read %d bytes without data corruption\n", total);

exit:
  if (corrupt)
  {
    setCorrupt(0);
  }
  exit(0);
}

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-10-17 17:11 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-09-29  0:18 UDP wierdness around skb_copy_and_csum_datagram_msg() Jay Smith
2016-09-29  1:24 ` Eric Dumazet
2016-09-29  2:20   ` Jay Smith
2016-09-29 23:28     ` Christian Lamparter
2016-09-30  0:06       ` Eric Dumazet
2016-09-30 17:35       ` Jay Smith
2016-09-30 18:40         ` Christian Lamparter
2016-10-17 17:10           ` Jay Smith

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).