[PATCH 3/3] tcp: Repair socket queues

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Pavel Emelyanov <xemul@parallels.com>
To: Linux Netdev List <netdev@vger.kernel.org>,
	David Miller <davem@davemloft.net>, Tejun Heo <tj@kernel.org>,
	Eric Dumazet <eric.dumazet@gmail.com>
Subject: [PATCH 3/3] tcp: Repair socket queues
Date: Tue, 06 Mar 2012 13:55:47 +0400	[thread overview]
Message-ID: <4F55DF23.6090105@parallels.com> (raw)
In-Reply-To: <4F55DEDE.1090602@parallels.com>

Reading queues under repair mode is done with recvmsg call.
The queue-under-repair set by TCP_REPAIR_QUEUE option is used
to determine which queue should be read. Thus both send and
receive queue can be read with this.

Caller must pass the MSG_PEEK flag.


Writing to queues is done with sendmsg call and yet again --
the repair-queue option can be used to push data into the
receive queue.

When putting an skb into receive queue a zero tcp header is
appented to its head to address the tcp_hdr(skb)->syn and
the ->fin checks by the (after repair) tcp_recvmsg. (This is
not fully correct, yes, need more work with the ->fin one).


When the repair mode is turned off, the write queue seqs are
updated so that the whole queue is considered to be 'already sent,
waiting for ACKs' (write_seq = snd_nxt <= snd_una). From the
protocol POV the send queue looks like it was sent, but the data
between the write_seq and snd_nxt is lost in the network.

This helps to avoid another sockoption for setting the snd_nxt
sequence. Leaving the whole queue in a 'not yet sent' state (as
it will be after sendmsg-s) will not allow to receive any acks
from the peer since the ack_seq will be after the snd_nxt. Thus
even the ack for the window probe will be dropped and the 
connection will be 'locked' with the zero peer window.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>

---
 net/ipv4/tcp.c        |   90 +++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_output.c |    1 +
 2 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8d9b2bc..eba6696 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -909,6 +909,39 @@ static inline int select_size(const struct sock *sk, bool sg)
 	return tmp;
 }
 
+static int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct sk_buff *skb;
+	struct tcp_skb_cb *cb;
+	struct tcphdr *th;
+
+	skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
+	if (!skb)
+		goto err;
+
+	th = (struct tcphdr *)skb_put(skb, sizeof(*th));
+	skb_reset_transport_header(skb);
+	memset(th, 0, sizeof(*th));
+
+	if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
+		goto err_free;
+
+	cb = TCP_SKB_CB(skb);
+
+	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+	tcp_queue_rcv(sk, skb, sizeof(*th));
+
+	return size;
+
+err_free:
+	kfree_skb(skb);
+err:
+	return -ENOMEM;
+}
+
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
@@ -930,6 +964,19 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 			goto out_err;
 
+	if (unlikely(tp->repair)) {
+		if (tp->repair_queue == TCP_RECV_QUEUE) {
+			copied = tcp_send_rcvq(sk, msg, size);
+			goto out;
+		}
+
+		err = -EINVAL;
+		if (tp->repair_queue == TCP_NO_QUEUE)
+			goto out_err;
+
+		/* 'common' sending to sendq */
+	}
+
 	/* This should be in poll */
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
@@ -1087,7 +1134,7 @@ new_segment:
 			if ((seglen -= copy) == 0 && iovlen == 0)
 				goto out;
 
-			if (skb->len < max || (flags & MSG_OOB))
+			if (skb->len < max || (flags & MSG_OOB) || tp->repair)
 				continue;
 
 			if (forced_push(tp)) {
@@ -1100,7 +1147,7 @@ new_segment:
 wait_for_sndbuf:
 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
-			if (copied)
+			if (copied && !tp->repair)
 				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 
 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
@@ -1111,7 +1158,7 @@ wait_for_memory:
 	}
 
 out:
-	if (copied)
+	if (copied && !tp->repair)
 		tcp_push(sk, flags, mss_now, tp->nonagle);
 	release_sock(sk);
 	return copied;
@@ -1185,6 +1232,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
 	return -EAGAIN;
 }
 
+static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct sk_buff *skb;
+	int copied = 0, err = 0;
+
+	/* XXX -- need to support SO_PEEK_OFF */
+
+	skb_queue_walk(&sk->sk_write_queue, skb) {
+		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+		if (err)
+			break;
+
+		copied += skb->len;
+	}
+
+	return err ?: copied;
+}
+
 /* Clean up the receive buffer for full frames taken by the user,
  * then send an ACK if necessary.  COPIED is the number of bytes
  * tcp_recvmsg has given to the user so far, it speeds up the
@@ -1430,6 +1495,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (flags & MSG_OOB)
 		goto recv_urg;
 
+	if (unlikely(tp->repair)) {
+		err = -EPERM;
+		if (!(flags & MSG_PEEK))
+			goto out;
+
+		if (tp->repair_queue == TCP_SEND_QUEUE)
+			goto recv_sndq;
+
+		err = -EINVAL;
+		if (tp->repair_queue == TCP_NO_QUEUE)
+			goto out;
+
+		/* 'common' recv queue MSG_PEEK-ing */
+	}
+
 	seq = &tp->copied_seq;
 	if (flags & MSG_PEEK) {
 		peek_seq = tp->copied_seq;
@@ -1780,6 +1860,10 @@ out:
 recv_urg:
 	err = tcp_recv_urg(sk, msg, len, flags);
 	goto out;
+
+recv_sndq:
+	err = tcp_peek_sndq(sk, msg, len);
+	goto out;
 }
 EXPORT_SYMBOL(tcp_recvmsg);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f0525d1..5e58c1a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2796,6 +2796,7 @@ void tcp_send_window_probe(struct sock *sk)
 {
 	if (sk->sk_state == TCP_ESTABLISHED) {
 		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+		tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
 		tcp_xmit_probe_skb(sk, 0);
 	}
 }
-- 
1.5.5.6

next prev parent reply	other threads:[~2012-03-06  9:55 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-03-06  9:54 [RFC][PATCH 0/3] TCP connection repair (v2) Pavel Emelyanov
2012-03-06  9:55 ` [PATCH 1/3] tcp: Move code around Pavel Emelyanov
2012-03-06  9:55 ` [PATCH 2/3] tcp: Initial repair mode Pavel Emelyanov
2012-03-06 13:11   ` Glauber Costa
2012-03-06 20:16     ` David Miller
2012-03-06  9:55 ` Pavel Emelyanov [this message]
2012-03-06 21:14 ` [RFC][PATCH 0/3] TCP connection repair (v2) David Miller
  -- strict thread matches above, loose matches on Subject: below --
2012-03-28 15:36 [PATCH net-next 0/3] TCP connection repair (v3) Pavel Emelyanov
2012-03-28 15:38 ` [PATCH 3/3] tcp: Repair socket queues Pavel Emelyanov
2012-03-29 10:30   ` Li Yu
2012-03-29 10:36     ` Pavel Emelyanov
2012-03-29 10:41       ` Li Yu
2012-03-29 10:41       ` Li Yu

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:8d9b2bc dfblob:eba6696 dfblob:f0525d1 dfblob:5e58c1a )
 OR (
bs:"[PATCH 3/3] tcp: Repair socket queues" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4F55DF23.6090105@parallels.com \
    --to=xemul@parallels.com \
    --cc=davem@davemloft.net \
    --cc=eric.dumazet@gmail.com \
    --cc=netdev@vger.kernel.org \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.