* [PATCH 1/2] tcp: Move code around
2012-02-29 15:13 [RFC][PATCH 0/2] TCP connection repair Pavel Emelyanov
@ 2012-02-29 15:13 ` Pavel Emelyanov
2012-02-29 15:14 ` [PATCH 2/2] tcp: Initial repair mode Pavel Emelyanov
1 sibling, 0 replies; 4+ messages in thread
From: Pavel Emelyanov @ 2012-02-29 15:13 UTC (permalink / raw)
To: Linux Netdev List, Tejun Heo, Eric Dumazet; +Cc: David Miller
This is just the preparation patch, which makes the needed for
TCP repair code ready for use.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
---
include/net/tcp.h | 4 +++
net/ipv4/tcp_input.c | 68 ++++++++++++++++++++++++++++---------------------
net/ipv4/tcp_output.c | 10 ++++++-
3 files changed, 51 insertions(+), 31 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6b2acfc..547a8fb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -434,6 +434,8 @@ extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_values *rvp);
extern int tcp_disconnect(struct sock *sk, int flags);
+void tcp_connect_init(struct sock *sk);
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
/* From syncookies.c */
extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
@@ -608,6 +610,8 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp)
*/
extern u32 __tcp_select_window(struct sock *sk);
+void tcp_send_window_probe(struct sock *sk);
+
/* TCP timestamps are only 32-bits, this causes a slight
* complication on 64-bit systems since we store a snapshot
* of jiffies in the buffer control blocks below. We decided
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53c8ce4..1122cce 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5514,6 +5514,44 @@ discard:
}
EXPORT_SYMBOL(tcp_rcv_established);
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (skb != NULL)
+ security_inet_conn_established(sk, skb);
+
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ tcp_init_congestion_control(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
@@ -5646,36 +5684,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
}
smp_mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
-
- security_inet_conn_established(sk, skb);
-
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- tcp_init_metrics(sk);
-
- tcp_init_congestion_control(sk);
- /* Prevent spurious tcp_cwnd_restart() on first data
- * packet.
- */
- tp->lsndtime = tcp_time_stamp;
-
- tcp_init_buffer_space(sk);
-
- if (sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
-
- if (!tp->rx_opt.snd_wscale)
- __tcp_fast_path_on(tp, tp->snd_wnd);
- else
- tp->pred_flags = 0;
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
+ tcp_finish_connect(sk, skb);
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 364784a..e90d62d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2561,7 +2561,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
EXPORT_SYMBOL(tcp_make_synack);
/* Do all connect socket setups that can be done AF independent. */
-static void tcp_connect_init(struct sock *sk)
+void tcp_connect_init(struct sock *sk)
{
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2619,6 +2619,7 @@ static void tcp_connect_init(struct sock *sk)
tp->rcv_nxt = 0;
tp->rcv_wup = 0;
tp->copied_seq = 0;
+ tp->snd_nxt = tp->write_seq;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
inet_csk(sk)->icsk_retransmits = 0;
@@ -2641,7 +2642,6 @@ int tcp_connect(struct sock *sk)
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
- tp->snd_nxt = tp->write_seq;
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
TCP_ECN_send_syn(sk, buff);
@@ -2790,6 +2790,11 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
+void tcp_send_window_probe(struct sock *sk)
+{
+ tcp_xmit_probe_skb(sk, 0);
+}
+
/* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk)
{
--
1.5.5.6
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH 2/2] tcp: Initial repair mode
2012-02-29 15:13 [RFC][PATCH 0/2] TCP connection repair Pavel Emelyanov
2012-02-29 15:13 ` [PATCH 1/2] tcp: Move code around Pavel Emelyanov
@ 2012-02-29 15:14 ` Pavel Emelyanov
2012-03-05 2:42 ` David Miller
1 sibling, 1 reply; 4+ messages in thread
From: Pavel Emelyanov @ 2012-02-29 15:14 UTC (permalink / raw)
To: Linux Netdev List, Tejun Heo, Eric Dumazet; +Cc: David Miller
This includes (according the the previous description):
* TCP_REPAIR sockoption
* Sequences sockoptions
* Ability to forcibly bind a socket to a port
* Immediate connect modification
* Silent close modification
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
---
include/linux/tcp.h | 6 ++++-
net/ipv4/inet_connection_sock.c | 3 ++
net/ipv4/tcp.c | 43 ++++++++++++++++++++++++++++++++++++++-
net/ipv4/tcp_ipv4.c | 19 ++++++++++++++--
net/ipv4/tcp_output.c | 1 -
5 files changed, 66 insertions(+), 6 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 115389e..0b2e01c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -106,6 +106,9 @@ enum {
#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/
#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */
+#define TCP_REPAIR 19 /* TCP sock is under repair right now */
+#define TCP_WRITE_SEQ 20
+#define TCP_RCV_NXT 21
/* for TCP_INFO socket option */
#define TCPI_OPT_TIMESTAMPS 1
@@ -353,7 +356,8 @@ struct tcp_sock {
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */
- unused : 2;
+ repair : 1,
+ unused : 1;
/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19d66ce..92788af 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -172,6 +172,9 @@ have_snum:
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse == 2)
+ goto success;
+
if (tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
smallest_size == -1) {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 22ef5f9..768306d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1932,7 +1932,9 @@ void tcp_close(struct sock *sk, long timeout)
* advertise a zero window, then kill -9 the FTP client, wheee...
* Note: timeout is always zero in such a case.
*/
- if (data_was_unread) {
+ if (tcp_sk(sk)->repair) {
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
@@ -2071,6 +2073,8 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
inet_csk_listen_stop(sk);
+ } else if (unlikely(tp->repair)) {
+ sk->sk_err = ECONNABORTED;
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -2294,6 +2298,33 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
tp->thin_dupack = val;
break;
+ case TCP_REPAIR:
+ if (!capable(CAP_SYS_ADMIN))
+ err = -EPERM;
+ else if (val < 0 || val > 1)
+ err = -EINVAL;
+ else {
+ tp->repair = val;
+ sk->sk_reuse = (val << 1);
+ if (val == 0)
+ tcp_send_window_probe(sk);
+ }
+ break;
+
+ case TCP_WRITE_SEQ:
+ if (!tp->repair)
+ err = -EPERM;
+ else
+ tp->write_seq = val;
+ break;
+
+ case TCP_RCV_NXT:
+ if (!tp->repair)
+ err = -EPERM;
+ else
+ tp->copied_seq = tp->rcv_nxt = val;
+ break;
+
case TCP_CORK:
/* When set indicates to always queue non-full frames.
* Later the user clears this option and we transmit
@@ -2629,6 +2658,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->thin_dupack;
break;
+ case TCP_REPAIR:
+ val = tp->repair;
+ break;
+
+ case TCP_WRITE_SEQ:
+ val = tp->write_seq;
+ break;
+
+ case TCP_RCV_NXT:
+ val = tp->rcv_nxt;
+ break;
+
case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout);
break;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 94abee8..6118486 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -137,6 +137,14 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
}
EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+static int tcp_repair_connect(struct sock *sk)
+{
+ tcp_connect_init(sk);
+ tcp_finish_connect(sk, NULL);
+
+ return 0;
+}
+
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -195,7 +203,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
- tp->write_seq = 0;
+ if (!tp->repair)
+ tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
@@ -246,7 +255,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
- if (!tp->write_seq)
+ if (!tp->write_seq && !tp->repair)
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
@@ -254,7 +263,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies;
- err = tcp_connect(sk);
+ if (likely(!tp->repair))
+ err = tcp_connect(sk);
+ else
+ err = tcp_repair_connect(sk);
+
rt = NULL;
if (err)
goto failure;
--
1.5.5.6
^ permalink raw reply related [flat|nested] 4+ messages in thread