Netdev List
 help / color / mirror / Atom feed
* [PATCH RFC] Allow tcp_parse_options to consult dst entry
From: Gilad Ben-Yossef @ 2009-10-20 15:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256052161-14156-2-git-send-email-gilad@codefidence.com>

We need tcp_parse_options to be aware of dst_entry to 
take into account per dst_entry TCP options settings

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/net/tcp.h        |    3 ++-
 net/ipv4/syncookies.c    |   27 ++++++++++++++-------------
 net/ipv4/tcp_input.c     |    9 ++++++---
 net/ipv4/tcp_ipv4.c      |   19 ++++++++++---------
 net/ipv4/tcp_minisocks.c |    7 +++++--
 net/ipv6/syncookies.c    |   28 +++++++++++++++-------------
 net/ipv6/tcp_ipv6.c      |    3 ++-
 7 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 03a49c7..740d09b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -409,7 +409,8 @@ extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 
 extern void			tcp_parse_options(struct sk_buff *skb,
 						  struct tcp_options_received *opt_rx,
-						  int estab);
+						  int estab,
+						  struct dst_entry *dst);
 
 extern u8			*tcp_parse_md5sig_option(struct tcphdr *th);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a6e0e07..4990dd4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -276,13 +276,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
 
-	/* check for timestamp cookie support */
-	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, 0);
-
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
-
 	ret = NULL;
 	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
 	if (!req)
@@ -298,12 +291,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->loc_addr		= ip_hdr(skb)->daddr;
 	ireq->rmt_addr		= ip_hdr(skb)->saddr;
 	ireq->ecn_ok		= 0;
-	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
-	ireq->sack_ok		= tcp_opt.sack_ok;
-	ireq->wscale_ok		= tcp_opt.wscale_ok;
-	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
-	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -351,6 +338,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(skb, &tcp_opt, 0, &rt->u.dst);
+
+	if (tcp_opt.saw_tstamp)
+		cookie_check_timestamp(&tcp_opt);
+
+	ireq->snd_wscale        = tcp_opt.snd_wscale;
+	ireq->rcv_wscale        = tcp_opt.rcv_wscale;
+	ireq->sack_ok           = tcp_opt.sack_ok;
+	ireq->wscale_ok         = tcp_opt.wscale_ok;
+	ireq->tstamp_ok         = tcp_opt.saw_tstamp;
+	req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+
 	/* Try to redo what tcp_v4_send_synack did. */
 	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d86784b..d502f49 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3698,12 +3698,14 @@ old_ack:
  * the fast version below fails.
  */
 void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
-		       int estab)
+		       int estab,  struct dst_entry *dst)
 {
 	unsigned char *ptr;
 	struct tcphdr *th = tcp_hdr(skb);
 	int length = (th->doff * 4) - sizeof(struct tcphdr);
 
+	BUG_ON(!estab && !dst);
+
 	ptr = (unsigned char *)(th + 1);
 	opt_rx->saw_tstamp = 0;
 
@@ -3820,7 +3822,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
 		if (tcp_parse_aligned_timestamp(tp, th))
 			return 1;
 	}
-	tcp_parse_options(skb, &tp->rx_opt, 1);
+	tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
 	return 1;
 }
 
@@ -5364,8 +5366,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int saved_clamp = tp->rx_opt.mss_clamp;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	tcp_parse_options(skb, &tp->rx_opt, 0);
+	tcp_parse_options(skb, &tp->rx_opt, 0, dst);
 
 	if (th->ack) {
 		/* rfc793:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b..1cb0ec4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1256,11 +1256,18 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 #endif
 
+	ireq = inet_rsk(req);
+	ireq->loc_addr = daddr;
+	ireq->rmt_addr = saddr;
+	ireq->no_srccheck = inet_sk(sk)->transparent;
+	ireq->opt = tcp_v4_save_options(sk, skb);
+
+	dst = inet_csk_route_req(sk, req);
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = 536;
 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 
-	tcp_parse_options(skb, &tmp_opt, 0);
+	tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
@@ -1269,14 +1276,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	tcp_openreq_init(req, &tmp_opt, skb);
 
-	ireq = inet_rsk(req);
-	ireq->loc_addr = daddr;
-	ireq->rmt_addr = saddr;
-	ireq->no_srccheck = inet_sk(sk)->transparent;
-	ireq->opt = tcp_v4_save_options(sk, skb);
-
 	if (security_inet_conn_request(sk, skb, req))
-		goto drop_and_free;
+		goto drop_and_release;
 
 	if (!want_cookie)
 		TCP_ECN_create_request(req, tcp_hdr(skb));
@@ -1301,7 +1302,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet_csk_route_req(sk, req)) != NULL &&
+		    dst != NULL &&
 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 		    peer->v4daddr == saddr) {
 			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c49a550..70ff955 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -101,7 +101,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	int paws_reject = 0;
 
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, 1);
+		tcp_parse_options(skb, &tmp_opt, 1, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
@@ -499,10 +499,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	int paws_reject = 0;
 	struct tcp_options_received tmp_opt;
 	struct sock *child;
+	struct dst_entry *dst = inet_csk_route_req(sk, req);
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-		tcp_parse_options(skb, &tmp_opt, 0);
+		tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
@@ -515,6 +516,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	dst_release(dst);
+
 	/* Check for pure retransmitted SYN. */
 	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 	    flg == TCP_FLAG_SYN &&
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 6b6ae91..6ece408 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -184,13 +184,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
 
-	/* check for timestamp cookie support */
-	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, 0);
-
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
-
 	ret = NULL;
 	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
 	if (!req)
@@ -224,12 +217,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	req->expires = 0UL;
 	req->retrans = 0;
 	ireq->ecn_ok		= 0;
-	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
-	ireq->sack_ok		= tcp_opt.sack_ok;
-	ireq->wscale_ok		= tcp_opt.wscale_ok;
-	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
-	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->rcv_isn = ntohl(th->seq) - 1;
 	treq->snt_isn = cookie;
 
@@ -264,6 +251,21 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 			goto out_free;
 	}
 
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(skb, &tcp_opt, 0, dst);
+
+	if (tcp_opt.saw_tstamp)
+		cookie_check_timestamp(&tcp_opt);
+
+	req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+
+	ireq->snd_wscale        = tcp_opt.snd_wscale;
+	ireq->rcv_wscale        = tcp_opt.rcv_wscale;
+	ireq->sack_ok           = tcp_opt.sack_ok;
+	ireq->wscale_ok         = tcp_opt.wscale_ok;
+	ireq->tstamp_ok         = tcp_opt.saw_tstamp;
+
 	req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
 	tcp_select_initial_window(tcp_full_space(sk), req->mss,
 				  &req->rcv_wnd, &req->window_clamp,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 21d100b..2eebab5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1165,6 +1165,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct request_sock *req = NULL;
 	__u32 isn = TCP_SKB_CB(skb)->when;
+	struct dst_entry *dst = __sk_dst_get(sk);
 #ifdef CONFIG_SYN_COOKIES
 	int want_cookie = 0;
 #else
@@ -1203,7 +1204,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
 
-	tcp_parse_options(skb, &tmp_opt, 0);
+	tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH RFC] Allow disabling of DSACK TCP option per route
From: Gilad Ben-Yossef @ 2009-10-20 15:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256052161-14156-7-git-send-email-gilad@codefidence.com>

Add and use no DSCAK bit in the features field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    1 +
 net/ipv4/tcp_input.c      |    8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6784b34..e78b60c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -381,6 +381,7 @@ enum
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 #define RTAX_FEATURE_NO_WSCALE	0x00000010
+#define RTAX_FEATURE_NO_DSACK	0x00000020
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4f5e914..4262da5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4080,8 +4080,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
 static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+	if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+	    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 		int mib_idx;
 
 		if (before(seq, tp->rcv_nxt))
@@ -4110,13 +4112,15 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
 static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
 		tcp_enter_quickack_mode(sk);
 
-		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+		if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+		    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
 			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH RFC] Document future removal of sysctl_tcp_* options
From: Gilad Ben-Yossef @ 2009-10-20 15:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256052161-14156-8-git-send-email-gilad@codefidence.com>

No need for global kill switches if we have per route entry controls.
Wait a year before removing in case someone is using this.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>

---
 Documentation/feature-removal-schedule.txt |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 89a47b5..60db855 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,18 @@ be removed from this file.
 
 ---------------------------
 
+What:	sysctl_tcp_sack, sysctl_tcp_timestamps, sysctl_tcp_window_scaling,
+	sysctl_tcp_dsack
+When:	October 2010
+
+Why:	These options can now be set on a per route basis via the
+	RTAX_FEATURE_NO_SACK, RTAX_FEATURE_NO_TSTAMP, RTAX_FEATURE_NO_WSCALE,
+	and RTAX_FEATURE_NO_DSACK route feature options.
+
+Who:	Gilad Ben-Yossef <gilad@codefidence.com>
+
+---------------------------
+
 What:	PRISM54
 When:	2.6.34
 
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH RFC] Only parse time stamp TCP option in time wait sock
From: Gilad Ben-Yossef @ 2009-10-20 15:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef, Yony Amit
In-Reply-To: <1256052161-14156-1-git-send-email-gilad@codefidence.com>

A time wait socket is established - we already know if time stamp
option is called for or not.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Signed-off-by: Ori Finkelman <ori@comsleep.com>
Signed-off-by: Yony Amit <yony@comsleep.com>

---
 net/ipv4/tcp_minisocks.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 624c3c9..c49a550 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -100,9 +100,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	struct tcp_options_received tmp_opt;
 	int paws_reject = 0;
 
-	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, 0);
+		tcp_parse_options(skb, &tmp_opt, 1);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH RFC] Add the no SACK route option feature
From: Gilad Ben-Yossef @ 2009-10-20 15:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256052161-14156-4-git-send-email-gilad@codefidence.com>

Implement querying and acting upon the no sack bit in the features
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    2 +-
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    4 +++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index adf2068..9c802a6 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -377,7 +377,7 @@ enum
 #define RTAX_MAX (__RTAX_MAX - 1)
 
 #define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
+#define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_TIMESTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d502f49..b14f780 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3763,7 +3763,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_SACK_PERM:
 				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
-				    !estab && sysctl_tcp_sack) {
+				    !estab && sysctl_tcp_sack &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_SACK)) {
 					opt_rx->sack_ok = 1;
 					tcp_sack_reset(opt_rx);
 				}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a..64db8dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -464,6 +464,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 				struct tcp_md5sig_key **md5) {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned size = 0;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -498,7 +499,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
 	}
-	if (likely(sysctl_tcp_sack)) {
+	if (likely(sysctl_tcp_sack &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_SACK))) {
 		opts->options |= OPTION_SACK_ADVERTISE;
 		if (unlikely(!(OPTION_TS & opts->options)))
 			size += TCPOLEN_SACKPERM_ALIGNED;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH RFC] Allow disabling TCP timestamp options per route
From: Gilad Ben-Yossef @ 2009-10-20 15:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256052161-14156-5-git-send-email-gilad@codefidence.com>

Implement querying and acting upon the no timestamp bit in the feature 
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    2 +-
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9c802a6..2ab8c75 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -378,7 +378,7 @@ enum
 
 #define RTAX_FEATURE_ECN	0x00000001
 #define RTAX_FEATURE_NO_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
+#define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
 struct rta_session
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b14f780..d2f9742 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3755,7 +3755,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 			case TCPOPT_TIMESTAMP:
 				if ((opsize == TCPOLEN_TIMESTAMP) &&
 				    ((estab && opt_rx->tstamp_ok) ||
-				     (!estab && sysctl_tcp_timestamps))) {
+				     (!estab && sysctl_tcp_timestamps &&
+				      !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) {
 					opt_rx->saw_tstamp = 1;
 					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
 					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 64db8dd..8f30c18 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -488,7 +488,9 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	opts->mss = tcp_advertise_mss(sk);
 	size += TCPOLEN_MSS_ALIGNED;
 
-	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+	if (likely(sysctl_tcp_timestamps &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) &&
+		   *md5 == NULL)) {
 		opts->options |= OPTION_TS;
 		opts->tsval = TCP_SKB_CB(skb)->when;
 		opts->tsecr = tp->rx_opt.ts_recent;
@@ -2317,7 +2319,9 @@ static void tcp_connect_init(struct sock *sk)
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
 	 */
 	tp->tcp_header_len = sizeof(struct tcphdr) +
-		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+		(sysctl_tcp_timestamps &&
+		(!dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) ?
+		  TCPOLEN_TSTAMP_ALIGNED : 0));
 
 #ifdef CONFIG_TCP_MD5SIG
 	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH RFC] Add dst_feature to query route entry features
From: Gilad Ben-Yossef @ 2009-10-20 15:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256052161-14156-3-git-send-email-gilad@codefidence.com>

Adding an accessor to existing  dst_entry feautres field and
refactor the only supported feature (allfrag) to use it.


Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/net/dst.h |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 5a900dd..b562be3 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -111,6 +111,12 @@ dst_metric(const struct dst_entry *dst, int metric)
 	return dst->metrics[metric-1];
 }
 
+static inline u32
+dst_feature(const struct dst_entry *dst, u32 feature)
+{
+	return dst_metric(dst, RTAX_FEATURES) & feature;
+}
+
 static inline u32 dst_mtu(const struct dst_entry *dst)
 {
 	u32 mtu = dst_metric(dst, RTAX_MTU);
@@ -136,7 +142,7 @@ static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
 static inline u32
 dst_allfrag(const struct dst_entry *dst)
 {
-	int ret = dst_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
+	int ret = dst_feature(dst,  RTAX_FEATURE_ALLFRAG);
 	/* Yes, _exactly_. This is paranoia. */
 	barrier();
 	return ret;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH RFC] Allow to turn off TCP window scale opt per route
From: Gilad Ben-Yossef @ 2009-10-20 15:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256052161-14156-6-git-send-email-gilad@codefidence.com>

Add and use no window scale bit in the features field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
 include/linux/rtnetlink.h |    1 +
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2ab8c75..6784b34 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -380,6 +380,7 @@ enum
 #define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
+#define RTAX_FEATURE_NO_WSCALE	0x00000010
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d2f9742..4f5e914 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3739,7 +3739,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_WINDOW:
 				if (opsize == TCPOLEN_WINDOW && th->syn &&
-				    !estab && sysctl_tcp_window_scaling) {
+				    !estab && sysctl_tcp_window_scaling &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) {
 					__u8 snd_wscale = *(__u8 *)ptr;
 					opt_rx->wscale_ok = 1;
 					if (snd_wscale > 14) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8f30c18..ff60a21 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -496,7 +496,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->tsecr = tp->rx_opt.ts_recent;
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
-	if (likely(sysctl_tcp_window_scaling)) {
+	if (likely(sysctl_tcp_window_scaling &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) {
 		opts->ws = tp->rx_opt.rcv_wscale;
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
@@ -2347,7 +2348,8 @@ static void tcp_connect_init(struct sock *sk)
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 				  &tp->rcv_wnd,
 				  &tp->window_clamp,
-				  sysctl_tcp_window_scaling,
+				  (sysctl_tcp_window_scaling &&
+				   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)),
 				  &rcv_wscale);
 
 	tp->rx_opt.rcv_wscale = rcv_wscale;
-- 
1.5.6.3


^ permalink raw reply related

* Re: [PATCH RFC] Per route TCP options
From: Eric Dumazet @ 2009-10-20 15:44 UTC (permalink / raw)
  To: Gilad Ben-Yossef; +Cc: netdev, ori
In-Reply-To: <1256052161-14156-1-git-send-email-gilad@codefidence.com>

Gilad Ben-Yossef a écrit :
> Turn the global sysctls allowing disabling of TCP SACK, DSCAK,
> time stamp and window scale into per route entry feature options,
> laying the ground to future removal of the relevant global sysctls.
> 
> You really only want to disable SACK, DSACK, time stamp or window
> scale if you've got a piece of broken networking equipment somewhere 
> as a stop gap until you can bring a big enough hammer to deal with
> the broken network equipment. It doesn't make sense to "punish" the
> entire connections going through the machine to destinations not 
> related to the broken equipment.
> 
> This is doubly true when you're dealing with network containers
> used to isolate several virtual domains.
> 
> Per route options implemented in free bits in the features route
> entry property, which in some cases were reserved by name for these
> options, so this does not inflate any structure and I expect that
> when the apropriate global sysctls will be removed the overall code
> base will be smaller.
> 
> Tested on x86 using Qemu/KVM.  
> 
> Will send the matching patch to iproute2 if/when this is ACKed or
> if someone wants to test this.
> 
> Patchset based on original work by Ori Finkelman and Yoni Amit 
> from ComSleep Ltd.
> 
> Gilad Ben-Yossef (8):
>   Only parse time stamp TCP option in time wait sock
>   Allow tcp_parse_options to consult dst entry
>   Infrastructure for querying route entry features
>   Add the no SACK route option feature
>   Allow disabling TCP timestamp options per route
>   Allow to turn off TCP window scale opt per route
>   Allow disabling of DSACK TCP option per route
>   Document future removal of sysctl_tcp_* options
> 

Interesting... But you should give numbers to your patches so that we know their order

You could also do the ECN part for consistency (ie RTAX_FEATURE_ECN -> RTAX_FEATURE_NO_ECN)

And please post iproute2 patches as well :)

Thanks

^ permalink raw reply

* Re: [PATCH RFC] Per route TCP options
From: Gilad Ben-Yossef @ 2009-10-20 16:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, ori
In-Reply-To: <4ADDDAFB.5040600@gmail.com>

Eric Dumazet wrote:

> Gilad Ben-Yossef a écrit :
>   
>> Turn the global sysctls allowing disabling of TCP SACK, DSCAK,
>> time stamp and window scale into per route entry feature options,
>> laying the ground to future removal of the relevant global sysctls.
>>
>> ...
> Interesting... But you should give numbers to your patches so that we know their order
>   
Will do.
> You could also do the ECN part for consistency (ie RTAX_FEATURE_ECN -> RTAX_FEATURE_NO_ECN)
>   
That and  sysctl_tcp_mtu_probing are on my todo list, assuming the 
general direction is accepted, of course.

Specifically, I couldn't understand why sysctl_tcp_ecn is documented to 
be a boolean value, but is initialized to 2 and queried with if 
(sysctl_tcp_ecn == 1) so I decided to let it be until I figure it out... 
;-)
> And please post iproute2 patches as well :)
>   
Will do. Note that the patch I have still has an ugly bit that needs 
addressing - it sets the features by name, but right now only displays 
numeric values to the, for "show". This of course will be fixed, but it 
does work for testing.

Thanks,
Gilad

-- 
Gilad Ben-Yossef
Chief Coffee Drinker & CTO
Codefidence Ltd.

Web:   http://codefidence.com
Cell:  +972-52-8260388
Skype: gilad_codefidence
Tel:   +972-8-9316883 ext. 201
Fax:   +972-8-9316884
Email: gilad@codefidence.com

Check out our Open Source technology and training blog - http://tuxology.net

	"Sorry cannot parse this, its too long to be true  :)"
	  -- Eric Dumazet on netdev mailing list


^ permalink raw reply

* [PATCH TESTING] Add support for configuring route entry features
From: Gilad Ben-Yossef @ 2009-10-20 16:22 UTC (permalink / raw)
  To: netdev; +Cc: ori, ilpo.jarvinen, Gilad Ben-Yossef

This is needed to test my previous per route entry TCP options patch.

Feature display is still numeric, so this is not the final version yet.

To use: add "features nows nots nosack nodsack to a route entry.

Based loosley on original patch by Ilpo Järvinen.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>

CC: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>

---
 include/linux/rtnetlink.h |   10 ++++++----
 ip/iproute.c              |   45 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 63d1c69..35964d4 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -376,10 +376,12 @@ enum
 
 #define RTAX_MAX (__RTAX_MAX - 1)
 
-#define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
-#define RTAX_FEATURE_ALLFRAG	0x00000008
+#define RTAX_FEATURE_ECN        0x00000001
+#define RTAX_FEATURE_NO_SACK    0x00000002
+#define RTAX_FEATURE_NO_TSTAMP  0x00000004
+#define RTAX_FEATURE_ALLFRAG    0x00000008
+#define RTAX_FEATURE_NO_WSCALE  0x00000010
+#define RTAX_FEATURE_NO_DSACK   0x00000020
 
 struct rta_session
 {
diff --git a/ip/iproute.c b/ip/iproute.c
index aeea93d..9dcc0e0 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -54,6 +54,22 @@ static const char *mx_names[RTAX_MAX+1] = {
 	[RTAX_FEATURES] = "features",
 	[RTAX_RTO_MIN]	= "rto_min",
 };
+
+struct valname {
+	unsigned int	val;
+	const char	*name;
+};
+
+static const struct valname features[] = {
+	{ RTAX_FEATURE_NO_SACK, "nosack" },
+	{ RTAX_FEATURE_NO_TSTAMP, "notimestamps" },
+	{ RTAX_FEATURE_NO_TSTAMP, "nots" },
+	{ RTAX_FEATURE_NO_WSCALE, "nowindowscale" },
+	{ RTAX_FEATURE_NO_WSCALE, "nows" },
+	{ RTAX_FEATURE_NO_DSACK, "nodsack" },
+	
+};
+
 static void usage(void) __attribute__((noreturn));
 
 static void usage(void)
@@ -75,7 +91,8 @@ static void usage(void)
 	fprintf(stderr, "           [ rtt TIME ] [ rttvar TIME ] [reordering NUMBER ]\n");
 	fprintf(stderr, "           [ window NUMBER] [ cwnd NUMBER ] [ initcwnd NUMBER ]\n");
 	fprintf(stderr, "           [ ssthresh NUMBER ] [ realms REALM ] [ src ADDRESS ]\n");
-	fprintf(stderr, "           [ rto_min TIME ] [ hoplimit NUMBER ] \n");
+	fprintf(stderr, "           [ rto_min TIME ] [ hoplimit NUMBER ]\n"); 
+	fprintf(stderr, "           [ features DISABLED_FEATURES ]\n");
 	fprintf(stderr, "TYPE := [ unicast | local | broadcast | multicast | throw |\n");
 	fprintf(stderr, "          unreachable | prohibit | blackhole | nat ]\n");
 	fprintf(stderr, "TABLE_ID := [ local | main | default | all | NUMBER ]\n");
@@ -85,6 +102,8 @@ static void usage(void)
 	fprintf(stderr, "NHFLAGS := [ onlink | pervasive ]\n");
 	fprintf(stderr, "RTPROTO := [ kernel | boot | static | NUMBER ]\n");
 	fprintf(stderr, "TIME := NUMBER[s|ms|us|ns|j]\n");
+	fprintf(stderr, "DISABLED_FEATURES := sack | timestamps | ts | ecn | frto |\n");
+	fprintf(stderr, "                     [ DISABLED_FEATURES ]\n");
 	exit(-1);
 }
 
@@ -877,6 +896,30 @@ int iproute_modify(int cmd, unsigned flags, int argc, char **argv)
 			if (get_unsigned(&win, *argv, 0))
 				invarg("\"ssthresh\" value is invalid\n", *argv);
 			rta_addattr32(mxrta, sizeof(mxbuf), RTAX_SSTHRESH, win);
+		} else if (matches(*argv, "features") == 0) {
+			int j;
+			unsigned int f = 0;
+			NEXT_ARG();
+			while (1) {
+				for (j = 0; j < ARRAY_SIZE(features); j++) {
+					if (strcmp(*argv, features[j].name) == 0) {
+						f |= features[j].val;
+						if (!NEXT_ARG_OK())
+							goto feat_out;
+						NEXT_ARG();
+						break;
+					}
+				}
+				if (j == ARRAY_SIZE(features)) {
+					if (f)
+						PREV_ARG();
+					break;
+				}
+			}
+feat_out:
+			if (!f)
+				invarg("\"features\" list is invalid\n", *argv);
+			rta_addattr32(mxrta, sizeof(mxbuf), RTAX_FEATURES, f);
 		} else if (matches(*argv, "realms") == 0) {
 			__u32 realm;
 			NEXT_ARG();
-- 
1.5.6.3


^ permalink raw reply related

* Re: [PATCH RFC] Per route TCP options
From: Rick Jones @ 2009-10-20 16:26 UTC (permalink / raw)
  To: Gilad Ben-Yossef; +Cc: netdev, ori
In-Reply-To: <1256052161-14156-1-git-send-email-gilad@codefidence.com>

Gilad Ben-Yossef wrote:
> Turn the global sysctls allowing disabling of TCP SACK, DSCAK,
> time stamp and window scale into per route entry feature options,
> laying the ground to future removal of the relevant global sysctls.
> 
> You really only want to disable SACK, DSACK, time stamp or window
> scale if you've got a piece of broken networking equipment somewhere 
> as a stop gap until you can bring a big enough hammer to deal with
> the broken network equipment. It doesn't make sense to "punish" the
> entire connections going through the machine to destinations not 
> related to the broken equipment.

Is it really only the case that those options get disabled for broken networking 
equipment?  Does this presage making all TCP options per-route only?

rick jones

^ permalink raw reply

* Re: Policy routing + route "via" gives a strange behavior
From: Atis Elsts @ 2009-10-20 16:48 UTC (permalink / raw)
  To: Guido Trotter; +Cc: netdev
In-Reply-To: <20091020132820.GA3159@gg.studio.tixteam.net>

On Tuesday 20 October 2009 16:28:20 you wrote:
> This is also refused unless a route like the one before appears in the
> default table, even though it does appear in table 100. Is this the right
> behavior, and if yes, why? 

I guess what you describe is too infrequent use case for anyone to really 
care. Connected and link scoped routes are usually not added to policy 
routing tables :) Can you explain more for what kind of setup this is needed?

This "issue" could be solved by using routing table in the FIB lookup done in 
fib_check_nh(). However, doing that would break a lot more setups than it 
would "fix".
For example, if you had these rules
  from all to 1.2.3.4 fwmark 0x64 lookup 100
  from all fwmark 0x64 unreachable
then adding policy route to table 100 would fail unless nexthop 1.2.3.4 was 
used...

Anyway, you can achieve what you wish by using the "onlink" option, e.g.:
  ip route add table 100 default dev eth1 via 192.168.5.254 onlink

Atis

^ permalink raw reply

* Re: Policy routing + route "via" gives a strange behavior
From: Guido Trotter @ 2009-10-20 17:23 UTC (permalink / raw)
  To: Atis Elsts; +Cc: netdev
In-Reply-To: <200910201948.39778.atis@mikrotik.com>

On Tue, Oct 20, 2009 at 07:48:39PM +0300, Atis Elsts wrote:

Hi,

Thanks for your explanation/help!

> > This is also refused unless a route like the one before appears in the
> > default table, even though it does appear in table 100. Is this the right
> > behavior, and if yes, why? 
> 
> I guess what you describe is too infrequent use case for anyone to really 
> care. Connected and link scoped routes are usually not added to policy 
> routing tables :) Can you explain more for what kind of setup this is needed?
> 

What I'm using it to is to force virtual machine traffic from a host to be
routed to a different interface (a gre interface, in my case). I set up a rule
that traffic from the guests' network (or from their interfaces) looks up a
different routing table, and in that table I set a default gateway so that any
traffic the instance would do is sent via that gateway.

> This "issue" could be solved by using routing table in the FIB lookup done in 
> fib_check_nh(). However, doing that would break a lot more setups than it 
> would "fix".
> For example, if you had these rules
>   from all to 1.2.3.4 fwmark 0x64 lookup 100
>   from all fwmark 0x64 unreachable
> then adding policy route to table 100 would fail unless nexthop 1.2.3.4 was 
> used...
> 
Not sure I follow you here, sorry! :/ How would it behave differently depending
on the rules used? If in table 100 I had something like:

ip route add 1.2.3.4 dev eth1 via 2.3.4.5

Then the traffic looking up 100 (which according to the rules is to 1.2.3.4, but
could be to something else as well) must be routed via 2.3.4.5.
Of course then in table 100 I need a route to 2.3.4.5, or I need one in the
default table (which is the only one that gets checked now).

> Anyway, you can achieve what you wish by using the "onlink" option, e.g.:
>   ip route add table 100 default dev eth1 via 192.168.5.254 onlink

I will try this, thanks! What I was doing now was something like:
ip route add 192.168.5.254/32 dev eth1 # no table 100
ip route add table 100 default dev eth1 via 192.168.5.254
ip route del 192.168.5.254/32 dev eth1

But it was kind of a nasty workaround. Although it was working :)

Thanks,

Guido


^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Ben Greear @ 2009-10-20 17:37 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: NetDev, robert
In-Reply-To: <4ADD41F5.5080707@candelatech.com>

On 10/19/2009 09:52 PM, Ben Greear wrote:
> Eric Dumazet wrote:
>> Ben Greear a écrit :
>>> I'm having strange issues when running pktgen on 10G interfaces while
>>> also running
>>> pktgen on mac-vlans on that interface, when the mac-vlan pktgen threads
>>> are on a different
>>> CPU.


I think I found the problem.  First, lockdep was not the issue, and mac-vlans
were properly setting up the lockdep keys.  I would have expected lockdep to
figure out I was trying to lock a non-valid lock, but maybe something else
kept that from happening.

Second:  I think the problem can only happen on my code tree because I
added code to allow mac-vlans to return NETDEV_TX_BUSY
when a hacked varient of dev_queue_xmit decided it could not immediately
transmit a packet.  Without my change, a packet would have to be created fresh
in this scenario, so it would not hit the bug.

However, I think pktgen might still need a similar fix because other drivers or
logic might also change the skb tx-queue map.

Here is the problem, or at least one of them:

pktgen tries to xmit, but gets NETDEV_TX_BUSY.  During the xmit attempt, the
skb queue map was changed to that of the underlying device, which was 4.  Note
that mac-vlans have only a single tx queue.
pktgen will retry this skb, but it never resets the skb queue back to 0.
This means that it will soon be accessing txq[4], which is corrupting
memory.  Things rapidly decline from here!

Here is a patch for comment, in case the pktgen folks would like to
apply something similar:

@@ -3991,11 +4001,26 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev, u64 now)
                 }
         }

-       if (!pkt_dev->skb) {
+       if ((!pkt_dev->skb) || (pkt_dev->clone_count <= 1)) {
+               /** If clone count is low, that might be because device is a layered
+                * virtual device, like mac-vlan.  In that case, the queue-map may be
+                * changed while transmitting out the lower levels, so we need to
+                * reset this here so we don't accidentally use a bogus queue.
+                */
+       reset_queue_map:
                 set_cur_queue_map(pkt_dev);
                 queue_map = pkt_dev->cur_queue_map;
         } else {
                 queue_map = skb_get_queue_mapping(pkt_dev->skb);
+               if (unlikely(queue_map >= odev->num_tx_queues)) {
+                       static int do_once = 1;
+                       if (do_once) {
+                               printk("pktgen ERROR:  queue_map range error, queue_map: %i  num_tx_queues: %i  iface: %s\n",
+                                      queue_map, odev->num_tx_queues, odev->name);
+                               WARN_ON(1);
+                       }
+                       goto reset_queue_map;
+               }
         }

         txq = netdev_get_tx_queue(odev, queue_map);


Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Eric Dumazet @ 2009-10-20 17:44 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev, robert
In-Reply-To: <4ADDF560.1020509@candelatech.com>

Ben Greear a écrit :
> On 10/19/2009 09:52 PM, Ben Greear wrote:
>> Eric Dumazet wrote:
>>> Ben Greear a écrit :
>>>> I'm having strange issues when running pktgen on 10G interfaces while
>>>> also running
>>>> pktgen on mac-vlans on that interface, when the mac-vlan pktgen threads
>>>> are on a different
>>>> CPU.
> 
> 
> I think I found the problem.  First, lockdep was not the issue, and
> mac-vlans
> were properly setting up the lockdep keys.  I would have expected
> lockdep to
> figure out I was trying to lock a non-valid lock, but maybe something else
> kept that from happening.
> 
> Second:  I think the problem can only happen on my code tree because I
> added code to allow mac-vlans to return NETDEV_TX_BUSY
> when a hacked varient of dev_queue_xmit decided it could not immediately
> transmit a packet.  Without my change, a packet would have to be created
> fresh
> in this scenario, so it would not hit the bug.
> 
> However, I think pktgen might still need a similar fix because other
> drivers or
> logic might also change the skb tx-queue map.
> 
> Here is the problem, or at least one of them:
> 
> pktgen tries to xmit, but gets NETDEV_TX_BUSY.  During the xmit attempt,
> the
> skb queue map was changed to that of the underlying device, which was
> 4.  Note
> that mac-vlans have only a single tx queue.

Thats not true since commit 2c11455321f37da6fe6cc36353149f9ac9183334
Date:   Thu Sep 3 00:11:45 2009 +0000
(macvlan: add multiqueue capability )

    macvlan devices are currently not multi-queue capable.

    We can do that defining rtnl_link_ops method,
    get_tx_queues(), called from rtnl_create_link()

    This new method gets num_tx_queues/real_num_tx_queues
    from lower device.

    macvlan_get_tx_queues() is a copy of vlan_get_tx_queues().

    Because macvlan_start_xmit() has to update netdev_queue
    stats only (and not dev->stats), I chose to change
    tx_errors/tx_aborted_errors accounting to tx_dropped,
    since netdev_queue structure doesnt define tx_errors /
    tx_aborted_errors.


> pktgen will retry this skb, but it never resets the skb queue back to 0.
> This means that it will soon be accessing txq[4], which is corrupting
> memory.  Things rapidly decline from here!

Something is really wrong on your kernel :)

> 
> Here is a patch for comment, in case the pktgen folks would like to
> apply something similar:
> 
> @@ -3991,11 +4001,26 @@ static void pktgen_xmit(struct pktgen_dev
> *pkt_dev, u64 now)
>                 }
>         }
> 
> -       if (!pkt_dev->skb) {
> +       if ((!pkt_dev->skb) || (pkt_dev->clone_count <= 1)) {
> +               /** If clone count is low, that might be because device
> is a layered
> +                * virtual device, like mac-vlan.  In that case, the
> queue-map may be
> +                * changed while transmitting out the lower levels, so
> we need to
> +                * reset this here so we don't accidentally use a bogus
> queue.
> +                */
> +       reset_queue_map:
>                 set_cur_queue_map(pkt_dev);
>                 queue_map = pkt_dev->cur_queue_map;
>         } else {
>                 queue_map = skb_get_queue_mapping(pkt_dev->skb);
> +               if (unlikely(queue_map >= odev->num_tx_queues)) {
> +                       static int do_once = 1;
> +                       if (do_once) {
> +                               printk("pktgen ERROR:  queue_map range
> error, queue_map: %i  num_tx_queues: %i  iface: %s\n",
> +                                      queue_map, odev->num_tx_queues,
> odev->name);
> +                               WARN_ON(1);
> +                       }
> +                       goto reset_queue_map;
> +               }
>         }
> 
>         txq = netdev_get_tx_queue(odev, queue_map);

Please try last kernel before posting patches :)

^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Ben Greear @ 2009-10-20 17:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: NetDev, robert
In-Reply-To: <4ADDF6E5.4070509@gmail.com>

On 10/20/2009 10:44 AM, Eric Dumazet wrote:

>
> Please try last kernel before posting patches :)

I should have looked at the latest code, but even if mac-vlan is no longer
an issue, I suspect it may still be possible to get into this case with
other virtual devices because pktgen plays tricks by cloning pkts.

That said, I have no proof, so no further arguments from me.

Thanks for pointing out the new mac-vlan patch.

Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* RE: [patch 0/3] KS8851 updates for -rc5
From: Doong, Ping @ 2009-10-20 17:53 UTC (permalink / raw)
  To: Ben Dooks, netdev; +Cc: Li, Charles
In-Reply-To: <20091020094902.274646871@fluff.org.uk>

Hi Ben,

I'm sorry. I am busy on other project, and have got the change to verify
your new patch on our side. I will do it soon.

Thank you,
Ping

-----Original Message-----
From: Ben Dooks [mailto:ben@simtec.co.uk] 
Sent: Tuesday, October 20, 2009 2:49 AM
To: netdev@vger.kernel.org
Cc: Doong, Ping
Subject: [patch 0/3] KS8851 updates for -rc5

Patches for some minor problems with the KS8851 SPI network driver
for inclusion into the kernel. This set is based on -rc5.

^ permalink raw reply

* [PATCH] net: Fix checks on unsigned in w90p910_ether_probe()
From: Roel Kluin @ 2009-10-20 18:14 UTC (permalink / raw)
  To: netdev, Andrew Morton, mcuos.com, davem

ether->txirq and ->rxirq are unsigned

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
---
diff --git a/drivers/net/arm/w90p910_ether.c b/drivers/net/arm/w90p910_ether.c
index 25e2627..503c2ce 100644
--- a/drivers/net/arm/w90p910_ether.c
+++ b/drivers/net/arm/w90p910_ether.c
@@ -981,7 +981,7 @@ static int __devinit w90p910_ether_probe(struct platform_device *pdev)
 {
 	struct w90p910_ether *ether;
 	struct net_device *dev;
-	int error;
+	int error, ret;
 
 	dev = alloc_etherdev(sizeof(struct w90p910_ether));
 	if (!dev)
@@ -1010,17 +1010,19 @@ static int __devinit w90p910_ether_probe(struct platform_device *pdev)
 		goto failed_free_mem;
 	}
 
-	ether->txirq = platform_get_irq(pdev, 0);
-	if (ether->txirq < 0) {
+	ret = platform_get_irq(pdev, 0);
+	ether->txirq = ret;
+	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to get ether tx irq\n");
-		error = -ENXIO;
+		error = ret;
 		goto failed_free_io;
 	}
 
-	ether->rxirq = platform_get_irq(pdev, 1);
-	if (ether->rxirq < 0) {
+	ret = platform_get_irq(pdev, 1);
+	ether->rxirq = ret;
+	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to get ether rx irq\n");
-		error = -ENXIO;
+		error = ret;
 		goto failed_free_txirq;
 	}
 

^ permalink raw reply related

* Re: pktgen and spin_lock_bh in xmit path
From: Eric Dumazet @ 2009-10-20 18:35 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADDF948.1050208@candelatech.com>

Ben Greear a écrit :

> I should have looked at the latest code, but even if mac-vlan is no longer
> an issue, I suspect it may still be possible to get into this case with
> other virtual devices because pktgen plays tricks by cloning pkts.

Sure, you may be right.

> 
> That said, I have no proof, so no further arguments from me.
> 
> Thanks for pointing out the new mac-vlan patch.
> 

fill_packet() does the mapping setting, so if a device/tunnel transmit change it,
we might have a problem for cloned pktgen packets.

Hmm... dev_pick_tx() logic might be the problem...

u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
{
        u32 hash;

        if (skb_rx_queue_recorded(skb)) {
                hash = skb_get_rx_queue(skb);
                while (unlikely(hash >= dev->real_num_tx_queues))
                        hash -= dev->real_num_tx_queues;
                return hash;
        }

        if (skb->sk && skb->sk->sk_hash)
                hash = skb->sk->sk_hash;
        else
                hash = skb->protocol;

        hash = jhash_1word(hash, skb_tx_hashrnd);

        return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
}
EXPORT_SYMBOL(skb_tx_hash);

static struct netdev_queue *dev_pick_tx(struct net_device *dev,
                                        struct sk_buff *skb)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        u16 queue_index = 0;

        if (ops->ndo_select_queue)
                queue_index = ops->ndo_select_queue(dev, skb);
        else if (dev->real_num_tx_queues > 1)
                queue_index = skb_tx_hash(dev, skb);

        skb_set_queue_mapping(skb, queue_index);
        return netdev_get_tx_queue(dev, queue_index);
}




So if pktgen does a skb_set_queue_mapping(skb, 0);
dev_pick_tx() will call skb_tx_hash().
skb_tx_hash() will consider mapping is not set and will
generate a new one based on hash value.


David, we are changing skb->mapping while transmitting it...

So yes, it can break pktgen if its skbs are cloned.

Maybe pktgen should call skb_record_rx_queue() instead of skb_set_queue_mapping()

Or maybe we should avoid this +/- 1 thing we do in skb_record_rx_queue(), since
we use same skb field for recording rx_queue or tx_queue :(


^ permalink raw reply

* Re: [PATCH RFC] Per route TCP options
From: Ilpo Järvinen @ 2009-10-20 18:53 UTC (permalink / raw)
  To: Gilad Ben-Yossef; +Cc: Eric Dumazet, Netdev, ori
In-Reply-To: <4ADDE132.3010408@codefidence.com>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1052 bytes --]

On Tue, 20 Oct 2009, Gilad Ben-Yossef wrote:

> Eric Dumazet wrote:
> 
> > Gilad Ben-Yossef a écrit :
> >   
> > > Turn the global sysctls allowing disabling of TCP SACK, DSCAK,
> > > time stamp and window scale into per route entry feature options,
> > > laying the ground to future removal of the relevant global sysctls.
> > >
> > > ...
> > Interesting... But you should give numbers to your patches so that we know
> > their order
> >   
> Will do.
> > You could also do the ECN part for consistency (ie RTAX_FEATURE_ECN ->
> > RTAX_FEATURE_NO_ECN)
> >   
> That and  sysctl_tcp_mtu_probing are on my todo list, assuming the general
> direction is accepted, of course.
> 
> Specifically, I couldn't understand why sysctl_tcp_ecn is documented to be a
> boolean value, but is initialized to 2 and queried with if (sysctl_tcp_ecn ==
> 1) so I decided to let it be until I figure it out... ;-)

Ah, I didn't notice that "- BOOLEAN" there so I modified only the 
description (some blindness to caps perhaps :-)), did you perhaps read
it fully?

-- 
 i.

^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Eric Dumazet @ 2009-10-20 18:54 UTC (permalink / raw)
  Cc: Ben Greear, NetDev, robert, David S. Miller
In-Reply-To: <4ADE0306.6060101@gmail.com>

Eric Dumazet a écrit :

> David, we are changing skb->mapping while transmitting it...
> 
> So yes, it can break pktgen if its skbs are cloned.
> 
> Maybe pktgen should call skb_record_rx_queue() instead of skb_set_queue_mapping()
> 
> Or maybe we should avoid this +/- 1 thing we do in skb_record_rx_queue(), since
> we use same skb field for recording rx_queue or tx_queue :(
> 

A possible fix would be :

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 86acdba..bbe4b2d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2554,7 +2554,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	__be16 *vlan_encapsulated_proto = NULL;  /* packet type ID field (or len) for VLAN tag */
 	__be16 *svlan_tci = NULL;                /* Encapsulates priority and SVLAN ID */
 	__be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
-	u16 queue_map;
 
 	if (pkt_dev->nr_labels)
 		protocol = htons(ETH_P_MPLS_UC);
@@ -2565,7 +2564,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	/* Update any of the values, used when we're incrementing various
 	 * fields.
 	 */
-	queue_map = pkt_dev->cur_queue_map;
 	mod_cur_headers(pkt_dev);
 
 	datalen = (odev->hard_header_len + 16) & ~0xf;
@@ -2605,7 +2603,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	skb->network_header = skb->tail;
 	skb->transport_header = skb->network_header + sizeof(struct iphdr);
 	skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr));
-	skb_set_queue_mapping(skb, queue_map);
 	iph = ip_hdr(skb);
 	udph = udp_hdr(skb);
 
@@ -2896,7 +2893,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	__be16 *vlan_encapsulated_proto = NULL;  /* packet type ID field (or len) for VLAN tag */
 	__be16 *svlan_tci = NULL;                /* Encapsulates priority and SVLAN ID */
 	__be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
-	u16 queue_map;
 
 	if (pkt_dev->nr_labels)
 		protocol = htons(ETH_P_MPLS_UC);
@@ -2907,7 +2903,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	/* Update any of the values, used when we're incrementing various
 	 * fields.
 	 */
-	queue_map = pkt_dev->cur_queue_map;
 	mod_cur_headers(pkt_dev);
 
 	skb = __netdev_alloc_skb(odev,
@@ -2946,7 +2941,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	skb->network_header = skb->tail;
 	skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
 	skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr));
-	skb_set_queue_mapping(skb, queue_map);
 	iph = ipv6_hdr(skb);
 	udph = udp_hdr(skb);
 
@@ -3437,7 +3431,13 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	if (pkt_dev->delay && pkt_dev->last_ok)
 		spin(pkt_dev, pkt_dev->next_tx);
 
-	queue_map = skb_get_queue_mapping(pkt_dev->skb);
+	queue_map = pkt_dev->cur_queue_map;
+	/*
+	 * tells skb_tx_hash() to use this tx queue.
+	 * We should reset skb->mapping before each xmit() because
+	 * xmit() might change it.
+	 */
+	skb_record_rx_queue(pkt_dev->skb, queue_map);
 	txq = netdev_get_tx_queue(odev, queue_map);
 
 	__netif_tx_lock_bh(txq);




^ permalink raw reply related

* Re: kernel panic in latest vanilla stable, while using nameif with "alive" pppoe interfaces
From: Cyrill Gorcunov @ 2009-10-20 19:08 UTC (permalink / raw)
  To: Denys Fedoryschenko
  Cc: Michal Ostrowski, Eric Dumazet, netdev, linux-ppp, paulus,
	mostrows
In-Reply-To: <200910201720.00473.denys@visp.net.lb>

[Denys Fedoryschenko - Tue, Oct 20, 2009 at 05:20:00PM +0300]
| It panics almost immediately on boot(even on old operations  that was stable, 
| seems on first pppoe customer login attempt), i will rebuild kernel and if 
| interesting will try to get panic message.
| 
...

Just to update status of the issue. The key moment is that pppoe_flush_dev
may be called asynchronously (especially via sysfs on dev entry, for example
we retrieve mtu of device and while we at it other process may update mtu
via sysfs). So I'm returning pppoe_hash_lock back which should eliminate a
number of lock complains and make locking scheme easier. All-in-one: I'm
working on it. Just need some more time.

	-- Cyrill

^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Ben Greear @ 2009-10-20 20:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE0770.8060708@gmail.com>

On 10/20/2009 11:54 AM, Eric Dumazet wrote:
> Eric Dumazet a écrit :
>
>> David, we are changing skb->mapping while transmitting it...
>>
>> So yes, it can break pktgen if its skbs are cloned.
>>
>> Maybe pktgen should call skb_record_rx_queue() instead of skb_set_queue_mapping()
>>
>> Or maybe we should avoid this +/- 1 thing we do in skb_record_rx_queue(), since
>> we use same skb field for recording rx_queue or tx_queue :(
>>
>
> A possible fix would be :

I tried your patch.  It's not crashing, but the link is bouncing around:

Oct 20 13:13:34 localhost kernel: ixgbe 0000:05:00.0: master disable timed out
Oct 20 13:13:35 localhost kernel: ixgbe 0000:05:00.0: master disable timed out
Oct 20 13:13:35 localhost kernel: ixgbe 0000:05:00.0: master disable timed out
...

Oct 20 13:14:06 localhost kernel: ixgbe 0000:05:00.0: master disable timed out
Oct 20 13:14:06 localhost kernel: ixgbe: eth9 NIC Link is Down
Oct 20 13:14:06 localhost kernel: ixgbe: eth9 NIC Link is Up 10 Gbps, Flow Control: RX/TX
Oct 20 13:14:09 localhost kernel: ixgbe: eth8 NIC Link is Up 10 Gbps, Flow Control: RX/TX
Oct 20 13:14:12 localhost kernel: ixgbe 0000:05:00.1: master disable timed out
Oct 20 13:14:15 localhost kernel: ixgbe: eth9 NIC Link is Up 10 Gbps, Flow Control: RX/TX
Oct 20 13:14:19 localhost kernel: ixgbe: eth9 NIC Link is Down
Oct 20 13:14:19 localhost kernel: ixgbe 0000:05:00.0: master disable timed out


I'm not sure if this is another weirdness in my system or something
else...

Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* [PATCH 2/4] [RFC] Add c/r support for connected INET sockets
From: Dan Smith @ 2009-10-20 21:06 UTC (permalink / raw)
  To: containers; +Cc: netdev, Oren Laadan, John Dykstra
In-Reply-To: <1256072803-3518-1-git-send-email-danms@us.ibm.com>

This patch adds basic support for C/R of open INET sockets.  I think that
all the important bits of the TCP and ICSK socket structures is saved,
but I think there is still some additional IPv6 stuff that needs to be
handled.

With this patch applied, the following script can be used to demonstrate
the functionality:

  https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html

It shows that this enables migration of a sendmail process with open
connections from one machine to another without dropping.

We still need comments from the netdev people about what sort of sanity
checking we need to do on the values in the ckpt_hdr_socket_inet
structure on restart.

Note that this still doesn't address lingering sockets yet.

Changes in v2:
 - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
   structure instead of saving them separately
 - Fix 'sock' naming in sock_cptrst()
 - Don't take the queue lock before skb_queue_tail() since it is
   done for us
 - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
   flag is specified on sys_restart()
 - Pull the implementation of the list of listening sockets back into
   this patch
 - Fix dangling printk
 - Add some comments around the parent/child restore logic

Cc: netdev@vger.kernel.org
Cc: Oren Laadan <orenl@librato.com>
Cc: John Dykstra <jdykstra72@gmail.com>
Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 checkpoint/sys.c                 |    4 +
 include/linux/checkpoint.h       |    5 +-
 include/linux/checkpoint_hdr.h   |   97 ++++++++++++++
 include/linux/checkpoint_types.h |    2 +
 net/checkpoint.c                 |   23 ++--
 net/ipv4/checkpoint.c            |  263 +++++++++++++++++++++++++++++++++++++-
 6 files changed, 379 insertions(+), 15 deletions(-)

diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 260a1ee..df00973 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -221,6 +221,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 
 	kfree(ctx->pids_arr);
 
+	sock_listening_list_free(&ctx->listen_sockets);
+
 	kfree(ctx);
 }
 
@@ -249,6 +251,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	spin_lock_init(&ctx->lock);
 #endif
 
+	INIT_LIST_HEAD(&ctx->listen_sockets);
+
 	err = -EBADF;
 	ctx->file = fget(fd);
 	if (!ctx->file)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 1da0b04..73d1677 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -19,6 +19,7 @@
 #define RESTART_TASKSELF	0x1
 #define RESTART_FROZEN		0x2
 #define RESTART_GHOST		0x4
+#define RESTART_SOCK_LISTENONLY 0x8
 
 #ifdef __KERNEL__
 #ifdef CONFIG_CHECKPOINT
@@ -48,7 +49,8 @@
 #define RESTART_USER_FLAGS  \
 	(RESTART_TASKSELF | \
 	 RESTART_FROZEN | \
-	 RESTART_GHOST)
+	 RESTART_GHOST | \
+	 RESTART_SOCK_LISTENONLY)
 
 extern int walk_task_subtree(struct task_struct *task,
 			     int (*func)(struct task_struct *, void *),
@@ -102,6 +104,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct sockaddr *rem, unsigned *rem_len);
 void sock_restore_header_info(struct sk_buff *skb,
 			      struct ckpt_hdr_socket_buffer *h);
+void sock_listening_list_free(struct list_head *head);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 3e6cab1..0c10657 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -20,6 +20,7 @@
 #include <linux/socket.h>
 #include <linux/un.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #else
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -569,6 +570,102 @@ struct ckpt_hdr_socket_unix {
 
 struct ckpt_hdr_socket_inet {
 	struct ckpt_hdr h;
+	__u32 daddr;
+	__u32 rcv_saddr;
+	__u32 saddr;
+	__u16 dport;
+	__u16 num;
+	__u16 sport;
+	__s16 uc_ttl;
+	__u16 cmsg_flags;
+
+	struct {
+		__u64 timeout;
+		__u32 ato;
+		__u32 lrcvtime;
+		__u16 last_seg_size;
+		__u16 rcv_mss;
+		__u8 pending;
+		__u8 quick;
+		__u8 pingpong;
+		__u8 blocked;
+	} icsk_ack __attribute__ ((aligned(8)));
+
+	/* FIXME: Skipped opt, tos, multicast, cork settings */
+
+	struct {
+		__u64 last_synq_overflow;
+
+		__u32 rcv_nxt;
+		__u32 copied_seq;
+		__u32 rcv_wup;
+		__u32 snd_nxt;
+		__u32 snd_una;
+		__u32 snd_sml;
+		__u32 rcv_tstamp;
+		__u32 lsndtime;
+
+		__u32 snd_wl1;
+		__u32 snd_wnd;
+		__u32 max_window;
+		__u32 mss_cache;
+		__u32 window_clamp;
+		__u32 rcv_ssthresh;
+		__u32 frto_highmark;
+
+		__u32 srtt;
+		__u32 mdev;
+		__u32 mdev_max;
+		__u32 rttvar;
+		__u32 rtt_seq;
+
+		__u32 packets_out;
+		__u32 retrans_out;
+
+		__u32 snd_up;
+		__u32 rcv_wnd;
+		__u32 write_seq;
+		__u32 pushed_seq;
+		__u32 lost_out;
+		__u32 sacked_out;
+		__u32 fackets_out;
+		__u32 tso_deferred;
+		__u32 bytes_acked;
+
+		__s32 lost_cnt_hint;
+		__u32 retransmit_high;
+
+		__u32 lost_retrans_low;
+
+		__u32 prior_ssthresh;
+		__u32 high_seq;
+
+		__u32 retrans_stamp;
+		__u32 undo_marker;
+		__s32 undo_retrans;
+		__u32 total_retrans;
+
+		__u32 urg_seq;
+		__u32 keepalive_time;
+		__u32 keepalive_intvl;
+
+		__u16 urg_data;
+		__u16 advmss;
+		__u8 frto_counter;
+		__u8 nonagle;
+
+		__u8 ecn_flags;
+		__u8 reordering;
+
+		__u8 keepalive_probes;
+	} tcp __attribute__ ((aligned(8)));
+
+	struct {
+		struct in6_addr saddr;
+		struct in6_addr rcv_saddr;
+		struct in6_addr daddr;
+	} inet6 __attribute__ ((aligned(8)));
+
 	__u32 laddr_len;
 	__u32 raddr_len;
 	struct sockaddr_in laddr;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index fa57cdc..91c141b 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -65,6 +65,8 @@ struct ckpt_ctx {
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
 
+	struct list_head listen_sockets;/* listening parent sockets */
+
 	/* [multi-process checkpoint] */
 	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
 	int nr_tasks;                   /* size of tasks array */
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 5ed2724..3e7574d 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -122,6 +122,7 @@ void sock_restore_header_info(struct sk_buff *skb,
 
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
 				struct sk_buff_head *queue,
+				uint16_t family,
 				int dst_objref)
 {
 	struct sk_buff *skb;
@@ -130,11 +131,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 		struct ckpt_hdr_socket_buffer *h;
 		int ret = 0;
 
-		/* FIXME: This could be a false positive for non-unix
-		 *        buffers, so add a type check here in the
-		 *        future
-		 */
-		if (UNIXCB(skb).fp) {
+		if ((family == AF_UNIX) && UNIXCB(skb).fp) {
 			ckpt_write_err(ctx, "TE", "af_unix: pass fd", -EBUSY);
 			return -EBUSY;
 		}
@@ -174,6 +171,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 
 static int sock_write_buffers(struct ckpt_ctx *ctx,
 			      struct sk_buff_head *queue,
+			      uint16_t family,
 			      int dst_objref)
 {
 	struct ckpt_hdr_socket_queue *h;
@@ -193,7 +191,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
 	h->skb_count = ret;
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (!ret)
-		ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
+		ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
 
  out:
 	ckpt_hdr_put(ctx, h);
@@ -215,12 +213,14 @@ int sock_deferred_write_buffers(void *data)
 		return dst_objref;
 	}
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write recv buffers: %i\n", ret);
 	if (ret < 0)
 		return ret;
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write send buffers: %i\n", ret);
 
 	return ret;
@@ -745,10 +745,9 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
 		goto err;
 
 	if ((h->sock_common.family == AF_INET) &&
-	    (h->sock.state != TCP_LISTEN)) {
-		/* Temporary hack to enable restore of TCP_LISTEN sockets
-		 * while forcing anything else to a closed state
-		 */
+	    (h->sock.state != TCP_LISTEN) &&
+	    (ctx->uflags & RESTART_SOCK_LISTENONLY)) {
+		ckpt_debug("Forcing open socket closed\n");
 		sock->sk->sk_state = TCP_CLOSE;
 		sock->state = SS_UNCONNECTED;
 	}
diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
index 9cbbf5e..5913652 100644
--- a/net/ipv4/checkpoint.c
+++ b/net/ipv4/checkpoint.c
@@ -17,6 +17,7 @@
 #include <linux/deferqueue.h>
 #include <net/tcp_states.h>
 #include <net/tcp.h>
+#include <net/ipv6.h>
 
 struct dq_sock {
 	struct ckpt_ctx *ctx;
@@ -28,6 +29,233 @@ struct dq_buffers {
 	struct sock *sk;
 };
 
+struct listen_item {
+	struct sock *sk;
+	struct list_head list;
+};
+
+void sock_listening_list_free(struct list_head *head)
+{
+	struct listen_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, head, list) {
+		list_del(&item->list);
+		kfree(item);
+	}
+}
+
+static int sock_listening_list_add(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	item->sk = sk;
+	list_add(&item->list, &ctx->listen_sockets);
+
+	return 0;
+}
+
+static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	list_for_each_entry(item, &ctx->listen_sockets, list) {
+		if (inet_sk(sk)->sport == inet_sk(item->sk)->sport)
+			return item->sk;
+	}
+
+	return NULL;
+}
+
+static int sock_hash_parent(void *data)
+{
+	struct dq_sock *dq = (struct dq_sock *)data;
+	struct sock *parent;
+
+	ckpt_debug("INET post-restart hash\n");
+
+	dq->sk->sk_prot->hash(dq->sk);
+
+	/* If there is a listening socket with the same source port,
+	 * then become a child of that socket [we are the result of an
+	 * accept()].  Otherwise hash ourselves directly in [we are
+	 * the result of a connect()]
+	 */
+
+	parent = sock_get_parent(dq->ctx, dq->sk);
+	if (parent) {
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+		local_bh_disable();
+		__inet_inherit_port(parent, dq->sk);
+		local_bh_enable();
+	} else {
+		inet_sk(dq->sk)->num = 0;
+		inet_hash_connect(&tcp_death_row, dq->sk);
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+	}
+
+	return 0;
+}
+
+static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
+{
+	struct dq_sock dq;
+
+	dq.sk = sock;
+	dq.ctx = ctx;
+
+	return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
+			      sock_hash_parent, NULL);
+}
+
+static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
+				struct tcp_sock *sk,
+				struct ckpt_hdr_socket_inet *hh,
+				int op)
+{
+	CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
+	CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
+	CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
+	CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
+	CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
+	CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
+	CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
+	CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
+
+	CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
+	CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
+	CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
+	CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
+	CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
+	CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
+	CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
+	CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
+	CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
+	CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
+
+	CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
+	CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
+	CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
+	CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
+	CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
+
+	CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
+	CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
+
+	CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
+	CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
+	CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
+	CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
+
+	CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
+
+	CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
+	CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
+	CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
+	CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
+	CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
+	CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
+	CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
+	CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
+
+	CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
+	CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
+
+	CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
+
+	CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
+	CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
+
+	CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
+	CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
+	CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
+	CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
+
+	CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
+	CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
+	CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
+
+	return 0;
+}
+
+static int sock_inet_restore_addrs(struct inet_sock *inet,
+				   struct ckpt_hdr_socket_inet *hh)
+{
+	inet->daddr = hh->raddr.sin_addr.s_addr;
+	inet->saddr = hh->laddr.sin_addr.s_addr;
+	inet->rcv_saddr = inet->saddr;
+
+	inet->dport = hh->raddr.sin_port;
+	inet->sport = hh->laddr.sin_port;
+
+	return 0;
+}
+
+static int sock_inet_cptrst(struct ckpt_ctx *ctx,
+			    struct sock *sk,
+			    struct ckpt_hdr_socket_inet *hh,
+			    int op)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ret;
+
+	if (op == CKPT_CPT) {
+		CKPT_COPY(op, hh->daddr, inet->daddr);
+		CKPT_COPY(op, hh->rcv_saddr, inet->rcv_saddr);
+		CKPT_COPY(op, hh->dport, inet->dport);
+		CKPT_COPY(op, hh->saddr, inet->saddr);
+		CKPT_COPY(op, hh->sport, inet->sport);
+	} else {
+		ret = sock_inet_restore_addrs(inet, hh);
+		if (ret)
+			return ret;
+	}
+
+	CKPT_COPY(op, hh->num, inet->num);
+	CKPT_COPY(op, hh->uc_ttl, inet->uc_ttl);
+	CKPT_COPY(op, hh->cmsg_flags, inet->cmsg_flags);
+
+	CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
+	CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
+	CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
+	CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
+	CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
+	CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
+	CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
+	CKPT_COPY(op,
+		  hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
+	CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
+
+	if (sk->sk_protocol == IPPROTO_TCP)
+		ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sk), hh, op);
+	else if (sk->sk_protocol == IPPROTO_UDP)
+		ret = 0;
+	else {
+		ckpt_write_err(ctx, "T", "unknown socket protocol %d",
+			       sk->sk_protocol);
+		ret = -EINVAL;
+	}
+
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *inet6 = inet6_sk(sk);
+		if (op == CKPT_CPT) {
+			ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
+			ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
+			ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
+		} else {
+			ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
+			ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
+			ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
+		}
+	}
+
+	return ret;
+}
+
 int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 {
 	struct ckpt_hdr_socket_inet *in;
@@ -43,6 +271,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 	if (ret)
 		goto out;
 
+	ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
+	if (ret < 0)
+		goto out;
+
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
  out:
 	ckpt_hdr_put(ctx, in);
@@ -87,9 +319,9 @@ static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
 	if (ret < 0)
 		goto out;
 
-	spin_lock(&queue->lock);
+	sock_restore_header_info(skb, h);
+
 	skb_queue_tail(queue, skb);
-	spin_unlock(&queue->lock);
  out:
 	ckpt_hdr_put(ctx, h);
 
@@ -209,8 +441,35 @@ int inet_restore(struct ckpt_ctx *ctx,
 			ckpt_debug("inet listen: %i\n", ret);
 			if (ret < 0)
 				goto out;
+
+			/* We are a listening socket, so add ourselves
+			 * to the list of parent sockets.  This will
+			 * allow our children to find us later and
+			 * link up
+			 */
+
+			ret = sock_listening_list_add(ctx, sock->sk);
+			if (ret < 0)
+				goto out;
 		}
 	} else {
+		ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
+		if (ret)
+			goto out;
+
+		if ((h->sock.state == TCP_ESTABLISHED) &&
+		    (h->sock.protocol == IPPROTO_TCP)) {
+			/* A connected socket that was spawned from an
+			 * accept() needs to be hashed with its parent
+			 * listening socket in order to receive
+			 * traffic on the original port.  Since we may
+			 * not have restarted the parent yet, we defer
+			 * this until later when we know we have all
+			 * the listening sockets accounted for.
+			 */
+			ret = sock_defer_hash(ctx, sock->sk);
+		}
+
 		if (!sock_flag(sock->sk, SOCK_DEAD))
 			ret = inet_defer_restore_buffers(ctx, sock->sk);
 	}
-- 
1.6.2.5


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox