Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3 2/7] Allow tcp_parse_options to consult dst entry
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

From: Gilad Ben-Yossef <gby@watson.codefidence.com>

We need tcp_parse_options to be aware of dst_entry to
take into account per dst_entry TCP options settings

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/net/tcp.h        |    3 ++-
 net/ipv4/syncookies.c    |   27 ++++++++++++++-------------
 net/ipv4/tcp_input.c     |    9 ++++++---
 net/ipv4/tcp_ipv4.c      |   21 ++++++++++++---------
 net/ipv4/tcp_minisocks.c |    7 +++++--
 net/ipv6/syncookies.c    |   28 +++++++++++++++-------------
 net/ipv6/tcp_ipv6.c      |    3 ++-
 7 files changed, 56 insertions(+), 42 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 03a49c7..740d09b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -409,7 +409,8 @@ extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 
 extern void			tcp_parse_options(struct sk_buff *skb,
 						  struct tcp_options_received *opt_rx,
-						  int estab);
+						  int estab,
+						  struct dst_entry *dst);
 
 extern u8			*tcp_parse_md5sig_option(struct tcphdr *th);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a6e0e07..4990dd4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -276,13 +276,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
 
-	/* check for timestamp cookie support */
-	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, 0);
-
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
-
 	ret = NULL;
 	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
 	if (!req)
@@ -298,12 +291,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->loc_addr		= ip_hdr(skb)->daddr;
 	ireq->rmt_addr		= ip_hdr(skb)->saddr;
 	ireq->ecn_ok		= 0;
-	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
-	ireq->sack_ok		= tcp_opt.sack_ok;
-	ireq->wscale_ok		= tcp_opt.wscale_ok;
-	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
-	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -351,6 +338,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(skb, &tcp_opt, 0, &rt->u.dst);
+
+	if (tcp_opt.saw_tstamp)
+		cookie_check_timestamp(&tcp_opt);
+
+	ireq->snd_wscale        = tcp_opt.snd_wscale;
+	ireq->rcv_wscale        = tcp_opt.rcv_wscale;
+	ireq->sack_ok           = tcp_opt.sack_ok;
+	ireq->wscale_ok         = tcp_opt.wscale_ok;
+	ireq->tstamp_ok         = tcp_opt.saw_tstamp;
+	req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+
 	/* Try to redo what tcp_v4_send_synack did. */
 	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d86784b..d502f49 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3698,12 +3698,14 @@ old_ack:
  * the fast version below fails.
  */
 void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
-		       int estab)
+		       int estab,  struct dst_entry *dst)
 {
 	unsigned char *ptr;
 	struct tcphdr *th = tcp_hdr(skb);
 	int length = (th->doff * 4) - sizeof(struct tcphdr);
 
+	BUG_ON(!estab && !dst);
+
 	ptr = (unsigned char *)(th + 1);
 	opt_rx->saw_tstamp = 0;
 
@@ -3820,7 +3822,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
 		if (tcp_parse_aligned_timestamp(tp, th))
 			return 1;
 	}
-	tcp_parse_options(skb, &tp->rx_opt, 1);
+	tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
 	return 1;
 }
 
@@ -5364,8 +5366,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int saved_clamp = tp->rx_opt.mss_clamp;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	tcp_parse_options(skb, &tp->rx_opt, 0);
+	tcp_parse_options(skb, &tp->rx_opt, 0, dst);
 
 	if (th->ack) {
 		/* rfc793:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b..1d611e3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1256,11 +1256,21 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 #endif
 
+	ireq = inet_rsk(req);
+	ireq->loc_addr = daddr;
+	ireq->rmt_addr = saddr;
+	ireq->no_srccheck = inet_sk(sk)->transparent;
+	ireq->opt = tcp_v4_save_options(sk, skb);
+
+	dst = inet_csk_route_req(sk, req);
+	if(!dst)
+		goto drop_and_free;
+
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = 536;
 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 
-	tcp_parse_options(skb, &tmp_opt, 0);
+	tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
@@ -1269,14 +1279,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	tcp_openreq_init(req, &tmp_opt, skb);
 
-	ireq = inet_rsk(req);
-	ireq->loc_addr = daddr;
-	ireq->rmt_addr = saddr;
-	ireq->no_srccheck = inet_sk(sk)->transparent;
-	ireq->opt = tcp_v4_save_options(sk, skb);
-
 	if (security_inet_conn_request(sk, skb, req))
-		goto drop_and_free;
+		goto drop_and_release;
 
 	if (!want_cookie)
 		TCP_ECN_create_request(req, tcp_hdr(skb));
@@ -1301,7 +1305,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet_csk_route_req(sk, req)) != NULL &&
 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 		    peer->v4daddr == saddr) {
 			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c49a550..70ff955 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -101,7 +101,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	int paws_reject = 0;
 
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, 1);
+		tcp_parse_options(skb, &tmp_opt, 1, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
@@ -499,10 +499,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	int paws_reject = 0;
 	struct tcp_options_received tmp_opt;
 	struct sock *child;
+	struct dst_entry *dst = inet_csk_route_req(sk, req);
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-		tcp_parse_options(skb, &tmp_opt, 0);
+		tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
@@ -515,6 +516,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	dst_release(dst);
+
 	/* Check for pure retransmitted SYN. */
 	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 	    flg == TCP_FLAG_SYN &&
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 6b6ae91..6ece408 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -184,13 +184,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
 
-	/* check for timestamp cookie support */
-	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, 0);
-
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
-
 	ret = NULL;
 	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
 	if (!req)
@@ -224,12 +217,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	req->expires = 0UL;
 	req->retrans = 0;
 	ireq->ecn_ok		= 0;
-	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
-	ireq->sack_ok		= tcp_opt.sack_ok;
-	ireq->wscale_ok		= tcp_opt.wscale_ok;
-	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
-	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->rcv_isn = ntohl(th->seq) - 1;
 	treq->snt_isn = cookie;
 
@@ -264,6 +251,21 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 			goto out_free;
 	}
 
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(skb, &tcp_opt, 0, dst);
+
+	if (tcp_opt.saw_tstamp)
+		cookie_check_timestamp(&tcp_opt);
+
+	req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+
+	ireq->snd_wscale        = tcp_opt.snd_wscale;
+	ireq->rcv_wscale        = tcp_opt.rcv_wscale;
+	ireq->sack_ok           = tcp_opt.sack_ok;
+	ireq->wscale_ok         = tcp_opt.wscale_ok;
+	ireq->tstamp_ok         = tcp_opt.saw_tstamp;
+
 	req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
 	tcp_select_initial_window(tcp_full_space(sk), req->mss,
 				  &req->rcv_wnd, &req->window_clamp,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 21d100b..2eebab5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1165,6 +1165,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct request_sock *req = NULL;
 	__u32 isn = TCP_SKB_CB(skb)->when;
+	struct dst_entry *dst = __sk_dst_get(sk);
 #ifdef CONFIG_SYN_COOKIES
 	int want_cookie = 0;
 #else
@@ -1203,7 +1204,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
 
-	tcp_parse_options(skb, &tmp_opt, 0);
+	tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 3/7] Add dst_feature to query route entry features
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Adding an accessor to existing  dst_entry feautres field and
refactor the only supported feature (allfrag) to use it.


Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/net/dst.h |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 5a900dd..b562be3 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -111,6 +111,12 @@ dst_metric(const struct dst_entry *dst, int metric)
 	return dst->metrics[metric-1];
 }
 
+static inline u32
+dst_feature(const struct dst_entry *dst, u32 feature)
+{
+	return dst_metric(dst, RTAX_FEATURES) & feature;
+}
+
 static inline u32 dst_mtu(const struct dst_entry *dst)
 {
 	u32 mtu = dst_metric(dst, RTAX_MTU);
@@ -136,7 +142,7 @@ static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
 static inline u32
 dst_allfrag(const struct dst_entry *dst)
 {
-	int ret = dst_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
+	int ret = dst_feature(dst,  RTAX_FEATURE_ALLFRAG);
 	/* Yes, _exactly_. This is paranoia. */
 	barrier();
 	return ret;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 5/7] Allow disabling TCP timestamp options per route
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Implement querying and acting upon the no timestamp bit in the feature 
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    2 +-
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9c802a6..2ab8c75 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -378,7 +378,7 @@ enum
 
 #define RTAX_FEATURE_ECN	0x00000001
 #define RTAX_FEATURE_NO_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
+#define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
 struct rta_session
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b14f780..d2f9742 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3755,7 +3755,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 			case TCPOPT_TIMESTAMP:
 				if ((opsize == TCPOLEN_TIMESTAMP) &&
 				    ((estab && opt_rx->tstamp_ok) ||
-				     (!estab && sysctl_tcp_timestamps))) {
+				     (!estab && sysctl_tcp_timestamps &&
+				      !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) {
 					opt_rx->saw_tstamp = 1;
 					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
 					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 64db8dd..8f30c18 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -488,7 +488,9 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	opts->mss = tcp_advertise_mss(sk);
 	size += TCPOLEN_MSS_ALIGNED;
 
-	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+	if (likely(sysctl_tcp_timestamps &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) &&
+		   *md5 == NULL)) {
 		opts->options |= OPTION_TS;
 		opts->tsval = TCP_SKB_CB(skb)->when;
 		opts->tsecr = tp->rx_opt.ts_recent;
@@ -2317,7 +2319,9 @@ static void tcp_connect_init(struct sock *sk)
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
 	 */
 	tp->tcp_header_len = sizeof(struct tcphdr) +
-		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+		(sysctl_tcp_timestamps &&
+		(!dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) ?
+		  TCPOLEN_TSTAMP_ALIGNED : 0));
 
 #ifdef CONFIG_TCP_MD5SIG
 	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 1/7] Only parse time stamp TCP option in time wait sock
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef, Yony Amit
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Since we only use tcp_parse_options here to check for the exietence
of TCP timestamp option in the header, it is better to call with
the "established" flag on.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Signed-off-by: Ori Finkelman <ori@comsleep.com>
Signed-off-by: Yony Amit <yony@comsleep.com>

---
 net/ipv4/tcp_minisocks.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 624c3c9..c49a550 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -100,9 +100,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	struct tcp_options_received tmp_opt;
 	int paws_reject = 0;
 
-	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, 0);
+		tcp_parse_options(skb, &tmp_opt, 1);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 7/7] Allow disabling of DSACK TCP option per route
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Add and use no DSCAK bit in the features field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    1 +
 net/ipv4/tcp_input.c      |    8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6784b34..e78b60c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -381,6 +381,7 @@ enum
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 #define RTAX_FEATURE_NO_WSCALE	0x00000010
+#define RTAX_FEATURE_NO_DSACK	0x00000020
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4f5e914..4262da5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4080,8 +4080,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
 static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+	if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+	    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 		int mib_idx;
 
 		if (before(seq, tp->rcv_nxt))
@@ -4110,13 +4112,15 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
 static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
 		tcp_enter_quickack_mode(sk);
 
-		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+		if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+		    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
 			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 4/7] Add the no SACK route option feature
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Implement querying and acting upon the no sack bit in the features
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    2 +-
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    4 +++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index adf2068..9c802a6 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -377,7 +377,7 @@ enum
 #define RTAX_MAX (__RTAX_MAX - 1)
 
 #define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
+#define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_TIMESTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d502f49..b14f780 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3763,7 +3763,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_SACK_PERM:
 				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
-				    !estab && sysctl_tcp_sack) {
+				    !estab && sysctl_tcp_sack &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_SACK)) {
 					opt_rx->sack_ok = 1;
 					tcp_sack_reset(opt_rx);
 				}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a..64db8dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -464,6 +464,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 				struct tcp_md5sig_key **md5) {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned size = 0;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -498,7 +499,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
 	}
-	if (likely(sysctl_tcp_sack)) {
+	if (likely(sysctl_tcp_sack &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_SACK))) {
 		opts->options |= OPTION_SACK_ADVERTISE;
 		if (unlikely(!(OPTION_TS & opts->options)))
 			size += TCPOLEN_SACKPERM_ALIGNED;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 6/7] Allow to turn off TCP window scale opt per route
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Add and use no window scale bit in the features field.

Note that this is not the same as setting a window scale of 0 
as would happen with window limit on route.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
 include/linux/rtnetlink.h |    1 +
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2ab8c75..6784b34 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -380,6 +380,7 @@ enum
 #define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
+#define RTAX_FEATURE_NO_WSCALE	0x00000010
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d2f9742..4f5e914 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3739,7 +3739,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_WINDOW:
 				if (opsize == TCPOLEN_WINDOW && th->syn &&
-				    !estab && sysctl_tcp_window_scaling) {
+				    !estab && sysctl_tcp_window_scaling &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) {
 					__u8 snd_wscale = *(__u8 *)ptr;
 					opt_rx->wscale_ok = 1;
 					if (snd_wscale > 14) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8f30c18..ff60a21 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -496,7 +496,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->tsecr = tp->rx_opt.ts_recent;
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
-	if (likely(sysctl_tcp_window_scaling)) {
+	if (likely(sysctl_tcp_window_scaling &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) {
 		opts->ws = tp->rx_opt.rcv_wscale;
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
@@ -2347,7 +2348,8 @@ static void tcp_connect_init(struct sock *sk)
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 				  &tp->rcv_wnd,
 				  &tp->window_clamp,
-				  sysctl_tcp_window_scaling,
+				  (sysctl_tcp_window_scaling &&
+				   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)),
 				  &rcv_wscale);
 
 	tp->rx_opt.rcv_wscale = rcv_wscale;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 0/7] Per route TCP options
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef

Third iteration of patch to allow disablng of TCP SACK, DSCAK, 
time stamp and window scale TCP options on a per route basis, now
with 100% less remote DoS opportunities (thank you Ilpo for 
spotting it ;-)

You usualy want to disable SACK, DSACK, time stamp or window
scale if you've got a piece of broken networking equipment somewhere 
as a stop gap until you can bring a big enough hammer to deal with
the broken network equipment. It doesn't make sense to "punish" the
entire connections going through the machine to destinations not 
related to the broken equipment.

This is doubly true when you're dealing with network containers
used to isolate several virtual domains.

Per route options implemented in free bits in the features route
entry property, which in some cases were reserved by name for these
options, so this does not inflate any structure.

Global sysctl based kill switches for these options are still
preserved, as some people seems to want them, so behaviour
is default to on, unless switched off either globaly or on
per route basis.

Tested on x86 using Qemu/KVM.  

Working but crude matching patch to iproute2 sent earlier to the list.

Patchset based on original work by Ori Finkelman and Yony Amit 
from ComSleep Ltd.

Gilad Ben-Yossef (7):
  Only parse time stamp TCP option in time wait sock
  Allow tcp_parse_options to consult dst entry
  Infrastructure for querying route entry features
  Add the no SACK route option feature
  Allow disabling TCP timestamp options per route
  Allow to turn off TCP window scale opt per route
  Allow disabling of DSACK TCP option per route

 include/linux/rtnetlink.h |    6 ++++--
 include/net/dst.h         |    8 +++++++-
 include/net/tcp.h         |    3 ++-
 net/ipv4/syncookies.c     |   27 ++++++++++++++-------------
 net/ipv4/tcp_input.c      |   26 ++++++++++++++++++--------
 net/ipv4/tcp_ipv4.c       |   21 ++++++++++++---------
 net/ipv4/tcp_minisocks.c  |    8 +++++---
 net/ipv4/tcp_output.c     |   18 +++++++++++++-----
 net/ipv6/syncookies.c     |   28 +++++++++++++++-------------
 net/ipv6/tcp_ipv6.c       |    3 ++-
 10 files changed, 92 insertions(+), 56 deletions(-)

^ permalink raw reply

* Re: [PATCH v2 8/8] Document future removal of sysctl_tcp_* options
From: Gilad Ben-Yossef @ 2009-10-26  8:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Bill Fink, William Allen Simpson, netdev, Ilpo Järvinen
In-Reply-To: <4AE52DBD.3030805@gmail.com>

Eric Dumazet wrote:

> Bill Fink a écrit :
>   
>> On Sun, 25 Oct 2009, Gilad Ben-Yossef wrote:
>>
>>     
>>> Eric Dumazet wrote:
>>>
>>>
>>> I still think having a global kill switch and per route options better 
>>> (basically use the exiting patch but not retire the global kill 
>>> switch|), but if you must Hgow about we leave the global sysctl as they 
>>> are and just have a two bit route option:
>>>
>>> 0 Use global default
>>> 1 Off
>>> 2 On
>>>
>>> It's kind of funny, because this is what the original patch from 
>>> Comsleep does and I thought it needlessly complicates things.
>>>
>>> So, what do you say - which will it be?
>>>       
>> I personally feel the 2-bit settings are overkill.  What i think
>> makes the most sense is for the global options to act as they always
>> have in the absence of any route specific settings, and for any
>> route specific settings to override the related global settings.
>> This is both simple and maintains backward compatibility.
>>     
>
> Backward compatibility is important, very important, if not the most
> important thing. Then usability comes.
>   
I tend to agree.
> I know some busy servers where adding/changing a single route makes them
> go crazy (because of ip route flush cache)
>
> So if a route is overriding a global conf, and the admin wants to make an
> emergency change during peak hours, he should do it by a global setting,
> or he wont use at all this new stuff, and stay conservative.
>
> Alternative would be to not trigger the flush of cache when changing
> features flags.
>
>   
OK. It really sounds like we should go with my first suggestion: global 
sysctl based kill switches, just as we have now and in addition, the 
ability to kill TCP options per route. The TCP option will be used if 
and only if both kill switches (global and per route) are not set.

What we achieve is:

1. Global kill switches work exactly as they do now, whether you use the 
new per route options or not, so backwards compatible.

2. In addition, if the global kill switch is not in effect, you can also 
kill the options on a per route basis.

I'm going to send third version of the patch to this effect, minus the 
new remote DoS possibility that Ilpo pointed out and leaving the global 
sysctl kill switches be.

If you like it, please ACK ;-)

Thanks,
Gilad



-- 
Gilad Ben-Yossef
Chief Coffee Drinker & CTO
Codefidence Ltd.

Web:   http://codefidence.com
Cell:  +972-52-8260388
Skype: gilad_codefidence
Tel:   +972-8-9316883 ext. 201
Fax:   +972-8-9316884
Email: gilad@codefidence.com

Check out our Open Source technology and training blog - http://tuxology.net

	"Linux is Ir. Ir, of course, is a form of hypereviscerated Reiyk."
		-- Marc Volovic, linux-il, 14 Dec 2000


^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Johannes Berg @ 2009-10-26  7:58 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Tilman Schmidt, David Miller, hidave.darkstar, linux-kernel, tglx,
	linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <20091026075438.GB6187@ff.dom.local>

[-- Attachment #1: Type: text/plain, Size: 765 bytes --]

On Mon, 2009-10-26 at 07:54 +0000, Jarek Poplawski wrote:

> > No, I wrote that I didn't know. I suppose now that I looked at it I do
> > know, and only disabling preemption is required.
> 
> But netif_rx has preemption disabled most of the time (by hardirqs
> disabling). So maybe disabling preemption isn't the main reason here
> either?

Not for netpoll though, which may or may not be relevant (if I were to
venture a guess I'd say it isn't and it disables preemption to be able
to do the softirq thing)

However, I lost track now of why we're discussing this.

Basically it boils down to using netif_rx() when in (soft)irq, and
netif_rx_ni() when in process context. That could just be an
optimisation, but it's a very valid one.

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Jarek Poplawski @ 2009-10-26  7:54 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Tilman Schmidt, David Miller, hidave.darkstar, linux-kernel, tglx,
	linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <1256543054.28230.6.camel@johannes.local>

On Mon, Oct 26, 2009 at 08:44:14AM +0100, Johannes Berg wrote:
> On Mon, 2009-10-26 at 07:41 +0000, Jarek Poplawski wrote:
> 
> > > netif_rx_ni() disables preemption.
> > 
> > You wrote earlier:
> > 
> > > [...] the networking layer needs to have
> > > packets handed to it with softirqs disabled.
> > 
> > How disabling preemption can fix something which needs softirqs
> > disabled? Could you be more precise?
> 
> No, I wrote that I didn't know. I suppose now that I looked at it I do
> know, and only disabling preemption is required.

But netif_rx has preemption disabled most of the time (by hardirqs
disabling). So maybe disabling preemption isn't the main reason here
either?

Jarek P.

^ permalink raw reply

* Re: [PATCH next-next-2.6] netdev: better dev_name_hash
From: Eric Dumazet @ 2009-10-26  7:48 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Octavian Purdila, netdev
In-Reply-To: <20091025233016.6860d9c7@nehalam>

Stephen Hemminger a écrit :
> I overkilled this with more functions and compared filenames as well.
> 
> 
> genarated names (dummyNNNN)
> Algorithm             Time (us)     Ratio       Max   StdDev
> kr_hash                  277925   152408.6   468448 543.19
> string_hash31            329356     5859.4    16042  44.18
> SuperFastHash            324570     4885.9    10502   2.29
> djb2                     327908     5608.5    15210  38.08
> string_hash17            326769     4883.6     9896   0.76
> full_name_hash           343196    63921.0   140628 343.62
> jhash_string             463801     4883.8    10085   1.02
> sdbm                     398587     9801.7    29634  99.18
> 
> filesystem names
> Algorithm             Time (us)     Ratio       Max   StdDev
> kr_hash                  278840   152314.9   468717 543.01
> string_hash31            331206     5802.1    16004  42.87
> SuperFastHash            325938     4887.5    13528   2.88
> djb2                     330621     5607.1    15333  38.05
> string_hash17            331181     4884.9    13274   1.78
> full_name_hash           347312    63972.2   141336 343.77
> jhash_string             466799     4885.2    13275   1.92
> sdbm                     403691     9771.7    29629  98.88
> 
> Ratio is the average number of buckets examined when scanning
> the whole set of names.
> 
> 
> 1) Increased hash buckets to 1024 which seems reasonable if we are
>    going to test that many names.
> 2) Increased name size to 256 so that longer filenames could be
>    checked and name blocks were not in same cache line
> 
> * SuperFastHash is too big to put inline
> 
> 

Thanks Stephen

1) dcache hash is very big on average machines.

2) dcache : We hash last component, against its parent, acting as a base.
   Your hashtest program considers the name as a single entity, giving different
   hash distribution.

3) netdev names are special, since we have only one parent, and
 smaller hash table.

4) jhash is not that expensive, but it might be because of huge working set of your
test program : strings are not in cpu caches and speed is mostly driven by ram bandwidth.

But current full_name_hash() seems a pretty bad choice !

^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Johannes Berg @ 2009-10-26  7:44 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Tilman Schmidt, David Miller, hidave.darkstar, linux-kernel, tglx,
	linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <20091026074126.GA6187@ff.dom.local>

[-- Attachment #1: Type: text/plain, Size: 474 bytes --]

On Mon, 2009-10-26 at 07:41 +0000, Jarek Poplawski wrote:

> > netif_rx_ni() disables preemption.
> 
> You wrote earlier:
> 
> > [...] the networking layer needs to have
> > packets handed to it with softirqs disabled.
> 
> How disabling preemption can fix something which needs softirqs
> disabled? Could you be more precise?

No, I wrote that I didn't know. I suppose now that I looked at it I do
know, and only disabling preemption is required.

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Jarek Poplawski @ 2009-10-26  7:41 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Tilman Schmidt, David Miller, hidave.darkstar, linux-kernel, tglx,
	linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <1256309191.12174.51.camel@johannes.local>

On Fri, Oct 23, 2009 at 04:46:31PM +0200, Johannes Berg wrote:
> On Fri, 2009-10-23 at 16:39 +0200, Tilman Schmidt wrote:
> 
> > Strange. Then what are the two separate functions netif_rx() and
> > netif_rx_ni() for?
> 
> netif_rx_ni() disables preemption.

You wrote earlier:

> [...] the networking layer needs to have
> packets handed to it with softirqs disabled.

How disabling preemption can fix something which needs softirqs
disabled? Could you be more precise?

> > > This really should be obvious. You're fixing the warning at the source
> > > of the warning, rather than the source of the problem.
> > 
> > Good idea. So please do tell us where the source of the problem is.
> 
> You use netif_rx_ni() instead of netif_rx() at whatever place that
> causes this problem.

This isn't a very precise description either.

Jarek P.

^ permalink raw reply

* Re: [PATCH 1/5] page allocator: Always wake kswapd when restarting an allocation attempt after direct reclaim failed
From: David Rientjes @ 2009-10-26  7:10 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Mel Gorman, Frans Pop, Jiri Kosina, Sven Geggus,
	Karol Lewandowski, Tobias Oetiker, Rafael J. Wysocki,
	David Miller, Reinette Chatre, Kalle Valo, Mohamed Abbas,
	Jens Axboe, John W. Linville, Pekka Enberg,
	Bartlomiej Zolnierkiewicz, Greg Kroah-Hartman,
	Stephan von Krawczynski, Kernel Testers List, netdev,
	linux-kernel
In-Reply-To: <20091026100019.2F4A.A69D9226@jp.fujitsu.com>

On Mon, 26 Oct 2009, KOSAKI Motohiro wrote:

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index bf72055..5a27896 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1899,6 +1899,12 @@ rebalance:
>  	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
>  		/* Wait for some write requests to complete then retry */
>  		congestion_wait(BLK_RW_ASYNC, HZ/50);
> +
> +		/*
> +		 * While we wait congestion wait, Amount of free memory can
> +		 * be changed dramatically. Thus, we kick kswapd again.
> +		 */
> +		wake_all_kswapd(order, zonelist, high_zoneidx);
>  		goto rebalance;
>  	}
>  

We're blocking to finish writeback of the directly reclaimed memory, why 
do we need to wake kswapd afterwards?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 1/4] Adds random ect generation to tfrc-sp sender side
From: Gerrit Renker @ 2009-10-26  6:55 UTC (permalink / raw)
  To: Ivo Calado; +Cc: dccp, netdev
In-Reply-To: <cb00fa210910210615v409d09c6p34cd6a48c48196f5@mail.gmail.com>

| >    That is, at the moment both the sender and receiver side of the ECN Nonce
| >    sum verification are placeholders which currently have no effect.
| >
| 
| Okay, then the implementation would be useless now.
I was not suggesting to throw away the patches, we can keep them for
later use.

They are a good starting basis once it makes sense to work with ECN.
Or how can we test ECN if we are not sure that the other parts work
as they are supposed to?

| >  3) Starting an implementation throws up further questions that need to
| >    be addressed, both the basis and the extension need to be verified.
| >
| > I would like to suggest to implement the basis, that is CCID-4 with ECN
| > (using plain ECT(0)), test with that until it works satisfactorily, and
| > then continue adding measures such as the ECN Nonce verification.
| >
| 
| Okay. But, when would be good to at least include random ECT
| generation? When DCCP ECN code will get fixed? Is there any work on
| this?
| 
I asked this on netdev earlier, there was not much enthusiasm.

The issue is that ECN belongs both to the network and the transport layer.
This network layer is in inet_ecn.h, outside of DCCP.

I believe that the changes would not be too hard to do, by changing the
macros. But it requires working with the people on netdev, in particular
to ensure it does not break something in the TCP/SCTP subsystems (both
also use ECN, and then there are also raw sockets).

I also have an interest in resolving this, due to the ugliness at the
moment for enabling ECN on IPv6.

RFC 2884, written by one of the Linux ECN developers, describes early
IPv4 ECN evaluation. It seems that initially it was planned to only 
support ECN over Ipv4, parts of the code may be still from that time.

We can pursue this in parallel to the other issues. Ideally this would be
resolved at the time the other parts of CCID-4 are ready for testing.

| > In summary, I would like to suggest to remove the ECN verification for
| > the moment and focus on the "basic" issues first.
| >
| > Would you be ok with that?
| >
| 
| Yes, we'll keep the ECN verification code here at our git until the
| scenario is ready.
| 
I was going to suggest to put them onto a webpage, such as yours, or on 
www.erg.abdn.ac.uk/users/gerrit/dccp there is also still some space.

Can you please elaborate how to keep your git tree and the one on
eden-feed synchronized? At the moment I have not made any changes other
than the ones I emailed you about. Is there a way of keeping both trees
in synch without running into versioning difficulties?

(The simplest way I can think of is to keep the patches in a separate
 set, or to spawn a subtree which contains the ECN patches on top of the
 CCID-4 tree.)

| > (Also for later) I wonder how to do the sums, with RFC 3168
| > ECT(0) = 0x2 => !0x2 = 0
| > ECT(1) = 0x1 => !0x1 = 0
| >
| 
| I don't understand. Can you try to explain it? Or cite RFC section
| that address it?

The values are from figure 1, page 7, the expressions evaluate both as 0:

void main(void)
{
	printf("!0x2 = %d, !0x1 = %d\n", !0x2, !0x1);
}

^ permalink raw reply

* Re: [PATCH next-next-2.6] netdev: better dev_name_hash
From: Stephen Hemminger @ 2009-10-26  6:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Octavian Purdila, netdev
In-Reply-To: <4AE4C1FA.7000002@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1536 bytes --]

I overkilled this with more functions and compared filenames as well.


genarated names (dummyNNNN)
Algorithm             Time (us)     Ratio       Max   StdDev
kr_hash                  277925   152408.6   468448 543.19
string_hash31            329356     5859.4    16042  44.18
SuperFastHash            324570     4885.9    10502   2.29
djb2                     327908     5608.5    15210  38.08
string_hash17            326769     4883.6     9896   0.76
full_name_hash           343196    63921.0   140628 343.62
jhash_string             463801     4883.8    10085   1.02
sdbm                     398587     9801.7    29634  99.18

filesystem names
Algorithm             Time (us)     Ratio       Max   StdDev
kr_hash                  278840   152314.9   468717 543.01
string_hash31            331206     5802.1    16004  42.87
SuperFastHash            325938     4887.5    13528   2.88
djb2                     330621     5607.1    15333  38.05
string_hash17            331181     4884.9    13274   1.78
full_name_hash           347312    63972.2   141336 343.77
jhash_string             466799     4885.2    13275   1.92
sdbm                     403691     9771.7    29629  98.88

Ratio is the average number of buckets examined when scanning
the whole set of names.


1) Increased hash buckets to 1024 which seems reasonable if we are
   going to test that many names.
2) Increased name size to 256 so that longer filenames could be
   checked and name blocks were not in same cache line

* SuperFastHash is too big to put inline


[-- Attachment #2: hashtest.c --]
[-- Type: text/x-c++src, Size: 6872 bytes --]

#include <stdio.h>
#include <stdint.h>
#include <malloc.h>
#include <sys/types.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>

static unsigned int
kr_hash(const unsigned char *str, unsigned int len)
{
	unsigned int hash = 0;

        while (len--)
	    hash += *str++;

	return hash;
}

static unsigned int
sdbm(const unsigned char *name, unsigned int len)
{
        unsigned long hash = 0;

        while (len--)
            hash = *name++ + (hash << 6) + (hash << 16) - hash;

        return hash;
}

#define init_name_hash()                0

/* partial hash update function. Assume roughly 4 bits per character */
static inline unsigned long
partial_name_hash(unsigned long c, unsigned long prevhash)
{
        return (prevhash + (c << 4) + (c >> 4)) * 11;
}

/* Compute the hash for a name string. */
static unsigned int
full_name_hash(const unsigned char *name, unsigned int len)
{
        unsigned long hash = 0;
        while (len--)
                hash = partial_name_hash(*name++, hash);
        return hash;
}

static unsigned int
djb2(const unsigned char *str, unsigned int len)
{
	unsigned long hash = 5381;

        while (len--)
		hash = ((hash << 5) + hash) + *str++; /* hash * 33 + c */

        return hash;
}

static unsigned int
string_hash31(const unsigned char *name, unsigned int len)
{
        unsigned hash = 0;
        int i;

        for (i = 0; i < len; ++i)
                hash = 31 * hash + name[i];
        return hash;
}

static unsigned int
string_hash17(const unsigned char *name, unsigned int len)
{
        unsigned hash = 0;
        int i;

        for (i = 0; i < len; ++i)
                hash = 17 * hash + name[i];
        return hash;
}

/* will need to change to unaligned_access() */
static inline uint16_t get16bits(const unsigned char *data)
{
	return *(uint16_t *) data;
}

static inline unsigned int
SuperFastHash (const unsigned char * data, unsigned int len)
{
	uint32_t hash = len, tmp;
	int rem;

	rem = len & 3;
	len >>= 2;

	/* Main loop */
	for (;len > 0; len--) {
		hash  += get16bits (data);
		tmp    = (get16bits (data+2) << 11) ^ hash;
		hash   = (hash << 16) ^ tmp;
		data  += 2*sizeof (uint16_t);
		hash  += hash >> 11;
	}

	/* Handle end cases */
	switch (rem) {
        case 3: hash += get16bits (data);
                hash ^= hash << 16;
                hash ^= data[sizeof (uint16_t)] << 18;
                hash += hash >> 11;
                break;
        case 2: hash += get16bits (data);
                hash ^= hash << 11;
                hash += hash >> 17;
                break;
        case 1: hash += *data;
                hash ^= hash << 10;
                hash += hash >> 1;
	}

	/* Force "avalanching" of final 127 bits */
	hash ^= hash << 3;
	hash += hash >> 5;
	hash ^= hash << 4;
	hash += hash >> 17;
	hash ^= hash << 25;
	hash += hash >> 6;

	return hash;
}


/* NOTE: Arguments are modified. */
#define __jhash_mix(a, b, c) \
{ \
  a -= b; a -= c; a ^= (c>>13); \
  b -= c; b -= a; b ^= (a<<8); \
  c -= a; c -= b; c ^= (b>>13); \
  a -= b; a -= c; a ^= (c>>12);  \
  b -= c; b -= a; b ^= (a<<16); \
  c -= a; c -= b; c ^= (b>>5); \
  a -= b; a -= c; a ^= (c>>3);  \
  b -= c; b -= a; b ^= (a<<10); \
  c -= a; c -= b; c ^= (b>>15); \
}

/* The golden ration: an arbitrary value */
#define JHASH_GOLDEN_RATIO	0x9e3779b9

/* The most generic version, hashes an arbitrary sequence
 * of bytes.  No alignment or length assumptions are made about
 * the input key.
 */
static inline uint32_t jhash(const void *key, uint32_t length, uint32_t initval)
{
	uint32_t a, b, c, len;
	const uint8_t *k = key;

	len = length;
	a = b = JHASH_GOLDEN_RATIO;
	c = initval;

	while (len >= 12) {
		a += (k[0] +((uint32_t)k[1]<<8) +((uint32_t)k[2]<<16) +((uint32_t)k[3]<<24));
		b += (k[4] +((uint32_t)k[5]<<8) +((uint32_t)k[6]<<16) +((uint32_t)k[7]<<24));
		c += (k[8] +((uint32_t)k[9]<<8) +((uint32_t)k[10]<<16)+((uint32_t)k[11]<<24));

		__jhash_mix(a,b,c);

		k += 12;
		len -= 12;
	}

	c += length;
	switch (len) {
	case 11: c += ((uint32_t)k[10]<<24);
	case 10: c += ((uint32_t)k[9]<<16);
	case 9 : c += ((uint32_t)k[8]<<8);
	case 8 : b += ((uint32_t)k[7]<<24);
	case 7 : b += ((uint32_t)k[6]<<16);
	case 6 : b += ((uint32_t)k[5]<<8);
	case 5 : b += k[4];
	case 4 : a += ((uint32_t)k[3]<<24);
	case 3 : a += ((uint32_t)k[2]<<16);
	case 2 : a += ((uint32_t)k[1]<<8);
	case 1 : a += k[0];
	};

	__jhash_mix(a,b,c);

	return c;
}

static unsigned int
jhash_string(const unsigned char *name, unsigned int len)
{
	return jhash(name, len, 0);
}

#define TESTSIZE	10000000ul
#define HASHBITS	10
#define HASHSZ		(1<<HASHBITS)
#define HASHMSK		(HASHSZ-1)
#define IFNAMSIZ	256

static char *names[TESTSIZE];

static void generate_names(void)
{
	unsigned int i;
	char *buf = malloc(TESTSIZE * IFNAMSIZ);

	printf("\ngenarated names (dummyNNNN)\n");
	for (i = 0; i < TESTSIZE; i++) {
		snprintf(buf, IFNAMSIZ, "dummy%d", i);
		names[i] = buf;
		buf += IFNAMSIZ;
	}
}

static inline void measure(const char *name,  
			   unsigned int (*hash)(const unsigned char *, unsigned int))
{
	unsigned int i;
	struct timeval t0, t1;
	unsigned int dist[HASHSZ] = { 0 };
	unsigned long long ratio = 0;
	long us;
	unsigned long m = 0;
	const double avg = TESTSIZE / HASHSZ;
	double std = 0;

	gettimeofday(&t0, NULL);
	
	for (i = 0; i < TESTSIZE; i++) {
		const unsigned char *str = (const unsigned char *) names[i];
		unsigned int h = hash(str, strlen(names[i]));
		++dist[h & HASHMSK];
	}

	gettimeofday(&t1, NULL);
	us = (t1.tv_sec - t0.tv_sec) * 1000000;
	us += (t1.tv_usec - t0.tv_usec);

	for (i = 0; i < HASHSZ; i++) {
		unsigned int n = dist[i];
		unsigned long long s;
		double delta = (n - avg);

		s = n;
		s *= (1 + n);
		ratio += s / 2;
		if (n > m) m = n;
		
		std += delta * delta;
	}

	printf("%-20s %10ld %10.1f %8ld %6.2f\n", name, us, 
	       (double) ratio / (double)TESTSIZE, m, sqrt(std / TESTSIZE));

}
#define MEASURE(func)	measure(#func, &func)


static void filesystem_names(void)
{
	FILE *f = fopen("/tmp/names", "r");
	if (!f) { perror("/tmp/names"); exit(1); }
	unsigned int i = 0;
	char line[IFNAMSIZ];

	printf("\nfilesystem names\n");
	while (fgets(line, sizeof line, f) != NULL) {
		strncpy(names[i], line, IFNAMSIZ);
		if (++i == TESTSIZE)
			break;
	}
	fclose(f);

}


int main()
{

	generate_names();

	printf("Algorithm             Time (us)     Ratio       Max   StdDev\n");
	MEASURE(kr_hash);
	MEASURE(string_hash31);
	MEASURE(SuperFastHash);
	MEASURE(djb2);
	MEASURE(string_hash17);
	MEASURE(full_name_hash);
	MEASURE(jhash_string);
	MEASURE(sdbm);

	filesystem_names();

	printf("Algorithm             Time (us)     Ratio       Max   StdDev\n");
	MEASURE(kr_hash);
	MEASURE(string_hash31);
	MEASURE(SuperFastHash);
	MEASURE(djb2);
	MEASURE(string_hash17);
	MEASURE(full_name_hash);
	MEASURE(jhash_string);
	MEASURE(sdbm);

        return 0;
}

^ permalink raw reply

* Re: [PATCH next-next-2.6] netdev: better dev_name_hash
From: Eric Dumazet @ 2009-10-26  5:28 UTC (permalink / raw)
  To: Hagen Paul Pfeifer; +Cc: Octavian Purdila, netdev
In-Reply-To: <20091025224153.GB20987@nuttenaction>

Hagen Paul Pfeifer a écrit :
> * Octavian Purdila | 2009-10-25 23:55:32 [+0200]:
> 
>> My results shows that new17 is better or very close to jhash2. And I think its 
>> lighter then jhash too.
> 
> If new17 is very close to jhash/jhash2 then the cycles comes into play.
> Anyway, there is already a very potent hash interface in form of jhash{,2}.
> 
> +1 for jhash2
> 

In fact, new10 should be the 'perfect' hash for the "eth%d"
netdev use, not jhash (way more expensive in cpu cycles BTW)

Most linux machines in the world have less than 10 interfaces, jhash
would be really overkill.


Thanks


[PATCH net-next-2.6] netdev: better dev_name_hash

Octavian Purdila pointed out that the current dev_name_hash is 
not very good at spreading entries when a large number of interfaces
of the same type (e.g. ethXXXXX) are used.

Here is hash distribution of 16000 "dummy%d" devices :

full_name_hash[] = {
0, 0, 0, 375, 0, 0, 562, 0, 0, 0, 6, 0, 0, 0, 0, 56,
0, 0, 0, 374, 0, 0, 562, 0, 0, 0, 5, 0, 0, 0, 0, 57,
0, 0, 0, 375, 0, 0, 562, 0, 0, 0, 6, 1, 0, 0, 0, 56,
0, 0, 0, 375, 0, 0, 563, 0, 0, 0, 5, 1, 0, 0, 0, 56,
0, 0, 0, 375, 0, 0, 563, 0, 0, 0, 6, 1, 0, 0, 0, 56,
0, 0, 0, 375, 0, 0, 562, 0, 0, 0, 6, 0, 0, 0, 0, 57,
0, 0, 0, 375, 0, 0, 563, 0, 0, 0, 5, 0, 0, 0, 0, 56,
0, 0, 0, 376, 0, 0, 563, 0, 0, 0, 6, 1, 0, 0, 0, 56,
0, 0, 0, 375, 0, 0, 562, 0, 0, 0, 5, 1, 0, 0, 0, 57,
0, 0, 0, 374, 0, 0, 563, 0, 0, 0, 6, 1, 0, 0, 0, 56,
0, 0, 0, 375, 0, 0, 563, 0, 0, 0, 6, 1, 0, 0, 0, 56,
0, 0, 0, 376, 0, 0, 562, 0, 0, 0, 6, 0, 0, 0, 0, 56,
0, 0, 0, 375, 0, 0, 562, 0, 0, 0, 5, 0, 0, 0, 0, 56,
0, 0, 0, 375, 0, 0, 562, 0, 0, 0, 5, 1, 0, 0, 0, 56,
0, 0, 0, 375, 0, 0, 563, 0, 0, 0, 6, 1, 0, 0, 0, 57,
0, 0, 0, 375, 0, 0, 563, 0, 0, 0, 6, 1, 0, 0, 0, 56,
};

Instead of using full_name_hash(), netdev should use a hash suited
to its typical uses, which are a common substring followed by a base 10 number.

new hash distribution :

string_hash10[] = {
62, 63, 61, 60, 61, 63, 61, 62, 64, 62, 61, 62, 62, 60, 60, 61,
61, 59, 60, 63, 61, 60, 62, 63, 62, 60, 60, 60, 59, 60, 61, 59,
58, 61, 61, 60, 60, 61, 61, 58, 58, 59, 58, 57, 58, 59, 58, 59,
60, 60, 59, 61, 63, 61, 60, 60, 62, 61, 60, 61, 61, 60, 61, 62,
61, 62, 63, 63, 62, 62, 64, 64, 61, 62, 63, 62, 62, 63, 64, 64,
64, 64, 64, 62, 64, 65, 62, 62, 63, 63, 62, 62, 63, 64, 62, 62,
64, 62, 63, 65, 64, 63, 63, 64, 64, 63, 63, 67, 65, 64, 66, 66,
66, 66, 66, 65, 64, 63, 65, 63, 63, 66, 66, 64, 64, 65, 65, 64,
63, 66, 64, 64, 65, 65, 63, 64, 65, 63, 62, 61, 64, 61, 61, 63,
65, 64, 63, 64, 62, 62, 62, 64, 61, 61, 63, 63, 63, 63, 65, 64,
62, 61, 63, 61, 61, 62, 61, 61, 62, 63, 62, 62, 63, 66, 62, 61,
62, 62, 62, 61, 62, 61, 61, 61, 64, 62, 63, 65, 63, 63, 63, 64,
62, 60, 60, 63, 61, 61, 63, 62, 63, 65, 65, 63, 62, 63, 65, 62,
62, 64, 63, 63, 65, 66, 65, 64, 64, 65, 63, 64, 66, 63, 63, 65,
66, 64, 63, 64, 66, 64, 63, 65, 63, 64, 66, 66, 64, 63, 64, 64,
62, 62, 64, 61, 60, 62, 63, 62, 61, 62, 63, 61, 62, 63, 60, 59,
}; 


Based on a previous patch from Octavian Purdila

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/dev.c |   15 ++++++++++++++-
 1 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index fa88dcd..e192068 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -196,9 +196,22 @@ EXPORT_SYMBOL(dev_base_lock);
 #define NETDEV_HASHBITS	8
 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 
+/*
+ * Because of "eth%d" patterns, following hash is giving good distribution
+ */
+static inline unsigned int string_hash10(const char *name, unsigned int len)
+{
+	unsigned int i, hash = 0;
+
+	for (i = 0; i < len; i++)
+		hash = hash * 10 + name[i];
+
+	return hash;
+}
+
 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 {
-	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+	unsigned hash = string_hash10(name, strnlen(name, IFNAMSIZ));
 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 }
 

^ permalink raw reply related

* Re: [PATCH v2 8/8] Document future removal of sysctl_tcp_* options
From: Eric Dumazet @ 2009-10-26  5:03 UTC (permalink / raw)
  To: Bill Fink; +Cc: Gilad Ben-Yossef, William Allen Simpson, netdev
In-Reply-To: <20091025202114.152b94b8.billfink@mindspring.com>

Bill Fink a écrit :
> On Sun, 25 Oct 2009, Gilad Ben-Yossef wrote:
> 
>> Eric Dumazet wrote:
>>
>>> Bill Fink a écrit :
>>>
>>>   
>>>> And as mentioned previously, the global options can be quite useful
>>>> in certain test scenarios.  I also agree the per route settings are
>>>> a very useful addition.  I think the global and per route settings
>>>> are complementary and shouldn't be thought of as in conflict with
>>>> one another.
>>>>     
>>> Absolutely, global setting is a must when an admin wants a quick path.
>>>
>>> The more flexible would be to have two bits per route, plus
>>> 2 bits on the global configuration.
>>>
>>> global conf:
>>> 00 : timestamps OFF, unless a route setting is not 00
>>> 01 : timestamps ON, unless a route setting is not 00
>>> 10 : Force timestamps OFF, ignore route settings (emergency sysadmin request)
>>> 11 : Force timestamps ON, ignore route settings 
>>>
>>> Route settings (used *only* if global setting is 0Y)
>>> 00 : global conf is used
>>> 01 : Force timestamps being OFF for this route
>>> 10 : Force timestamps being ON for this route
>>> 11 : complement global conf
>> Hey, I have no issue to re-spin the patch with this suggestion, if you 
>> truly think this is valuable, but would you please consider the 
>> nightmare of having to just explain this to someone?
>>
>> It sounds to me way too complicated for what it does.
>>
>> I still think having a global kill switch and per route options better 
>> (basically use the exiting patch but not retire the global kill 
>> switch|), but if you must Hgow about we leave the global sysctl as they 
>> are and just have a two bit route option:
>>
>> 0 Use global default
>> 1 Off
>> 2 On
>>
>> It's kind of funny, because this is what the original patch from 
>> Comsleep does and I thought it needlessly complicates things.
>>
>> So, what do you say - which will it be?
> 
> I personally feel the 2-bit settings are overkill.  What i think
> makes the most sense is for the global options to act as they always
> have in the absence of any route specific settings, and for any
> route specific settings to override the related global settings.
> This is both simple and maintains backward compatibility.

Backward compatibility is important, very important, if not the most
important thing. Then usability comes.

I know some busy servers where adding/changing a single route makes them
go crazy (because of ip route flush cache)

So if a route is overriding a global conf, and the admin wants to make an
emergency change during peak hours, he should do it by a global setting,
or he wont use at all this new stuff, and stay conservative.

Alternative would be to not trigger the flush of cache when changing
features flags.

^ permalink raw reply

* Re: [PATCH next-next-2.6] netdev: better dev_name_hash
From: Stephen Hemminger @ 2009-10-26  4:43 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: netdev
In-Reply-To: <200910252158.53921.opurdila@ixiacom.com>

On Sun, 25 Oct 2009 21:58:53 +0200
Octavian Purdila <opurdila@ixiacom.com> wrote:

> 
> The current dev_name_hash is not very good at spreading entries when a
> large number of interfaces of the same type (e.g. ethXXXXX) are used.
> 
> Here are some performance numbers for creating 16000 dummy interfaces with
> and without the patch (with per device sysctl entries disabled)
> 
>     With patch                      Without patch
> 
>     real    0m 2.27s                real    0m 4.32s
>     user    0m 0.00s                user    0m 0.00s
>     sys     0m 1.13s                sys     0m 2.16s
> 
> Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
> ---
>  net/core/dev.c |    8 +++++++-
>  1 files changed, 7 insertions(+), 1 deletions(-)

My $.02 would for fixing full name hash because other usages would
benefit as well.  Inventing special case for network devices seems
unnecessary.

-- 

^ permalink raw reply

* Re: [RFC] make per interface sysctl entries configurable
From: Stephen Hemminger @ 2009-10-26  4:31 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: Eric Dumazet, Benjamin LaHaise, Cosmin Ratiu, netdev
In-Reply-To: <200910251954.49700.opurdila@ixiacom.com>

On Sun, 25 Oct 2009 19:54:49 +0200
Octavian Purdila <opurdila@ixiacom.com> wrote:

> 
> RFC patches are attached.
> 
> Another possible approach: add an interface flag and use it to decide whether 
> we want per interface sysctl entries or not.
> 
> Benchmarks for creating 1000 interface (with the ndst module previously posted 
> on the list, ppc750 @800Mhz machine):
> 
> - without the patches:
> 
> real    4m 38.27s
> user    0m 0.00s
> sys     2m 18.90s
> 
> - with the patches:
> 
> real    0m 0.10s
> user    0m 0.00s
> sys     0m 0.05s
> 
> Thanks,
> tavi

I would rather optimize the algorithm than give up and make it
not available. It should be possible to do better by just using some
better programming.

-- 

^ permalink raw reply

* Re: VLAN and ARP failure on tg3 drivers
From: Gertjan Hofman @ 2009-10-26  4:30 UTC (permalink / raw)
  To: Matt Carlson; +Cc: netdev@vger.kernel.org, Eric Dumazet, Benny Amorsen

Dear Matt, Eric, Benny,

Sorry about the slow response to your fast replies. I think Benny is correct, the 'problem' lies in the fact that we were using a VLAN ID of 0, without knowing its special significance. User error.

I tested it with other VLAN id's (>0) and it appears to work fine. We are not entirely sure we understand  why it used to work with VLAN ID 0 on the Broadcom chips and still does with a number of different cards (with >2.6.27 kernels).  What is the 'correct' behaviour for this incorrect usage ? When a PC returns the ARP response to the machine with the BroadCom card, it will have the destination address of the VLAN device, but presumably the VLAN ID tag set to zero.  Hmmm. I can live with not knowing the answer I guess.


Thanks again,

Gertjan



 

--- On Fri, 10/23/09, Matt Carlson <mcarlson@broadcom.com> wrote:

> From: Matt Carlson <mcarlson@broadcom.com>
> Subject: Re: VLAN and ARP failure on tg3 drivers
> To: "Gertjan Hofman" <gertjan_hofman@yahoo.com>
> Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>
> Date: Friday, October 23, 2009, 3:35 PM
> On Thu, Oct 22, 2009 at 09:52:42PM
> -0700, Gertjan Hofman wrote:
> > Dear Kernel developers,
> > 
> > A couple of weeks ago we tried to migrate from a
> 2.6.24? kernel to a 2.6.29 kernel and noticed our VLAN
> application no longer works.? The problem is easy to
> replicate:
> > 
> > 1. connect 2 PC's with a cross-over cable
> > 2. set up a fixed IP address to both PC's? (say
> 192.168.0.[1,2])
> > 3. create a vlan:? vconfig? add eth0 0.
> > 4. set IP addresses on the VLAN devices? (say
> 192.168.1.[1,2])
> > 5. try ping one machine from the other.
> > 
> > I tried to dig into the problem by using un-patched
> kernel.org kernels with Ubuntu .config files.? Kernels up to
> 2.6.26 work fine, kernels after and including 2.6.27 fail.
> The problem is that ARP messages are being dropped. If the
> ARP table is updated by hand on each machine, the
> communication across the VLAN works fine.
> > 
> > At first I thought the kernel VLAN code was the
> problem (we had an earlier issue with a regression in
> 2.6.24) but it looks like the problem is actually with the
> tg3 driver.? Our system uses Broadcom ethernet chips. I
> tried the same experiments with combination of boards that
> have Broadcom and none-Broadcom and the only time I see it
> fail is with the tg3? driver loaded.
> > 
> > Snooping with WireShark shows that a ARP request from
> the non-Broadcom machine is seen and even answered, but
> never appears back on the network. If the Broadcom machine
> orginates the ARP message, it never arrives at the
> destination. I tried lowering the size of the MTU to 1492 as
> well as giving each VLAN device a different MAC. No deal.
> > 
> > I tried to look at tg3 patch changes from 2.6.26 to
> 2.6.27 but I am not familiar enough with the Git system to
> extract the appropiate changes.? I am a bit surprised that I
> am not seeing any references to this on the web, the
> combination of >2.6.27 kernels, Broadcom and VLAN cant be
> that uncommon.
> > 
> > I would be happy to provide more information and to
> try tests if any one can suggest them.
> > 
> > Sincerely,
> > 
> > Gertjan
> 
> I don't see any reason why your setup should fail, but it
> doesn't hurt
> to gather more info about the problem.
> 
> What device are you experiencing this problem with? 
> Is management
> firmware enabled?  (`ethtool -i ethx`)
> 
>






      

^ permalink raw reply

* Re: PATCH 23/10]Optimize the upload speed for PPP connection.
From: Franko Fang @ 2009-10-26  2:44 UTC (permalink / raw)
  To: David Miller, william.allen.simpson
  Cc: netdev, linux-kernel, zihan, greg, haegar
In-Reply-To: <20091024.064603.249036129.davem@davemloft.net>

Thanks your advice.

But generally, PAGE_SIZE is 4096, whether it is too large or not?

If PAGE_SIZE is really appropriate, then I can resubmit the patch.

Thanks very much.
----- Original Message ----- 
From: "David Miller" <davem@davemloft.net>
To: <william.allen.simpson@gmail.com>
Cc: <huananhu@huawei.com>; <netdev@vger.kernel.org>; <linux-kernel@vger.kernel.org>; <zihan@huawei.com>; <greg@kroah.com>; <haegar@sdinet.de>
Sent: Saturday, October 24, 2009 9:46 PM
Subject: Re: PATCH 23/10]Optimize the upload speed for PPP connection.


> From: William Allen Simpson <william.allen.simpson@gmail.com>
> Date: Fri, 23 Oct 2009 07:46:08 -0400
> 
>> Concur.  I'd go further than that, my code usually made room for at
>> least
>> a full MTU (MRU) with HDLC escaping.  To minimize context switches,
>> that
>> should be 3014 ((1500 MRU + 2 FCS + 4 header) * 2 escapes + 2 flags).
>> 
>> Even in the old days, when memory was tight, context switches and
>> interrupt
>> time were more expensive, too.  PPP is supposed to scale to OC-192.
> 
> Actually I'd like to see ->obuf allocated externally and then
> make it simply PAGE_SIZE.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* linux-next: manual merge of the net tree with the i2c tree
From: Stephen Rothwell @ 2009-10-26  2:37 UTC (permalink / raw)
  To: David Miller, netdev
  Cc: linux-next, linux-kernel, Mika Kuoppala, Jean Delvare,
	Ben Hutchings

Hi all,

Today's linux-next merge of the net tree got a conflict in
drivers/net/sfc/sfe4001.c between commit
3f7c0648f727a6d5baf6117653e4001dc877b90b ("i2c: Prevent priority
inversion on top of bus lock") from the i2c tree and commit
c9597d4f89565b6562bd3026adbe6eac6c317f47 ("sfc: Merge sfe4001.c into
falcon_boards.c") from the net tree.

I have applied the following merge fixup patch (after removing
drivers/net/sfc/sfe4001.c) and can carry it as necessary.

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 26 Oct 2009 12:24:55 +1100
Subject: [PATCH] net: merge fixup for drivers/net/sfc/falcon_boards.c

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 drivers/net/sfc/falcon_boards.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/sfc/falcon_boards.c b/drivers/net/sfc/falcon_boards.c
index 99f7372..788b336 100644
--- a/drivers/net/sfc/falcon_boards.c
+++ b/drivers/net/sfc/falcon_boards.c
@@ -327,7 +327,7 @@ static int sfn4111t_reset(struct efx_nic *efx)
 	efx_oword_t reg;
 
 	/* GPIO 3 and the GPIO register are shared with I2C, so block that */
-	mutex_lock(&efx->i2c_adap.bus_lock);
+	rt_mutex_lock(&efx->i2c_adap.bus_lock);
 
 	/* Pull RST_N (GPIO 2) low then let it up again, setting the
 	 * FLASH_CFG_1 strap (GPIO 3) appropriately.  Only change the
@@ -343,7 +343,7 @@ static int sfn4111t_reset(struct efx_nic *efx)
 	efx_writeo(efx, &reg, FR_AB_GPIO_CTL);
 	msleep(1);
 
-	mutex_unlock(&efx->i2c_adap.bus_lock);
+	rt_mutex_unlock(&efx->i2c_adap.bus_lock);
 
 	ssleep(1);
 	return 0;
-- 
1.6.5

^ permalink raw reply related

* Re: [PATCH] virtio-net: fix data corruption with OOM
From: Rusty Russell @ 2009-10-26  1:41 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, kvm, virtualization
In-Reply-To: <20091025170340.GA22099@redhat.com>

On Mon, 26 Oct 2009 03:33:40 am Michael S. Tsirkin wrote:
> virtio net used to unlink skbs from send queues on error,
> but ever since 48925e372f04f5e35fec6269127c62b2c71ab794
> we do not do this. This causes guest data corruption and crashes
> with vhost since net core can requeue the skb or free it without
> it being taken off the list.
> 
> This patch fixes this by queueing the skb after successfull
> transmit.

I originally thought that this was racy: as soon as we do add_buf, we need to
make sure we're ready for the callback (for virtio_pci, it's ->kick, but we
shouldn't rely on that).

So a comment would be nice.  How's this?

Subject: virtio-net: fix data corruption with OOM
Date: Sun, 25 Oct 2009 19:03:40 +0200
From: "Michael S. Tsirkin" <mst@redhat.com>

virtio net used to unlink skbs from send queues on error,
but ever since 48925e372f04f5e35fec6269127c62b2c71ab794
we do not do this. This causes guest data corruption and crashes
with vhost since net core can requeue the skb or free it without
it being taken off the list.

This patch fixes this by queueing the skb after successful
transmit.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ comment)
---

Rusty, here's a fix for another data corrupter I saw.
This fixes a regression from 2.6.31, so definitely
2.6.32 I think. Comments?

 drivers/net/virtio_net.c |    8 +++++---
 1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -516,8 +516,7 @@ again:
 	/* Free up any pending old buffers before queueing new ones. */
 	free_old_xmit_skbs(vi);

-	/* Put new one in send queue and do transmit */
-	__skb_queue_head(&vi->send, skb);
+	/* Try to transmit */
 	capacity = xmit_skb(vi, skb);

 	/* This can happen with OOM and indirect buffers. */
@@ -531,8 +530,17 @@ again:
 		}
 		return NETDEV_TX_BUSY;
 	}
+	vi->svq->vq_ops->kick(vi->svq);

-	vi->svq->vq_ops->kick(vi->svq);
+	/*
+	 * Put new one in send queue.  You'd expect we'd need this before
+	 * xmit_skb calls add_buf(), since the callback can be triggered
+	 * immediately after that.  But since the callback just triggers
+	 * another call back here, normal network xmit locking prevents the
+	 * race.
+	 */
+	__skb_queue_head(&vi->send, skb);
+
 	/* Don't wait up for transmitted skbs to be freed. */
 	skb_orphan(skb);
 	nf_reset(skb);

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox