* [PATCH 02/16] tcp: Abstract back handling peer aliveness test into helper function.
From: David Miller @ 2012-07-10 15:07 UTC (permalink / raw)
To: netdev
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_ipv4.c | 2 +-
net/ipv4/tcp_metrics.c | 10 ++++++++++
net/ipv6/tcp_ipv6.c | 2 +-
4 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 98ca797..5478356 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -389,6 +389,7 @@ extern void tcp_enter_loss(struct sock *sk, int how);
extern void tcp_clear_retrans(struct tcp_sock *tp);
extern void tcp_update_metrics(struct sock *sk);
extern void tcp_init_metrics(struct sock *sk);
+extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
extern void tcp_disable_fack(struct tcp_sock *tp);
extern void tcp_close(struct sock *sk, long timeout);
extern void tcp_init_sock(struct sock *sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 64568fa..e9312a8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1405,7 +1405,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
(!peer || !peer->tcp_ts_stamp) &&
- (!dst || !dst_metric(dst, RTAX_RTT))) {
+ !tcp_peer_is_proven(req, dst)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 2793ecf..9afe703 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1,7 +1,9 @@
+#include <linux/module.h>
#include <linux/cache.h>
#include <linux/tcp.h>
#include <net/inet_connection_sock.h>
+#include <net/request_sock.h>
#include <net/sock.h>
#include <net/dst.h>
#include <net/tcp.h>
@@ -190,3 +192,11 @@ reset:
tp->snd_cwnd = tcp_init_cwnd(tp, dst);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
+{
+ if (!dst)
+ return false;
+ return dst_metric(dst, RTAX_RTT) ? true : false;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6cc67ed..75d1795 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1177,7 +1177,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
(!peer || !peer->tcp_ts_stamp) &&
- (!dst || !dst_metric(dst, RTAX_RTT))) {
+ !tcp_peer_is_proven(req, dst)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
--
1.7.10.4
^ permalink raw reply related
* [PATCH 01/16] tcp: Move dynamnic metrics handling into seperate file.
From: David Miller @ 2012-07-10 15:07 UTC (permalink / raw)
To: netdev
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/net/tcp.h | 4 +
net/ipv4/Makefile | 2 +-
net/ipv4/tcp_input.c | 188 +----------------------------------------------
net/ipv4/tcp_metrics.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 199 insertions(+), 187 deletions(-)
create mode 100644 net/ipv4/tcp_metrics.c
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 53fb7d8..98ca797 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,6 +388,8 @@ extern void tcp_enter_frto(struct sock *sk);
extern void tcp_enter_loss(struct sock *sk, int how);
extern void tcp_clear_retrans(struct tcp_sock *tp);
extern void tcp_update_metrics(struct sock *sk);
+extern void tcp_init_metrics(struct sock *sk);
+extern void tcp_disable_fack(struct tcp_sock *tp);
extern void tcp_close(struct sock *sk, long timeout);
extern void tcp_init_sock(struct sock *sk);
extern unsigned int tcp_poll(struct file * file, struct socket *sock,
@@ -556,6 +558,8 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
return (tp->srtt >> 3) + tp->rttvar;
}
+extern void tcp_set_rto(struct sock *sk);
+
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3b..5a23e8b 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o \
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ca0d0e7..055ac49 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_frto_response __read_mostly;
-int sysctl_tcp_nometrics_save __read_mostly;
int sysctl_tcp_thin_dupack __read_mostly;
@@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static inline void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk)
tcp_bound_rto(sk);
}
-/* Save metrics learned by this TCP session.
- This function is called only, when TCP finishes successfully
- i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
- */
-void tcp_update_metrics(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (sysctl_tcp_nometrics_save)
- return;
-
- if (dst && (dst->flags & DST_HOST)) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- int m;
- unsigned long rtt;
-
- dst_confirm(dst);
-
- if (icsk->icsk_backoff || !tp->srtt) {
- /* This session failed to estimate rtt. Why?
- * Probably, no packets returned in time.
- * Reset our results.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT)))
- dst_metric_set(dst, RTAX_RTT, 0);
- return;
- }
-
- rtt = dst_metric_rtt(dst, RTAX_RTT);
- m = rtt - tp->srtt;
-
- /* If newly calculated rtt larger than stored one,
- * store new one. Otherwise, use EWMA. Remember,
- * rtt overestimation is always better than underestimation.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT))) {
- if (m <= 0)
- set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
- else
- set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
- }
-
- if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
- unsigned long var;
- if (m < 0)
- m = -m;
-
- /* Scale deviation to rttvar fixed point */
- m >>= 1;
- if (m < tp->mdev)
- m = tp->mdev;
-
- var = dst_metric_rtt(dst, RTAX_RTTVAR);
- if (m >= var)
- var = m;
- else
- var -= (var - m) >> 2;
-
- set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
- }
-
- if (tcp_in_initial_slowstart(tp)) {
- /* Slow start still did not finish. */
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
- if (!dst_metric_locked(dst, RTAX_CWND) &&
- tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
- icsk->icsk_ca_state == TCP_CA_Open) {
- /* Cong. avoidance phase, cwnd is reliable. */
- if (!dst_metric_locked(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH,
- max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND,
- (dst_metric(dst, RTAX_CWND) +
- tp->snd_cwnd) >> 1);
- } else {
- /* Else slow start did not finish, cwnd is non-sense,
- ssthresh may be also invalid.
- */
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND,
- (dst_metric(dst, RTAX_CWND) +
- tp->snd_ssthresh) >> 1);
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
- }
-
- if (!dst_metric_locked(dst, RTAX_REORDERING)) {
- if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
- tp->reordering != sysctl_tcp_reordering)
- dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
- }
- }
-}
-
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
* Packet counting of FACK is based on in-order assumptions, therefore TCP
* disables it when reordering is detected
*/
-static void tcp_disable_fack(struct tcp_sock *tp)
+void tcp_disable_fack(struct tcp_sock *tp)
{
/* RFC3517 uses different metric in lost marker => reset on change */
if (tcp_is_fack(tp))
@@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
}
-/* Initialize metrics on socket. */
-
-static void tcp_init_metrics(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst == NULL)
- goto reset;
-
- dst_confirm(dst);
-
- if (dst_metric_locked(dst, RTAX_CWND))
- tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
- if (dst_metric(dst, RTAX_SSTHRESH)) {
- tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- } else {
- /* ssthresh may have been reduced unnecessarily during.
- * 3WHS. Restore it back to its initial default.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- }
- if (dst_metric(dst, RTAX_REORDERING) &&
- tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
- tcp_disable_fack(tp);
- tcp_disable_early_retrans(tp);
- tp->reordering = dst_metric(dst, RTAX_REORDERING);
- }
-
- if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
- goto reset;
-
- /* Initial rtt is determined from SYN,SYN-ACK.
- * The segment is small and rtt may appear much
- * less than real one. Use per-dst memory
- * to make it more realistic.
- *
- * A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal circumstances sending small
- * packets force peer to delay ACKs and calculation is correct too.
- * The algorithm is adaptive and, provided we follow specs, it
- * NEVER underestimate RTT. BUT! If peer tries to make some clever
- * tricks sort of "quick acks" for time long enough to decrease RTT
- * to low value, and then abruptly stops to do it and starts to delay
- * ACKs, wait for troubles.
- */
- if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
- tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
- tp->rtt_seq = tp->snd_nxt;
- }
- if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
- tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
- tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
- }
- tcp_set_rto(sk);
-reset:
- if (tp->srtt == 0) {
- /* RFC6298: 5.7 We've failed to get a valid RTT sample from
- * 3WHS. This is most likely due to retransmission,
- * including spurious one. Reset the RTO back to 3secs
- * from the more aggressive 1sec to avoid more spurious
- * retransmission.
- */
- tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
- }
- /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
- * retransmitted. In light of RFC6298 more aggressive 1sec
- * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
- * retransmission has occurred.
- */
- if (tp->total_retrans > 1)
- tp->snd_cwnd = 1;
- else
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
static void tcp_update_reordering(struct sock *sk, const int metric,
const int ts)
{
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 0000000..2793ecf
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,192 @@
+#include <linux/cache.h>
+#include <linux/tcp.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_nometrics_save __read_mostly;
+
+/* Save metrics learned by this TCP session. This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (sysctl_tcp_nometrics_save)
+ return;
+
+ if (dst && (dst->flags & DST_HOST)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ int m;
+ unsigned long rtt;
+
+ dst_confirm(dst);
+
+ if (icsk->icsk_backoff || !tp->srtt) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time.
+ * Reset our results.
+ */
+ if (!(dst_metric_locked(dst, RTAX_RTT)))
+ dst_metric_set(dst, RTAX_RTT, 0);
+ return;
+ }
+
+ rtt = dst_metric_rtt(dst, RTAX_RTT);
+ m = rtt - tp->srtt;
+
+ /* If newly calculated rtt larger than stored one,
+ * store new one. Otherwise, use EWMA. Remember,
+ * rtt overestimation is always better than underestimation.
+ */
+ if (!(dst_metric_locked(dst, RTAX_RTT))) {
+ if (m <= 0)
+ set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
+ else
+ set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
+ }
+
+ if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+ unsigned long var;
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev)
+ m = tp->mdev;
+
+ var = dst_metric_rtt(dst, RTAX_RTTVAR);
+ if (m >= var)
+ var = m;
+ else
+ var -= (var - m) >> 2;
+
+ set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
+ }
+
+ if (tcp_in_initial_slowstart(tp)) {
+ /* Slow start still did not finish. */
+ if (dst_metric(dst, RTAX_SSTHRESH) &&
+ !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+ (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
+ dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
+ if (!dst_metric_locked(dst, RTAX_CWND) &&
+ tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
+ dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
+ } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ icsk->icsk_ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!dst_metric_locked(dst, RTAX_SSTHRESH))
+ dst_metric_set(dst, RTAX_SSTHRESH,
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+ if (!dst_metric_locked(dst, RTAX_CWND))
+ dst_metric_set(dst, RTAX_CWND,
+ (dst_metric(dst, RTAX_CWND) +
+ tp->snd_cwnd) >> 1);
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ ssthresh may be also invalid.
+ */
+ if (!dst_metric_locked(dst, RTAX_CWND))
+ dst_metric_set(dst, RTAX_CWND,
+ (dst_metric(dst, RTAX_CWND) +
+ tp->snd_ssthresh) >> 1);
+ if (dst_metric(dst, RTAX_SSTHRESH) &&
+ !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+ tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
+ dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
+ }
+
+ if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+ if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+ dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
+ }
+ }
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst == NULL)
+ goto reset;
+
+ dst_confirm(dst);
+
+ if (dst_metric_locked(dst, RTAX_CWND))
+ tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
+ if (dst_metric(dst, RTAX_SSTHRESH)) {
+ tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ } else {
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ }
+ if (dst_metric(dst, RTAX_REORDERING) &&
+ tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+ tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
+ tp->reordering = dst_metric(dst, RTAX_REORDERING);
+ }
+
+ if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
+ goto reset;
+
+ /* Initial rtt is determined from SYN,SYN-ACK.
+ * The segment is small and rtt may appear much
+ * less than real one. Use per-dst memory
+ * to make it more realistic.
+ *
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal circumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+ if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
+ tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
+ tp->rtt_seq = tp->snd_nxt;
+ }
+ if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
+ tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
+ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+ }
+ tcp_set_rto(sk);
+reset:
+ if (tp->srtt == 0) {
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
+ * 3WHS. This is most likely due to retransmission,
+ * including spurious one. Reset the RTO back to 3secs
+ * from the more aggressive 1sec to avoid more spurious
+ * retransmission.
+ */
+ tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+ }
+ /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC6298 more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1)
+ tp->snd_cwnd = 1;
+ else
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
--
1.7.10.4
^ permalink raw reply related
* [PATCH 0/16] Metrics restructuring.
From: David Miller @ 2012-07-10 15:07 UTC (permalink / raw)
To: netdev
This patch series works towards the goal of minimizing the amount
of things that can change in an ipv4 route.
In a regime where the routing cache is removed, route changes will
lead to cloning in the FIB tables or similar.
The largest trigger of route metrics writes, TCP, now has it's own
cache of dynamic metric state. The timewait timestamps are stored
there now as well.
As a result of that, pre-cowing metrics is no longer necessary,
and therefore FLOWI_FLAG_PRECOW_METRICS is removed.
Redirect and PMTU handling is moved back into the ipv4 routes. I'm
sorry for all the headaches trying to do this in the inetpeer has
caused, it was the wrong approach for sure.
Since metrics become read-only for ipv4 we no longer need the inetpeer
hung off of the ipv4 routes either. So those disappear too.
Also, timewait sockets no longer need to hold onto an inetpeer either.
After this series, we still have some details to resolve wrt. PMTU and
redirects for a route-cache-less system:
1) With just the plain route cache removal, PMTU will continue to
work mostly fine. This is because of how the local route users
call down into the PMTU update code with the route they already
hold.
However, if we wish to cache pre-computed routes in fib_info
nexthops (which we want for performance), then we need to add
route cloning for PMTU events.
2) Redirects require more work. First, redirects must be changed to
be handled like PMTU. Wherein we call down into the sockets and
other entities, and then they call back into the routing code with
the route they were using.
So we'll be adding an ->update_nexthop() method alongside
->update_pmtu().
And then, like for PMTU, we'll need cloning support once we start
caching routes in the fib_info nexthops.
But that's it, we can completely pull the trigger and remove the
routing cache with minimal disruptions.
As it is, this patch series alone helps a lot of things. For one,
routing cache entry creation should be a lot faster, because we no
longer do inetpeer lookups (even to check if an entry exists).
This patch series also opens the door for non-DST_HOST ipv4 routes,
because nothing fundamentally cares about rt->rt_dst any more. It
can be removed with the base routing cache removal patch. In fact,
that was the primary goal of this patch series.
Signed-off-by: David S. Miller <davem@davemloft.net>
^ permalink raw reply
* [PATCH] ipv6: fix RTPROT_RA markup of RA routes w/nexthops
From: Denis Ovsienko @ 2012-07-10 14:45 UTC (permalink / raw)
To: netdev
In-Reply-To: <20120709.144002.6394436211445151.davem@davemloft.net>
From: Denis Ovsienko <infrastation@yandex.ru>
Userspace implementations of network routing protocols sometimes need to
tell RA-originated IPv6 routes from other kernel routes to make proper
routing decisions. This makes most sense for RA routes with nexthops,
namely, default routes and Route Information routes.
The intended mean of preserving RA route origin in a netlink message is
through indicating RTPROT_RA as protocol code. Function rt6_fill_node()
tried to do that for default routes, but its test condition was taken
wrong. This change is modeled after the original mailing list posting
by Jeff Haran. It fixes the test condition for default route case and
sets the same behaviour for Route Information case (both types use
nexthops). Handling of the 3rd RA route type, Prefix Information, is
left unchanged, as it stands for interface connected routes (without
nexthops).
Signed-off-by: Denis Ovsienko <infrastation@yandex.ru>
---
net/ipv6/route.c | 10 ++++++----
1 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 999a982..238b1ee 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2440,10 +2440,12 @@ static int rt6_fill_node(struct net *net,
rtm->rtm_protocol = rt->rt6i_protocol;
if (rt->rt6i_flags & RTF_DYNAMIC)
rtm->rtm_protocol = RTPROT_REDIRECT;
- else if (rt->rt6i_flags & RTF_ADDRCONF)
- rtm->rtm_protocol = RTPROT_KERNEL;
- else if (rt->rt6i_flags & RTF_DEFAULT)
- rtm->rtm_protocol = RTPROT_RA;
+ else if (rt->rt6i_flags & RTF_ADDRCONF) {
+ if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
+ rtm->rtm_protocol = RTPROT_RA;
+ else
+ rtm->rtm_protocol = RTPROT_KERNEL;
+ }
if (rt->rt6i_flags & RTF_CACHE)
rtm->rtm_flags |= RTM_F_CLONED;
--
1.7.7.6
^ permalink raw reply related
* Re: TCP transmit performance regression
From: Eric Dumazet @ 2012-07-10 14:45 UTC (permalink / raw)
To: Ming Lei; +Cc: Network Development, David Miller
In-Reply-To: <CACVXFVPmec5_3Up9qM4iA90Xua9J_E-aRT-2g7Hu7TR4zRKQtA@mail.gmail.com>
On Tue, 2012-07-10 at 22:22 +0800, Ming Lei wrote:
> Looks single page allocation won't put too much pressure on MM, that is
> why I suggested to avoid copy if the skb buffer size is less or equal one
> page. Anyway, unnecessary copy will increase computation and consume power.
AFAIK this long thread started with drivers/net/usb/smsc95xx.c using
32KB buffers, thats order-3 pages, not 'single page'
Definitely very wrong. You can try to claim the contrary, it wont be
wise.
^ permalink raw reply
* Re: [net-next patch v2] bnx2x: Add run-time CNIC support
From: David Miller @ 2012-07-10 14:40 UTC (permalink / raw)
To: eilong; +Cc: meravs, netdev, dmitry
In-Reply-To: <1341931085.27035.13.camel@lb-tlvb-eilong.il.broadcom.com>
From: "Eilon Greenstein" <eilong@broadcom.com>
Date: Tue, 10 Jul 2012 17:38:05 +0300
> On Tue, 2012-07-10 at 06:06 -0700, David Miller wrote:
>> Why can't you turn CNIC off at the start, and if a CNIC user actually
>> arrives and is activated, reset the entire chip and put it into CNIC
>> mode?
>
> Since the CNIC mode should not change under traffic, and since it is a
> shared HW attribute, we need to consider a scenario on which one
> interface is loaded and running in L2 only mode, and then on another
> interface the CNIC is required, but enabling it will affect the first
> interface that is already running.
If CNIC is off, the only effect for other users of the chips is
perhaps some dropped packets. This is acceptable.
That is why I said you should only go from CNIC off to CNIC on,
and never the other way around.
I have yet to see a justification why you cannot implement this
properly, and my patience is completely exhausted on this issue.
^ permalink raw reply
* Re: [net-next patch v2] bnx2x: Add run-time CNIC support
From: Eilon Greenstein @ 2012-07-10 14:38 UTC (permalink / raw)
To: David Miller; +Cc: meravs, netdev, dmitry
In-Reply-To: <20120710.060616.2081630953053267615.davem@davemloft.net>
On Tue, 2012-07-10 at 06:06 -0700, David Miller wrote:
> From: "Eilon Greenstein" <eilong@broadcom.com>
> Date: Tue, 10 Jul 2012 15:41:29 +0300
>
> > OK. Since it blocks the ability to add SR-IOV support, is it acceptable
> > to submit it as constant enabled for PF and disabled for VF (SR-IOV)?
>
> You're not describing to me why you guys are turning on features like
> the CNIC mode before you necessarily have any users of that feature.
The chips controlled by the bnx2x have shared HW - there is more than
one port using the same HW. So changing shared HW configuration affects
more than one interface and therefore should not be done in runtime -
one of the requirements is that operations on one interface will not
affect any other interface.
> Why can't you turn CNIC off at the start, and if a CNIC user actually
> arrives and is activated, reset the entire chip and put it into CNIC
> mode?
Since the CNIC mode should not change under traffic, and since it is a
shared HW attribute, we need to consider a scenario on which one
interface is loaded and running in L2 only mode, and then on another
interface the CNIC is required, but enabling it will affect the first
interface that is already running.
> And if CNIC being on is such a latency killer, why in the world
> haven't you done things more reasonably like that from the very
> beginning?
>
> Why are you making it so that lower latency with your chips is only
> available to a group of users who are effectively statistically
> insignificant?
The chip latency is advertised with the CNIC support for customers.
However, some of them have full control over the environment and do not
care about offloaded storage and they were able to optimize it by
removing the CNIC completely. This is somewhat similar to customers that
do not want the other port and we tweak the device nvram to completely
shutdown one port and by that save some power - most customers that use
only one port cannot benefit from this additional power saving of
completely disabling that port. Most of our customers are OEM that sells
machines that might run Linux, and they want to allow users to use iSCSI
and FCoE - and we are not enabling this extra optimization for those -
simply because the HW was designed to be a converged NIC and this L2
only optimization was added later for those special cases.
This patch is removing the ifdefs from all over the bnx2x and placing
the equivalent ifdef about the Kconfig in one location - this will allow
adding support for the SR-IOV that cannot support the CNIC alongside
with PF that does support it.
^ permalink raw reply
* Re: TCP transmit performance regression
From: Ming Lei @ 2012-07-10 14:22 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Network Development, David Miller
In-Reply-To: <1341928931.3265.5263.camel@edumazet-glaptop>
On Tue, Jul 10, 2012 at 10:02 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> I am kind of annoyed you sent on netdev a copy of a _private_ mail.
I am sure that your reply which includes below is not from a private mail:
Only because skbs were fat (8KB allocated/truesize, for a single
1500 bytes frame)
>
> Next time, make sure you dont do that without my consent.
OK
> On Tue, 2012-07-10 at 21:37 +0800, Ming Lei wrote:
>
>> Could you explain why the truesize of SKB is 8KB for single
>> 1500bytes frame?
>>
>
> Because the driver uses skb_alloc(4096) for example ?
>
> I don't know, you don't tell us the driver.
>
>
> Goal is to have skb->head points to a 2048 bytes area, so truesize
> should be 2048 + sizeof(sk_buff) (including struct shared_info)
>
>> I observed it is 2560bytes for RX SKBs inside asix_rx_fixup with
>> rx_urb_size of 2048 on beagle-xm.
>>
>
> Thats because using 2048 bytes for the urb buffer (excluding
> shared_info) means you need :
>
> 2048 + sizeof(struct shared_info) + sizeof(sk_buff) = 2560
>
> In fact 2048 + sizeof(struct shared_info) means a full 4096 area is
> used.
>
> You have 2560 on recent kernels because the way netdev_alloc_frag()
> works.
>
> Thats why copybreak can actually saves ram. Since it is adding a copy,
> we try to use it only on slow devices.
Looks single page allocation won't put too much pressure on MM, that is
why I suggested to avoid copy if the skb buffer size is less or equal one
page. Anyway, unnecessary copy will increase computation and consume power.
Thanks,
--
Ming Lei
^ permalink raw reply
* Re: TCP transmit performance regression
From: Eric Dumazet @ 2012-07-10 14:02 UTC (permalink / raw)
To: Ming Lei; +Cc: Network Development, David Miller
In-Reply-To: <CACVXFVM-fhQJX+EFNoFSnuVAjqfRM7OjzbQML7H+tMEjxG7Rug@mail.gmail.com>
I am kind of annoyed you sent on netdev a copy of a _private_ mail.
Next time, make sure you dont do that without my consent.
On Tue, 2012-07-10 at 21:37 +0800, Ming Lei wrote:
> Could you explain why the truesize of SKB is 8KB for single
> 1500bytes frame?
>
Because the driver uses skb_alloc(4096) for example ?
I don't know, you don't tell us the driver.
Goal is to have skb->head points to a 2048 bytes area, so truesize
should be 2048 + sizeof(sk_buff) (including struct shared_info)
> I observed it is 2560bytes for RX SKBs inside asix_rx_fixup with
> rx_urb_size of 2048 on beagle-xm.
>
Thats because using 2048 bytes for the urb buffer (excluding
shared_info) means you need :
2048 + sizeof(struct shared_info) + sizeof(sk_buff) = 2560
In fact 2048 + sizeof(struct shared_info) means a full 4096 area is
used.
You have 2560 on recent kernels because the way netdev_alloc_frag()
works.
Thats why copybreak can actually saves ram. Since it is adding a copy,
we try to use it only on slow devices.
^ permalink raw reply
* Re: [PATCH] bridge: fix endian
From: RongQing Li @ 2012-07-10 14:00 UTC (permalink / raw)
To: devendra.aaru; +Cc: netdev, yoshfuji
In-Reply-To: <CAHdPZaP-eokSqBPWdxBg7fA-ETqvo0ivqG67KyJr3s6qb1xKBA@mail.gmail.com>
2012/7/10, devendra.aaru <devendra.aaru@gmail.com>:
> As you are doing the same change to the drivers in drivers/net/*** i
> think a patchset would be better.
>
> but that's just upto you. ;-)
>
> Thanks,
If I can find others afterward, I will change them on a patchset.
Thanks
-Roy
^ permalink raw reply
* Re: TCP transmit performance regression
From: Ming Lei @ 2012-07-10 13:37 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Network Development, David Miller
In-Reply-To: <1341908908.3265.4508.camel@edumazet-glaptop>
On Tue, Jul 10, 2012 at 4:28 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2012-07-10 at 15:22 +0800, Ming Lei wrote:
>
>> Kernel stack size is 8KB or more, so could you find process creation failure
>> in your ChromeBooks machine at the same time?
>
> I believe you mix a lot of things.
>
> Have you ever heard of sockets limits ?
>
> All available ram on a machine is not for whoever wants it, thanks God.
>
> No : TCP stack was dropping frames, because of socket limits.
>
> Only because skbs were fat (8KB allocated/truesize, for a single 1500
> bytes frame)
Could you explain why the truesize of SKB is 8KB for single
1500bytes frame?
I observed it is 2560bytes for RX SKBs inside asix_rx_fixup with
rx_urb_size of 2048 on beagle-xm.
>
> If application is fast and read skb as soon as the arrive, no problem is
> detected.
>
> But if application is slow, or a TCP packet is lost on network,
> man packets are queued into ofo queue. And eventually not enough room is
> avalable -> we drop incoming frames, and sender has to restransmit them.
>
> So instead of loading your web pages as fast as possible, you have to
> wait for retransmits.
>
> So you see nothing at all, no kernel logs, no failed memory attempts.
>
> Only its slower than necessary
>
>
>
Thanks,
--
Ming Lei
^ permalink raw reply
* Re: [RFC PATCH] tcp: limit data skbs in qdisc layer
From: Lin Ming @ 2012-07-10 13:28 UTC (permalink / raw)
To: Eric Dumazet
Cc: David Miller, ycheng, dave.taht, netdev, codel, therbert,
mattmathis, nanditad, ncardwell, andrewmcgr
In-Reply-To: <1341845722.3265.3065.camel@edumazet-glaptop>
On Mon, Jul 9, 2012 at 10:55 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2012-07-09 at 00:08 -0700, David Miller wrote:
>
>> I'm suspicious and anticipate that 10G will need more queueing than
>> you are able to get away with tg3 at 1G speeds. But it is an exciting
>> idea nonetheless :-)
>
> There is a fundamental problem calling any xmit function from skb
> destructor.
>
> skb destructor can be called while qdisc lock is taken, so we can
> deadlock trying to reacquire it.
>
> One such path is the dev_deactivate_queue() -> qdisc_reset() ->
> qdisc_reset_queue(), but also any dropped skbs in qdisc.
>
> So I should only do this stuff from a separate context, for example a
> tasklet or timer.
>
> Alternative would be to use dev_kfree_skb_irq() for all dropped skbs in
> qdisc layer.
Hi Eric,
Maybe a bit off topic ...
Could you share how to test qdisc related change?
Assume I'm testing qdisc performance, for example, codel qdisc,
then how to setup the test environment?
Do you use some network simulator, for example, using the special netem qdisc to
simulate slow network/packet loss/network delay, etc?
Thanks,
Lin Ming
^ permalink raw reply
* Re: [PATCH 4 0/4] Add ability to set defaultless network device MAC addresses to deterministic computed locally administered values
From: Steven Rostedt @ 2012-07-10 13:08 UTC (permalink / raw)
To: "Andy Green (林安廸)"
Cc: Florian Fainelli, linux-arm-kernel, linux-omap, s-jan, arnd,
patches, tony, netdev, linux-kernel
In-Reply-To: <4FFC2712.9020208@warmcat.com>
On Tue, 2012-07-10 at 20:58 +0800, "Andy Green (林安廸)" wrote:
> On 10/07/12 20:37, the mail apparently from Florian Fainelli included:
>
> Why should Ubuntu, Fedora etc stink up their OSes with Panda-specific
> workarounds? And Panda is not the only device with this issue.
Actually I think you just answered your own question ;-)
Anyway, I don't think an initrd solution is the best. Yeah, it's fine
for a work around, but then I need to go and screw with the initrd if it
doesn't have support for the board. If the network card already has a
MAC address, why should the kernel give it another *random* one?
This isn't a complex patch set, where the complexity should be put into
userspace. And it makes it very convenient for people that just want the
board to boot so they can test it. I'm not developing any SoC or BSP,
I'm just using it to make sure my kernel changes can also be implemented
for ARM.
-- Steve
^ permalink raw reply
* Re: [net-next patch v2] bnx2x: Add run-time CNIC support
From: David Miller @ 2012-07-10 13:06 UTC (permalink / raw)
To: eilong; +Cc: meravs, netdev, dmitry
In-Reply-To: <1341924089.27035.7.camel@lb-tlvb-eilong.il.broadcom.com>
From: "Eilon Greenstein" <eilong@broadcom.com>
Date: Tue, 10 Jul 2012 15:41:29 +0300
> OK. Since it blocks the ability to add SR-IOV support, is it acceptable
> to submit it as constant enabled for PF and disabled for VF (SR-IOV)?
You're not describing to me why you guys are turning on features like
the CNIC mode before you necessarily have any users of that feature.
Why can't you turn CNIC off at the start, and if a CNIC user actually
arrives and is activated, reset the entire chip and put it into CNIC
mode?
And if CNIC being on is such a latency killer, why in the world
haven't you done things more reasonably like that from the very
beginning?
Why are you making it so that lower latency with your chips is only
available to a group of users who are effectively statistically
insignificant?
^ permalink raw reply
* Re: [GIT PULL net] IPVS
From: Pablo Neira Ayuso @ 2012-07-10 13:05 UTC (permalink / raw)
To: Simon Horman
Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer
In-Reply-To: <20120710092002.GE12776@verge.net.au>
Hi Simon,
On Tue, Jul 10, 2012 at 06:20:03PM +0900, Simon Horman wrote:
> On Mon, Apr 30, 2012 at 11:27:22AM +0200, Pablo Neira Ayuso wrote:
> > On Fri, Apr 27, 2012 at 09:53:54AM +0900, Simon Horman wrote:
> > > Hi Pablo,
> > >
> > > please consider the following 5 changes for 3.4, they are all bug fixes.
> > > I would also like these changes considered for stable.
> >
> > Please, ping me again once these have hit Linus tree to ask for
> > -stable submission.
>
> Sorry for letting this slip through the cracks.
>
> Please consider the following commits which are in Linus's tree for stable.
> Or I can submit them directly if that is easier.
>
> There are 7 patches listed below. The first 5 were the patches in this
> pull request. The last two were patches in a git pull request
> a few days earlier.
That's fine, I can make it, but you have to include what stable
releases this will be applied, eg. patch 1 to releases 3.4 and 3.2.
I think -stable maintainers will ask for that.
> commit 8537de8a7ab6681cc72fb0411ab1ba7fdba62dd0
> Author: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Thu Apr 26 07:47:44 2012 +0200
>
> ipvs: kernel oops - do_ip_vs_get_ctl
>
> Change order of init so netns init is ready
> when register ioctl and netlink.
>
> Ver2
> Whitespace fixes and __init added.
>
> Reported-by: "Ryan O'Hara" <rohara@redhat.com>
> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Acked-by: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: Simon Horman <horms@verge.net.au>
>
> commit 582b8e3eadaec77788c1aa188081a8d5059c42a6
> Author: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Thu Apr 26 09:45:35 2012 +0200
>
> ipvs: take care of return value from protocol init_netns
>
> ip_vs_create_timeout_table() can return NULL
> All functions protocol init_netns is affected of this patch.
>
> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Acked-by: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Simon Horman <horms@verge.net.au>
>
> commit 4b984cd50bc1b6d492175cd77bfabb78e76ffa67
> Author: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Thu Apr 26 09:45:34 2012 +0200
>
> ipvs: null check of net->ipvs in lblc(r) shedulers
>
> Avoid crash when registering shedulers after
> the IPVS core initialization for netns fails. Do this by
> checking for present core (net->ipvs).
>
> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Acked-by: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Simon Horman <horms@verge.net.au>
>
> commit 39f618b4fd95ae243d940ec64c961009c74e3333
> Author: Julian Anastasov <ja@ssi.bg>
> Date: Wed Apr 25 00:29:58 2012 +0300
>
> ipvs: reset ipvs pointer in netns
>
> Make sure net->ipvs is reset on netns cleanup or failed
> initialization. It is needed for IPVS applications to know that
> IPVS core is not loaded in netns.
>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Signed-off-by: Simon Horman <horms@verge.net.au>
>
> commit 8d08d71ce59438a6ef06be5db07966e0c144b74e
> Author: Julian Anastasov <ja@ssi.bg>
> Date: Wed Apr 25 00:29:59 2012 +0300
>
> ipvs: add check in ftp for initialized core
>
> Avoid crash when registering ip_vs_ftp after
> the IPVS core initialization for netns fails. Do this by
> checking for present core (net->ipvs).
>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Signed-off-by: Simon Horman <horms@verge.net.au>
>
> commit 8f9b9a2fad47af27e14b037395e03cd8278d96d7
> Author: Julian Anastasov <ja@ssi.bg>
> Date: Fri Apr 13 18:08:43 2012 +0300
>
> ipvs: fix crash in ip_vs_control_net_cleanup on unload
>
> commit 14e405461e664b777e2a5636e10b2ebf36a686ec (2.6.39)
> ("Add __ip_vs_control_{init,cleanup}_sysctl()")
> introduced regression due to wrong __net_init for
> __ip_vs_control_cleanup_sysctl. This leads to crash when
> the ip_vs module is unloaded.
>
> Fix it by changing __net_init to __net_exit for
> the function that is already renamed to ip_vs_control_net_cleanup_sysctl.
>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
> Signed-off-by: Simon Horman <horms@verge.net.au>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
>
> commit 7118c07a844d367560ee91adb2071bde2fabcdbf
> Author: Sasha Levin <levinsasha928@gmail.com>
> Date: Sat Apr 14 12:37:46 2012 -0400
>
> ipvs: Verify that IP_VS protocol has been registered
>
> The registration of a protocol might fail, there were no checks
> and all registrations were assumed to be correct. This lead to
> NULL ptr dereferences when apps tried registering.
>
> For example:
>
> [ 1293.226051] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
> [ 1293.227038] IP: [<ffffffff822aacb0>] tcp_register_app+0x60/0xb0
> [ 1293.227038] PGD 391de067 PUD 6c20b067 PMD 0
> [ 1293.227038] Oops: 0000 [#1] PREEMPT SMP
> [ 1293.227038] CPU 1
> [ 1293.227038] Pid: 19609, comm: trinity Tainted: G W 3.4.0-rc1-next-20120405-sasha-dirty #57
> [ 1293.227038] RIP: 0010:[<ffffffff822aacb0>] [<ffffffff822aacb0>] tcp_register_app+0x60/0xb0
> [ 1293.227038] RSP: 0018:ffff880038c1dd18 EFLAGS: 00010286
> [ 1293.227038] RAX: ffffffffffffffc0 RBX: 0000000000001500 RCX: 0000000000010000
> [ 1293.227038] RDX: 0000000000000000 RSI: ffff88003a2d5888 RDI: 0000000000000282
> [ 1293.227038] RBP: ffff880038c1dd48 R08: 0000000000000000 R09: 0000000000000000
> [ 1293.227038] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a2d5668
> [ 1293.227038] R13: ffff88003a2d5988 R14: ffff8800696a8ff8 R15: 0000000000000000
> [ 1293.227038] FS: 00007f01930d9700(0000) GS:ffff88007ce00000(0000) knlGS:0000000000000000
> [ 1293.227038] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 1293.227038] CR2: 0000000000000018 CR3: 0000000065dfc000 CR4: 00000000000406e0
> [ 1293.227038] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 1293.227038] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [ 1293.227038] Process trinity (pid: 19609, threadinfo ffff880038c1c000, task ffff88002dc73000)
> [ 1293.227038] Stack:
> [ 1293.227038] ffff880038c1dd48 00000000fffffff4 ffff8800696aada0 ffff8800694f5580
> [ 1293.227038] ffffffff8369f1e0 0000000000001500 ffff880038c1dd98 ffffffff822a716b
> [ 1293.227038] 0000000000000000 ffff8800696a8ff8 0000000000000015 ffff8800694f5580
> [ 1293.227038] Call Trace:
> [ 1293.227038] [<ffffffff822a716b>] ip_vs_app_inc_new+0xdb/0x180
> [ 1293.227038] [<ffffffff822a7258>] register_ip_vs_app_inc+0x48/0x70
> [ 1293.227038] [<ffffffff822b2fea>] __ip_vs_ftp_init+0xba/0x140
> [ 1293.227038] [<ffffffff821c9060>] ops_init+0x80/0x90
> [ 1293.227038] [<ffffffff821c90cb>] setup_net+0x5b/0xe0
> [ 1293.227038] [<ffffffff821c9416>] copy_net_ns+0x76/0x100
> [ 1293.227038] [<ffffffff810dc92b>] create_new_namespaces+0xfb/0x190
> [ 1293.227038] [<ffffffff810dca21>] unshare_nsproxy_namespaces+0x61/0x80
> [ 1293.227038] [<ffffffff810afd1f>] sys_unshare+0xff/0x290
> [ 1293.227038] [<ffffffff8187622e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
> [ 1293.227038] [<ffffffff82665539>] system_call_fastpath+0x16/0x1b
> [ 1293.227038] Code: 89 c7 e8 34 91 3b 00 89 de 66 c1 ee 04 31 de 83 e6 0f 48 83 c6 22 48 c1 e6 04 4a 8b 14 26 49 8d 34 34 48 8d 42 c0 48 39 d6 74 13 <66> 39 58 58 74 22 48 8b 48 40 48 8d 41 c0 48 39 ce 75 ed 49 8d
> [ 1293.227038] RIP [<ffffffff822aacb0>] tcp_register_app+0x60/0xb0
> [ 1293.227038] RSP <ffff880038c1dd18>
> [ 1293.227038] CR2: 0000000000000018
> [ 1293.379284] ---[ end trace 364ab40c7011a009 ]---
> [ 1293.381182] Kernel panic - not syncing: Fatal exception in interrupt
>
> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> Acked-by: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Simon Horman <horms@verge.net.au>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
>
^ permalink raw reply
* Re: getting warn once around skb_try_coalesce
From: Eric Dumazet @ 2012-07-10 13:01 UTC (permalink / raw)
To: Shlomo Pongartz
Cc: Or Gerlitz, David Miller, netdev@vger.kernel.org, Erez Shitrit
In-Reply-To: <4FFC2191.1040001@mellanox.com>
On Tue, 2012-07-10 at 15:35 +0300, Shlomo Pongartz wrote:
> I've applied the patch and there are no more warnings. Thanks.
> Can you please elaborate on this issue which was there from day one and
> AFAIK never manifested itself.
two problems :
1) truesize underestimation
Well, I posted at least 50 patches related to various skb->truesize
mismatches in the past year.
skb->truesize is/should_be the true size of skb, that is the memory
allocated for sk_buff, skb->head and all fragments
Check commit e1ac50f64691de9a (bnx2x: fix skb truesize underestimation)
for a similar fix done on bnx2x
Its very important to do so to avoid OOM.
If you account few bytes per fragment but allocate PAGE_SIZE bytes, its
pretty easy to allocate far more memory than allowed by various
socket/tcp/udp/... limits, and exhaust kernel memory.
commit 924a4c7d2e962b (myri10ge: fix truesize underestimation)
commit 7b8b59617ead5acc (igbvf: fix truesize underestimation)
I probably missed your driver because it was not on drivers/net tree but
drivers/infiniband
2) Not enough tailroom
Invisible but performance suffers, because IP/TCP will need
to call pskb_may_pull() and the expensive __pskb_pull_tail().
So each incoming IP packet needs at least one pskb_expand_head().
I'll send an official patch, but I believe I should refine the tailroom
allocation like that :
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 5c1bc99..f10221f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -123,7 +123,7 @@ static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
skb_frag_size_set(frag, size);
skb->data_len += size;
- skb->truesize += size;
+ skb->truesize += PAGE_SIZE;
} else
skb_put(skb, length);
@@ -156,14 +156,18 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct sk_buff *skb;
int buf_size;
+ int tailroom;
u64 *mapping;
- if (ipoib_ud_need_sg(priv->max_ib_mtu))
+ if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
buf_size = IPOIB_UD_HEAD_SIZE;
- else
+ tailroom = 128; /* reserve some tailroom for IP/TCP headers */
+ } else {
buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+ tailroom = 0;
+ }
- skb = dev_alloc_skb(buf_size + 4);
+ skb = dev_alloc_skb(buf_size + tailroom + 4);
if (unlikely(!skb))
return NULL;
^ permalink raw reply related
* Re: [PATCH 4 0/4] Add ability to set defaultless network device MAC addresses to deterministic computed locally administered values
From: "Andy Green (林安廸)" @ 2012-07-10 12:58 UTC (permalink / raw)
To: Florian Fainelli
Cc: linux-arm-kernel, linux-omap, s-jan, arnd, patches, tony, netdev,
linux-kernel, rostedt
In-Reply-To: <201207101437.54877.florian@openwrt.org>
On 10/07/12 20:37, the mail apparently from Florian Fainelli included:
Hi -
> Le jeudi 05 juillet 2012 04:44:33, Andy Green a écrit :
>> The following series adds some code to generate legal, locally administered
>> MAC addresses from OMAP4 CPU Die ID fuse data, and then adds a helper at
>> net/ethernet taking care of accepting device path / MAC mapping
>> registrations and running a notifier to enforce the requested MAC when the
>> matching network device turns up.
>
> This looks like something you can solve by user-space entirely. Expose the
That might seem so from a openwrt perspective, where you custom cook the
whole userland thing per-device, but it ain't so from a generic rootfs
perspective.
Why should Ubuntu, Fedora etc stink up their OSes with Panda-specific
workarounds? And Panda is not the only device with this issue.
-Andy
^ permalink raw reply
* Re: [net-next patch v2] bnx2x: Add run-time CNIC support
From: Eilon Greenstein @ 2012-07-10 12:41 UTC (permalink / raw)
To: David Miller; +Cc: meravs, netdev, dmitry
In-Reply-To: <20120710.053724.1002197670026212780.davem@davemloft.net>
On Tue, 2012-07-10 at 05:37 -0700, David Miller wrote:
> From: "Eilon Greenstein" <eilong@broadcom.com>
> Date: Tue, 10 Jul 2012 15:33:54 +0300
>
> > On Tue, 2012-07-10 at 05:21 -0700, David Miller wrote:
> >> Make it really dynamic, and properly configurable at run time, so
> >> people don't have to go through hoops to get the "advantages" you
> >> speak so highly of.
> >
> > This is possible for the resources, but not for the latency - we cannot
> > change the HW mode once traffic started to run. Why is that so bad to
> > support Kconfig as a working mode like we did thus far? We are using it
> > specifically for users that wants to optimize the kernel, so Kconfig
> > does not sound that bad in that context.
>
> Sure you can find a way to make this work, you just really aren't
> trying hard enough.
>
> The current situation is a huge and gross hack. I'm not letting you
> continue spreading this disease.
>
> Implement this properly, I really mean it.
>
OK. Since it blocks the ability to add SR-IOV support, is it acceptable
to submit it as constant enabled for PF and disabled for VF (SR-IOV)?
^ permalink raw reply
* Re: [PATCH 4 0/4] Add ability to set defaultless network device MAC addresses to deterministic computed locally administered values
From: Florian Fainelli @ 2012-07-10 12:37 UTC (permalink / raw)
To: linux-arm-kernel
Cc: Andy Green, linux-omap, s-jan, arnd, patches, tony, netdev,
linux-kernel, rostedt
In-Reply-To: <20120705024259.26317.16000.stgit@build.warmcat.com>
Hi,
Le jeudi 05 juillet 2012 04:44:33, Andy Green a écrit :
> The following series adds some code to generate legal, locally administered
> MAC addresses from OMAP4 CPU Die ID fuse data, and then adds a helper at
> net/ethernet taking care of accepting device path / MAC mapping
> registrations and running a notifier to enforce the requested MAC when the
> matching network device turns up.
This looks like something you can solve by user-space entirely. Expose the
OMAP4 CPU Die ID using a sysfs attribute, and let user-space manage the MAC
address pool.
If you tell me you want to use this for nfsroot booting, what prevents you
from using an initramfs, assign a valid MAC to your interface and switch over
your nfsroot once the interface setup is done?
>
> On PandaBoard / ES, two devices have no board-level MAC either assigned by
> the manufacturer or stored on the board, the last patch in the series adds
> these device paths and gets them set when the network device is registered.
>
> Lastly for convenient testing, there's a little patch on
> omap2plus_defconfig that will get Ethernet and WLAN up on Pandaboard.
>
> The patches are against today's linux-omap.
>
> Thanks to Tony Lindgren and Arnd Bergmann for comments leading to the
> helper in net/ethernet.
>
> ---
>
> Andy Green (4):
> OMAP: add cpu id register to MAC address helper
> NET ethernet introduce mac_platform helper
> OMAP4 PANDA register ethernet and wlan for automatic mac allocation
> config test config extending omap2plus with wl12xx etc
>
>
> arch/arm/configs/omap2plus_defconfig | 35 +++----
> arch/arm/mach-omap2/Kconfig | 1
> arch/arm/mach-omap2/board-omap4panda.c | 30 ++++++
> arch/arm/mach-omap2/id.c | 39 ++++++++
> arch/arm/mach-omap2/include/mach/id.h | 1
> include/net/mac-platform.h | 39 ++++++++
> net/Kconfig | 5 +
> net/ethernet/Makefile | 3 +
> net/ethernet/mac-platform.c | 151
> ++++++++++++++++++++++++++++++++ 9 files changed, 282 insertions(+), 22
> deletions(-)
> create mode 100644 include/net/mac-platform.h
> create mode 100644 net/ethernet/mac-platform.c
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
--
Florian
^ permalink raw reply
* Re: getting warn once around skb_try_coalesce
From: Shlomo Pongartz @ 2012-07-10 12:35 UTC (permalink / raw)
To: Eric Dumazet
Cc: Or Gerlitz, David Miller, netdev@vger.kernel.org, Erez Shitrit
In-Reply-To: <1341919328.3265.4871.camel@edumazet-glaptop>
On 7/10/2012 2:22 PM, Eric Dumazet wrote:
> On Tue, 2012-07-10 at 13:14 +0200, Eric Dumazet wrote:
>> On Tue, 2012-07-10 at 12:18 +0200, Eric Dumazet wrote:
>>> On Tue, 2012-07-10 at 12:54 +0300, Or Gerlitz wrote:
>>>> Hi Dave, Eric,
>>>>
>>>> Another trace that I see here with net-next is this one-time warning. I
>>>> get it always
>>>> on the passive side of TCP, something that seems related to GRO, it
>>>> happens only with
>>>> IPoIB, not with mlx4_en and igb (when igb get to work on net-next...)
>>>>
>>>> The latest commit in this area is bad43ca8325f493dcaa0896c2f036276af059c7e
>>>> "net: introduce skb_try_coalesce()" from Eric.
>>>>
>>>> Or.
>>>>
>>>> -----------[ cut here ]------------
>>>> WARNING: at net/core/skbuff.c:3413 skb_try_coalesce+0x1f8/0x31d()
>>> This warning catch skb truesize offenders, most probably its a driver
>>> issue.
>>>
>> By the way, this driver allocates not enough tailroom in skbs, so IP/TCP
>> stacks need to reallocate skb head to pull IP/TCP headers. Thats not
>> efficient.
>>
>> I suggest using following patch :
> And of course we also can fix the truesize bug.
> (Not sure it will fix the warning, but worth trying)
>
> Since this driver allocates a full page, it must use the PAGE_SIZE, not
> the used part in the fragment
>
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 5c1bc99..e611a924 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -123,7 +123,7 @@ static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
>
> skb_frag_size_set(frag, size);
> skb->data_len += size;
> - skb->truesize += size;
> + skb->truesize += PAGE_SIZE;
> } else
> skb_put(skb, length);
>
> @@ -159,7 +159,7 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
> u64 *mapping;
>
> if (ipoib_ud_need_sg(priv->max_ib_mtu))
> - buf_size = IPOIB_UD_HEAD_SIZE;
> + buf_size = IPOIB_UD_HEAD_SIZE + 128; /* reserve some tailroom for IP/TCP headers */
> else
> buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
>
>
>
>
> .
>
Hi,
I've applied the patch and there are no more warnings. Thanks.
Can you please elaborate on this issue which was there from day one and
AFAIK never manifested itself.
Best regards,
S.P.
^ permalink raw reply
* Re: [net-next patch v2] bnx2x: Add run-time CNIC support
From: David Miller @ 2012-07-10 12:37 UTC (permalink / raw)
To: eilong; +Cc: meravs, netdev, dmitry
In-Reply-To: <1341923634.27035.6.camel@lb-tlvb-eilong.il.broadcom.com>
From: "Eilon Greenstein" <eilong@broadcom.com>
Date: Tue, 10 Jul 2012 15:33:54 +0300
> On Tue, 2012-07-10 at 05:21 -0700, David Miller wrote:
>> Make it really dynamic, and properly configurable at run time, so
>> people don't have to go through hoops to get the "advantages" you
>> speak so highly of.
>
> This is possible for the resources, but not for the latency - we cannot
> change the HW mode once traffic started to run. Why is that so bad to
> support Kconfig as a working mode like we did thus far? We are using it
> specifically for users that wants to optimize the kernel, so Kconfig
> does not sound that bad in that context.
Sure you can find a way to make this work, you just really aren't
trying hard enough.
The current situation is a huge and gross hack. I'm not letting you
continue spreading this disease.
Implement this properly, I really mean it.
^ permalink raw reply
* Re: [net-next patch v2] bnx2x: Add run-time CNIC support
From: Eilon Greenstein @ 2012-07-10 12:33 UTC (permalink / raw)
To: David Miller; +Cc: meravs, netdev, dmitry
In-Reply-To: <20120710.052149.792836375666491854.davem@davemloft.net>
On Tue, 2012-07-10 at 05:21 -0700, David Miller wrote:
> From: "Merav Sicron" <meravs@broadcom.com>
> Date: Tue, 10 Jul 2012 15:17:00 +0300
>
> > There are still two advantages in disabling CNIC in bnx2x: Saving
> > resources (MSI-X vector and memory) as well as reducing some latency.
>
> But, nobody does this. No end user can do this easily, this
> is therefore of zero value to him.
Most do not, but I'm aware of two customers that play with their own
kernel that do that - they can play with the driver and tweak it to
disable this mode manually, but that is similar to supporting something
outside the tree.
> > While it is true that distributions enable the CNIC Kconfig option, some
> > users that care about resources and latency compile a kernel without it.
>
> This, therefore, results in a terrible user experience.
We are using the Kconfig since it is meant for advanced users that
customize their kernel to their needs.
> > Can you please re-consider this patch?
>
> Absolutely not.
>
> Make it really dynamic, and properly configurable at run time, so
> people don't have to go through hoops to get the "advantages" you
> speak so highly of.
This is possible for the resources, but not for the latency - we cannot
change the HW mode once traffic started to run. Why is that so bad to
support Kconfig as a working mode like we did thus far? We are using it
specifically for users that wants to optimize the kernel, so Kconfig
does not sound that bad in that context.
^ permalink raw reply
* Re: [PATCH net-next 3/9] IB/ipoib: Add support for acting as VIF
From: Eric Dumazet @ 2012-07-10 12:26 UTC (permalink / raw)
To: Or Gerlitz; +Cc: davem, roland, netdev, ali, sean.hefty, Erez Shitrit
In-Reply-To: <1341922569-4118-4-git-send-email-ogerlitz@mellanox.com>
On Tue, 2012-07-10 at 15:16 +0300, Or Gerlitz wrote:
> From: Erez Shitrit <erezsh@mellanox.co.il>
>
> When IPoIB interface acts as a VIF for an eIPoIB interface, it uses
> the skb cb storage area on the RX flow, to place information which
> can be of use to the upper layer device.
>
> One such usage example, is when an eIPoIB inteface needs to generate
> a source mac for incoming Ethernet frames.
>
> The IPoIB code checks the VIF private flag on the RX path, and accoriding
> to the value of the flag prepares the skb CB data, etc.
>
...
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 5c1bc99..da28799 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -300,7 +300,13 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
> likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
> skb->ip_summed = CHECKSUM_UNNECESSARY;
>
> - napi_gro_receive(&priv->napi, skb);
> + /* if handler is registered on top of ipoib, set skb oob data */
> + if (dev->priv_flags & IFF_EIPOIB_VIF) {
> + set_skb_oob_cb_data(skb, wc, &priv->napi);
> + /* the registered handler will take care of the skb */
> + netif_receive_skb(skb);
> + } else
> + napi_gro_receive(&priv->napi, skb);
skb->cb[] can be destroyed in netif_receive_skb() /
__netif_receive_skb()
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
...
#endif
^ permalink raw reply
* Re: [net-next patch v2] bnx2x: Add run-time CNIC support
From: David Miller @ 2012-07-10 12:21 UTC (permalink / raw)
To: meravs; +Cc: netdev, eilong, dmitry
In-Reply-To: <1341922620.27284.16.camel@lb-tlvb-meravs.il.broadcom.com>
From: "Merav Sicron" <meravs@broadcom.com>
Date: Tue, 10 Jul 2012 15:17:00 +0300
> There are still two advantages in disabling CNIC in bnx2x: Saving
> resources (MSI-X vector and memory) as well as reducing some latency.
But, nobody does this. No end user can do this easily, this
is therefore of zero value to him.
> While it is true that distributions enable the CNIC Kconfig option, some
> users that care about resources and latency compile a kernel without it.
This, therefore, results in a terrible user experience.
> Can you please re-consider this patch?
Absolutely not.
Make it really dynamic, and properly configurable at run time, so
people don't have to go through hoops to get the "advantages" you
speak so highly of.
^ permalink raw reply
* Re: getting warn once around skb_try_coalesce
From: Or Gerlitz @ 2012-07-10 12:19 UTC (permalink / raw)
To: Eric Dumazet
Cc: David Miller, netdev@vger.kernel.org, Shlomo Pongratz,
Erez Shitrit
In-Reply-To: <1341919328.3265.4871.camel@edumazet-glaptop>
On 7/10/2012 2:22 PM, Eric Dumazet wrote:
> And of course we also can fix the truesize bug. (Not sure it will fix
> the warning, but worth trying)
thanks, we will give that a try and let you know
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox