* [PATCH net-next 1/3] Implementation of RFC 4898 Extended TCP Statistics (Web10G)
@ 2014-12-16 17:50 rapier
2014-12-17 11:01 ` Bjørn Mork
0 siblings, 1 reply; 4+ messages in thread
From: rapier @ 2014-12-16 17:50 UTC (permalink / raw)
To: netdev
This patch provides the kernel instrument set. While this patch
compiles and runs it does not have control and management capabilities.
These are provided in the next patch submission.
---
include/linux/tcp.h | 8 +
include/net/tcp.h | 1 +
include/net/tcp_estats.h | 376 +++++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/tcp.h | 6 +-
net/ipv4/tcp.c | 21 ++-
net/ipv4/tcp_cong.c | 3 +
net/ipv4/tcp_htcp.c | 1 +
net/ipv4/tcp_input.c | 116 +++++++++++++--
net/ipv4/tcp_ipv4.c | 10 ++
net/ipv4/tcp_output.c | 61 +++++++-
net/ipv4/tcp_timer.c | 3 +
net/ipv6/tcp_ipv6.c | 7 +
12 files changed, 592 insertions(+), 21 deletions(-)
create mode 100644 include/net/tcp_estats.h
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 67309ec..8758360 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -126,6 +126,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
return (struct tcp_request_sock *)req;
}
+#ifdef CONFIG_TCP_ESTATS
+struct tcp_estats;
+#endif
+
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
@@ -309,6 +313,10 @@ struct tcp_sock {
struct tcp_md5sig_info __rcu *md5sig_info;
#endif
+#ifdef CONFIG_TCP_ESTATS
+ struct tcp_estats *tcp_stats;
+#endif
+
/* TCP fastopen related information */
struct tcp_fastopen_request *fastopen_req;
/* fastopen_rsk points to request_sock that resulted in this big
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf..9f7e31e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -43,6 +43,7 @@
#include <net/tcp_states.h>
#include <net/inet_ecn.h>
#include <net/dst.h>
+#include <net/tcp_estats.h>
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
diff --git a/include/net/tcp_estats.h b/include/net/tcp_estats.h
new file mode 100644
index 0000000..ff6000e
--- /dev/null
+++ b/include/net/tcp_estats.h
@@ -0,0 +1,376 @@
+/*
+ * include/net/tcp_estats.h
+ *
+ * Implementation of TCP Extended Statistics MIB (RFC 4898)
+ *
+ * Authors:
+ * John Estabrook <jsestabrook@gmail.com>
+ * Andrew K. Adams <akadams@psc.edu>
+ * Kevin Hogan <kwabena@google.com>
+ * Dominin Hamon <dma@stripysock.com>
+ * John Heffner <johnwheffner@gmail.com>
+ *
+ * The Web10Gig project. See http://www.web10gig.org
+ *
+ * Copyright © 2011, Pittsburgh Supercomputing Center (PSC).
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _TCP_ESTATS_H
+#define _TCP_ESTATS_H
+
+#include <net/sock.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/jump_label.h>
+#include <linux/spinlock.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
+
+/* defines number of seconds that stats persist after connection ends */
+#define TCP_ESTATS_PERSIST_DELAY_SECS 5
+
+enum tcp_estats_sndlim_states {
+ TCP_ESTATS_SNDLIM_NONE = -1,
+ TCP_ESTATS_SNDLIM_SENDER,
+ TCP_ESTATS_SNDLIM_CWND,
+ TCP_ESTATS_SNDLIM_RWIN,
+ TCP_ESTATS_SNDLIM_STARTUP,
+ TCP_ESTATS_SNDLIM_TSODEFER,
+ TCP_ESTATS_SNDLIM_PACE,
+ TCP_ESTATS_SNDLIM_NSTATES /* Keep at end */
+};
+
+enum tcp_estats_addrtype {
+ TCP_ESTATS_ADDRTYPE_IPV4 = 1,
+ TCP_ESTATS_ADDRTYPE_IPV6 = 2
+};
+
+enum tcp_estats_softerror_reason {
+ TCP_ESTATS_SOFTERROR_BELOW_DATA_WINDOW = 1,
+ TCP_ESTATS_SOFTERROR_ABOVE_DATA_WINDOW = 2,
+ TCP_ESTATS_SOFTERROR_BELOW_ACK_WINDOW = 3,
+ TCP_ESTATS_SOFTERROR_ABOVE_ACK_WINDOW = 4,
+ TCP_ESTATS_SOFTERROR_BELOW_TS_WINDOW = 5,
+ TCP_ESTATS_SOFTERROR_ABOVE_TS_WINDOW = 6,
+ TCP_ESTATS_SOFTERROR_DATA_CHECKSUM = 7,
+ TCP_ESTATS_SOFTERROR_OTHER = 8,
+};
+
+#define TCP_ESTATS_INACTIVE 2
+#define TCP_ESTATS_ACTIVE 1
+
+#define TCP_ESTATS_TABLEMASK_INACTIVE 0x00
+#define TCP_ESTATS_TABLEMASK_ACTIVE 0x01
+#define TCP_ESTATS_TABLEMASK_PERF 0x02
+#define TCP_ESTATS_TABLEMASK_PATH 0x04
+#define TCP_ESTATS_TABLEMASK_STACK 0x08
+#define TCP_ESTATS_TABLEMASK_APP 0x10
+#define TCP_ESTATS_TABLEMASK_EXTRAS 0x40
+
+#ifdef CONFIG_TCP_ESTATS
+
+extern struct static_key tcp_estats_enabled;
+
+#define TCP_ESTATS_CHECK(tp, table, expr) \
+ do { \
+ if (static_key_false(&tcp_estats_enabled)) { \
+ if (likely((tp)->tcp_stats) && \
+ likely((tp)->tcp_stats->tables.table)) { \
+ (expr); \
+ } \
+ } \
+ } while (0)
+
+#define TCP_ESTATS_VAR_INC(tp, table, var) \
+ TCP_ESTATS_CHECK(tp, table, ++((tp)->tcp_stats->tables.table->var))
+#define TCP_ESTATS_VAR_DEC(tp, table, var) \
+ TCP_ESTATS_CHECK(tp, table, --((tp)->tcp_stats->tables.table->var))
+#define TCP_ESTATS_VAR_ADD(tp, table, var, val) \
+ TCP_ESTATS_CHECK(tp, table, \
+ ((tp)->tcp_stats->tables.table->var) += (val))
+#define TCP_ESTATS_VAR_SET(tp, table, var, val) \
+ TCP_ESTATS_CHECK(tp, table, \
+ ((tp)->tcp_stats->tables.table->var) = (val))
+#define TCP_ESTATS_UPDATE(tp, func) \
+ do { \
+ if (static_key_false(&tcp_estats_enabled)) { \
+ if (likely((tp)->tcp_stats)) { \
+ (func); \
+ } \
+ } \
+ } while (0)
+
+/*
+ * Variables that can be read and written directly.
+ *
+ * Contains all variables from RFC 4898. Commented fields are
+ * either not implemented (only StartTimeStamp
+ * remains unimplemented in this release) or have
+ * handlers and do not need struct storage.
+ */
+struct tcp_estats_connection_table {
+ u32 AddressType;
+ union { struct in_addr addr; struct in6_addr addr6; } LocalAddress;
+ union { struct in_addr addr; struct in6_addr addr6; } RemAddress;
+ u16 LocalPort;
+ u16 RemPort;
+};
+
+struct tcp_estats_perf_table {
+ u32 SegsOut;
+ u32 DataSegsOut;
+ u64 DataOctetsOut;
+ u32 SegsRetrans;
+ u32 OctetsRetrans;
+ u32 SegsIn;
+ u32 DataSegsIn;
+ u64 DataOctetsIn;
+ /* ElapsedSecs */
+ /* ElapsedMicroSecs */
+ /* StartTimeStamp */
+ /* CurMSS */
+ /* PipeSize */
+ u32 MaxPipeSize;
+ /* SmoothedRTT */
+ /* CurRTO */
+ u32 CongSignals;
+ /* CurCwnd */
+ /* CurSsthresh */
+ u32 Timeouts;
+ /* CurRwinSent */
+ u32 MaxRwinSent;
+ u32 ZeroRwinSent;
+ /* CurRwinRcvd */
+ u32 MaxRwinRcvd;
+ u32 ZeroRwinRcvd;
+ /* SndLimTransRwin */
+ /* SndLimTransCwnd */
+ /* SndLimTransSnd */
+ /* SndLimTimeRwin */
+ /* SndLimTimeCwnd */
+ /* SndLimTimeSnd */
+ u32 snd_lim_trans[TCP_ESTATS_SNDLIM_NSTATES];
+ u32 snd_lim_time[TCP_ESTATS_SNDLIM_NSTATES];
+};
+
+struct tcp_estats_path_table {
+ /* RetranThresh */
+ u32 NonRecovDAEpisodes;
+ u32 SumOctetsReordered;
+ u32 NonRecovDA;
+ u32 SampleRTT;
+ /* RTTVar */
+ u32 MaxRTT;
+ u32 MinRTT;
+ u64 SumRTT;
+ u32 CountRTT;
+ u32 MaxRTO;
+ u32 MinRTO;
+ u8 IpTtl;
+ u8 IpTosIn;
+ /* IpTosOut */
+ u32 PreCongSumCwnd;
+ u32 PreCongSumRTT;
+ u32 PostCongSumRTT;
+ u32 PostCongCountRTT;
+ u32 ECNsignals;
+ u32 DupAckEpisodes;
+ /* RcvRTT */
+ u32 DupAcksOut;
+ u32 CERcvd;
+ u32 ECESent;
+};
+
+struct tcp_estats_stack_table {
+ u32 ActiveOpen;
+ /* MSSSent */
+ /* MSSRcvd */
+ /* WinScaleSent */
+ /* WinScaleRcvd */
+ /* TimeStamps */
+ /* ECN */
+ /* WillSendSACK */
+ /* WillUseSACK */
+ /* State */
+ /* Nagle */
+ u32 MaxSsCwnd;
+ u32 MaxCaCwnd;
+ u32 MaxSsthresh;
+ u32 MinSsthresh;
+ /* InRecovery */
+ u32 DupAcksIn;
+ u32 SpuriousFrDetected;
+ u32 SpuriousRtoDetected;
+ u32 SoftErrors;
+ u32 SoftErrorReason;
+ u32 SlowStart;
+ u32 CongAvoid;
+ u32 OtherReductions;
+ u32 CongOverCount;
+ u32 FastRetran;
+ u32 SubsequentTimeouts;
+ /* CurTimeoutCount */
+ u32 AbruptTimeouts;
+ u32 SACKsRcvd;
+ u32 SACKBlocksRcvd;
+ u32 SendStall;
+ u32 DSACKDups;
+ u32 MaxMSS;
+ u32 MinMSS;
+ u32 SndInitial;
+ u32 RecInitial;
+ /* CurRetxQueue */
+ /* MaxRetxQueue */
+ /* CurReasmQueue */
+ u32 MaxReasmQueue;
+ u32 EarlyRetrans;
+ u32 EarlyRetransDelay;
+};
+
+struct tcp_estats_app_table {
+ /* SndUna */
+ /* SndNxt */
+ u32 SndMax;
+ u64 ThruOctetsAcked;
+ /* RcvNxt */
+ u64 ThruOctetsReceived;
+ /* CurAppWQueue */
+ u32 MaxAppWQueue;
+ /* CurAppRQueue */
+ u32 MaxAppRQueue;
+};
+
+/*
+ currently, no backing store is needed for tuning elements in
+ web10g - they are all read or written to directly in other
+ data structures (such as the socket)
+*/
+
+struct tcp_estats_extras_table {
+ /* OtherReductionsCV */
+ u32 OtherReductionsCM;
+ u32 Priority;
+};
+
+struct tcp_estats_tables {
+ struct tcp_estats_connection_table *connection_table;
+ struct tcp_estats_perf_table *perf_table;
+ struct tcp_estats_path_table *path_table;
+ struct tcp_estats_stack_table *stack_table;
+ struct tcp_estats_app_table *app_table;
+ struct tcp_estats_extras_table *extras_table;
+};
+
+struct tcp_estats {
+ int tcpe_cid; /* idr map id */
+
+ struct sock *sk;
+ kuid_t uid;
+ kgid_t gid;
+ int ids;
+
+ atomic_t users;
+
+ enum tcp_estats_sndlim_states limstate;
+ ktime_t limstate_ts;
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ ktime_t start_ts;
+ ktime_t current_ts;
+#else
+ unsigned long start_ts;
+ unsigned long current_ts;
+#endif
+ struct timeval start_tv;
+
+ int queued;
+ struct work_struct create_notify;
+ struct work_struct establish_notify;
+ struct delayed_work destroy_notify;
+
+ struct tcp_estats_tables tables;
+
+ struct rcu_head rcu;
+};
+
+extern struct idr tcp_estats_idr;
+
+extern int tcp_estats_wq_enabled;
+extern struct workqueue_struct *tcp_estats_wq;
+extern void (*create_notify_func)(struct work_struct *work);
+extern void (*establish_notify_func)(struct work_struct *work);
+extern void (*destroy_notify_func)(struct work_struct *work);
+
+extern unsigned long persist_delay;
+extern spinlock_t tcp_estats_idr_lock;
+
+/* For the TCP code */
+extern int tcp_estats_create(struct sock *sk, enum tcp_estats_addrtype t,
+ int active);
+extern void tcp_estats_destroy(struct sock *sk);
+extern void tcp_estats_establish(struct sock *sk);
+extern void tcp_estats_free(struct rcu_head *rcu);
+
+extern void tcp_estats_update_snd_nxt(struct tcp_sock *tp);
+extern void tcp_estats_update_acked(struct tcp_sock *tp, u32 ack);
+extern void tcp_estats_update_rtt(struct sock *sk, unsigned long rtt_sample);
+extern void tcp_estats_update_timeout(struct sock *sk);
+extern void tcp_estats_update_mss(struct tcp_sock *tp);
+extern void tcp_estats_update_rwin_rcvd(struct tcp_sock *tp);
+extern void tcp_estats_update_sndlim(struct tcp_sock *tp,
+ enum tcp_estats_sndlim_states why);
+extern void tcp_estats_update_rcvd(struct tcp_sock *tp, u32 seq);
+extern void tcp_estats_update_rwin_sent(struct tcp_sock *tp);
+extern void tcp_estats_update_congestion(struct tcp_sock *tp);
+extern void tcp_estats_update_post_congestion(struct tcp_sock *tp);
+extern void tcp_estats_update_segsend(struct sock *sk, int pcount,
+ u32 seq, u32 end_seq, int flags);
+extern void tcp_estats_update_segrecv(struct tcp_sock *tp, struct sk_buff *skb);
+extern void tcp_estats_update_finish_segrecv(struct tcp_sock *tp);
+extern void tcp_estats_update_writeq(struct sock *sk);
+extern void tcp_estats_update_recvq(struct sock *sk);
+
+extern void tcp_estats_init(void);
+
+static inline void tcp_estats_use(struct tcp_estats *stats)
+{
+ atomic_inc(&stats->users);
+}
+
+static inline int tcp_estats_use_if_valid(struct tcp_estats *stats)
+{
+ return atomic_inc_not_zero(&stats->users);
+}
+
+static inline void tcp_estats_unuse(struct tcp_estats *stats)
+{
+ if (atomic_dec_and_test(&stats->users)) {
+ sock_put(stats->sk);
+ stats->sk = NULL;
+ call_rcu(&stats->rcu, tcp_estats_free);
+ }
+}
+
+#else /* !CONFIG_TCP_ESTATS */
+
+#define tcp_estats_enabled (0)
+
+#define TCP_ESTATS_VAR_INC(tp, table, var) do {} while (0)
+#define TCP_ESTATS_VAR_DEC(tp, table, var) do {} while (0)
+#define TCP_ESTATS_VAR_ADD(tp, table, var, val) do {} while (0)
+#define TCP_ESTATS_VAR_SET(tp, table, var, val) do {} while (0)
+#define TCP_ESTATS_UPDATE(tp, func) do {} while (0)
+
+static inline void tcp_estats_init(void) { }
+static inline void tcp_estats_establish(struct sock *sk) { }
+static inline void tcp_estats_create(struct sock *sk,
+ enum tcp_estats_addrtype t,
+ int active) { }
+static inline void tcp_estats_destroy(struct sock *sk) { }
+
+#endif /* CONFIG_TCP_ESTATS */
+
+#endif /* _TCP_ESTATS_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 3b97183..5dae043 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -186,9 +186,13 @@ struct tcp_info {
__u32 tcpi_rcv_space;
__u32 tcpi_total_retrans;
-
__u64 tcpi_pacing_rate;
__u64 tcpi_max_pacing_rate;
+
+#ifdef CONFIG_TCP_ESTATS
+ /* RFC 4898 extended stats Info */
+ __u32 tcpi_estats_cid;
+#endif
};
/* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3075723..698dbb7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -418,6 +418,10 @@ void tcp_init_sock(struct sock *sk)
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+#ifdef CONFIG_TCP_ESTATS
+ tp->tcp_stats = NULL;
+#endif
+
local_bh_disable();
sock_update_memcg(sk);
sk_sockets_allocated_inc(sk);
@@ -972,6 +976,9 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
+ if (copied)
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_writeq(sk));
+
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
@@ -1264,9 +1271,11 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
- if (copied)
+ if (copied) {
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_writeq(sk));
+ }
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
@@ -1658,6 +1667,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
}
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk));
+
/* Well, if we have backlog, try to process it now yet. */
if (copied >= target && !sk->sk_backlog.tail)
@@ -2684,6 +2695,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
sk->sk_pacing_rate : ~0ULL;
info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
sk->sk_max_pacing_rate : ~0ULL;
+
+#ifdef CONFIG_TCP_ESTATS
+ info->tcpi_estats_cid = (tp->tcp_stats && tp->tcp_stats->tcpe_cid > 0)
+ ? tp->tcp_stats->tcpe_cid : 0;
+#endif
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3101,6 +3117,9 @@ void __init tcp_init(void)
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
tcp_metrics_init();
+
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
+ tcp_estats_init();
+
tcp_tasklet_init();
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0d..e93929d 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -295,6 +295,8 @@ void tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
u32 cwnd = tp->snd_cwnd + acked;
+ TCP_ESTATS_VAR_INC(tp, stack_table, SlowStart);
+
if (cwnd > tp->snd_ssthresh)
cwnd = tp->snd_ssthresh + 1;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
@@ -304,6 +306,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
{
+ TCP_ESTATS_VAR_INC(tp, stack_table, CongAvoid);
if (tp->snd_cwnd_cnt >= w) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 58469ff..5facb4c 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -251,6 +251,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tp->snd_cwnd_cnt += ca->pkts_acked;
ca->pkts_acked = 1;
+ TCP_ESTATS_VAR_INC(tp, stack_table, CongAvoid);
}
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 075ab4d..8f0601b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -77,8 +77,10 @@
#include <linux/errqueue.h>
int sysctl_tcp_timestamps __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_sack);
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_max_reordering __read_mostly = 300;
@@ -231,13 +233,15 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
tcp_enter_quickack_mode((struct sock *)tp);
break;
case INET_ECN_CE:
+ TCP_ESTATS_VAR_INC(tp, path_table, CERcvd);
if (tcp_ca_needs_ecn((struct sock *)tp))
tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
-
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode((struct sock *)tp);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ } else {
+ TCP_ESTATS_VAR_INC(tp, path_table, ECESent);
}
tp->ecn_flags |= TCP_ECN_SEEN;
break;
@@ -1104,6 +1108,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
dup_sack = true;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+ TCP_ESTATS_VAR_INC(tp, stack_table, DSACKDups);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
@@ -1114,6 +1119,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
+ TCP_ESTATS_VAR_INC(tp, stack_table, DSACKDups);
}
}
@@ -1653,6 +1659,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
state.reord = tp->packets_out;
state.rtt_us = -1L;
+ TCP_ESTATS_VAR_INC(tp, stack_table, SACKsRcvd);
+ TCP_ESTATS_VAR_ADD(tp, stack_table, SACKBlocksRcvd, num_sacks);
+
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
tp->fackets_out = 0;
@@ -1928,6 +1937,8 @@ void tcp_enter_loss(struct sock *sk)
bool new_recovery = false;
bool is_reneg; /* is receiver reneging on SACKs? */
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp));
+
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
!after(tp->high_seq, tp->snd_una) ||
@@ -2200,8 +2211,12 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
*/
if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
(tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
- !tcp_may_send_now(sk))
- return !tcp_pause_early_retransmit(sk, flag);
+ !tcp_may_send_now(sk)) {
+ int early_retrans = !tcp_pause_early_retransmit(sk, flag);
+ if (early_retrans)
+ TCP_ESTATS_VAR_INC(tp, stack_table, EarlyRetrans);
+ return early_retrans;
+ }
return false;
}
@@ -2299,9 +2314,15 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
*/
static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
{
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + tcp_max_burst(tp));
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ u32 pkts = tcp_packets_in_flight(tp) + tcp_max_burst(tp);
+
+ if (pkts < tp->snd_cwnd) {
+ tp->snd_cwnd = pkts;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ TCP_ESTATS_VAR_INC(tp, stack_table, OtherReductions);
+ TCP_ESTATS_VAR_INC(tp, extras_table, OtherReductionsCM);
+ }
}
/* Nothing was retransmitted or returned timestamp is less
@@ -2402,6 +2423,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
tcp_ecn_withdraw_cwr(tp);
+ TCP_ESTATS_VAR_INC(tp, stack_table, CongOverCount);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
@@ -2428,10 +2450,15 @@ static bool tcp_try_undo_recovery(struct sock *sk)
*/
DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
tcp_undo_cwnd_reduction(sk, false);
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
mib_idx = LINUX_MIB_TCPLOSSUNDO;
- else
+ TCP_ESTATS_VAR_INC(tp, stack_table,
+ SpuriousRtoDetected);
+ } else {
mib_idx = LINUX_MIB_TCPFULLUNDO;
+ TCP_ESTATS_VAR_INC(tp, stack_table,
+ SpuriousFrDetected);
+ }
NET_INC_STATS_BH(sock_net(sk), mib_idx);
}
@@ -2472,9 +2499,12 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
DBGUNDO(sk, "partial loss");
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
- if (frto_undo)
+ if (frto_undo) {
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
+ TCP_ESTATS_VAR_INC(tp, stack_table,
+ SpuriousRtoDetected);
+ }
inet_csk(sk)->icsk_retransmits = 0;
if (frto_undo || tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
@@ -2555,6 +2585,7 @@ void tcp_enter_cwr(struct sock *sk)
tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
}
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp));
}
static void tcp_try_keep_open(struct sock *sk)
@@ -2580,8 +2611,10 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
- if (flag & FLAG_ECE)
+ if (flag & FLAG_ECE) {
tcp_enter_cwr(sk);
+ TCP_ESTATS_VAR_INC(tp, path_table, ECNsignals);
+ }
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
@@ -2826,6 +2859,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
break;
+ case TCP_CA_Disorder:
+ TCP_ESTATS_VAR_INC(tp, path_table, NonRecovDAEpisodes);
+ break;
+
case TCP_CA_Recovery:
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
@@ -2870,6 +2907,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
+
+ if (icsk->icsk_ca_state == TCP_CA_Disorder)
+ TCP_ESTATS_VAR_INC(tp, path_table, NonRecovDA);
+
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag, prior_unsacked);
return;
@@ -2889,6 +2930,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
/* Otherwise enter Recovery state */
tcp_enter_recovery(sk, (flag & FLAG_ECE));
fast_rexmit = 1;
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp));
+ TCP_ESTATS_VAR_INC(tp, stack_table, FastRetran);
}
if (do_lost)
@@ -2928,6 +2971,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
+ TCP_ESTATS_UPDATE(tcp_sk(sk), tcp_estats_update_rtt(sk, seq_rtt_us));
/* RFC6298: only reset backoff on valid RTT measurement. */
inet_csk(sk)->icsk_backoff = 0;
@@ -3007,6 +3051,7 @@ void tcp_resume_early_retransmit(struct sock *sk)
if (!tp->do_early_retrans)
return;
+ TCP_ESTATS_VAR_INC(tp, stack_table, EarlyRetransDelay);
tcp_enter_recovery(sk, false);
tcp_update_scoreboard(sk, 1);
tcp_xmit_retransmit_queue(sk);
@@ -3310,9 +3355,11 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_rwin_rcvd(tp));
}
}
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_acked(tp, ack));
tp->snd_una = ack;
return flag;
@@ -3410,6 +3457,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
+ int prior_state = icsk->icsk_ca_state;
long sack_rtt_us = -1L;
/* We very likely will need to access write queue head. */
@@ -3419,6 +3467,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* then we can probably ignore it.
*/
if (before(ack, prior_snd_una)) {
+ TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+ TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+ TCP_ESTATS_SOFTERROR_BELOW_ACK_WINDOW);
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - tp->max_window)) {
tcp_send_challenge_ack(sk);
@@ -3430,8 +3481,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* If the ack includes data we haven't sent yet, discard
* this segment (RFC793 Section 3.9).
*/
- if (after(ack, tp->snd_nxt))
+ if (after(ack, tp->snd_nxt)) {
+ TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+ TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+ TCP_ESTATS_SOFTERROR_ABOVE_ACK_WINDOW);
goto invalid_ack;
+ }
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
@@ -3439,6 +3494,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
+ if (icsk->icsk_ca_state == TCP_CA_Disorder)
+ TCP_ESTATS_VAR_ADD(tp, path_table, SumOctetsReordered,
+ ack - prior_snd_una);
icsk->icsk_retransmits = 0;
}
@@ -3456,6 +3514,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_acked(tp, ack));
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
@@ -3510,6 +3569,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
+ if (icsk->icsk_ca_state == TCP_CA_Open &&
+ prior_state >= TCP_CA_CWR)
+ TCP_ESTATS_UPDATE(tp,
+ tcp_estats_update_post_congestion(tp));
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
@@ -4177,7 +4240,9 @@ static void tcp_ofo_queue(struct sock *sk)
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, tp->rcv_nxt));
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+
if (!eaten)
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4232,6 +4297,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk));
+ TCP_ESTATS_VAR_INC(tp, path_table, DupAcksOut);
+
skb1 = skb_peek_tail(&tp->out_of_order_queue);
if (!skb1) {
/* Initial out of order segment, build 1 SACK. */
@@ -4242,6 +4310,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->end_seq;
}
__skb_queue_head(&tp->out_of_order_queue, skb);
+ TCP_ESTATS_VAR_INC(tp, path_table, DupAckEpisodes);
goto end;
}
@@ -4438,6 +4507,9 @@ queue_and_out:
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
+ TCP_ESTATS_UPDATE(
+ tp,
+ tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq));
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (skb->len)
tcp_event_data_recv(sk, skb);
@@ -4459,6 +4531,8 @@ queue_and_out:
tcp_fast_path_check(sk);
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk));
+
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
@@ -4990,6 +5064,9 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+ TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+ TCP_ESTATS_SOFTERROR_BELOW_TS_WINDOW);
tcp_send_dupack(sk, skb);
goto discard;
}
@@ -5004,6 +5081,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* an acknowledgment should be sent in reply (unless the RST
* bit is set, if so drop the segment and return)".
*/
+ TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+ TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+ before(TCP_SKB_CB(skb)->end_seq, tp->rcv_wup) ?
+ TCP_ESTATS_SOFTERROR_BELOW_DATA_WINDOW :
+ TCP_ESTATS_SOFTERROR_ABOVE_DATA_WINDOW);
if (!th->rst) {
if (th->syn)
goto syn_challenge;
@@ -5152,6 +5234,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
return;
} else { /* Header too small */
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+ TCP_ESTATS_VAR_SET(tp, stack_table,
+ SoftErrorReason,
+ TCP_ESTATS_SOFTERROR_OTHER);
goto discard;
}
} else {
@@ -5178,6 +5264,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq));
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
@@ -5204,10 +5291,12 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq));
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
&fragstolen);
}
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk));
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
@@ -5260,6 +5349,9 @@ step5:
csum_error:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+ TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+ TCP_ESTATS_SOFTERROR_DATA_CHECKSUM);
discard:
__kfree_skb(skb);
@@ -5459,6 +5551,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
smp_mb();
tcp_finish_connect(sk, skb);
+ tcp_estats_establish(sk);
if ((tp->syn_fastopen || tp->syn_data) &&
tcp_rcv_fastopen_synack(sk, skb, &foc))
@@ -5685,6 +5778,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
+ tcp_estats_establish(sk);
/* Note, that this wakeup is only for marginal crossed SYN case.
* Passively open sockets are not waked up, because
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7..9c85a54 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1310,6 +1310,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (!newsk)
goto exit_nonewsk;
+ tcp_estats_create(newsk, TCP_ESTATS_ADDRTYPE_IPV4, TCP_ESTATS_INACTIVE);
+
newsk->sk_gso_type = SKB_GSO_TCPV4;
inet_sk_rx_dst_set(newsk, skb);
@@ -1670,6 +1672,8 @@ process:
skb->dev = NULL;
bh_lock_sock_nested(sk);
+ TCP_ESTATS_UPDATE(
+ tcp_sk(sk), tcp_estats_update_segrecv(tcp_sk(sk), skb));
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1680,6 +1684,8 @@ process:
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
+ TCP_ESTATS_UPDATE(
+ tcp_sk(sk), tcp_estats_update_finish_segrecv(tcp_sk(sk)));
bh_unlock_sock(sk);
sock_put(sk);
@@ -1809,6 +1815,8 @@ static int tcp_v4_init_sock(struct sock *sk)
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
+ tcp_estats_create(sk, TCP_ESTATS_ADDRTYPE_IPV4, TCP_ESTATS_ACTIVE);
+
return 0;
}
@@ -1842,6 +1850,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
+ tcp_estats_destroy(sk);
+
BUG_ON(tp->fastopen_rsk != NULL);
/* If socket is aborted during connect operation */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f18262..145b4f2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -80,6 +80,7 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
tcp_advance_send_head(sk, skb);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_snd_nxt(tp));
tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
@@ -292,6 +293,7 @@ static u16 tcp_select_window(struct sock *sk)
}
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_rwin_sent(tp));
/* Make sure we do not exceed the maximum possible
* scaled window.
@@ -905,6 +907,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
+#ifdef CONFIG_TCP_ESTATS
+ __u32 seq;
+ __u32 end_seq;
+ int tcp_flags;
+ int pcount;
+#endif
BUG_ON(!skb || !tcp_skb_pcount(skb));
@@ -1008,6 +1016,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
+#ifdef CONFIG_TCP_ESTATS
+ /* If the skb isn't cloned, we can't reference it after
+ * calling queue_xmit, so copy everything we need here. */
+ pcount = tcp_skb_pcount(skb);
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
+ tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
+#endif
+
/* OK, its time to fill skb_shinfo(skb)->gso_segs */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
@@ -1020,10 +1037,17 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+ if (likely(!err)) {
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_segsend(sk, pcount,
+ seq, end_seq,
+ tcp_flags));
+ }
+
if (likely(err <= 0))
return err;
tcp_enter_cwr(sk);
+ TCP_ESTATS_VAR_INC(tp, stack_table, SendStall);
return net_xmit_eval(err);
}
@@ -1398,6 +1422,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
if (icsk->icsk_mtup.enabled)
mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
tp->mss_cache = mss_now;
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_mss(tp));
return mss_now;
}
@@ -1670,11 +1695,13 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
tcp_init_tso_segs(sk, skb, cur_mss);
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
- return 0;
+ return -TCP_ESTATS_SNDLIM_SENDER;
cwnd_quota = tcp_cwnd_test(tp, skb);
- if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
- cwnd_quota = 0;
+ if (!cwnd_quota)
+ return -TCP_ESTATS_SNDLIM_CWND;
+ if (!tcp_snd_wnd_test(tp, skb, cur_mss))
+ return -TCP_ESTATS_SNDLIM_RWIN;
return cwnd_quota;
}
@@ -1688,7 +1715,7 @@ bool tcp_may_send_now(struct sock *sk)
return skb &&
tcp_snd_test(sk, skb, tcp_current_mss(sk),
(tcp_skb_is_last(sk, skb) ?
- tp->nonagle : TCP_NAGLE_PUSH));
+ tp->nonagle : TCP_NAGLE_PUSH)) > 0;
}
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1978,6 +2005,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
+ int why = TCP_ESTATS_SNDLIM_SENDER;
bool is_cwnd_limited = false;
u32 max_segs;
@@ -2008,6 +2036,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
+ why = TCP_ESTATS_SNDLIM_CWND;
is_cwnd_limited = true;
if (push_one == 2)
/* Force out a loss probe pkt. */
@@ -2016,19 +2045,24 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+ why = TCP_ESTATS_SNDLIM_RWIN;
break;
-
+ }
+
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
+ /* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
break;
} else {
if (!push_one &&
tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
- max_segs))
+ max_segs)) {
+ why = TCP_ESTATS_SNDLIM_TSODEFER;
break;
+ }
}
limit = mss_now;
@@ -2041,6 +2075,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+ /* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
break;
/* TCP Small Queues :
@@ -2064,10 +2099,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
*/
smp_mb__after_atomic();
if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ /* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
break;
}
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+ /* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
break;
repair:
@@ -2080,9 +2117,12 @@ repair:
sent_pkts += tcp_skb_pcount(skb);
if (push_one)
+ /* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
break;
}
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_sndlim(tp, why));
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -3148,11 +3188,16 @@ int tcp_connect(struct sock *sk)
*/
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
- TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+
+ TCP_ESTATS_VAR_SET(tp, stack_table, SndInitial, tp->write_seq);
+ TCP_ESTATS_VAR_SET(tp, app_table, SndMax, tp->write_seq);
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_snd_nxt(tp));
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
+
return 0;
}
EXPORT_SYMBOL(tcp_connect);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 1829c7f..0f6f1f4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -477,6 +477,9 @@ out_reset_timer:
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+
+ TCP_ESTATS_UPDATE(tp, tcp_estats_update_timeout(sk));
+
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
__sk_dst_reset(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5ff8780..db1f88f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1131,6 +1131,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (newsk == NULL)
goto out_nonewsk;
+ tcp_estats_create(newsk, TCP_ESTATS_ADDRTYPE_IPV6, TCP_ESTATS_INACTIVE);
+
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
* count here, tcp_create_openreq_child now does this for us, see the
@@ -1463,6 +1465,8 @@ process:
skb->dev = NULL;
bh_lock_sock_nested(sk);
+ TCP_ESTATS_UPDATE(
+ tcp_sk(sk), tcp_estats_update_segrecv(tcp_sk(sk), skb));
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1473,6 +1477,8 @@ process:
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
+ TCP_ESTATS_UPDATE(
+ tcp_sk(sk), tcp_estats_update_finish_segrecv(tcp_sk(sk)));
bh_unlock_sock(sk);
sock_put(sk);
@@ -1661,6 +1667,7 @@ static int tcp_v6_init_sock(struct sock *sk)
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
#endif
+ tcp_estats_create(sk, TCP_ESTATS_ADDRTYPE_IPV6, TCP_ESTATS_ACTIVE);
return 0;
}
--
1.9.3
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH net-next 1/3] Implementation of RFC 4898 Extended TCP Statistics (Web10G)
2014-12-16 17:50 [PATCH net-next 1/3] Implementation of RFC 4898 Extended TCP Statistics (Web10G) rapier
@ 2014-12-17 11:01 ` Bjørn Mork
2014-12-17 16:19 ` Bryan
2014-12-17 17:00 ` rapier
0 siblings, 2 replies; 4+ messages in thread
From: Bjørn Mork @ 2014-12-17 11:01 UTC (permalink / raw)
To: rapier; +Cc: netdev
rapier <rapier@psc.edu> writes:
> + * The Web10Gig project. See http://www.web10gig.org
URL is already outdated?
Bjørn
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH net-next 1/3] Implementation of RFC 4898 Extended TCP Statistics (Web10G)
2014-12-17 11:01 ` Bjørn Mork
@ 2014-12-17 16:19 ` Bryan
2014-12-17 17:00 ` rapier
1 sibling, 0 replies; 4+ messages in thread
From: Bryan @ 2014-12-17 16:19 UTC (permalink / raw)
To: Bjørn Mork; +Cc: netdev
On 12/17/2014 6:01 AM, Bjørn Mork wrote:
>> + * The Web10Gig project. See http://www.web10gig.org
> URL is already outdated?
That's a typo. Please see http://www.web10g.org
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH net-next 1/3] Implementation of RFC 4898 Extended TCP Statistics (Web10G)
2014-12-17 11:01 ` Bjørn Mork
2014-12-17 16:19 ` Bryan
@ 2014-12-17 17:00 ` rapier
1 sibling, 0 replies; 4+ messages in thread
From: rapier @ 2014-12-17 17:00 UTC (permalink / raw)
To: Bjørn Mork; +Cc: netdev
On 12/17/14 6:01 AM, Bjørn Mork wrote:
> rapier <rapier@psc.edu> writes:
>
>> + * The Web10Gig project. See http://www.web10gig.org
>
> URL is already outdated?
My apologies. The correct site is http://www.web10g.org. My fingers
moved faster than my brain on that one.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2014-12-17 17:10 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-12-16 17:50 [PATCH net-next 1/3] Implementation of RFC 4898 Extended TCP Statistics (Web10G) rapier
2014-12-17 11:01 ` Bjørn Mork
2014-12-17 16:19 ` Bryan
2014-12-17 17:00 ` rapier
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).