* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
From: Vlad Yasevich @ 2012-07-18 21:23 UTC (permalink / raw)
To: Neil Horman; +Cc: netdev, Sridhar Samudrala, David S. Miller, linux-sctp
In-Reply-To: <1342634466-17930-1-git-send-email-nhorman@tuxdriver.com>
On 07/18/2012 02:01 PM, Neil Horman wrote:
> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters. While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.
>
> Instead, lets implement the new ietf quick failover draft found here:
> http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
>
> This will let the sctp stack identify transports that have had a small number of
> errors, and avoid using them quickly until their reliability can be
> re-established. I've tested this out on two virt guests connected via multiple
> isolated virt networks and believe its in compliance with the above draft and
> works well.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Vlad Yasevich <vyasevich@gmail.com>
> CC: Sridhar Samudrala <sri@us.ibm.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: linux-sctp@vger.kernel.org
>
> ---
> Change notes:
>
> V2)
> - Added socket option API from section 6.1 of the specification, as per
> request from Vlad. Adding this socket option allows us to alter both the path
> maximum retransmit value and the path partial failure threshold for each
> transport and the association as a whole.
>
> - Added a per transport pf_retrans value, and initialized it from the
> association value. This makes each transport independently configurable as per
> the socket option above, and prevents changes in the sysctl from bleeding into
> an already created association.
> ---
> Documentation/networking/ip-sysctl.txt | 14 +++++
> include/net/sctp/constants.h | 1 +
> include/net/sctp/structs.h | 11 +++-
> include/net/sctp/user.h | 11 ++++
> net/sctp/associola.c | 36 ++++++++++--
> net/sctp/outqueue.c | 6 +-
> net/sctp/sm_sideeffect.c | 33 ++++++++++-
> net/sctp/socket.c | 96 ++++++++++++++++++++++++++++++++
> net/sctp/sysctl.c | 9 +++
> net/sctp/transport.c | 4 +-
> 10 files changed, 206 insertions(+), 15 deletions(-)
>
[ snip ]
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index b3b8a8d..dfffece 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3470,6 +3470,52 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
> }
>
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to alter the partially failed threshold for one or all
> + * transports in an association. See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
> + char __user *optval,
> + unsigned int optlen)
> +{
> + struct sctp_paddrthlds val;
> + struct sctp_transport *trans;
> + struct sctp_association *asoc;
> +
> + if (optlen < sizeof(struct sctp_paddrthlds))
> + return -EINVAL;
> + if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
> + optlen))
> + return -EFAULT;
What if optlen is bigger? You going to trash the stack.
> +
> + if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> + asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> + if (!asoc)
> + return -ENOENT;
> + list_for_each_entry(trans, &asoc->peer.transport_addr_list,
> + transports) {
> + trans->pathmaxrxt = val.spt_pathmaxrxt;
> + trans->pf_retrans = val.spt_pathpfthld;
You want to make sure that the values aren't 0. Otherwise, you'll set
the pathmaxrxt to 0 and that would be bad.
> + }
> +
> + asoc->pf_retrans = val.spt_pathpfthld;
> + asoc->pathmaxrxt = val.spt_pathmaxrxt;
Ditto.
> + } else {
> + trans = sctp_addr_id2transport(sk, &val.spt_address,
> + val.spt_assoc_id);
> + if (!trans)
> + return -ENOENT;
> +
> + trans->pathmaxrxt = val.spt_pathmaxrxt;
> + trans->pf_retrans = val.spt_pathpfthld;
Ditto.
> + }
> +
> + return 0;
> +}
> +
> /* API 6.2 setsockopt(), getsockopt()
> *
> * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -3619,6 +3665,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
> case SCTP_AUTO_ASCONF:
> retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
> break;
> + case SCTP_PEER_ADDR_THLDS:
> + retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
> + break;
> default:
> retval = -ENOPROTOOPT;
> break;
> @@ -5490,6 +5539,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
> return 0;
> }
>
> +/*
> + * SCTP_PEER_ADDR_THLDS
> + *
> + * This option allows us to fetch the partially failed threshold for one or all
> + * transports in an association. See Section 6.1 of:
> + * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
> + */
> +static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> + char __user *optval,
> + int optlen)
> +{
> + struct sctp_paddrthlds val;
> + struct sctp_transport *trans;
> + struct sctp_association *asoc;
> +
> + if (optlen < sizeof(struct sctp_paddrthlds))
> + return -EINVAL;
> + if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
> + return -EFAULT;
Again, trashing the stack if optlen and optval are bigger.
-vlad
> +
> + if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
> + val.spt_assoc_id);
> + asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> + if (!asoc)
> + return -ENOENT;
> +
> + val.spt_pathpfthld = asoc->pf_retrans;
> + val.spt_pathmaxrxt = asoc->pathmaxrxt;
> + } else {
> + trans = sctp_addr_id2transport(sk, &val.spt_address,
> + val.spt_assoc_id);
> + if (!trans)
> + return -ENOENT;
> +
> + val.spt_pathmaxrxt = trans->pathmaxrxt;
> + val.spt_pathpfthld = trans->pf_retrans;
> + }
> +
> + if (copy_to_user(optval, &val, optlen))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
> char __user *optval, int __user *optlen)
> {
> @@ -5628,6 +5721,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
> case SCTP_AUTO_ASCONF:
> retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
> break;
> + case SCTP_PEER_ADDR_THLDS:
> + retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
> + break;
> default:
> retval = -ENOPROTOOPT;
> break;
> diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
> index e5fe639..2b2bfe9 100644
> --- a/net/sctp/sysctl.c
> +++ b/net/sctp/sysctl.c
> @@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
> .extra2 = &int_max
> },
> {
> + .procname = "pf_retrans",
> + .data = &sctp_pf_retrans,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec_minmax,
> + .extra1 = &zero,
> + .extra2 = &int_max
> + },
> + {
> .procname = "max_init_retransmits",
> .data = &sctp_max_retrans_init,
> .maxlen = sizeof(int),
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index b026ba0..194d0f3 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
>
> /* Initialize the default path max_retrans. */
> peer->pathmaxrxt = sctp_max_retrans_path;
> + peer->pf_retrans = sctp_pf_retrans;
>
> INIT_LIST_HEAD(&peer->transmitted);
> INIT_LIST_HEAD(&peer->send_ready);
> @@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
> {
> unsigned long timeout;
> timeout = t->rto + sctp_jitter(t->rto);
> - if (t->state != SCTP_UNCONFIRMED)
> + if ((t->state != SCTP_UNCONFIRMED) &&
> + (t->state != SCTP_PF))
> timeout += t->hbinterval;
> timeout += jiffies;
> return timeout;
>
^ permalink raw reply
* Re: [PATCH net-next 0/4] net/mlx4_en: Add accelerated RFS support
From: Or Gerlitz @ 2012-07-18 21:22 UTC (permalink / raw)
To: David Miller; +Cc: roland, netdev, oren, yevgenyp, amirv
In-Reply-To: <20120718.114158.2161516212050365274.davem@davemloft.net>
On Wed, Jul 18, 2012 at 9:41 PM, David Miller <davem@davemloft.net> wrote:
> Please use CONFIG_RFS_ACCEL consistently to protect this feature
> in your driver sources.
>
> Using CPU_RMAP in a few places is inconsistent, and not what other
> drivers do.
We're indeed using CONFIG_RFS_ACCEL across the place, and only in two
places which deal directly with rmap use CPU_RMAP. The latter is
selected by RFS_ACCEL, but if you feel this is inconsistent, will
change to use only one define.
Or.
^ permalink raw reply
* Re: [PATCH v2 1/7] net-tcp: Fast Open base
From: Eric Dumazet @ 2012-07-18 21:16 UTC (permalink / raw)
To: Yuchung Cheng; +Cc: davem, hkchu, edumazet, ncardwell, sivasankar, netdev
In-Reply-To: <1342645307-17772-2-git-send-email-ycheng@google.com>
On Wed, 2012-07-18 at 14:01 -0700, Yuchung Cheng wrote:
> This patch impelements the common code for both the client and server.
>
> 1. TCP Fast Open option processing. Since Fast Open does not have an
> option number assigned by IANA yet, it shares the experiment option
> code 254 by implementing draft-ietf-tcpm-experimental-options
> with a 16 bits magic number 0xF989. This enables global experiments
> without clashing the scarce(2) experimental options available for TCP.
>
> When the draft status becomes standard (hopefully), the client should
> switch to the new option number assigned while the server supports
> both numbers for transistion.
>
> 2. The new sysctl tcp_fastopen
>
> 3. A place holder init function
>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> ---
Acked-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply
* Re: [PATCH v2 2/7] net-tcp: Fast Open client - cookie cache
From: Eric Dumazet @ 2012-07-18 21:16 UTC (permalink / raw)
To: Yuchung Cheng; +Cc: davem, hkchu, edumazet, ncardwell, sivasankar, netdev
In-Reply-To: <1342645307-17772-3-git-send-email-ycheng@google.com>
On Wed, 2012-07-18 at 14:01 -0700, Yuchung Cheng wrote:
> Add Fast Open metrics in tcp metrics cache: the basic ones are MSS and
> the cookies. Later patch will cache more to handle unfriendly middleboxes.
>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> ---
> include/net/tcp.h | 4 ++++
> net/ipv4/tcp_metrics.c | 41 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 45 insertions(+), 0 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 5aed371..e601da1 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -405,6 +405,10 @@ extern void tcp_metrics_init(void);
> extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
> extern bool tcp_remember_stamp(struct sock *sk);
> extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
> +extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
> + struct tcp_fastopen_cookie *cookie);
> +extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
> + struct tcp_fastopen_cookie *cookie);
> extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
> extern void tcp_disable_fack(struct tcp_sock *tp);
> extern void tcp_close(struct sock *sk, long timeout);
> diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
> index 1a115b6..b498954 100644
> --- a/net/ipv4/tcp_metrics.c
> +++ b/net/ipv4/tcp_metrics.c
> @@ -30,6 +30,11 @@ enum tcp_metric_index {
> TCP_METRIC_MAX,
> };
>
> +struct tcp_fastopen_metrics {
> + u16 mss;
> + struct tcp_fastopen_cookie cookie;
> +};
> +
> struct tcp_metrics_block {
> struct tcp_metrics_block __rcu *tcpm_next;
> struct inetpeer_addr tcpm_addr;
> @@ -38,6 +43,7 @@ struct tcp_metrics_block {
> u32 tcpm_ts_stamp;
> u32 tcpm_lock;
> u32 tcpm_vals[TCP_METRIC_MAX];
> + struct tcp_fastopen_metrics tcpm_fastopen;
> };
>
> static bool tcp_metric_locked(struct tcp_metrics_block *tm,
> @@ -118,6 +124,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
> tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
> tm->tcpm_ts = 0;
> tm->tcpm_ts_stamp = 0;
> + tm->tcpm_fastopen.mss = 0;
> + tm->tcpm_fastopen.cookie.len = 0;
> }
>
> static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
> @@ -633,6 +641,39 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
> return ret;
> }
>
> +void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
> + struct tcp_fastopen_cookie *cookie)
> +{
> + struct tcp_metrics_block *tm;
> +
> + rcu_read_lock();
> + tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
> + if (tm) {
> + struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
> + if (tfom->mss)
> + *mss = tfom->mss;
> + *cookie = tfom->cookie;
> + }
> + rcu_read_unlock();
> +}
> +
> +
> +void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
> + struct tcp_fastopen_cookie *cookie)
> +{
> + struct tcp_metrics_block *tm;
> +
> + rcu_read_lock();
> + tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
> + if (tm) {
> + struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
> + tfom->mss = mss;
> + if (cookie->len > 0)
> + tfom->cookie = *cookie;
> + }
> + rcu_read_unlock();
> +}
> +
Hmm, this rcu_read_lock() in cache_set() gives a false sense of
security ;)
I suggest using a seqlock instead ?
^ permalink raw reply
* Re: [PATCH] SUNRPC: Prevent kernel stack corruption on long values of flush
From: J. Bruce Fields @ 2012-07-18 21:08 UTC (permalink / raw)
To: Jim Rees
Cc: Sasha Levin, Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA,
davem-fT/PcQaiUtIeIZ0/mPfg9Q, davej-H+wXaHxf7aLQT0dZR+AlfA,
linux-nfs-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20120718200049.GA17964-63aXycvo3TyHXe+LvDLADg@public.gmane.org>
On Wed, Jul 18, 2012 at 04:00:49PM -0400, Jim Rees wrote:
> J. Bruce Fields wrote:
>
> On Tue, Jul 17, 2012 at 12:01:26AM +0200, Sasha Levin wrote:
> > The buffer size in read_flush() is too small for the longest possible values
> > for it. This can lead to a kernel stack corruption:
>
> Thanks!
>
> >
> > diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
> > index 2afd2a8..f86d95e 100644
> > --- a/net/sunrpc/cache.c
> > +++ b/net/sunrpc/cache.c
> > @@ -1409,11 +1409,11 @@ static ssize_t read_flush(struct file *file, char __user *buf,
> > size_t count, loff_t *ppos,
> > struct cache_detail *cd)
> > {
> > - char tbuf[20];
> > + char tbuf[22];
>
> I wonder how common this sort of calculation is in the kernel? It might
> provide some peace of mind to be able to write this something like
>
> char tbuf[MAXLEN_BASE10_UL + 2] /* + 2 for final "\n\0" */
>
> You could use something like:
>
> char tbuf[sizeof (unsigned long) * 24 / 10 + 1 + 2]; /* + 2 for final "\n\0" */
>
> since there are roughly 10 bits for every 3 decimal digits.
So we could do something like this. OK, I'm not sure I care enough.
--b.
diff --git a/include/linux/string.h b/include/linux/string.h
index e033564..ed34180 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -126,6 +126,10 @@ extern void argv_free(char **argv);
extern bool sysfs_streq(const char *s1, const char *s2);
extern int strtobool(const char *s, bool *res);
+/* length of the decimal representation of an unsigned integer. Just an
+ * approximation, but it's right for types of size 1 to 36 bytes: */
+#define base10len(i) (sizeof(i) * 24 / 10 + 1)
+
#ifdef CONFIG_BINARY_PRINTF
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 2afd2a8..1dcd2b3 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1409,7 +1409,7 @@ static ssize_t read_flush(struct file *file, char __user *buf,
size_t count, loff_t *ppos,
struct cache_detail *cd)
{
- char tbuf[20];
+ char tbuf[base10len(unsigned long) + 2];
unsigned long p = *ppos;
size_t len;
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH v2 5/7] net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)
From: Yuchung Cheng @ 2012-07-18 21:01 UTC (permalink / raw)
To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, ycheng, netdev
In-Reply-To: <1342645307-17772-1-git-send-email-ycheng@google.com>
sendmsg() (or sendto()) with MSG_FASTOPEN is a combo of connect(2)
and write(2). The application should replace connect() with it to
send data in the opening SYN packet.
For blocking socket, sendmsg() blocks until all the data are buffered
locally and the handshake is completed like connect() call. It
returns similar errno like connect() if the TCP handshake fails.
For non-blocking socket, it returns the number of bytes queued (and
transmitted in the SYN-data packet) if cookie is available. If cookie
is not available, it transmits a data-less SYN packet with Fast Open
cookie request option and returns -EINPROGRESS like connect().
Using MSG_FASTOPEN on connecting or connected socket will result in
simlar errno like repeating connect() calls. Therefore the application
should only use this flag on new sockets.
The buffer size of sendmsg() is independent of the MSS of the connection.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
Documentation/networking/ip-sysctl.txt | 11 ++++++
include/linux/socket.h | 1 +
include/net/inet_common.h | 6 ++-
include/net/tcp.h | 3 ++
net/ipv4/af_inet.c | 19 +++++++---
net/ipv4/tcp.c | 61 +++++++++++++++++++++++++++++---
net/ipv4/tcp_ipv4.c | 3 ++
7 files changed, 92 insertions(+), 12 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e1e0215..03964e0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -468,6 +468,17 @@ tcp_syncookies - BOOLEAN
SYN flood warnings in logs not being really flooded, your server
is seriously misconfigured.
+tcp_fastopen - INTEGER
+ Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data
+ in the opening SYN packet. To use this feature, the client application
+ must not use connect(). Instead, it should use sendmsg() or sendto()
+ with MSG_FASTOPEN flag which performs a TCP handshake automatically.
+
+ The values (bitmap) are:
+ 1: Enables sending data in the opening SYN on the client
+
+ Default: 0
+
tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 255. Default value
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 25d6322..90297db 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -276,6 +276,7 @@ struct ucred {
#else
#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */
#endif
+#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 22fac98..2340087 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -14,9 +14,11 @@ struct sockaddr;
struct socket;
extern int inet_release(struct socket *sock);
-extern int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
+extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags);
-extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+extern int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags);
+extern int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags);
extern int inet_accept(struct socket *sock, struct socket *newsock, int flags);
extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 867557b..c025810 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -212,6 +212,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
/* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */
#define TCP_INIT_CWND 10
+/* Bit Flags for sysctl_tcp_fastopen */
+#define TFO_CLIENT_ENABLE 1
+
extern struct inet_timewait_death_row tcp_death_row;
/* sysctl variables for tcp */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6ef67b7..c05ec41 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -590,8 +590,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
@@ -600,8 +600,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;
- lock_sock(sk);
-
if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -664,7 +662,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTED;
err = 0;
out:
- release_sock(sk);
return err;
sock_error:
@@ -674,6 +671,18 @@ sock_error:
sock->state = SS_DISCONNECTING;
goto out;
}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ int err;
+
+ lock_sock(sock->sk);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ release_sock(sock->sk);
+ return err;
+}
EXPORT_SYMBOL(inet_stream_connect);
/*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4252cd8..581ecf0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
#include <linux/slab.h>
#include <net/icmp.h>
+#include <net/inet_common.h>
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
@@ -982,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg)
return tmp;
}
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+ if (tp->fastopen_req != NULL) {
+ kfree(tp->fastopen_req);
+ tp->fastopen_req = NULL;
+ }
+}
+
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err, flags;
+
+ if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+ return -EOPNOTSUPP;
+ if (tp->fastopen_req != NULL)
+ return -EALREADY; /* Another Fast Open is in progress */
+
+ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+ sk->sk_allocation);
+ if (unlikely(tp->fastopen_req == NULL))
+ return -ENOBUFS;
+ tp->fastopen_req->data = msg;
+
+ flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+ err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, flags);
+ *size = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ return err;
+}
+
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int iovlen, flags, err, copied;
- int mss_now = 0, size_goal;
+ int iovlen, flags, err, copied = 0;
+ int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
bool sg;
long timeo;
lock_sock(sk);
flags = msg->msg_flags;
+ if (flags & MSG_FASTOPEN) {
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
+ if (err == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (err)
+ goto out_err;
+ offset = copied_syn;
+ }
+
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. */
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
- goto out_err;
+ goto do_error;
if (unlikely(tp->repair)) {
if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1037,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
unsigned char __user *from = iov->iov_base;
iov++;
+ if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
+ if (offset >= seglen) {
+ offset -= seglen;
+ continue;
+ }
+ seglen -= offset;
+ from += offset;
+ offset = 0;
+ }
while (seglen > 0) {
int copy = 0;
@@ -1199,7 +1250,7 @@ out:
if (copied && likely(!tp->repair))
tcp_push(sk, flags, mss_now, tp->nonagle);
release_sock(sk);
- return copied;
+ return copied + copied_syn;
do_fault:
if (!skb->len) {
@@ -1212,7 +1263,7 @@ do_fault:
}
do_error:
- if (copied)
+ if (copied + copied_syn)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6b1fe40..f70f844 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1952,6 +1952,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
tp->cookie_values = NULL;
}
+ /* If socket is aborted during connect operation */
+ tcp_free_fastopen_req(tp);
+
sk_sockets_allocated_dec(sk);
sock_release_memcg(sk);
}
--
1.7.7.3
^ permalink raw reply related
* [PATCH v2 6/7] net-tcp: Fast Open client - detecting SYN-data drops
From: Yuchung Cheng @ 2012-07-18 21:01 UTC (permalink / raw)
To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, ycheng, netdev
In-Reply-To: <1342645307-17772-1-git-send-email-ycheng@google.com>
On paths with firewalls dropping SYN with data or experimental TCP options,
Fast Open connections will have experience SYN timeout and bad performance.
The solution is to track such incidents in the cookie cache and disables
Fast Open temporarily.
Since only the original SYN includes data and/or Fast Open option, the
SYN-ACK has some tell-tale sign (tcp_rcv_fastopen_synack()) to detect
such drops. If a path has recurring Fast Open SYN drops, Fast Open is
disabled for 2^(recurring_losses) minutes starting from four minutes up to
roughly one and half day. sendmsg with MSG_FASTOPEN flag will succeed but
it behaves as connect() then write().
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
include/net/tcp.h | 6 ++++--
net/ipv4/tcp_input.c | 10 +++++++++-
net/ipv4/tcp_metrics.c | 16 +++++++++++++---
net/ipv4/tcp_output.c | 13 +++++++++++--
4 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c025810..e07878d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -409,9 +409,11 @@ extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
extern bool tcp_remember_stamp(struct sock *sk);
extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
- struct tcp_fastopen_cookie *cookie);
+ struct tcp_fastopen_cookie *cookie,
+ int *syn_loss, unsigned long *last_syn_loss);
extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
- struct tcp_fastopen_cookie *cookie);
+ struct tcp_fastopen_cookie *cookie,
+ bool syn_lost);
extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
extern void tcp_disable_fack(struct tcp_sock *tp);
extern void tcp_close(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38b6a81..c49a4fc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5652,6 +5652,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *data = tcp_write_queue_head(sk);
u16 mss = tp->rx_opt.mss_clamp;
+ bool syn_drop;
if (mss == tp->rx_opt.user_mss) {
struct tcp_options_received opt;
@@ -5664,7 +5665,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
mss = opt.mss_clamp;
}
- tcp_fastopen_cache_set(sk, mss, cookie);
+ /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
+ * the remote receives only the retransmitted (regular) SYNs: either
+ * the original SYN-data or the corresponding SYN-ACK is lost.
+ */
+ syn_drop = (cookie->len <= 0 && data &&
+ inet_csk(sk)->icsk_retransmits);
+
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
if (data) { /* Retransmit unacked data in SYN */
tcp_retransmit_skb(sk, data);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b498954..47e2c31 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -32,6 +32,8 @@ enum tcp_metric_index {
struct tcp_fastopen_metrics {
u16 mss;
+ u16 syn_loss:10; /* Recurring Fast Open SYN losses */
+ unsigned long last_syn_loss; /* Last Fast Open SYN loss */
struct tcp_fastopen_cookie cookie;
};
@@ -125,6 +127,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
tm->tcpm_ts = 0;
tm->tcpm_ts_stamp = 0;
tm->tcpm_fastopen.mss = 0;
+ tm->tcpm_fastopen.syn_loss = 0;
tm->tcpm_fastopen.cookie.len = 0;
}
@@ -642,7 +645,8 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
}
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
- struct tcp_fastopen_cookie *cookie)
+ struct tcp_fastopen_cookie *cookie,
+ int *syn_loss, unsigned long *last_syn_loss)
{
struct tcp_metrics_block *tm;
@@ -653,13 +657,14 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
if (tfom->mss)
*mss = tfom->mss;
*cookie = tfom->cookie;
+ *syn_loss = tfom->syn_loss;
+ *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
}
rcu_read_unlock();
}
-
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
- struct tcp_fastopen_cookie *cookie)
+ struct tcp_fastopen_cookie *cookie, bool syn_lost)
{
struct tcp_metrics_block *tm;
@@ -670,6 +675,11 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
tfom->mss = mss;
if (cookie->len > 0)
tfom->cookie = *cookie;
+ if (syn_lost) {
+ ++tfom->syn_loss;
+ tfom->last_syn_loss = jiffies;
+ } else
+ tfom->syn_loss = 0;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8869328..c5cfd5e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2860,10 +2860,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int space, i, err = 0, iovlen = fo->data->msg_iovlen;
+ int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
struct sk_buff *syn_data = NULL, *data;
+ unsigned long last_syn_loss = 0;
+
+ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+ &syn_loss, &last_syn_loss);
+ /* Recurring FO SYN losses: revert to regular handshake temporarily */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ fo->cookie.len = -1;
+ goto fallback;
+ }
- tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie);
if (fo->cookie.len <= 0)
goto fallback;
--
1.7.7.3
^ permalink raw reply related
* [PATCH v2 4/7] net-tcp: Fast Open client - receiving SYN-ACK
From: Yuchung Cheng @ 2012-07-18 21:01 UTC (permalink / raw)
To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, ycheng, netdev
In-Reply-To: <1342645307-17772-1-git-send-email-ycheng@google.com>
On receiving the SYN-ACK after SYN-data, the client needs to
a) update the cached MSS and cookie (if included in SYN-ACK)
b) retransmit the data not yet acknowledged by the SYN-ACK in the final ACK of
the handshake.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
net/ipv4/tcp_input.c | 40 +++++++++++++++++++++++++++++++++++-----
1 files changed, 35 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a06bb89..38b6a81 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5646,6 +5646,34 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
}
}
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *data = tcp_write_queue_head(sk);
+ u16 mss = tp->rx_opt.mss_clamp;
+
+ if (mss == tp->rx_opt.user_mss) {
+ struct tcp_options_received opt;
+ const u8 *hash_location;
+
+ /* Get original SYNACK MSS value if user MSS sets mss_clamp */
+ tcp_clear_options(&opt);
+ opt.user_mss = opt.mss_clamp = 0;
+ tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
+ mss = opt.mss_clamp;
+ }
+
+ tcp_fastopen_cache_set(sk, mss, cookie);
+
+ if (data) { /* Retransmit unacked data in SYN */
+ tcp_retransmit_skb(sk, data);
+ tcp_rearm_rto(sk);
+ return true;
+ }
+ return false;
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
@@ -5653,9 +5681,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_cookie_values *cvp = tp->cookie_values;
+ struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
- tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL);
+ tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
if (th->ack) {
/* rfc793:
@@ -5665,11 +5694,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
* a reset (unless the RST bit is set, if so drop
* the segment and return)"
- *
- * We do not send data with SYN, so that RFC-correct
- * test reduces to:
*/
- if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
goto reset_and_undo;
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5781,6 +5808,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_finish_connect(sk, skb);
+ if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc))
+ return -1;
+
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
icsk->icsk_ack.pingpong) {
--
1.7.7.3
^ permalink raw reply related
* [PATCH v2 7/7] net-tcp: Fast Open client - cookie-less mode
From: Yuchung Cheng @ 2012-07-18 21:01 UTC (permalink / raw)
To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, ycheng, netdev
In-Reply-To: <1342645307-17772-1-git-send-email-ycheng@google.com>
In trusted networks, e.g., intranet, data-center, the client does not
need to use Fast Open cookie to mitigate DoS attacks. In cookie-less
mode, sendmsg() with MSG_FASTOPEN flag will send SYN-data regardless
of cookie availability.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
Documentation/networking/ip-sysctl.txt | 2 ++
include/linux/tcp.h | 1 +
include/net/tcp.h | 1 +
net/ipv4/tcp_input.c | 8 ++++++--
net/ipv4/tcp_output.c | 6 +++++-
5 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 03964e0..5f3ef7f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -476,6 +476,8 @@ tcp_fastopen - INTEGER
The values (bitmap) are:
1: Enables sending data in the opening SYN on the client
+ 5: Enables sending data in the opening SYN on the client regardless
+ of cookie availability.
Default: 0
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1edf96a..9febfb6 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -387,6 +387,7 @@ struct tcp_sock {
u8 repair_queue;
u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */
early_retrans_delayed:1, /* Delayed ER timer installed */
+ syn_data:1, /* SYN includes data */
syn_fastopen:1; /* SYN includes Fast Open option */
/* RTT measurement */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e07878d..bc7c134 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -214,6 +214,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Bit Flags for sysctl_tcp_fastopen */
#define TFO_CLIENT_ENABLE 1
+#define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */
extern struct inet_timewait_death_row tcp_death_row;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c49a4fc..e67d685 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5650,7 +5650,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
struct tcp_fastopen_cookie *cookie)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *data = tcp_write_queue_head(sk);
+ struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
u16 mss = tp->rx_opt.mss_clamp;
bool syn_drop;
@@ -5665,6 +5665,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
mss = opt.mss_clamp;
}
+ if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */
+ cookie->len = -1;
+
/* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
* the remote receives only the retransmitted (regular) SYNs: either
* the original SYN-data or the corresponding SYN-ACK is lost.
@@ -5816,7 +5819,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_finish_connect(sk, skb);
- if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc))
+ if ((tp->syn_fastopen || tp->syn_data) &&
+ tcp_rcv_fastopen_synack(sk, skb, &foc))
return -1;
if (sk->sk_write_pending ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c5cfd5e..27a32ac 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2864,6 +2864,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
struct sk_buff *syn_data = NULL, *data;
unsigned long last_syn_loss = 0;
+ tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
&syn_loss, &last_syn_loss);
/* Recurring FO SYN losses: revert to regular handshake temporarily */
@@ -2873,7 +2874,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
goto fallback;
}
- if (fo->cookie.len <= 0)
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+ fo->cookie.len = -1;
+ else if (fo->cookie.len <= 0)
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
@@ -2916,6 +2919,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
fo->copied = data->len;
if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+ tp->syn_data = (fo->copied > 0);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
goto done;
}
--
1.7.7.3
^ permalink raw reply related
* [PATCH v2 1/7] net-tcp: Fast Open base
From: Yuchung Cheng @ 2012-07-18 21:01 UTC (permalink / raw)
To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, ycheng, netdev
In-Reply-To: <1342645307-17772-1-git-send-email-ycheng@google.com>
This patch impelements the common code for both the client and server.
1. TCP Fast Open option processing. Since Fast Open does not have an
option number assigned by IANA yet, it shares the experiment option
code 254 by implementing draft-ietf-tcpm-experimental-options
with a 16 bits magic number 0xF989. This enables global experiments
without clashing the scarce(2) experimental options available for TCP.
When the draft status becomes standard (hopefully), the client should
switch to the new option number assigned while the server supports
both numbers for transistion.
2. The new sysctl tcp_fastopen
3. A place holder init function
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
include/linux/tcp.h | 10 ++++++++++
include/net/tcp.h | 9 ++++++++-
net/ipv4/Makefile | 2 +-
net/ipv4/syncookies.c | 2 +-
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
net/ipv4/tcp_fastopen.c | 11 +++++++++++
net/ipv4/tcp_input.c | 26 ++++++++++++++++++++++----
net/ipv4/tcp_ipv4.c | 2 +-
net/ipv4/tcp_minisocks.c | 4 ++--
net/ipv4/tcp_output.c | 25 +++++++++++++++++++++----
net/ipv6/syncookies.c | 2 +-
net/ipv6/tcp_ipv6.c | 2 +-
12 files changed, 86 insertions(+), 16 deletions(-)
create mode 100644 net/ipv4/tcp_fastopen.c
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1888169..12948f5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -243,6 +243,16 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
return (tcp_hdr(skb)->doff - 5) * 4;
}
+/* TCP Fast Open */
+#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
+#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
+
+/* TCP Fast Open Cookie as stored in memory */
+struct tcp_fastopen_cookie {
+ s8 len;
+ u8 val[TCP_FASTOPEN_COOKIE_MAX];
+};
+
/* This defines a selective acknowledgement block. */
struct tcp_sack_block_wire {
__be32 start_seq;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 85c5090..5aed371 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -170,6 +170,11 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
#define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
#define TCPOPT_COOKIE 253 /* Cookie extension (experimental) */
+#define TCPOPT_EXP 254 /* Experimental */
+/* Magic number to be after the option value for sharing TCP
+ * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
+ */
+#define TCPOPT_FASTOPEN_MAGIC 0xF989
/*
* TCP option lengths
@@ -180,6 +185,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOLEN_SACK_PERM 2
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_MD5SIG 18
+#define TCPOLEN_EXP_FASTOPEN_BASE 4
#define TCPOLEN_COOKIE_BASE 2 /* Cookie-less header extension */
#define TCPOLEN_COOKIE_PAIR 3 /* Cookie pair header extension */
#define TCPOLEN_COOKIE_MIN (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN)
@@ -222,6 +228,7 @@ extern int sysctl_tcp_retries1;
extern int sysctl_tcp_retries2;
extern int sysctl_tcp_orphan_retries;
extern int sysctl_tcp_syncookies;
+extern int sysctl_tcp_fastopen;
extern int sysctl_tcp_retrans_collapse;
extern int sysctl_tcp_stdurg;
extern int sysctl_tcp_rfc1337;
@@ -418,7 +425,7 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags, int *addr_len);
extern void tcp_parse_options(const struct sk_buff *skb,
struct tcp_options_received *opt_rx, const u8 **hvpp,
- int estab);
+ int estab, struct tcp_fastopen_cookie *foc);
extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
/*
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a677d80..ae2ccf2 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o tcp_metrics.o \
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eab2a7f..650e152 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3f6a1e7..5840c32 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -367,6 +367,13 @@ static struct ctl_table ipv4_table[] = {
},
#endif
{
+ .procname = "tcp_fastopen",
+ .data = &sysctl_tcp_fastopen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_tw_recycle",
.data = &tcp_death_row.sysctl_tw_recycle,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 0000000..a7f729c
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,11 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+int sysctl_tcp_fastopen;
+
+static int __init tcp_fastopen_init(void)
+{
+ return 0;
+}
+
+late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fdd49f1..a06bb89 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3732,7 +3732,8 @@ old_ack:
* the fast version below fails.
*/
void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
- const u8 **hvpp, int estab)
+ const u8 **hvpp, int estab,
+ struct tcp_fastopen_cookie *foc)
{
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
@@ -3839,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
break;
}
break;
- }
+ case TCPOPT_EXP:
+ /* Fast Open option shares code 254 using a
+ * 16 bits magic number. It's valid only in
+ * SYN or SYN-ACK with an even size.
+ */
+ if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
+ get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
+ foc == NULL || !th->syn || (opsize & 1))
+ break;
+ foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
+ if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
+ foc->len <= TCP_FASTOPEN_COOKIE_MAX)
+ memcpy(foc->val, ptr + 2, foc->len);
+ else if (foc->len != 0)
+ foc->len = -1;
+ break;
+
+ }
ptr += opsize-2;
length -= opsize;
}
@@ -3882,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
if (tcp_parse_aligned_timestamp(tp, th))
return true;
}
- tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
+ tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
return true;
}
@@ -5637,7 +5655,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_cookie_values *cvp = tp->cookie_values;
int saved_clamp = tp->rx_opt.mss_clamp;
- tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL);
if (th->ack) {
/* rfc793:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9caf5c..6b1fe40 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1307,7 +1307,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
if (tmp_opt.cookie_plus > 0 &&
tmp_opt.saw_tstamp &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c66f2ed..5912ac3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -97,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -534,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 15a7c7b..4849be7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -385,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
#define OPTION_COOKIE_EXTENSION (1 << 4)
+#define OPTION_FAST_OPEN_COOKIE (1 << 8)
struct tcp_out_options {
- u8 options; /* bit field of OPTION_* */
+ u16 options; /* bit field of OPTION_* */
+ u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
- u16 mss; /* 0 to disable */
- __u32 tsval, tsecr; /* need to include OPTION_TS */
__u8 *hash_location; /* temporary pointer, overloaded */
+ __u32 tsval, tsecr; /* need to include OPTION_TS */
+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
};
/* The sysctl int routines are generic, so check consistency here.
@@ -442,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired)
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
struct tcp_out_options *opts)
{
- u8 options = opts->options; /* mungable copy */
+ u16 options = opts->options; /* mungable copy */
/* Having both authentication and cookies for security is redundant,
* and there's certainly not enough room. Instead, the cookie-less
@@ -564,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
tp->rx_opt.dsack = 0;
}
+
+ if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+ struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+
+ *ptr++ = htonl((TCPOPT_EXP << 24) |
+ ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+
+ memcpy(ptr, foc->val, foc->len);
+ if ((foc->len & 3) == 2) {
+ u8 *align = ((u8 *)ptr) + foc->len;
+ align[0] = align[1] = TCPOPT_NOP;
+ }
+ ptr += (foc->len + 3) >> 2;
+ }
}
/* Compute TCP options for SYN packets. This is not the final
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7bf3cc4..bb46061 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -177,7 +177,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
goto out;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c9dabdd..0302ec3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1033,7 +1033,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
if (tmp_opt.cookie_plus > 0 &&
tmp_opt.saw_tstamp &&
--
1.7.7.3
^ permalink raw reply related
* [PATCH v2 3/7] net-tcp: Fast Open client - sending SYN-data
From: Yuchung Cheng @ 2012-07-18 21:01 UTC (permalink / raw)
To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, ycheng, netdev
In-Reply-To: <1342645307-17772-1-git-send-email-ycheng@google.com>
This patch implements sending SYN-data in tcp_connect(). The data is
from tcp_sendmsg() with flag MSG_FASTOPEN (implemented in a later patch).
The length of the cookie in tcp_fastopen_req, init'd to 0, controls the
type of the SYN. If the cookie is not cached (len==0), the host sends
data-less SYN with Fast Open cookie request option to solicit a cookie
from the remote. If cookie is not available (len > 0), the host sends
a SYN-data with Fast Open cookie option. If cookie length is negative,
the SYN will not include any Fast Open option (for fall back operations).
To deal with middleboxes that may drop SYN with data or experimental TCP
option, the SYN-data is only sent once. SYN retransmits do not include
data or Fast Open options. The connection will fall back to regular TCP
handshake.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
include/linux/snmp.h | 1 +
include/linux/tcp.h | 6 ++-
include/net/tcp.h | 9 ++++
net/ipv4/af_inet.c | 7 +++
net/ipv4/proc.c | 1 +
net/ipv4/tcp_output.c | 115 +++++++++++++++++++++++++++++++++++++++++++++----
6 files changed, 129 insertions(+), 10 deletions(-)
diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index e5fcbd0..00bc189 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -238,6 +238,7 @@ enum
LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */
LINUX_MIB_TCPCHALLENGEACK, /* TCPChallengeACK */
LINUX_MIB_TCPSYNCHALLENGE, /* TCPSYNChallenge */
+ LINUX_MIB_TCPFASTOPENACTIVE, /* TCPFastOpenActive */
__LINUX_MIB_MAX
};
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 12948f5..1edf96a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -386,7 +386,8 @@ struct tcp_sock {
unused : 1;
u8 repair_queue;
u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */
- early_retrans_delayed:1; /* Delayed ER timer installed */
+ early_retrans_delayed:1, /* Delayed ER timer installed */
+ syn_fastopen:1; /* SYN includes Fast Open option */
/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
@@ -500,6 +501,9 @@ struct tcp_sock {
struct tcp_md5sig_info __rcu *md5sig_info;
#endif
+/* TCP fastopen related information */
+ struct tcp_fastopen_request *fastopen_req;
+
/* When the cookie options are generated and exchanged, then this
* object holds a reference to them (cookie_values->kref). Also
* contains related tcp_cookie_transactions fields.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e601da1..867557b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1289,6 +1289,15 @@ extern int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff
extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
const struct tcp_md5sig_key *key);
+struct tcp_fastopen_request {
+ /* Fast Open cookie. Size 0 means a cookie request */
+ struct tcp_fastopen_cookie cookie;
+ struct msghdr *data; /* data in MSG_FASTOPEN */
+ u16 copied; /* queued in tcp_connect() */
+};
+
+void tcp_free_fastopen_req(struct tcp_sock *tp);
+
/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 07a02f6..6ef67b7 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -558,9 +558,14 @@ EXPORT_SYMBOL(inet_dgram_connect);
static long inet_wait_for_connect(struct sock *sk, long timeo)
{
+ const bool write = (sk->sk_protocol == IPPROTO_TCP) &&
+ tcp_sk(sk)->fastopen_req &&
+ tcp_sk(sk)->fastopen_req->data;
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (write)
+ sk->sk_write_pending++;
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -576,6 +581,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
finish_wait(sk_sleep(sk), &wait);
+ if (write)
+ sk->sk_write_pending--;
return timeo;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2a5240b..957acd1 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -262,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+ SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4849be7..8869328 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -596,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
tcp_cookie_size_check(cvp->cookie_desired) :
0;
+ struct tcp_fastopen_request *fastopen = tp->fastopen_req;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -636,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
+ if (fastopen && fastopen->cookie.len >= 0) {
+ u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = &fastopen->cookie;
+ remaining -= need;
+ tp->syn_fastopen = 1;
+ }
+ }
/* Note that timestamps are required by the specification.
*
* Odd numbers of bytes are prohibited by the specification, ensuring
@@ -2824,6 +2835,96 @@ void tcp_connect_init(struct sock *sk)
tcp_clear_retrans(tp);
}
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ tcb->end_seq += skb->len;
+ skb_header_release(skb);
+ __tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+ tp->write_seq = tcb->end_seq;
+ tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_request *fo = tp->fastopen_req;
+ int space, i, err = 0, iovlen = fo->data->msg_iovlen;
+ struct sk_buff *syn_data = NULL, *data;
+
+ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie);
+ if (fo->cookie.len <= 0)
+ goto fallback;
+
+ /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+ * user-MSS. Reserve maximum option space for middleboxes that add
+ * private TCP options. The cost is reduced data space in SYN :(
+ */
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+ tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ MAX_TCP_OPTION_SPACE;
+
+ syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
+ sk->sk_allocation);
+ if (syn_data == NULL)
+ goto fallback;
+
+ for (i = 0; i < iovlen && syn_data->len < space; ++i) {
+ struct iovec *iov = &fo->data->msg_iov[i];
+ unsigned char __user *from = iov->iov_base;
+ int len = iov->iov_len;
+
+ if (syn_data->len + len > space)
+ len = space - syn_data->len;
+ else if (i + 1 == iovlen)
+ /* No more data pending in inet_wait_for_connect() */
+ fo->data = NULL;
+
+ if (skb_add_data(syn_data, from, len))
+ goto fallback;
+ }
+
+ /* Queue a data-only packet after the regular SYN for retransmission */
+ data = pskb_copy(syn_data, sk->sk_allocation);
+ if (data == NULL)
+ goto fallback;
+ TCP_SKB_CB(data)->seq++;
+ TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
+ TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
+ tcp_connect_queue_skb(sk, data);
+ fo->copied = data->len;
+
+ if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ goto done;
+ }
+ syn_data = NULL;
+
+fallback:
+ /* Send a regular SYN with Fast Open cookie request option */
+ if (fo->cookie.len > 0)
+ fo->cookie.len = 0;
+ err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+ if (err)
+ tp->syn_fastopen = 0;
+ kfree_skb(syn_data);
+done:
+ fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
+ return err;
+}
+
/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
@@ -2841,17 +2942,13 @@ int tcp_connect(struct sock *sk)
skb_reserve(buff, MAX_TCP_HEADER);
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+ tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tcp_connect_queue_skb(sk, buff);
TCP_ECN_send_syn(sk, buff);
- /* Send it off. */
- TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tp->retrans_stamp = TCP_SKB_CB(buff)->when;
- skb_header_release(buff);
- __tcp_add_write_queue_tail(sk, buff);
- sk->sk_wmem_queued += buff->truesize;
- sk_mem_charge(sk, buff->truesize);
- tp->packets_out += tcp_skb_pcount(buff);
- err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+ /* Send off SYN; include data in Fast Open. */
+ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
if (err == -ECONNREFUSED)
return err;
--
1.7.7.3
^ permalink raw reply related
* [PATCH v2 2/7] net-tcp: Fast Open client - cookie cache
From: Yuchung Cheng @ 2012-07-18 21:01 UTC (permalink / raw)
To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, ycheng, netdev
In-Reply-To: <1342645307-17772-1-git-send-email-ycheng@google.com>
Add Fast Open metrics in tcp metrics cache: the basic ones are MSS and
the cookies. Later patch will cache more to handle unfriendly middleboxes.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
include/net/tcp.h | 4 ++++
net/ipv4/tcp_metrics.c | 41 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 45 insertions(+), 0 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5aed371..e601da1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -405,6 +405,10 @@ extern void tcp_metrics_init(void);
extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
extern bool tcp_remember_stamp(struct sock *sk);
extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
+extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie);
+extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie);
extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
extern void tcp_disable_fack(struct tcp_sock *tp);
extern void tcp_close(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 1a115b6..b498954 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -30,6 +30,11 @@ enum tcp_metric_index {
TCP_METRIC_MAX,
};
+struct tcp_fastopen_metrics {
+ u16 mss;
+ struct tcp_fastopen_cookie cookie;
+};
+
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
struct inetpeer_addr tcpm_addr;
@@ -38,6 +43,7 @@ struct tcp_metrics_block {
u32 tcpm_ts_stamp;
u32 tcpm_lock;
u32 tcpm_vals[TCP_METRIC_MAX];
+ struct tcp_fastopen_metrics tcpm_fastopen;
};
static bool tcp_metric_locked(struct tcp_metrics_block *tm,
@@ -118,6 +124,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
tm->tcpm_ts = 0;
tm->tcpm_ts_stamp = 0;
+ tm->tcpm_fastopen.mss = 0;
+ tm->tcpm_fastopen.cookie.len = 0;
}
static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
@@ -633,6 +641,39 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
return ret;
}
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+ if (tfom->mss)
+ *mss = tfom->mss;
+ *cookie = tfom->cookie;
+ }
+ rcu_read_unlock();
+}
+
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+ tfom->mss = mss;
+ if (cookie->len > 0)
+ tfom->cookie = *cookie;
+ }
+ rcu_read_unlock();
+}
+
static unsigned long tcpmhash_entries;
static int __init set_tcpmhash_entries(char *str)
{
--
1.7.7.3
^ permalink raw reply related
* [PATCH v2 0/7] TCP Fast Open client
From: Yuchung Cheng @ 2012-07-18 21:01 UTC (permalink / raw)
To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, ycheng, netdev
ChangeLog since v1:
- Reduce tons of code by storing Fast Open stats in the TCP metrics :)
- Clarify the purpose of using an experimental option in patch 1/7
This patch series implement the client functionality of TCP Fast Open.
TCP Fast Open (TFO) allows data to be carried in the SYN and SYN-ACK
packets and consumed by the receiving end during the initial connection
handshake, thus providing a saving of up to one full round trip time (RTT)
compared to standard TCP requiring a three-way handshake (3WHS) to
complete before data can be exchanged.
The protocol change is detailed in the IETF internet draft at
http://www.ietf.org/id/draft-ietf-tcpm-fastopen-00.txt . The research
paper (http://conferences.sigcomm.org/co-next/2011/papers/1569470463.pdf)
studied the performance impact of HTTP using Fast Open, based on this
Linux implementation and the Chrome browser.
To use Fast Open, the client application (active SYN sender) must
replace connect() socket call with sendmsg() or sendto() with the new
MSG_FASTOPEN flag. If the server supports Fast Open the data exchange
starts at TCP handshake. Otherwise the connection will automatically
fall back to conventional TCP.
Yuchung Cheng (7):
net-tcp: Fast Open base
net-tcp: Fast Open client - cookie cache
net-tcp: Fast Open client - sending SYN-data
net-tcp: Fast Open client - receiving SYN-ACK
net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)
net-tcp: Fast Open client - detecting SYN-data drops
net-tcp: Fast Open client - cookie-less mode
Documentation/networking/ip-sysctl.txt | 13 +++
include/linux/snmp.h | 1 +
include/linux/socket.h | 1 +
include/linux/tcp.h | 17 ++++-
include/net/inet_common.h | 6 +-
include/net/tcp.h | 28 ++++++-
net/ipv4/Makefile | 2 +-
net/ipv4/af_inet.c | 26 +++++-
net/ipv4/proc.c | 1 +
net/ipv4/syncookies.c | 2 +-
net/ipv4/sysctl_net_ipv4.c | 7 ++
net/ipv4/tcp.c | 61 ++++++++++++-
net/ipv4/tcp_fastopen.c | 11 +++
net/ipv4/tcp_input.c | 76 ++++++++++++++--
net/ipv4/tcp_ipv4.c | 5 +-
net/ipv4/tcp_metrics.c | 51 +++++++++++
net/ipv4/tcp_minisocks.c | 4 +-
net/ipv4/tcp_output.c | 153 +++++++++++++++++++++++++++++---
net/ipv6/syncookies.c | 2 +-
net/ipv6/tcp_ipv6.c | 2 +-
20 files changed, 427 insertions(+), 42 deletions(-)
create mode 100644 net/ipv4/tcp_fastopen.c
--
1.7.7.3
^ permalink raw reply
* Re: [PATCH] SUNRPC: Prevent kernel stack corruption on long values of flush
From: Jim Rees @ 2012-07-18 20:55 UTC (permalink / raw)
To: Dave Jones, J. Bruce Fields, Sasha Levin,
Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA,
davem-fT/PcQaiUtIeIZ0/mPfg9Q, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20120718203304.GA18540-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Dave Jones wrote:
On Wed, Jul 18, 2012 at 04:00:49PM -0400, Jim Rees wrote:
> You could use something like:
>
> char tbuf[sizeof (unsigned long) * 24 / 10 + 1 + 2]; /* + 2 for final "\n\0" */
>
> since there are roughly 10 bits for every 3 decimal digits.
>
> But I'm obviously confused, because I don't understand why tbuf needs to be
> any more than 10 + 2.
Unsigned long isn't necessarily 32 bits.
On 64-bit systems %lu can be up to 18446744073709551615
Thanks. You caught me thinking "Intel." How embarrassing.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH 09/15] ipv4: Cache output routes in fib_info nexthops.
From: David Miller @ 2012-07-18 20:49 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
In-Reply-To: <1342644032.2626.3651.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 18 Jul 2012 22:40:32 +0200
> On Wed, 2012-07-18 at 13:36 -0700, David Miller wrote:
>
>> Do you really think it's faster/better in this case? I did consider
>> it.
>
> No, but code is smaller ;)
Ok, I'm convinced :)
^ permalink raw reply
* Re: [PATCH 09/15] ipv4: Cache output routes in fib_info nexthops.
From: Eric Dumazet @ 2012-07-18 20:40 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20120718.133647.1948216070418950586.davem@davemloft.net>
On Wed, 2012-07-18 at 13:36 -0700, David Miller wrote:
> Do you really think it's faster/better in this case? I did consider
> it.
No, but code is smaller ;)
^ permalink raw reply
* Re: [PATCH 09/15] ipv4: Cache output routes in fib_info nexthops.
From: David Miller @ 2012-07-18 20:36 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
In-Reply-To: <1342643659.2626.3636.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 18 Jul 2012 22:34:19 +0200
> On Wed, 2012-07-18 at 11:24 -0700, David Miller wrote:
>> If we have an output route that lacks nexthop exceptions, we can cache
>> it in the FIB info nexthop.
>>
>> Such routes will have DST_HOST cleared because such routes refer to a
>> family of destinations, rather than just one.
>
>
>> - const struct fib_result *res,
>> +static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
>> +{
>> + static DEFINE_SPINLOCK(fib_cache_lock);
>> + struct rtable **p = &nh->nh_rth_output;
>> +
>> + if (*p)
>> + return;
>> +
>> + spin_lock_bh(&fib_cache_lock);
>> + if (!*p) {
>> + *p = rt;
>> + dst_clone(&rt->dst);
>> + }
>> + spin_unlock_bh(&fib_cache_lock);
>> +}
>> +
>
> This probably should use cmpxchg()
Do you really think it's faster/better in this case? I did consider
it.
^ permalink raw reply
* Re: [PATCH 09/15] ipv4: Cache output routes in fib_info nexthops.
From: Eric Dumazet @ 2012-07-18 20:34 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20120718.112404.1910372180742347127.davem@davemloft.net>
On Wed, 2012-07-18 at 11:24 -0700, David Miller wrote:
> If we have an output route that lacks nexthop exceptions, we can cache
> it in the FIB info nexthop.
>
> Such routes will have DST_HOST cleared because such routes refer to a
> family of destinations, rather than just one.
> - const struct fib_result *res,
> +static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
> +{
> + static DEFINE_SPINLOCK(fib_cache_lock);
> + struct rtable **p = &nh->nh_rth_output;
> +
> + if (*p)
> + return;
> +
> + spin_lock_bh(&fib_cache_lock);
> + if (!*p) {
> + *p = rt;
> + dst_clone(&rt->dst);
> + }
> + spin_unlock_bh(&fib_cache_lock);
> +}
> +
This probably should use cmpxchg()
static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
{
struct rtable **p = &nh->nh_rth_output;
if (*p)
return;
if (cmpxchg(p, NULL, rt) == NULL)
dst_clone(&rt->dst);
}
^ permalink raw reply
* Re: [PATCH] SUNRPC: Prevent kernel stack corruption on long values of flush
From: Dave Jones @ 2012-07-18 20:33 UTC (permalink / raw)
To: Jim Rees
Cc: J. Bruce Fields, Sasha Levin,
Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA,
davem-fT/PcQaiUtIeIZ0/mPfg9Q, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20120718200049.GA17964-63aXycvo3TyHXe+LvDLADg@public.gmane.org>
On Wed, Jul 18, 2012 at 04:00:49PM -0400, Jim Rees wrote:
> You could use something like:
>
> char tbuf[sizeof (unsigned long) * 24 / 10 + 1 + 2]; /* + 2 for final "\n\0" */
>
> since there are roughly 10 bits for every 3 decimal digits.
>
> But I'm obviously confused, because I don't understand why tbuf needs to be
> any more than 10 + 2.
Unsigned long isn't necessarily 32 bits.
On 64-bit systems %lu can be up to 18446744073709551615
Dave
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH] net: Statically initialize init_net.dev_base_head
From: David Miller @ 2012-07-18 20:32 UTC (permalink / raw)
To: eric.dumazet; +Cc: mark.d.rustad, netdev, gaofeng, nhorman
In-Reply-To: <1342638809.2626.3451.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 18 Jul 2012 21:13:29 +0200
> On Wed, 2012-07-18 at 12:06 -0700, Mark Rustad wrote:
>> This change eliminates an initialization-order hazard most
>> recently seen when netprio_cgroup is built into the kernel.
>>
>> With thanks to Eric Dumazet for catching a bug.
>>
>> Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
>> ---
>>
>> net/core/dev.c | 3 ++-
>> net/core/net_namespace.c | 4 +++-
>> 2 files changed, 5 insertions(+), 2 deletions(-)
>
> Acked-by: Eric Dumazet <edumazet@google.com>
Applied, thanks everyone.
^ permalink raw reply
* Re: [PATCH] net: Statically initialize init_net.dev_base_head
From: David Miller @ 2012-07-18 20:32 UTC (permalink / raw)
To: john.r.fastabend; +Cc: nhorman, mark.d.rustad, netdev, gaofeng, eric.dumazet
In-Reply-To: <50071D11.7080207@intel.com>
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 18 Jul 2012 13:31:13 -0700
> On 7/18/2012 1:21 PM, Neil Horman wrote:
>> On Wed, Jul 18, 2012 at 01:20:10PM -0700, David Miller wrote:
>>> From: Neil Horman <nhorman@tuxdriver.com>
>>> Date: Wed, 18 Jul 2012 16:11:49 -0400
>>>
>>>> On Wed, Jul 18, 2012 at 12:06:07PM -0700, Mark Rustad wrote:
>>>>> This change eliminates an initialization-order hazard most
>>>>> recently seen when netprio_cgroup is built into the kernel.
>>>>>
>>>>> With thanks to Eric Dumazet for catching a bug.
>>>>>
>>>>> Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
>>> ...
>>>> I think dave was going to take John Fastabends patch from earlier
>>>> today, but
>>>> this works just as well. Long term I'm going to look into delaying
>>>> initzlization for cgroups, as it creates a strange initialization
>>>> state when you
>>>> have a module_init routine registered.
>>>
>>> Neil, any particular preference between John's and Mark's version
>>> of the fix?
>>>
>> I think they're both perfectly good. If I had to choose I'd say
>> Marks, just
>> because its done by initializing data, rather than adding more code to
>> run every
>> time we create a cgroup.
>>
>> Neil
>>
>
> Fine by me if we take this version instead.
I think that's what I'll do, sorry for all the trouble John :)
^ permalink raw reply
* [net-next 9/9] ixgbe: Cleanup holes in flags after removing several of them
From: Jeff Kirsher @ 2012-07-18 20:31 UTC (permalink / raw)
To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1342643516-2696-1-git-send-email-jeffrey.t.kirsher@intel.com>
From: Alexander Duyck <alexander.h.duyck@intel.com>
This change is just meant to defragment the flags as there are several hole
that have been introduced since several features, or the flags for them,
have been removed.
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe.h | 50 +++++++++++++++---------------
1 file changed, 25 insertions(+), 25 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 4ca10e6..f7f6fe2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -433,33 +433,33 @@ struct ixgbe_adapter {
* thus the additional *_CAPABLE flags.
*/
u32 flags;
-#define IXGBE_FLAG_MSI_CAPABLE (u32)(1 << 1)
-#define IXGBE_FLAG_MSI_ENABLED (u32)(1 << 2)
-#define IXGBE_FLAG_MSIX_CAPABLE (u32)(1 << 3)
-#define IXGBE_FLAG_MSIX_ENABLED (u32)(1 << 4)
-#define IXGBE_FLAG_RX_1BUF_CAPABLE (u32)(1 << 6)
-#define IXGBE_FLAG_RX_PS_CAPABLE (u32)(1 << 7)
-#define IXGBE_FLAG_RX_PS_ENABLED (u32)(1 << 8)
-#define IXGBE_FLAG_IN_NETPOLL (u32)(1 << 9)
-#define IXGBE_FLAG_DCA_ENABLED (u32)(1 << 10)
-#define IXGBE_FLAG_DCA_CAPABLE (u32)(1 << 11)
-#define IXGBE_FLAG_IMIR_ENABLED (u32)(1 << 12)
-#define IXGBE_FLAG_MQ_CAPABLE (u32)(1 << 13)
-#define IXGBE_FLAG_DCB_ENABLED (u32)(1 << 14)
-#define IXGBE_FLAG_VMDQ_CAPABLE (u32)(1 << 18)
-#define IXGBE_FLAG_VMDQ_ENABLED (u32)(1 << 19)
-#define IXGBE_FLAG_FAN_FAIL_CAPABLE (u32)(1 << 20)
-#define IXGBE_FLAG_NEED_LINK_UPDATE (u32)(1 << 22)
-#define IXGBE_FLAG_NEED_LINK_CONFIG (u32)(1 << 23)
-#define IXGBE_FLAG_FDIR_HASH_CAPABLE (u32)(1 << 24)
-#define IXGBE_FLAG_FDIR_PERFECT_CAPABLE (u32)(1 << 25)
-#define IXGBE_FLAG_FCOE_CAPABLE (u32)(1 << 26)
-#define IXGBE_FLAG_FCOE_ENABLED (u32)(1 << 27)
-#define IXGBE_FLAG_SRIOV_CAPABLE (u32)(1 << 28)
-#define IXGBE_FLAG_SRIOV_ENABLED (u32)(1 << 29)
+#define IXGBE_FLAG_MSI_CAPABLE (u32)(1 << 0)
+#define IXGBE_FLAG_MSI_ENABLED (u32)(1 << 1)
+#define IXGBE_FLAG_MSIX_CAPABLE (u32)(1 << 2)
+#define IXGBE_FLAG_MSIX_ENABLED (u32)(1 << 3)
+#define IXGBE_FLAG_RX_1BUF_CAPABLE (u32)(1 << 4)
+#define IXGBE_FLAG_RX_PS_CAPABLE (u32)(1 << 5)
+#define IXGBE_FLAG_RX_PS_ENABLED (u32)(1 << 6)
+#define IXGBE_FLAG_IN_NETPOLL (u32)(1 << 7)
+#define IXGBE_FLAG_DCA_ENABLED (u32)(1 << 8)
+#define IXGBE_FLAG_DCA_CAPABLE (u32)(1 << 9)
+#define IXGBE_FLAG_IMIR_ENABLED (u32)(1 << 10)
+#define IXGBE_FLAG_MQ_CAPABLE (u32)(1 << 11)
+#define IXGBE_FLAG_DCB_ENABLED (u32)(1 << 12)
+#define IXGBE_FLAG_VMDQ_CAPABLE (u32)(1 << 13)
+#define IXGBE_FLAG_VMDQ_ENABLED (u32)(1 << 14)
+#define IXGBE_FLAG_FAN_FAIL_CAPABLE (u32)(1 << 15)
+#define IXGBE_FLAG_NEED_LINK_UPDATE (u32)(1 << 16)
+#define IXGBE_FLAG_NEED_LINK_CONFIG (u32)(1 << 17)
+#define IXGBE_FLAG_FDIR_HASH_CAPABLE (u32)(1 << 18)
+#define IXGBE_FLAG_FDIR_PERFECT_CAPABLE (u32)(1 << 19)
+#define IXGBE_FLAG_FCOE_CAPABLE (u32)(1 << 20)
+#define IXGBE_FLAG_FCOE_ENABLED (u32)(1 << 21)
+#define IXGBE_FLAG_SRIOV_CAPABLE (u32)(1 << 22)
+#define IXGBE_FLAG_SRIOV_ENABLED (u32)(1 << 23)
u32 flags2;
-#define IXGBE_FLAG2_RSC_CAPABLE (u32)(1)
+#define IXGBE_FLAG2_RSC_CAPABLE (u32)(1 << 0)
#define IXGBE_FLAG2_RSC_ENABLED (u32)(1 << 1)
#define IXGBE_FLAG2_TEMP_SENSOR_CAPABLE (u32)(1 << 2)
#define IXGBE_FLAG2_TEMP_SENSOR_EVENT (u32)(1 << 3)
--
1.7.10.4
^ permalink raw reply related
* [net-next 7/9] ixgbe: Add support for SR-IOV w/ DCB or RSS
From: Jeff Kirsher @ 2012-07-18 20:31 UTC (permalink / raw)
To: davem
Cc: Alexander Duyck, netdev, gospo, sassmann, Greg Rose,
John Fastabend, Jeff Kirsher
In-Reply-To: <1342643516-2696-1-git-send-email-jeffrey.t.kirsher@intel.com>
From: Alexander Duyck <alexander.h.duyck@intel.com>
This change essentially makes it so that we can enable almost all of the
features all at once. This patch allows for the combination of SR-IOV,
DCB, and FCoE in the case of the x540. It also beefs up the SR-IOV by
adding support for RSS to the PF.
The testing matrix gets to be very complex for this patch as there are a
number of different features and subsets for queueing options. I tried to
narrow these down a bit by restricting the PF to only supporting 4TC DCB
when it is enabled in addition to SR-IOV.
Cc: Greg Rose <gregory.v.rose@intel.com>
Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Tested-by: Ross Brattain <ross.b.brattain@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe.h | 4 +
drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 377 ++++++++++++++++++++++--
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 37 ++-
drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 52 +++-
4 files changed, 423 insertions(+), 47 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 5a75a9c..67743aa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -284,6 +284,10 @@ struct ixgbe_ring_feature {
u16 offset; /* offset to start of feature */
} ____cacheline_internodealigned_in_smp;
+#define IXGBE_82599_VMDQ_8Q_MASK 0x78
+#define IXGBE_82599_VMDQ_4Q_MASK 0x7C
+#define IXGBE_82599_VMDQ_2Q_MASK 0x7E
+
/*
* FCoE requires that all Rx buffers be over 2200 bytes in length. Since
* this is twice the size of a half page we need to double the page order
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index 4c3822f..676e93f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -29,6 +29,83 @@
#include "ixgbe_sriov.h"
#ifdef CONFIG_IXGBE_DCB
+/**
+ * ixgbe_cache_ring_dcb_sriov - Descriptor ring to register mapping for SR-IOV
+ * @adapter: board private structure to initialize
+ *
+ * Cache the descriptor ring offsets for SR-IOV to the assigned rings. It
+ * will also try to cache the proper offsets if RSS/FCoE are enabled along
+ * with VMDq.
+ *
+ **/
+static bool ixgbe_cache_ring_dcb_sriov(struct ixgbe_adapter *adapter)
+{
+#ifdef IXGBE_FCOE
+ struct ixgbe_ring_feature *fcoe = &adapter->ring_feature[RING_F_FCOE];
+#endif /* IXGBE_FCOE */
+ struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
+ int i;
+ u16 reg_idx;
+ u8 tcs = netdev_get_num_tc(adapter->netdev);
+
+ /* verify we have DCB queueing enabled before proceeding */
+ if (tcs <= 1)
+ return false;
+
+ /* verify we have VMDq enabled before proceeding */
+ if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED))
+ return false;
+
+ /* start at VMDq register offset for SR-IOV enabled setups */
+ reg_idx = vmdq->offset * __ALIGN_MASK(1, ~vmdq->mask);
+ for (i = 0; i < adapter->num_rx_queues; i++, reg_idx++) {
+ /* If we are greater than indices move to next pool */
+ if ((reg_idx & ~vmdq->mask) >= tcs)
+ reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask);
+ adapter->rx_ring[i]->reg_idx = reg_idx;
+ }
+
+ reg_idx = vmdq->offset * __ALIGN_MASK(1, ~vmdq->mask);
+ for (i = 0; i < adapter->num_tx_queues; i++, reg_idx++) {
+ /* If we are greater than indices move to next pool */
+ if ((reg_idx & ~vmdq->mask) >= tcs)
+ reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask);
+ adapter->tx_ring[i]->reg_idx = reg_idx;
+ }
+
+#ifdef IXGBE_FCOE
+ /* nothing to do if FCoE is disabled */
+ if (!(adapter->flags & IXGBE_FLAG_FCOE_ENABLED))
+ return true;
+
+ /* The work is already done if the FCoE ring is shared */
+ if (fcoe->offset < tcs)
+ return true;
+
+ /* The FCoE rings exist separately, we need to move their reg_idx */
+ if (fcoe->indices) {
+ u16 queues_per_pool = __ALIGN_MASK(1, ~vmdq->mask);
+ u8 fcoe_tc = ixgbe_fcoe_get_tc(adapter);
+
+ reg_idx = (vmdq->offset + vmdq->indices) * queues_per_pool;
+ for (i = fcoe->offset; i < adapter->num_rx_queues; i++) {
+ reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask) + fcoe_tc;
+ adapter->rx_ring[i]->reg_idx = reg_idx;
+ reg_idx++;
+ }
+
+ reg_idx = (vmdq->offset + vmdq->indices) * queues_per_pool;
+ for (i = fcoe->offset; i < adapter->num_tx_queues; i++) {
+ reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask) + fcoe_tc;
+ adapter->tx_ring[i]->reg_idx = reg_idx;
+ reg_idx++;
+ }
+ }
+
+#endif /* IXGBE_FCOE */
+ return true;
+}
+
/* ixgbe_get_first_reg_idx - Return first register index associated with ring */
static void ixgbe_get_first_reg_idx(struct ixgbe_adapter *adapter, u8 tc,
unsigned int *tx, unsigned int *rx)
@@ -120,14 +197,61 @@ static bool ixgbe_cache_ring_dcb(struct ixgbe_adapter *adapter)
* no other mapping is used.
*
*/
-static inline bool ixgbe_cache_ring_sriov(struct ixgbe_adapter *adapter)
+static bool ixgbe_cache_ring_sriov(struct ixgbe_adapter *adapter)
{
- adapter->rx_ring[0]->reg_idx = adapter->num_vfs * 2;
- adapter->tx_ring[0]->reg_idx = adapter->num_vfs * 2;
- if (adapter->num_vfs)
- return true;
- else
+#ifdef IXGBE_FCOE
+ struct ixgbe_ring_feature *fcoe = &adapter->ring_feature[RING_F_FCOE];
+#endif /* IXGBE_FCOE */
+ struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
+ struct ixgbe_ring_feature *rss = &adapter->ring_feature[RING_F_RSS];
+ int i;
+ u16 reg_idx;
+
+ /* only proceed if VMDq is enabled */
+ if (!(adapter->flags & IXGBE_FLAG_VMDQ_ENABLED))
return false;
+
+ /* start at VMDq register offset for SR-IOV enabled setups */
+ reg_idx = vmdq->offset * __ALIGN_MASK(1, ~vmdq->mask);
+ for (i = 0; i < adapter->num_rx_queues; i++, reg_idx++) {
+#ifdef IXGBE_FCOE
+ /* Allow first FCoE queue to be mapped as RSS */
+ if (fcoe->offset && (i > fcoe->offset))
+ break;
+#endif
+ /* If we are greater than indices move to next pool */
+ if ((reg_idx & ~vmdq->mask) >= rss->indices)
+ reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask);
+ adapter->rx_ring[i]->reg_idx = reg_idx;
+ }
+
+#ifdef IXGBE_FCOE
+ /* FCoE uses a linear block of queues so just assigning 1:1 */
+ for (; i < adapter->num_rx_queues; i++, reg_idx++)
+ adapter->rx_ring[i]->reg_idx = reg_idx;
+
+#endif
+ reg_idx = vmdq->offset * __ALIGN_MASK(1, ~vmdq->mask);
+ for (i = 0; i < adapter->num_tx_queues; i++, reg_idx++) {
+#ifdef IXGBE_FCOE
+ /* Allow first FCoE queue to be mapped as RSS */
+ if (fcoe->offset && (i > fcoe->offset))
+ break;
+#endif
+ /* If we are greater than indices move to next pool */
+ if ((reg_idx & rss->mask) >= rss->indices)
+ reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask);
+ adapter->tx_ring[i]->reg_idx = reg_idx;
+ }
+
+#ifdef IXGBE_FCOE
+ /* FCoE uses a linear block of queues so just assigning 1:1 */
+ for (; i < adapter->num_tx_queues; i++, reg_idx++)
+ adapter->tx_ring[i]->reg_idx = reg_idx;
+
+#endif
+
+ return true;
}
/**
@@ -169,30 +293,20 @@ static void ixgbe_cache_ring_register(struct ixgbe_adapter *adapter)
adapter->rx_ring[0]->reg_idx = 0;
adapter->tx_ring[0]->reg_idx = 0;
- if (ixgbe_cache_ring_sriov(adapter))
+#ifdef CONFIG_IXGBE_DCB
+ if (ixgbe_cache_ring_dcb_sriov(adapter))
return;
-#ifdef CONFIG_IXGBE_DCB
if (ixgbe_cache_ring_dcb(adapter))
return;
+
#endif
+ if (ixgbe_cache_ring_sriov(adapter))
+ return;
ixgbe_cache_ring_rss(adapter);
}
-/**
- * ixgbe_set_sriov_queues - Allocate queues for IOV use
- * @adapter: board private structure to initialize
- *
- * IOV doesn't actually use anything, so just NAK the
- * request for now and let the other queue routines
- * figure out what to do.
- */
-static inline bool ixgbe_set_sriov_queues(struct ixgbe_adapter *adapter)
-{
- return false;
-}
-
#define IXGBE_RSS_16Q_MASK 0xF
#define IXGBE_RSS_8Q_MASK 0x7
#define IXGBE_RSS_4Q_MASK 0x3
@@ -200,6 +314,109 @@ static inline bool ixgbe_set_sriov_queues(struct ixgbe_adapter *adapter)
#define IXGBE_RSS_DISABLED_MASK 0x0
#ifdef CONFIG_IXGBE_DCB
+/**
+ * ixgbe_set_dcb_sriov_queues: Allocate queues for SR-IOV devices w/ DCB
+ * @adapter: board private structure to initialize
+ *
+ * When SR-IOV (Single Root IO Virtualiztion) is enabled, allocate queues
+ * and VM pools where appropriate. Also assign queues based on DCB
+ * priorities and map accordingly..
+ *
+ **/
+static bool ixgbe_set_dcb_sriov_queues(struct ixgbe_adapter *adapter)
+{
+ int i;
+ u16 vmdq_i = adapter->ring_feature[RING_F_VMDQ].limit;
+ u16 vmdq_m = 0;
+#ifdef IXGBE_FCOE
+ u16 fcoe_i = 0;
+#endif
+ u8 tcs = netdev_get_num_tc(adapter->netdev);
+
+ /* verify we have DCB queueing enabled before proceeding */
+ if (tcs <= 1)
+ return false;
+
+ /* verify we have VMDq enabled before proceeding */
+ if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED))
+ return false;
+
+ /* Add starting offset to total pool count */
+ vmdq_i += adapter->ring_feature[RING_F_VMDQ].offset;
+
+ /* 16 pools w/ 8 TC per pool */
+ if (tcs > 4) {
+ vmdq_i = min_t(u16, vmdq_i, 16);
+ vmdq_m = IXGBE_82599_VMDQ_8Q_MASK;
+ /* 32 pools w/ 4 TC per pool */
+ } else {
+ vmdq_i = min_t(u16, vmdq_i, 32);
+ vmdq_m = IXGBE_82599_VMDQ_4Q_MASK;
+ }
+
+#ifdef IXGBE_FCOE
+ /* queues in the remaining pools are available for FCoE */
+ fcoe_i = (128 / __ALIGN_MASK(1, ~vmdq_m)) - vmdq_i;
+
+#endif
+ /* remove the starting offset from the pool count */
+ vmdq_i -= adapter->ring_feature[RING_F_VMDQ].offset;
+
+ /* save features for later use */
+ adapter->ring_feature[RING_F_VMDQ].indices = vmdq_i;
+ adapter->ring_feature[RING_F_VMDQ].mask = vmdq_m;
+
+ /*
+ * We do not support DCB, VMDq, and RSS all simultaneously
+ * so we will disable RSS since it is the lowest priority
+ */
+ adapter->ring_feature[RING_F_RSS].indices = 1;
+ adapter->ring_feature[RING_F_RSS].mask = IXGBE_RSS_DISABLED_MASK;
+
+ adapter->num_rx_pools = vmdq_i;
+ adapter->num_rx_queues_per_pool = tcs;
+
+ adapter->num_tx_queues = vmdq_i * tcs;
+ adapter->num_rx_queues = vmdq_i * tcs;
+
+#ifdef IXGBE_FCOE
+ if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) {
+ struct ixgbe_ring_feature *fcoe;
+
+ fcoe = &adapter->ring_feature[RING_F_FCOE];
+
+ /* limit ourselves based on feature limits */
+ fcoe_i = min_t(u16, fcoe_i, num_online_cpus());
+ fcoe_i = min_t(u16, fcoe_i, fcoe->limit);
+
+ if (fcoe_i) {
+ /* alloc queues for FCoE separately */
+ fcoe->indices = fcoe_i;
+ fcoe->offset = vmdq_i * tcs;
+
+ /* add queues to adapter */
+ adapter->num_tx_queues += fcoe_i;
+ adapter->num_rx_queues += fcoe_i;
+ } else if (tcs > 1) {
+ /* use queue belonging to FcoE TC */
+ fcoe->indices = 1;
+ fcoe->offset = ixgbe_fcoe_get_tc(adapter);
+ } else {
+ adapter->flags &= ~IXGBE_FLAG_FCOE_ENABLED;
+
+ fcoe->indices = 0;
+ fcoe->offset = 0;
+ }
+ }
+
+#endif /* IXGBE_FCOE */
+ /* configure TC to queue mapping */
+ for (i = 0; i < tcs; i++)
+ netdev_set_tc_queue(adapter->netdev, i, 1, i);
+
+ return true;
+}
+
static bool ixgbe_set_dcb_queues(struct ixgbe_adapter *adapter)
{
struct net_device *dev = adapter->netdev;
@@ -262,6 +479,117 @@ static bool ixgbe_set_dcb_queues(struct ixgbe_adapter *adapter)
#endif
/**
+ * ixgbe_set_sriov_queues - Allocate queues for SR-IOV devices
+ * @adapter: board private structure to initialize
+ *
+ * When SR-IOV (Single Root IO Virtualiztion) is enabled, allocate queues
+ * and VM pools where appropriate. If RSS is available, then also try and
+ * enable RSS and map accordingly.
+ *
+ **/
+static bool ixgbe_set_sriov_queues(struct ixgbe_adapter *adapter)
+{
+ u16 vmdq_i = adapter->ring_feature[RING_F_VMDQ].limit;
+ u16 vmdq_m = 0;
+ u16 rss_i = adapter->ring_feature[RING_F_RSS].limit;
+ u16 rss_m = IXGBE_RSS_DISABLED_MASK;
+#ifdef IXGBE_FCOE
+ u16 fcoe_i = 0;
+#endif
+
+ /* only proceed if SR-IOV is enabled */
+ if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED))
+ return false;
+
+ /* Add starting offset to total pool count */
+ vmdq_i += adapter->ring_feature[RING_F_VMDQ].offset;
+
+ /* double check we are limited to maximum pools */
+ vmdq_i = min_t(u16, IXGBE_MAX_VMDQ_INDICES, vmdq_i);
+
+ /* 64 pool mode with 2 queues per pool */
+ if ((vmdq_i > 32) || (rss_i < 4)) {
+ vmdq_m = IXGBE_82599_VMDQ_2Q_MASK;
+ rss_m = IXGBE_RSS_2Q_MASK;
+ rss_i = min_t(u16, rss_i, 2);
+ /* 32 pool mode with 4 queues per pool */
+ } else {
+ vmdq_m = IXGBE_82599_VMDQ_4Q_MASK;
+ rss_m = IXGBE_RSS_4Q_MASK;
+ rss_i = 4;
+ }
+
+#ifdef IXGBE_FCOE
+ /* queues in the remaining pools are available for FCoE */
+ fcoe_i = 128 - (vmdq_i * __ALIGN_MASK(1, ~vmdq_m));
+
+#endif
+ /* remove the starting offset from the pool count */
+ vmdq_i -= adapter->ring_feature[RING_F_VMDQ].offset;
+
+ /* save features for later use */
+ adapter->ring_feature[RING_F_VMDQ].indices = vmdq_i;
+ adapter->ring_feature[RING_F_VMDQ].mask = vmdq_m;
+
+ /* limit RSS based on user input and save for later use */
+ adapter->ring_feature[RING_F_RSS].indices = rss_i;
+ adapter->ring_feature[RING_F_RSS].mask = rss_m;
+
+ adapter->num_rx_pools = vmdq_i;
+ adapter->num_rx_queues_per_pool = rss_i;
+
+ adapter->num_rx_queues = vmdq_i * rss_i;
+ adapter->num_tx_queues = vmdq_i * rss_i;
+
+ /* disable ATR as it is not supported when VMDq is enabled */
+ adapter->flags &= ~IXGBE_FLAG_FDIR_HASH_CAPABLE;
+
+#ifdef IXGBE_FCOE
+ /*
+ * FCoE can use rings from adjacent buffers to allow RSS
+ * like behavior. To account for this we need to add the
+ * FCoE indices to the total ring count.
+ */
+ if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) {
+ struct ixgbe_ring_feature *fcoe;
+
+ fcoe = &adapter->ring_feature[RING_F_FCOE];
+
+ /* limit ourselves based on feature limits */
+ fcoe_i = min_t(u16, fcoe_i, fcoe->limit);
+
+ if (vmdq_i > 1 && fcoe_i) {
+ /* reserve no more than number of CPUs */
+ fcoe_i = min_t(u16, fcoe_i, num_online_cpus());
+
+ /* alloc queues for FCoE separately */
+ fcoe->indices = fcoe_i;
+ fcoe->offset = vmdq_i * rss_i;
+ } else {
+ /* merge FCoE queues with RSS queues */
+ fcoe_i = min_t(u16, fcoe_i + rss_i, num_online_cpus());
+
+ /* limit indices to rss_i if MSI-X is disabled */
+ if (!(adapter->flags & IXGBE_FLAG_MSIX_ENABLED))
+ fcoe_i = rss_i;
+
+ /* attempt to reserve some queues for just FCoE */
+ fcoe->indices = min_t(u16, fcoe_i, fcoe->limit);
+ fcoe->offset = fcoe_i - fcoe->indices;
+
+ fcoe_i -= rss_i;
+ }
+
+ /* add queues to adapter */
+ adapter->num_tx_queues += fcoe_i;
+ adapter->num_rx_queues += fcoe_i;
+ }
+
+#endif
+ return true;
+}
+
+/**
* ixgbe_set_rss_queues - Allocate queues for RSS
* @adapter: board private structure to initialize
*
@@ -353,14 +681,17 @@ static void ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
adapter->num_rx_pools = adapter->num_rx_queues;
adapter->num_rx_queues_per_pool = 1;
- if (ixgbe_set_sriov_queues(adapter))
+#ifdef CONFIG_IXGBE_DCB
+ if (ixgbe_set_dcb_sriov_queues(adapter))
return;
-#ifdef CONFIG_IXGBE_DCB
if (ixgbe_set_dcb_queues(adapter))
return;
#endif
+ if (ixgbe_set_sriov_queues(adapter))
+ return;
+
ixgbe_set_rss_queues(adapter);
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ea94fa2..454e556 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -3161,9 +3161,18 @@ static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)
* Set up VF register offsets for selected VT Mode,
* i.e. 32 or 64 VFs for SR-IOV
*/
- gcr_ext = IXGBE_READ_REG(hw, IXGBE_GCR_EXT);
- gcr_ext |= IXGBE_GCR_EXT_MSIX_EN;
- gcr_ext |= IXGBE_GCR_EXT_VT_MODE_64;
+ switch (adapter->ring_feature[RING_F_VMDQ].mask) {
+ case IXGBE_82599_VMDQ_8Q_MASK:
+ gcr_ext = IXGBE_GCR_EXT_VT_MODE_16;
+ break;
+ case IXGBE_82599_VMDQ_4Q_MASK:
+ gcr_ext = IXGBE_GCR_EXT_VT_MODE_32;
+ break;
+ default:
+ gcr_ext = IXGBE_GCR_EXT_VT_MODE_64;
+ break;
+ }
+
IXGBE_WRITE_REG(hw, IXGBE_GCR_EXT, gcr_ext);
/* enable Tx loopback for VF/PF communication */
@@ -3947,7 +3956,18 @@ static void ixgbe_setup_gpie(struct ixgbe_adapter *adapter)
if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) {
gpie &= ~IXGBE_GPIE_VTMODE_MASK;
- gpie |= IXGBE_GPIE_VTMODE_64;
+
+ switch (adapter->ring_feature[RING_F_VMDQ].mask) {
+ case IXGBE_82599_VMDQ_8Q_MASK:
+ gpie |= IXGBE_GPIE_VTMODE_16;
+ break;
+ case IXGBE_82599_VMDQ_4Q_MASK:
+ gpie |= IXGBE_GPIE_VTMODE_32;
+ break;
+ default:
+ gpie |= IXGBE_GPIE_VTMODE_64;
+ break;
+ }
}
/* Enable Thermal over heat sensor interrupt */
@@ -6674,11 +6694,6 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc)
return -EINVAL;
}
- if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) {
- e_err(drv, "Enable failed, SR-IOV enabled\n");
- return -EINVAL;
- }
-
/* Hardware supports up to 8 traffic classes */
if (tc > adapter->dcb_cfg.num_tcs.pg_tcs ||
(hw->mac.type == ixgbe_mac_82598EB &&
@@ -7225,10 +7240,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
netdev->priv_flags |= IFF_UNICAST_FLT;
netdev->priv_flags |= IFF_SUPP_NOFCS;
- if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)
- adapter->flags &= ~(IXGBE_FLAG_RSS_ENABLED |
- IXGBE_FLAG_DCB_ENABLED);
-
#ifdef CONFIG_IXGBE_DCB
netdev->dcbnl_ops = &dcbnl_ops;
#endif
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index eb3f67c..d285443 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -107,15 +107,21 @@ void ixgbe_enable_sriov(struct ixgbe_adapter *adapter,
"VF drivers to avoid spoofed packet errors\n");
} else {
err = pci_enable_sriov(adapter->pdev, adapter->num_vfs);
+ if (err) {
+ e_err(probe, "Failed to enable PCI sriov: %d\n", err);
+ goto err_novfs;
+ }
}
- if (err) {
- e_err(probe, "Failed to enable PCI sriov: %d\n", err);
- goto err_novfs;
- }
- adapter->flags |= IXGBE_FLAG_SRIOV_ENABLED;
+ adapter->flags |= IXGBE_FLAG_SRIOV_ENABLED;
e_info(probe, "SR-IOV enabled with %d VFs\n", adapter->num_vfs);
+ /* Enable VMDq flag so device will be set in VM mode */
+ adapter->flags |= IXGBE_FLAG_VMDQ_ENABLED;
+ if (!adapter->ring_feature[RING_F_VMDQ].limit)
+ adapter->ring_feature[RING_F_VMDQ].limit = 1;
+ adapter->ring_feature[RING_F_VMDQ].offset = adapter->num_vfs;
+
num_vf_macvlans = hw->mac.num_rar_entries -
(IXGBE_MAX_PF_MACVLANS + 1 + adapter->num_vfs);
@@ -146,12 +152,39 @@ void ixgbe_enable_sriov(struct ixgbe_adapter *adapter,
* and memory allocated set up the mailbox parameters
*/
ixgbe_init_mbx_params_pf(hw);
- memcpy(&hw->mbx.ops, ii->mbx_ops,
- sizeof(hw->mbx.ops));
+ memcpy(&hw->mbx.ops, ii->mbx_ops, sizeof(hw->mbx.ops));
+
+ /* limit trafffic classes based on VFs enabled */
+ if ((adapter->hw.mac.type == ixgbe_mac_82599EB) &&
+ (adapter->num_vfs < 16)) {
+ adapter->dcb_cfg.num_tcs.pg_tcs = MAX_TRAFFIC_CLASS;
+ adapter->dcb_cfg.num_tcs.pfc_tcs = MAX_TRAFFIC_CLASS;
+ } else if (adapter->num_vfs < 32) {
+ adapter->dcb_cfg.num_tcs.pg_tcs = 4;
+ adapter->dcb_cfg.num_tcs.pfc_tcs = 4;
+ } else {
+ adapter->dcb_cfg.num_tcs.pg_tcs = 1;
+ adapter->dcb_cfg.num_tcs.pfc_tcs = 1;
+ }
+
+ /* We do not support RSS w/ SR-IOV */
+ adapter->ring_feature[RING_F_RSS].limit = 1;
/* Disable RSC when in SR-IOV mode */
adapter->flags2 &= ~(IXGBE_FLAG2_RSC_CAPABLE |
IXGBE_FLAG2_RSC_ENABLED);
+
+#ifdef IXGBE_FCOE
+ /*
+ * When SR-IOV is enabled 82599 cannot support jumbo frames
+ * so we must disable FCoE because we cannot support FCoE MTU.
+ */
+ if (adapter->hw.mac.type == ixgbe_mac_82599EB)
+ adapter->flags &= ~(IXGBE_FLAG_FCOE_ENABLED |
+ IXGBE_FLAG_FCOE_CAPABLE);
+#endif
+
+ /* enable spoof checking for all VFs */
for (i = 0; i < adapter->num_vfs; i++)
adapter->vfinfo[i].spoofchk_enabled = true;
return;
@@ -171,7 +204,6 @@ err_novfs:
void ixgbe_disable_sriov(struct ixgbe_adapter *adapter)
{
struct ixgbe_hw *hw = &adapter->hw;
- u32 gcr;
u32 gpie;
u32 vmdctl;
int i;
@@ -182,9 +214,7 @@ void ixgbe_disable_sriov(struct ixgbe_adapter *adapter)
#endif
/* turn off device IOV mode */
- gcr = IXGBE_READ_REG(hw, IXGBE_GCR_EXT);
- gcr &= ~(IXGBE_GCR_EXT_SRIOV);
- IXGBE_WRITE_REG(hw, IXGBE_GCR_EXT, gcr);
+ IXGBE_WRITE_REG(hw, IXGBE_GCR_EXT, 0);
gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
gpie &= ~IXGBE_GPIE_VTMODE_MASK;
IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
--
1.7.10.4
^ permalink raw reply related
* [net-next 8/9] ixgbe: Retire RSS enabled and capable flags
From: Jeff Kirsher @ 2012-07-18 20:31 UTC (permalink / raw)
To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1342643516-2696-1-git-send-email-jeffrey.t.kirsher@intel.com>
From: Alexander Duyck <alexander.h.duyck@intel.com>
All of our hardware supports RSS even if it is only for a single queue. So
instead of toting around the RSS enable flag I am updating the code so that
all devices are enabled and if we want to disable RSS it is indicated via
the RSS mask.
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe.h | 2 --
drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 4 ---
drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 10 +-------
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 29 ++++++----------------
4 files changed, 8 insertions(+), 37 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 67743aa..4ca10e6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -446,8 +446,6 @@ struct ixgbe_adapter {
#define IXGBE_FLAG_IMIR_ENABLED (u32)(1 << 12)
#define IXGBE_FLAG_MQ_CAPABLE (u32)(1 << 13)
#define IXGBE_FLAG_DCB_ENABLED (u32)(1 << 14)
-#define IXGBE_FLAG_RSS_ENABLED (u32)(1 << 16)
-#define IXGBE_FLAG_RSS_CAPABLE (u32)(1 << 17)
#define IXGBE_FLAG_VMDQ_CAPABLE (u32)(1 << 18)
#define IXGBE_FLAG_VMDQ_ENABLED (u32)(1 << 19)
#define IXGBE_FLAG_FAN_FAIL_CAPABLE (u32)(1 << 20)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 8e1be50..4104ea25 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -2245,10 +2245,6 @@ static int ixgbe_get_rss_hash_opts(struct ixgbe_adapter *adapter,
{
cmd->data = 0;
- /* if RSS is disabled then report no hashing */
- if (!(adapter->flags & IXGBE_FLAG_RSS_ENABLED))
- return 0;
-
/* Report default options for RSS on ixgbe */
switch (cmd->flow_type) {
case TCP_V4_FLOW:
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index 676e93f..38d1b65 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -265,9 +265,6 @@ static bool ixgbe_cache_ring_rss(struct ixgbe_adapter *adapter)
{
int i;
- if (!(adapter->flags & IXGBE_FLAG_RSS_ENABLED))
- return false;
-
for (i = 0; i < adapter->num_rx_queues; i++)
adapter->rx_ring[i]->reg_idx = i;
for (i = 0; i < adapter->num_tx_queues; i++)
@@ -602,11 +599,6 @@ static bool ixgbe_set_rss_queues(struct ixgbe_adapter *adapter)
struct ixgbe_ring_feature *f;
u16 rss_i;
- if (!(adapter->flags & IXGBE_FLAG_RSS_ENABLED)) {
- adapter->flags &= ~IXGBE_FLAG_FDIR_HASH_CAPABLE;
- return false;
- }
-
/* set mask for 16 queue limit of RSS */
f = &adapter->ring_feature[RING_F_RSS];
rss_i = f->limit;
@@ -1062,7 +1054,6 @@ static void ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
}
adapter->flags &= ~IXGBE_FLAG_DCB_ENABLED;
- adapter->flags &= ~IXGBE_FLAG_RSS_ENABLED;
if (adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE) {
e_err(probe,
"ATR is not supported while multiple "
@@ -1073,6 +1064,7 @@ static void ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)
ixgbe_disable_sriov(adapter);
+ adapter->ring_feature[RING_F_RSS].limit = 1;
ixgbe_set_num_queues(adapter);
adapter->num_q_vectors = 1;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 454e556..a3dc965 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2891,9 +2891,6 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter)
int i, j;
u16 rss_i = adapter->ring_feature[RING_F_RSS].indices;
- if (!(adapter->flags & IXGBE_FLAG_RSS_ENABLED))
- rss_i = 1;
-
/*
* Program table for at least 2 queues w/ SR-IOV so that VFs can
* make full use of any rings they may have. We will use the
@@ -2923,7 +2920,7 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter)
IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum);
if (adapter->hw.mac.type == ixgbe_mac_82598EB) {
- if (adapter->flags & IXGBE_FLAG_RSS_ENABLED)
+ if (adapter->ring_feature[RING_F_RSS].mask)
mrqc = IXGBE_MRQC_RSSEN;
} else {
u8 tcs = netdev_get_num_tc(adapter->netdev);
@@ -3102,6 +3099,7 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
static void ixgbe_setup_psrtype(struct ixgbe_adapter *adapter)
{
struct ixgbe_hw *hw = &adapter->hw;
+ int rss_i = adapter->ring_feature[RING_F_RSS].indices;
int p;
/* PSRTYPE must be initialized in non 82598 adapters */
@@ -3114,13 +3112,10 @@ static void ixgbe_setup_psrtype(struct ixgbe_adapter *adapter)
if (hw->mac.type == ixgbe_mac_82598EB)
return;
- if (adapter->flags & IXGBE_FLAG_RSS_ENABLED) {
- int rss_i = adapter->ring_feature[RING_F_RSS].indices;
- if (rss_i > 3)
- psrtype |= 2 << 29;
- else if (rss_i > 1)
- psrtype |= 1 << 29;
- }
+ if (rss_i > 3)
+ psrtype |= 2 << 29;
+ else if (rss_i > 1)
+ psrtype |= 1 << 29;
for (p = 0; p < adapter->num_rx_pools; p++)
IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(adapter->num_vfs + p),
@@ -4408,7 +4403,6 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
/* Set capability flags */
rss = min_t(int, IXGBE_MAX_RSS_INDICES, num_online_cpus());
adapter->ring_feature[RING_F_RSS].limit = rss;
- adapter->flags |= IXGBE_FLAG_RSS_ENABLED;
switch (hw->mac.type) {
case ixgbe_mac_82598EB:
if (hw->device_id == IXGBE_DEV_ID_82598AT)
@@ -6756,10 +6750,6 @@ static netdev_features_t ixgbe_fix_features(struct net_device *netdev,
{
struct ixgbe_adapter *adapter = netdev_priv(netdev);
- /* return error if RXHASH is being enabled when RSS is not supported */
- if (!(adapter->flags & IXGBE_FLAG_RSS_ENABLED))
- features &= ~NETIF_F_RXHASH;
-
/* If Rx checksum is disabled, then RSC/LRO should also be disabled */
if (!(features & NETIF_F_RXCSUM))
features &= ~NETIF_F_LRO;
@@ -6802,7 +6792,7 @@ static int ixgbe_set_features(struct net_device *netdev,
if (!(features & NETIF_F_NTUPLE)) {
if (adapter->flags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE) {
/* turn off Flow Director, set ATR and reset */
- if ((adapter->flags & IXGBE_FLAG_RSS_ENABLED) &&
+ if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) &&
!(adapter->flags & IXGBE_FLAG_DCB_ENABLED))
adapter->flags |= IXGBE_FLAG_FDIR_HASH_CAPABLE;
need_reset = true;
@@ -7294,11 +7284,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
if (err)
goto err_sw_init;
- if (!(adapter->flags & IXGBE_FLAG_RSS_ENABLED)) {
- netdev->hw_features &= ~NETIF_F_RXHASH;
- netdev->features &= ~NETIF_F_RXHASH;
- }
-
/* WOL not supported for all devices */
adapter->wol = 0;
hw->eeprom.ops.read(hw, 0x2c, &adapter->eeprom_cap);
--
1.7.10.4
^ permalink raw reply related
* [net-next 6/9] ixgbe: Update configure virtualization to allow for multiple PF pools
From: Jeff Kirsher @ 2012-07-18 20:31 UTC (permalink / raw)
To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1342643516-2696-1-git-send-email-jeffrey.t.kirsher@intel.com>
From: Alexander Duyck <alexander.h.duyck@intel.com>
This change allows all pools from the default pool forward to be enabled vi
ixgbe_configure_virtualization. This is needed as we are planning to use
queues belonging to adjacent pools for FCoE when SR-IOV and FCoE are both
enabled.
In addition this patch contains some minor formatting changes as there were
a few spots that seemed to be in need of some cleanup.
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Stephen Ko <stephen.s.ko@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 2b4b791..ea94fa2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -3130,28 +3130,28 @@ static void ixgbe_setup_psrtype(struct ixgbe_adapter *adapter)
static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)
{
struct ixgbe_hw *hw = &adapter->hw;
- u32 gcr_ext;
- u32 vt_reg_bits;
u32 reg_offset, vf_shift;
- u32 vmdctl;
+ u32 gcr_ext, vmdctl;
int i;
if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED))
return;
vmdctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
- vt_reg_bits = IXGBE_VMD_CTL_VMDQ_EN | IXGBE_VT_CTL_REPLEN;
- vt_reg_bits |= (adapter->num_vfs << IXGBE_VT_CTL_POOL_SHIFT);
- IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vmdctl | vt_reg_bits);
+ vmdctl |= IXGBE_VMD_CTL_VMDQ_EN;
+ vmdctl &= ~IXGBE_VT_CTL_POOL_MASK;
+ vmdctl |= (adapter->num_vfs << IXGBE_VT_CTL_POOL_SHIFT);
+ vmdctl |= IXGBE_VT_CTL_REPLEN;
+ IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vmdctl);
vf_shift = adapter->num_vfs % 32;
reg_offset = (adapter->num_vfs >= 32) ? 1 : 0;
/* Enable only the PF's pool for Tx/Rx */
- IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), (1 << vf_shift));
- IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset ^ 1), 0);
- IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), (1 << vf_shift));
- IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset ^ 1), 0);
+ IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), (~0) << vf_shift);
+ IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset ^ 1), reg_offset - 1);
+ IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), (~0) << vf_shift);
+ IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset ^ 1), reg_offset - 1);
IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, IXGBE_PFDTXGSWC_VT_LBEN);
/* Map PF MAC address in RAR Entry 0 to first pool following VFs */
@@ -3168,9 +3168,9 @@ static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)
/* enable Tx loopback for VF/PF communication */
IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, IXGBE_PFDTXGSWC_VT_LBEN);
+
/* Enable MAC Anti-Spoofing */
- hw->mac.ops.set_mac_anti_spoofing(hw,
- (adapter->num_vfs != 0),
+ hw->mac.ops.set_mac_anti_spoofing(hw, (adapter->num_vfs != 0),
adapter->num_vfs);
/* For VFs that have spoof checking turned off */
for (i = 0; i < adapter->num_vfs; i++) {
--
1.7.10.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox