* [PATCH net-next 1/4] net: tcp: refactor reinitialization of congestion control
From: Daniel Borkmann @ 2014-12-05 15:24 UTC (permalink / raw)
To: davem; +Cc: hannes, fw, netdev
In-Reply-To: <1417793092-6263-1-git-send-email-dborkman@redhat.com>
We can just move this to an extra function and make the code
a bit more readable, no functional change.
Joint work with Florian Westphal.
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
net/ipv4/tcp_cong.c | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0d..38f2f8a 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -107,6 +107,18 @@ void tcp_init_congestion_control(struct sock *sk)
icsk->icsk_ca_ops->init(sk);
}
+static void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = ca;
+
+ if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
+}
+
/* Manage refcounts on socket close. */
void tcp_cleanup_congestion_control(struct sock *sk)
{
@@ -262,21 +274,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
#endif
if (!ca)
err = -ENOENT;
-
else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
err = -EPERM;
-
else if (!try_module_get(ca->owner))
err = -EBUSY;
-
- else {
- tcp_cleanup_congestion_control(sk);
- icsk->icsk_ca_ops = ca;
-
- if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
- icsk->icsk_ca_ops->init(sk);
- }
+ else
+ tcp_reinit_congestion_control(sk, ca);
out:
rcu_read_unlock();
return err;
--
1.7.11.7
^ permalink raw reply related
* [PATCH net-next 3/4] net: tcp: add RTAX_CC_ALGO fib handling
From: Daniel Borkmann @ 2014-12-05 15:24 UTC (permalink / raw)
To: davem; +Cc: hannes, fw, netdev
In-Reply-To: <1417793092-6263-1-git-send-email-dborkman@redhat.com>
This patch adds the minimum necessary for the RTAX_CC_ALGO congestion
control metric to be set up and dumped back to user space.
While the internal representation of RTAX_CC_ALGO is handled as a u32
key, we avoided to expose this implementation detail to user space, thus
instead, we chose the netlink attribute that is being exchanged between
user space to be the actual congestion control algorithm name, similarly
as in the setsockopt(2) API in order to allow for maximum flexibility,
even for 3rd party modules.
It is a bit unfortunate that RTAX_QUICKACK used up a whole RTAX slot as
it should have been stored in RTAX_FEATURES instead, we first thought
about reusing it for the congestion control key, but it brings more
complications and/or confusion than worth it. Trying to load a non
present congestion algorithm name will be rejected.
Joint work with Florian Westphal.
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
include/net/tcp.h | 7 +++++++
include/uapi/linux/rtnetlink.h | 2 ++
net/core/rtnetlink.c | 15 +++++++++++++--
net/decnet/dn_fib.c | 3 ++-
net/decnet/dn_table.c | 3 ++-
net/ipv4/fib_semantics.c | 14 ++++++++++++--
net/ipv6/ip6_fib.c | 15 ++++++++++++++-
net/ipv6/route.c | 3 ++-
8 files changed, 54 insertions(+), 8 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 135b70c..95bb237 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -846,7 +846,14 @@ extern struct tcp_congestion_ops tcp_reno;
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(const char *name);
+#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
+#else
+static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+ return NULL;
+}
+#endif
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9c9b8b4..d81f22d 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -389,6 +389,8 @@ enum {
#define RTAX_INITRWND RTAX_INITRWND
RTAX_QUICKACK,
#define RTAX_QUICKACK RTAX_QUICKACK
+ RTAX_CC_ALGO,
+#define RTAX_CC_ALGO RTAX_CC_ALGO
__RTAX_MAX
};
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 61cb7e7..3566f41 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -50,6 +50,7 @@
#include <net/arp.h>
#include <net/route.h>
#include <net/udp.h>
+#include <net/tcp.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/fib_rules.h>
@@ -669,9 +670,19 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
for (i = 0; i < RTAX_MAX; i++) {
if (metrics[i]) {
+ if (i == RTAX_CC_ALGO - 1) {
+ char tmp[TCP_CA_NAME_MAX], *name;
+
+ name = tcp_ca_get_name_by_key(metrics[i], tmp);
+ if (!name)
+ continue;
+ if (nla_put_string(skb, i + 1, name))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u32(skb, i + 1, metrics[i]))
+ goto nla_put_failure;
+ }
valid++;
- if (nla_put_u32(skb, i+1, metrics[i]))
- goto nla_put_failure;
}
}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index d332aef..df48034 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -298,7 +298,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att
int type = nla_type(attr);
if (type) {
- if (type > RTAX_MAX || nla_len(attr) < 4)
+ if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
+ nla_len(attr) < 4)
goto err_inval;
fi->fib_metrics[type-1] = nla_get_u32(attr);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 86e3807..75c20a9 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -273,7 +273,8 @@ static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(2) /* RTA_DST */
- + nla_total_size(4); /* RTA_PRIORITY */
+ + nla_total_size(4) /* RTA_PRIORITY */
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f99f41b..d2b7b55 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
+ nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(4) /* RTA_DST */
+ nla_total_size(4) /* RTA_PRIORITY */
- + nla_total_size(4); /* RTA_PREFSRC */
+ + nla_total_size(4) /* RTA_PREFSRC */
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
@@ -859,7 +860,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (type > RTAX_MAX)
goto err_inval;
- val = nla_get_u32(nla);
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp);
+ if (val == TCP_CA_UNSPEC)
+ goto err_inval;
+ } else {
+ val = nla_get_u32(nla);
+ }
if (type == RTAX_ADVMSS && val > 65535 - 40)
val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b2d1838..0998ac6 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <net/ipv6.h>
+#include <net/tcp.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
@@ -650,10 +651,22 @@ static int fib6_commit_metrics(struct dst_entry *dst,
int type = nla_type(nla);
if (type) {
+ u32 val;
+
if (type > RTAX_MAX)
return -EINVAL;
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp);
+ if (val == TCP_CA_UNSPEC)
+ return -EINVAL;
+ } else {
+ val = nla_get_u32(nla);
+ }
- mp[type - 1] = nla_get_u32(nla);
+ mp[type - 1] = val;
}
}
return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c910831..818c99a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2534,7 +2534,8 @@ static inline size_t rt6_nlmsg_size(void)
+ nla_total_size(4) /* RTA_OIF */
+ nla_total_size(4) /* RTA_PRIORITY */
+ RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
- + nla_total_size(sizeof(struct rta_cacheinfo));
+ + nla_total_size(sizeof(struct rta_cacheinfo))
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
}
static int rt6_fill_node(struct net *net,
--
1.7.11.7
^ permalink raw reply related
* [PATCH net-next 2/4] net: tcp: add key management to congestion control
From: Daniel Borkmann @ 2014-12-05 15:24 UTC (permalink / raw)
To: davem; +Cc: hannes, fw, netdev
In-Reply-To: <1417793092-6263-1-git-send-email-dborkman@redhat.com>
For a per-route congestion control possibility, our aim is to store
a unique u32 key identifier into dst metrics, which can then be
mapped into a tcp_congestion_ops struct. We argue that having a
RTAX key entry is the most simple, generic and easy way to manage,
and also keeps the memory footprint of dst entries lower on 64 bit
than with storing a pointer directly, for example. Having a unique
key id also allows for decoupling actual TCP congestion control
module management from the FIB layer, i.e. we don't have to care
about expensive module refcounting inside the FIB at this point.
We first thought of using an IDR store for the realization, which
takes over dynamic assignment of unused key space and also performs
the key to pointer mapping in RCU. While doing so, we stumbled upon
the issue that due to the nature of dynamic key distribution, it
just so happens, arguably in very rare occasions, that excessive
module loads and unloads can lead to a possible reuse of previously
used key space. Thus, previously stale keys in the dst metric are
now being reassigned to a different congestion control algorithm,
which might lead to unexpected behaviour. One way to resolve this
would have been to walk FIBs on the actually rare occasion of a
module unload and reset the metric keys for each FIB in each netns,
but that's just very costly.
Therefore, we argue a better solution is to reuse the unique
congestion control algorithm name member and map that into u32 key
space through jhash. For that, we split the flags attribute (as it
currently uses 2 bits only anyway) into two u32 attributes, flags
and key, so that we can keep the cacheline boundary of 2 cachelines
on x86_64 and cache the precalculated key at registration time for
the fast path. On average we might expect 2 - 4 modules being loaded
worst case perhaps 15, so a key collision possibility is extremely
low, and guaranteed collision-free on LE/BE for all in-tree modules.
Overall this results in much simpler code, and all without the
overhead of an IDR. Due to the deterministic nature, modules can
now be unloaded, the congestion control algorithm for a specific
but unloaded key will fall back to the default one, and on module
reload time it will switch back to the expected algorithm
transparently.
Joint work with Florian Westphal.
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
include/net/inet_connection_sock.h | 3 +-
include/net/tcp.h | 9 +++++-
net/ipv4/tcp_cong.c | 63 ++++++++++++++++++++++++++++++++++++--
3 files changed, 71 insertions(+), 4 deletions(-)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 848e85c..5976bde 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -98,7 +98,8 @@ struct inet_connection_sock {
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
- __u8 icsk_ca_state;
+ __u8 icsk_ca_state:7,
+ icsk_ca_dst_locked:1;
__u8 icsk_retransmits;
__u8 icsk_pending;
__u8 icsk_backoff;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf..135b70c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -787,6 +787,8 @@ enum tcp_ca_ack_event_flags {
#define TCP_CA_MAX 128
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
+#define TCP_CA_UNSPEC 0
+
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
@@ -794,7 +796,8 @@ enum tcp_ca_ack_event_flags {
struct tcp_congestion_ops {
struct list_head list;
- unsigned long flags;
+ u32 key;
+ u32 flags;
/* initialize private data (optional) */
void (*init)(struct sock *sk);
@@ -841,6 +844,10 @@ u32 tcp_reno_ssthresh(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
+u32 tcp_ca_get_key_by_name(const char *name);
+char *tcp_ca_get_name_by_key(u32 key, char *buffer);
+
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 38f2f8a..8fd348c 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -13,6 +13,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include <linux/gfp.h>
+#include <linux/jhash.h>
#include <net/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -31,6 +32,19 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
return NULL;
}
+/* Simple linear search, not much in here. */
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
+{
+ struct tcp_congestion_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+ if (e->key == key)
+ return e;
+ }
+
+ return NULL;
+}
+
/*
* Attach new congestion control algorithm to the list
* of available options.
@@ -45,9 +59,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL;
}
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
spin_lock(&tcp_cong_list_lock);
- if (tcp_ca_find(ca->name)) {
- pr_notice("%s already registered\n", ca->name);
+ if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
+ pr_notice("%s already registered or non-unique key\n",
+ ca->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
@@ -70,9 +87,48 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
spin_lock(&tcp_cong_list_lock);
list_del_rcu(&ca->list);
spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module gets removed entirely.
+ *
+ * A try_module_get() should fail by now as our module is
+ * in "going" state since no refs are held anymore and
+ * module_exit() handler being called.
+ */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+u32 tcp_ca_get_key_by_name(const char *name)
+{
+ const struct tcp_congestion_ops *ca;
+ u32 key;
+
+ rcu_read_lock();
+ ca = tcp_ca_find(name);
+ key = ca ? ca->key : TCP_CA_UNSPEC;
+ rcu_read_unlock();
+
+ return key;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
+
+char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+ const struct tcp_congestion_ops *ca;
+ char *ret = NULL;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(key);
+ if (ca)
+ ret = strncpy(buffer, ca->name,
+ TCP_CA_NAME_MAX);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
+
/* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk)
{
@@ -256,6 +312,9 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
struct tcp_congestion_ops *ca;
int err = 0;
+ if (icsk->icsk_ca_dst_locked)
+ return -EPERM;
+
rcu_read_lock();
ca = tcp_ca_find(name);
--
1.7.11.7
^ permalink raw reply related
* [PATCH net-next 0/4] net: allow setting congctl via routing table
From: Daniel Borkmann @ 2014-12-05 15:24 UTC (permalink / raw)
To: davem; +Cc: hannes, fw, netdev
This is the second part of our work and allows for setting the congestion
control algorithm via routing table. For details, please see individual
patches.
Joint work with Florian Westphal, suggested by Hannes Frederic Sowa.
Thanks!
Daniel Borkmann (4):
net: tcp: refactor reinitialization of congestion control
net: tcp: add key management to congestion control
net: tcp: add RTAX_CC_ALGO fib handling
net: tcp: add per route congestion control
include/net/inet_connection_sock.h | 3 +-
include/net/tcp.h | 22 +++++++++-
include/uapi/linux/rtnetlink.h | 2 +
net/core/rtnetlink.c | 15 ++++++-
net/decnet/dn_fib.c | 3 +-
net/decnet/dn_table.c | 3 +-
net/ipv4/fib_semantics.c | 14 +++++-
net/ipv4/tcp_cong.c | 87 ++++++++++++++++++++++++++++++++------
net/ipv4/tcp_ipv4.c | 2 +
net/ipv4/tcp_minisocks.c | 30 +++++++++++--
net/ipv4/tcp_output.c | 21 +++++++++
net/ipv6/ip6_fib.c | 15 ++++++-
net/ipv6/route.c | 3 +-
net/ipv6/tcp_ipv6.c | 2 +
14 files changed, 196 insertions(+), 26 deletions(-)
--
1.7.11.7
^ permalink raw reply
* [PATCH net-next 4/4] net: tcp: add per route congestion control
From: Daniel Borkmann @ 2014-12-05 15:24 UTC (permalink / raw)
To: davem; +Cc: hannes, fw, netdev
In-Reply-To: <1417793092-6263-1-git-send-email-dborkman@redhat.com>
This work adds the possibility to define a per route/destination congestion
control algorithm. Generally, this opens up the possibility for a machine
with different links to enforce specific congestion control algorithms with
optimal strategies for each of them based on their network characteristics,
even transparently for a single application listening on all links.
For our specific use case, this additionally facilitates deployment of DCTCP,
for example, applications can easily serve internal traffic/dsts in DCTCP and
external one with CUBIC. Other scenarios would also allow for utilizing e.g.
long living, low priority background flows for certain destinations/routes
while still being able for normal traffic to utilize the default congestion
control algorithm. We also thought about a per netns setting (where different
defaults are possible), but given its actually a link specific property, we
argue that a per route/destination setting is the most natural and flexible.
The administrator can utilize this through ip-route(8) by appending "congctl
[lock] <name>", where <name> denotes the name of a loaded congestion control
algorithm and the optional lock parameter allows to enforce the given algorithm
so that applications in user space would not be allowed to overwrite that
algorithm for that destination.
The dst metric lookups are being done when a dst entry is already available
in order to avoid a costly lookup and still before the algorithms are being
initialized, thus overhead is very low when the feature is not being used.
While the client side would need to drop the current reference on the module,
on server side this can actually even be avoided as we just got a flat-copied
socket clone.
In case a dst metric contains a key to a congestion control module that has
just been removed from the kernel, it will fallback to the default congestion
control discipline for those and not reassign anything. Should at a later
point in time that module be reinserted into the kernel, then this algorithm
will be used again as keys are deterministic/static.
Joint work with Florian Westphal.
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
include/net/tcp.h | 6 ++++++
net/ipv4/tcp_ipv4.c | 2 ++
net/ipv4/tcp_minisocks.c | 30 ++++++++++++++++++++++++++----
net/ipv4/tcp_output.c | 21 +++++++++++++++++++++
net/ipv6/tcp_ipv6.c | 2 ++
5 files changed, 57 insertions(+), 4 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 95bb237..b8fdc6b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -448,6 +448,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(struct sock *sk,
struct request_sock *req,
struct sk_buff *skb);
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst);
@@ -636,6 +637,11 @@ static inline u32 tcp_rto_min_us(struct sock *sk)
return jiffies_to_usecs(tcp_rto_min(sk));
}
+static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
+{
+ return dst_metric_locked(dst, RTAX_CC_ALGO);
+}
+
/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 33f5ff0..ed94fdc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
}
sk_setup_caps(newsk, dst);
+ tcp_ca_openreq_child(newsk, dst);
+
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d2680..bc9216d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp,
tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
}
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ bool ca_got_dst = false;
+
+ if (ca_key != TCP_CA_UNSPEC) {
+ const struct tcp_congestion_ops *ca;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && try_module_get(ca->owner))) {
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ ca_got_dst = true;
+ }
+ rcu_read_unlock();
+ }
+
+ if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
+ tcp_assign_congestion_control(sk);
+
+ tcp_set_ca_state(sk, TCP_CA_Open);
+}
+EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
- if (!try_module_get(newicsk->icsk_ca_ops->owner))
- tcp_assign_congestion_control(newsk);
-
- tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
__skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f5bd4bd..5eb5676 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2916,6 +2916,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
}
EXPORT_SYMBOL(tcp_make_synack);
+static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_congestion_ops *ca;
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+
+ if (ca_key == TCP_CA_UNSPEC)
+ return;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && try_module_get(ca->owner))) {
+ module_put(icsk->icsk_ca_ops->owner);
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ }
+ rcu_read_unlock();
+}
+
/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
{
@@ -2941,6 +2960,8 @@ static void tcp_connect_init(struct sock *sk)
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
+ tcp_ca_dst_init(sk, dst);
+
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = dst_metric_advmss(dst);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d06af89..20f44fe 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1199,6 +1199,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
newnp->opt->opt_flen);
+ tcp_ca_openreq_child(newsk, dst);
+
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss &&
--
1.7.11.7
^ permalink raw reply related
* [PATCH iproute2 -next] ip: route: add congestion control setting
From: Daniel Borkmann @ 2014-12-05 15:28 UTC (permalink / raw)
To: stephen; +Cc: hannes, fw, netdev
This patch adds configuration and dumping of congestion control metric
for ip route, f.e.: ip route add <dst> dev <dev> congctl [lock] <name>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
Stephen, this patch is already rebased on top of Florian's
ECN patch [1]. Thanks!
[1] http://patchwork.ozlabs.org/patch/407729/
include/linux/rtnetlink.h | 2 ++
ip/iproute.c | 24 +++++++++++++++++++++---
man/man8/ip-route.8.in | 25 ++++++++++++++++++++++++-
3 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ae23d94..0c68a1a 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -390,6 +390,8 @@ enum {
#define RTAX_INITRWND RTAX_INITRWND
RTAX_QUICKACK,
#define RTAX_QUICKACK RTAX_QUICKACK
+ RTAX_CC_ALGO,
+#define RTAX_CC_ALGO RTAX_CC_ALGO
__RTAX_MAX
};
diff --git a/ip/iproute.c b/ip/iproute.c
index 5a496a9..18f7de7 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -53,6 +53,7 @@ static const char *mx_names[RTAX_MAX+1] = {
[RTAX_RTO_MIN] = "rto_min",
[RTAX_INITRWND] = "initrwnd",
[RTAX_QUICKACK] = "quickack",
+ [RTAX_CC_ALGO] = "congctl",
};
static void usage(void) __attribute__((noreturn));
@@ -80,8 +81,7 @@ static void usage(void)
fprintf(stderr, " [ window NUMBER] [ cwnd NUMBER ] [ initcwnd NUMBER ]\n");
fprintf(stderr, " [ ssthresh NUMBER ] [ realms REALM ] [ src ADDRESS ]\n");
fprintf(stderr, " [ rto_min TIME ] [ hoplimit NUMBER ] [ initrwnd NUMBER ]\n");
- fprintf(stderr, " [ features FEATURES ]\n");
- fprintf(stderr, " [ quickack BOOL ]\n");
+ fprintf(stderr, " [ features FEATURES ] [ quickack BOOL ] [ congctl NAME ]\n");
fprintf(stderr, "TYPE := [ unicast | local | broadcast | multicast | throw |\n");
fprintf(stderr, " unreachable | prohibit | blackhole | nat ]\n");
fprintf(stderr, "TABLE_ID := [ local | main | default | all | NUMBER ]\n");
@@ -545,10 +545,12 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
fprintf(fp, " %s", mx_names[i]);
else
fprintf(fp, " metric %d", i);
+
if (mxlock & (1<<i))
fprintf(fp, " lock");
+ if (i != RTAX_CC_ALGO)
+ val = *(unsigned*)RTA_DATA(mxrta[i]);
- val = *(unsigned*)RTA_DATA(mxrta[i]);
switch (i) {
case RTAX_FEATURES:
print_rtax_features(fp, val);
@@ -573,6 +575,10 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
fprintf(fp, " %gs", val/1e3);
else
fprintf(fp, " %ums", val);
+ break;
+ case RTAX_CC_ALGO:
+ fprintf(fp, " %s", (char *)RTA_DATA(mxrta[i]));
+ break;
}
}
}
@@ -925,6 +931,18 @@ static int iproute_modify(int cmd, unsigned flags, int argc, char **argv)
if (quickack != 1 && quickack != 0)
invarg("\"quickack\" value should be 0 or 1\n", *argv);
rta_addattr32(mxrta, sizeof(mxbuf), RTAX_QUICKACK, quickack);
+ } else if (matches(*argv, "congctl") == 0) {
+ char cc[16];
+ NEXT_ARG();
+ memset(cc, 0, sizeof(cc));
+ if (strcmp(*argv, "lock") == 0) {
+ mxlock |= (1<<RTAX_CC_ALGO);
+ NEXT_ARG();
+ }
+ strncpy(cc, *argv, sizeof(cc) - 1);
+ if (strlen(cc) == 0)
+ invarg("\"conctl\" value must be a algorithm name\n", *argv);
+ rta_addattr_l(mxrta, sizeof(mxbuf), RTAX_CC_ALGO, cc, strlen(cc));
} else if (matches(*argv, "rttvar") == 0) {
unsigned win;
NEXT_ARG();
diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in
index 89960c1..6e4b94c 100644
--- a/man/man8/ip-route.8.in
+++ b/man/man8/ip-route.8.in
@@ -116,7 +116,9 @@ replace " } "
.B features
.IR FEATURES " ] [ "
.B quickack
-.IR BOOL " ]"
+.IR BOOL " ] [ "
+.B congctl
+.IR NAME " ]"
.ti -8
.IR TYPE " := [ "
@@ -433,6 +435,27 @@ sysctl is set to 0.
Enable or disable quick ack for connections to this destination.
.TP
+.BI congctl " NAME " "(3.19+ only)"
+.TP
+.BI "congctl lock" " NAME " "(3.19+ only)"
+Sets a specific TCP congestion control algorithm only for a given destination.
+If not specified, Linux keeps the current global default TCP congestion control
+algorithm, or the one set from the application. If the modifier
+.B lock
+is not used, an application may nevertheless overwrite the suggested congestion
+control algorithm for that destination. If the modifier
+.B lock
+is used, then an application is not allowed to overwrite the specified congestion
+control algorithm for that destination, thus it will be enforced/guaranteed to
+use the proposed algorithm. Should a congestion control module be unloaded, the
+specified congestion control algorithm will fall back to the current global
+default on connection establishment. In case the same congestion control module
+will be reloaded at a later point in time into the kernel and the congctl route
+attribute has not been modified until then, new connections for that destination
+will make use of it again. Note that the kernel will not try to autoload non-present
+congestion control modules.
+
+.TP
.BI advmss " NUMBER " "(2.3.15+ only)"
the MSS ('Maximal Segment Size') to advertise to these
destinations when establishing TCP connections. If it is not given,
--
1.9.3
^ permalink raw reply related
* Re: [patch net-next 1/2] net: sched: cls_basic: fix error path in basic_change()
From: John Fastabend @ 2014-12-05 15:29 UTC (permalink / raw)
To: Jiri Pirko; +Cc: netdev, davem, jhs
In-Reply-To: <1417791023-28124-1-git-send-email-jiri@resnulli.us>
On 12/05/2014 06:50 AM, Jiri Pirko wrote:
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> ---
> net/sched/cls_basic.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
> index 7cf0a62..5aed341 100644
> --- a/net/sched/cls_basic.c
> +++ b/net/sched/cls_basic.c
> @@ -178,10 +178,9 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
> return -EINVAL;
> }
>
> - err = -ENOBUFS;
> fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
> - if (fnew == NULL)
> - goto errout;
> + if (!fnew)
> + return -ENOBUFS;
>
> tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
> err = -EINVAL;
>
Nice catch, thanks!
Reviewed-by: John Fastabend <john.r.fastabend@intel.com>
--
John Fastabend Intel Corporation
^ permalink raw reply
* Re: [PATCH net-next] tcp: refine TSO autosizing
From: Neal Cardwell @ 2014-12-05 15:32 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Yuchung Cheng, Nandita Dukkipati
In-Reply-To: <1417788937.4322.21.camel@edumazet-glaptop2.roam.corp.google.com>
On Fri, Dec 5, 2014 at 9:15 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Commit 95bd09eb2750 ("tcp: TSO packets automatic sizing") tried to
> control TSO size, but did this at the wrong place (sendmsg() time)
>
> At sendmsg() time, we might have a pessimistic view of flow rate,
> and we end up building very small skbs (with 2 MSS per skb).
>
> This is bad because :
>
> - It sends small TSO packets even in Slow Start where rate quickly
> increases.
> - It tends to make socket write queue very big, increasing tcp_ack()
> processing time, but also increasing memory needs, not necessarily
> accounted for, as fast clones overhead is currently ignored.
> - Lower GRO efficiency and more ACK packets.
>
> Servers with a lot of small lived connections suffer from this.
>
> Lets instead fill skbs as much as possible (64KB of payload), but split
> them at xmit time, when we have a precise idea of the flow rate.
> skb split is actually quite efficient.
Nice. I definitely agree this is the right direction.
However, from my experience testing a variant of this approach, this
kind of late decision about packet size was sometimes causing
performance shortfalls on long-RTT, medium-bandwidth paths unless
tcp_tso_should_defer() was also modified to use the new/smaller packet
size goal.
The issue is that tcp_tso_should_defer() uses tp->xmit_size_goal_segs
as a yardstick, and says, "hey, if cwnd and rwin allow us to send
tp->xmit_size_goal_segs * tp->mss_cache then let's go ahead and send
it now."
But if we remove the sendmsg-time autosizing logic that was tuning
tp->xmit_size_goal_segs, then tcp_tso_should_defer() is now going to
be waiting to try to accumulate permission to send a big skb with
tp->xmit_size_goal_segs (e.g. ~40) MSS in it.
In my tests I was able to fix this issue by making
tcp_tso_should_defer() use the latest size goal instead of
tp->xmit_size_goal_segs.
So, how about making the rate-based TSO autosizing goal (stored in
"segs" in this patch) at the top of tcp_write_xmit()? Then we could
pass in that segment goal to tcp_tso_should_defer() for use instead of
tp->xmit_size_goal_segs in deciding whether we have a big enough chunk
to send now. Similarly, that segment goal could be passed in to
tcp_mss_split_point instead of sk->sk_gso_max_segs.
(The autosizing calculation could be in a helper function to keep
tcp_write_xmit() manageable.)
neal
^ permalink raw reply
* Re: [patch net-next 1/2] net: sched: cls_basic: fix error path in basic_change()
From: Eric Dumazet @ 2014-12-05 16:03 UTC (permalink / raw)
To: John Fastabend; +Cc: Jiri Pirko, netdev, davem, jhs
In-Reply-To: <5481CF4C.2060607@gmail.com>
On Fri, 2014-12-05 at 07:29 -0800, John Fastabend wrote:
> On 12/05/2014 06:50 AM, Jiri Pirko wrote:
> > Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> > ---
> > net/sched/cls_basic.c | 5 ++---
> > 1 file changed, 2 insertions(+), 3 deletions(-)
> >
> > diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
> > index 7cf0a62..5aed341 100644
> > --- a/net/sched/cls_basic.c
> > +++ b/net/sched/cls_basic.c
> > @@ -178,10 +178,9 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
> > return -EINVAL;
> > }
> >
> > - err = -ENOBUFS;
> > fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
> > - if (fnew == NULL)
> > - goto errout;
> > + if (!fnew)
> > + return -ENOBUFS;
> >
> > tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
> > err = -EINVAL;
> >
>
> Nice catch, thanks!
>
> Reviewed-by: John Fastabend <john.r.fastabend@intel.com>
Sorry, but this looks a cosmetic change, right ?
If it is a fix, we'd like a 'Fixes: ...' tag.
^ permalink raw reply
* [PATCH net-next] macvlan: allow setting LRO independently of lower device
From: Michal Kubecek @ 2014-12-05 16:05 UTC (permalink / raw)
To: Patrick McHardy; +Cc: netdev, linux-kernel
Since commit fbe168ba91f7 ("net: generic dev_disable_lro() stacked
device handling"), dev_disable_lro() zeroes NETIF_F_LRO feature flag
first for a macvlan device and then for its lower device. As an attempt
to set NETIF_F_LRO to zero is ignored, dev_disable_lro() issues a
warning and taints kernel.
Allowing NETIF_F_LRO to be set independently of the lower device
consists of three parts:
- add the flag to hw_features to allow toggling it
- allow setting it to 0 even if lower device has the flag set
- add the flag to MACVLAN_FEATURES to restore copying from lower
device on macvlan creation
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
drivers/net/macvlan.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 9538674..10604db 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -747,7 +747,7 @@ static struct lock_class_key macvlan_netdev_addr_lock_key;
#define MACVLAN_FEATURES \
(NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
- NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | \
+ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_LRO | \
NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \
NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
@@ -784,6 +784,7 @@ static int macvlan_init(struct net_device *dev)
(lowerdev->state & MACVLAN_STATE_MASK);
dev->features = lowerdev->features & MACVLAN_FEATURES;
dev->features |= ALWAYS_ON_FEATURES;
+ dev->hw_features |= NETIF_F_LRO;
dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES;
dev->gso_max_size = lowerdev->gso_max_size;
dev->iflink = lowerdev->ifindex;
@@ -936,15 +937,15 @@ static netdev_features_t macvlan_fix_features(struct net_device *dev,
netdev_features_t features)
{
struct macvlan_dev *vlan = netdev_priv(dev);
+ netdev_features_t lowerdev_features = vlan->lowerdev->features;
netdev_features_t mask;
features |= NETIF_F_ALL_FOR_ALL;
features &= (vlan->set_features | ~MACVLAN_FEATURES);
mask = features;
- features = netdev_increment_features(vlan->lowerdev->features,
- features,
- mask);
+ lowerdev_features &= (features | ~NETIF_F_LRO);
+ features = netdev_increment_features(lowerdev_features, features, mask);
features |= ALWAYS_ON_FEATURES;
features &= ~NETIF_F_NETNS_LOCAL;
--
1.8.4.5
^ permalink raw reply related
* [PATCH net-next] openvswitch: set correct protocol on route lookup
From: Jiri Benc @ 2014-12-05 16:24 UTC (permalink / raw)
To: netdev; +Cc: dev, Pravin Shelar, Wenyu Zhang
Respect what the caller passed to ovs_tunnel_get_egress_info.
Fixes: 8f0aad6f35f7e ("openvswitch: Extend packet attribute for egress tunnel info")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
---
net/openvswitch/vport.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index e771a46933e5..9584526c0778 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -600,7 +600,7 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
fl.saddr = tun_key->ipv4_src;
fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos);
fl.flowi4_mark = skb_mark;
- fl.flowi4_proto = IPPROTO_GRE;
+ fl.flowi4_proto = ipproto;
rt = ip_route_output_key(net, &fl);
if (IS_ERR(rt))
--
1.8.3.1
^ permalink raw reply related
* Re: [patch net-next 1/2] net: sched: cls_basic: fix error path in basic_change()
From: John Fastabend @ 2014-12-05 16:34 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Jiri Pirko, netdev, davem, jhs
In-Reply-To: <1417795431.15618.6.camel@edumazet-glaptop2.roam.corp.google.com>
On 12/05/2014 08:03 AM, Eric Dumazet wrote:
> On Fri, 2014-12-05 at 07:29 -0800, John Fastabend wrote:
>> On 12/05/2014 06:50 AM, Jiri Pirko wrote:
>>> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>>> ---
>>> net/sched/cls_basic.c | 5 ++---
>>> 1 file changed, 2 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
>>> index 7cf0a62..5aed341 100644
>>> --- a/net/sched/cls_basic.c
>>> +++ b/net/sched/cls_basic.c
>>> @@ -178,10 +178,9 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
>>> return -EINVAL;
>>> }
>>>
>>> - err = -ENOBUFS;
>>> fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
>>> - if (fnew == NULL)
>>> - goto errout;
>>> + if (!fnew)
>>> + return -ENOBUFS;
>>>
>>> tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
>>> err = -EINVAL;
>>>
>>
>> Nice catch, thanks!
>>
>> Reviewed-by: John Fastabend <john.r.fastabend@intel.com>
>
> Sorry, but this looks a cosmetic change, right ?
>
> If it is a fix, we'd like a 'Fixes: ...' tag.
>
>
>
oops, you are right. fnew is null, free'ing the null value
is no issue at all from the error path. And if we are going
to start converting the use of
if (foo == NULL)
to
if (!foo)
in ./net/sched that is OK but not really worth the noise in
my opinion there are a lot of these in the code for quiet
sometime.
Sorry for the noise. I misread it as fixing a free on some
non-null value, which is not the case.
.John
--
John Fastabend Intel Corporation
^ permalink raw reply
* Re: [PATCH net-next 0/4] net: allow setting congctl via routing table
From: Dave Taht @ 2014-12-05 16:35 UTC (permalink / raw)
To: Daniel Borkmann
Cc: davem@davemloft.net, Hannes Frederic Sowa, Florian Westphal,
netdev@vger.kernel.org
In-Reply-To: <1417793092-6263-1-git-send-email-dborkman@redhat.com>
On Fri, Dec 5, 2014 at 7:24 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> This is the second part of our work and allows for setting the congestion
> control algorithm via routing table. For details, please see individual
> patches.
>
> Joint work with Florian Westphal, suggested by Hannes Frederic Sowa.
>
> Thanks!
>
> Daniel Borkmann (4):
> net: tcp: refactor reinitialization of congestion control
> net: tcp: add key management to congestion control
> net: tcp: add RTAX_CC_ALGO fib handling
> net: tcp: add per route congestion control
Very interesting. Have you tried something other than dctcp here
(e.g. westwood or lp?)
Have you considered the case where the route changes underneath
you from one device to another?
Example, here I am routing everything through eth0, where I
would want cubic, probably...
root@ganesha:~/git/tinc# ip route
default via 172.26.16.1 dev eth0 proto babel onlink
69.181.216.0/22 via 172.26.16.1 dev eth0 proto babel onlink
169.254.0.0/16 dev eth0 scope link metric 1000
172.26.16.0/24 dev eth0 proto kernel scope link src 172.26.16.177
172.26.16.1 via 172.26.16.1 dev eth0 proto babel onlink
172.26.16.112 via 172.26.16.112 dev eth0 proto babel onlink
172.26.17.0/24 via 172.26.16.1 dev eth0 proto babel onlink
172.26.17.3 via 172.26.16.1 dev eth0 proto babel onlink
172.26.17.227 via 172.26.16.1 dev eth0 proto babel onlink
192.168.7.0/30 dev eth1 proto kernel scope link src 192.168.7.1 metric 1
192.168.7.2 via 172.26.16.112 dev eth0 proto babel onlink
And I pull the plug, and everything flips over to wlan0,
where I might want westwood (or something saner than
that. It might be nice to have a per-device cc default
algorithm...)
root@ganesha:~/git/tinc# ip route
default via 172.26.17.224 dev wlan0 proto babel onlink
69.181.216.0/22 via 172.26.17.224 dev wlan0 proto babel onlink
169.254.0.0/16 dev eth0 scope link metric 1000
172.26.16.0/24 dev eth0 proto kernel scope link src 172.26.16.177
172.26.16.1 via 172.26.17.227 dev wlan0 proto babel onlink
172.26.16.112 via 172.26.17.227 dev wlan0 proto babel onlink
172.26.17.0/24 via 172.26.17.224 dev wlan0 proto babel onlink
172.26.17.3 via 172.26.17.227 dev wlan0 proto babel onlink
172.26.17.227 via 172.26.17.227 dev wlan0 proto babel onlink
192.168.7.0/30 dev eth1 proto kernel scope link src 192.168.7.1 metric 1
192.168.7.2 via 172.26.17.227 dev wlan0 proto babel onlink
--
Dave Täht
the quest for a fq-friendly vpn with no head of line blocking continues!
https://plus.google.com/u/0/107942175615993706558/posts/QWPWLoGMtrm
^ permalink raw reply
* Re: [PATCH net-next] tcp: refine TSO autosizing
From: Eric Dumazet @ 2014-12-05 17:06 UTC (permalink / raw)
To: Neal Cardwell; +Cc: David Miller, netdev, Yuchung Cheng, Nandita Dukkipati
In-Reply-To: <CADVnQynB32C8vWqX-Tem-GpHuSs+AsG6s8_M1Og=ru1cFVNBcw@mail.gmail.com>
On Fri, 2014-12-05 at 10:32 -0500, Neal Cardwell wrote:
> On Fri, Dec 5, 2014 at 9:15 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > From: Eric Dumazet <edumazet@google.com>
> >
> > Commit 95bd09eb2750 ("tcp: TSO packets automatic sizing") tried to
> > control TSO size, but did this at the wrong place (sendmsg() time)
> >
> > At sendmsg() time, we might have a pessimistic view of flow rate,
> > and we end up building very small skbs (with 2 MSS per skb).
> >
> > This is bad because :
> >
> > - It sends small TSO packets even in Slow Start where rate quickly
> > increases.
> > - It tends to make socket write queue very big, increasing tcp_ack()
> > processing time, but also increasing memory needs, not necessarily
> > accounted for, as fast clones overhead is currently ignored.
> > - Lower GRO efficiency and more ACK packets.
> >
> > Servers with a lot of small lived connections suffer from this.
> >
> > Lets instead fill skbs as much as possible (64KB of payload), but split
> > them at xmit time, when we have a precise idea of the flow rate.
> > skb split is actually quite efficient.
>
> Nice. I definitely agree this is the right direction.
>
> However, from my experience testing a variant of this approach, this
> kind of late decision about packet size was sometimes causing
> performance shortfalls on long-RTT, medium-bandwidth paths unless
> tcp_tso_should_defer() was also modified to use the new/smaller packet
> size goal.
>
> The issue is that tcp_tso_should_defer() uses tp->xmit_size_goal_segs
> as a yardstick, and says, "hey, if cwnd and rwin allow us to send
> tp->xmit_size_goal_segs * tp->mss_cache then let's go ahead and send
> it now."
>
> But if we remove the sendmsg-time autosizing logic that was tuning
> tp->xmit_size_goal_segs, then tcp_tso_should_defer() is now going to
> be waiting to try to accumulate permission to send a big skb with
> tp->xmit_size_goal_segs (e.g. ~40) MSS in it.
>
> In my tests I was able to fix this issue by making
> tcp_tso_should_defer() use the latest size goal instead of
> tp->xmit_size_goal_segs.
>
> So, how about making the rate-based TSO autosizing goal (stored in
> "segs" in this patch) at the top of tcp_write_xmit()? Then we could
> pass in that segment goal to tcp_tso_should_defer() for use instead of
> tp->xmit_size_goal_segs in deciding whether we have a big enough chunk
> to send now. Similarly, that segment goal could be passed in to
> tcp_mss_split_point instead of sk->sk_gso_max_segs.
>
> (The autosizing calculation could be in a helper function to keep
> tcp_write_xmit() manageable.)
>
Sounds an awesome suggestion indeed, I am working on it.
Thanks Neal !
^ permalink raw reply
* Re: [patch net-next 1/2] net: sched: cls_basic: fix error path in basic_change()
From: Sergei Shtylyov @ 2014-12-05 17:07 UTC (permalink / raw)
To: Jiri Pirko, netdev; +Cc: davem, jhs
In-Reply-To: <1417791023-28124-1-git-send-email-jiri@resnulli.us>
Hello.
On 12/05/2014 05:50 PM, Jiri Pirko wrote:
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Care to explain what's wrong?
> ---
> net/sched/cls_basic.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
> diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
> index 7cf0a62..5aed341 100644
> --- a/net/sched/cls_basic.c
> +++ b/net/sched/cls_basic.c
> @@ -178,10 +178,9 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
> return -EINVAL;
> }
>
> - err = -ENOBUFS;
> fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
> - if (fnew == NULL)
> - goto errout;
> + if (!fnew)
> + return -ENOBUFS;
>
> tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
> err = -EINVAL;
WBR, Sergei
^ permalink raw reply
* [3.16.y-ckt stable] Patch "net/ping: handle protocol mismatching scenario" has been added to staging queue
From: Luis Henriques @ 2014-12-05 17:09 UTC (permalink / raw)
To: Jane Zhou
Cc: David S. Miller, Alexey Kuznetsov, James Morris,
Hideaki YOSHIFUJI, Patrick McHardy, netdev, Yiwei Zhao,
Luis Henriques, kernel-team
This is a note to let you know that I have just added a patch titled
net/ping: handle protocol mismatching scenario
to the linux-3.16.y-queue branch of the 3.16.y-ckt extended stable tree
which can be found at:
http://kernel.ubuntu.com/git?p=ubuntu/linux.git;a=shortlog;h=refs/heads/linux-3.16.y-queue
This patch is scheduled to be released in version 3.16.7-ckt3.
If you, or anyone else, feels it should not be added to this tree, please
reply to this email.
For more information about the 3.16.y-ckt tree, see
https://wiki.ubuntu.com/Kernel/Dev/ExtendedStable
Thanks.
-Luis
------
>From ea56ab19520e241612bd1f9e4d9e900823ffb645 Mon Sep 17 00:00:00 2001
From: Jane Zhou <a17711@motorola.com>
Date: Mon, 24 Nov 2014 11:44:08 -0800
Subject: net/ping: handle protocol mismatching scenario
commit 91a0b603469069cdcce4d572b7525ffc9fd352a6 upstream.
ping_lookup() may return a wrong sock if sk_buff's and sock's protocols
dont' match. For example, sk_buff's protocol is ETH_P_IPV6, but sock's
sk_family is AF_INET, in that case, if sk->sk_bound_dev_if is zero, a wrong
sock will be returned.
the fix is to "continue" the searching, if no matching, return NULL.
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Jane Zhou <a17711@motorola.com>
Signed-off-by: Yiwei Zhao <gbjc64@motorola.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Luis Henriques <luis.henriques@canonical.com>
---
net/ipv4/ping.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 044a0ddf6a79..620e8ffa62e8 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -217,6 +217,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
&ipv6_hdr(skb)->daddr))
continue;
#endif
+ } else {
+ continue;
}
if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
^ permalink raw reply related
* Re: [PATCH iproute2 REGRESSIONS v2] ss: Fix layout/output issues introduced by regression
From: Sergei Shtylyov @ 2014-12-05 17:12 UTC (permalink / raw)
To: Vadim Kochan, netdev
In-Reply-To: <1417791796-7739-1-git-send-email-vadim4j@gmail.com>
Hello.
On 12/05/2014 06:03 PM, Vadim Kochan wrote:
> This patch fixes the following issues which was introduced by me in commits:
> #1 (2dc854854b7f1b) ss: Fixed broken output for Netlink 'Peer Address:Port' column
> ISSUE: Broken layout when all sockets are printed out
> #2 (eef43b5052afb7) ss: Identify more netlink protocol names
> ISSUE: Protocol id is not printed if 'numbers only' output was specified (-n)
> Also aligned the width of the local/peer ports to be more wider.
> I tested with a lot of option combinations (I may miss some test cases),
> but layout seems to me better than the previous released version of iproute2/ss.
> Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
> ---
> misc/ss.c | 30 ++++++++++--------------------
> 1 file changed, 10 insertions(+), 20 deletions(-)
> diff --git a/misc/ss.c b/misc/ss.c
> index a99294d..8abaaff 100644
> --- a/misc/ss.c
> +++ b/misc/ss.c
[...]
> @@ -2912,11 +2910,12 @@ static void netlink_show_one(struct filter *f,
> printf("%-*s ", state_width, "UNCONN");
> printf("%-6d %-6d ", rq, wq);
>
> - if (resolve_services)
> - {
> + if (resolve_services) {
> printf("%*s:", addr_width, nl_proto_n2a(prot, prot_name,
> sizeof(prot_name)));
> - }
> + } else
> + printf("%*d:", addr_width, prot);
> +
Extra empty line hardly needed here. And if iproute2 follows the Linux
kernel style, {} should be used in all arms of the *if* statement (since it's
used in one case).
>
> if (pid == -1) {
> printf("%-*s ", serv_width, "*");
WBR, Sergei
^ permalink raw reply
* Re: [PATCH iproute2 REGRESSIONS v2] ss: Fix layout/output issues introduced by regression
From: vadim4j @ 2014-12-05 17:06 UTC (permalink / raw)
To: Sergei Shtylyov; +Cc: netdev
In-Reply-To: <5481E77C.7070102@cogentembedded.com>
[...]
> >@@ -2912,11 +2910,12 @@ static void netlink_show_one(struct filter *f,
> > printf("%-*s ", state_width, "UNCONN");
> > printf("%-6d %-6d ", rq, wq);
> >
> >- if (resolve_services)
> >- {
> >+ if (resolve_services) {
> > printf("%*s:", addr_width, nl_proto_n2a(prot, prot_name,
> > sizeof(prot_name)));
> >- }
> >+ } else
> >+ printf("%*d:", addr_width, prot);
> >+
>
> Extra empty line hardly needed here. And if iproute2 follows the Linux
> kernel style, {} should be used in all arms of the *if* statement (since
> it's used in one case).
>
> >
> > if (pid == -1) {
> > printf("%-*s ", serv_width, "*");
>
> WBR, Sergei
>
You mean change to this ?
if (resolve_services) {
printf("%*s:", addr_width, nl_proto_n2a(prot, prot_name,
sizeof(prot_name)));
} else {
printf("%*d:", addr_width, prot);
}
Thanks,
^ permalink raw reply
* Re: [PATCH iproute2 REGRESSIONS v2] ss: Fix layout/output issues introduced by regression
From: Sergei Shtylyov @ 2014-12-05 17:22 UTC (permalink / raw)
To: vadim4j; +Cc: netdev
In-Reply-To: <20141205170641.GA23226@angus-think.wlc.globallogic.com>
On 12/05/2014 08:06 PM, vadim4j@gmail.com wrote:
> [...]
>>> @@ -2912,11 +2910,12 @@ static void netlink_show_one(struct filter *f,
>>> printf("%-*s ", state_width, "UNCONN");
>>> printf("%-6d %-6d ", rq, wq);
>>>
>>> - if (resolve_services)
>>> - {
>>> + if (resolve_services) {
>>> printf("%*s:", addr_width, nl_proto_n2a(prot, prot_name,
>>> sizeof(prot_name)));
>>> - }
>>> + } else
>>> + printf("%*d:", addr_width, prot);
>>> +
>> Extra empty line hardly needed here. And if iproute2 follows the Linux
>> kernel style, {} should be used in all arms of the *if* statement (since
>> it's used in one case).
>>>
>>> if (pid == -1) {
>>> printf("%-*s ", serv_width, "*");
> You mean change to this ?
> if (resolve_services) {
> printf("%*s:", addr_width, nl_proto_n2a(prot, prot_name,
> sizeof(prot_name)));
> } else {
> printf("%*d:", addr_width, prot);
> }
Yes (but indent } with tab please).
> Thanks,
WBR, Sergei
^ permalink raw reply
* [PATCH iproute2 REGRESSIONS v3] ss: Fix layout/output issues introduced by regression
From: Vadim Kochan @ 2014-12-05 17:19 UTC (permalink / raw)
To: netdev; +Cc: Vadim Kochan
This patch fixes the following issues which was introduced by me in commits:
#1 (2dc854854b7f1b) ss: Fixed broken output for Netlink 'Peer Address:Port' column
ISSUE: Broken layout when all sockets are printed out
#2 (eef43b5052afb7) ss: Identify more netlink protocol names
ISSUE: Protocol id is not printed if 'numbers only' output was specified (-n)
Also aligned the width of the local/peer ports to be more wider.
I tested with a lot of option combinations (I may miss some test cases),
but layout seems to me better than the previous released version of iproute2/ss.
Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
---
misc/ss.c | 28 +++++++++-------------------
1 file changed, 9 insertions(+), 19 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index a99294d..c9733a7 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -101,8 +101,6 @@ int state_width;
int addrp_width;
int addr_width;
int serv_width;
-int paddr_width;
-int pserv_width;
int screen_width;
static const char *TCP_PROTO = "tcp";
@@ -2912,10 +2910,11 @@ static void netlink_show_one(struct filter *f,
printf("%-*s ", state_width, "UNCONN");
printf("%-6d %-6d ", rq, wq);
- if (resolve_services)
- {
+ if (resolve_services) {
printf("%*s:", addr_width, nl_proto_n2a(prot, prot_name,
sizeof(prot_name)));
+ } else {
+ printf("%*d:", addr_width, prot);
}
if (pid == -1) {
@@ -2947,10 +2946,10 @@ static void netlink_show_one(struct filter *f,
if (state == NETLINK_CONNECTED) {
printf("%*d:%-*d",
- paddr_width, dst_group, pserv_width, dst_pid);
+ addr_width, dst_group, serv_width, dst_pid);
} else {
printf("%*s*%-*s",
- paddr_width, "", pserv_width, "");
+ addr_width, "", serv_width, "");
}
char *pid_context = NULL;
@@ -3684,22 +3683,13 @@ int main(int argc, char *argv[])
printf("%-*s ", state_width, "State");
printf("%-6s %-6s ", "Recv-Q", "Send-Q");
- paddr_width = addr_width;
- pserv_width = serv_width;
-
- /* Netlink service column can be resolved as process name/pid thus it
- * can be much wider than address column which is just a
- * protocol name/id.
- */
- if (current_filter.dbs & (1<<NETLINK_DB)) {
- serv_width = addr_width - 10;
- paddr_width = 13;
- pserv_width = 13;
- }
+ /* Make enough space for the local/remote port field */
+ addr_width -= 13;
+ serv_width += 13;
printf("%*s:%-*s %*s:%-*s\n",
addr_width, "Local Address", serv_width, "Port",
- paddr_width, "Peer Address", pserv_width, "Port");
+ addr_width, "Peer Address", serv_width, "Port");
fflush(stdout);
--
2.1.3
^ permalink raw reply related
* Re: [PATCH iproute2 REGRESSIONS v2] ss: Fix layout/output issues introduced by regression
From: vadim4j @ 2014-12-05 17:21 UTC (permalink / raw)
To: Sergei Shtylyov; +Cc: netdev
In-Reply-To: <5481E9E0.6080809@cogentembedded.com>
On Fri, Dec 05, 2014 at 08:22:40PM +0300, Sergei Shtylyov wrote:
>>> sizeof(prot_name)));
> >>>- }
> >>>+ } else
> >>>+ printf("%*d:", addr_width, prot);
> >>>+
>
> >> Extra empty line hardly needed here. And if iproute2 follows the Linux
> >>kernel style, {} should be used in all arms of the *if* statement (since
> >>it's used in one case).
>
> >>>
> >>> if (pid == -1) {
> >>> printf("%-*s ", serv_width, "*");
>
> >You mean change to this ?
>
> > if (resolve_services) {
> > printf("%*s:", addr_width, nl_proto_n2a(prot, prot_name,
> > sizeof(prot_name)));
> > } else {
> > printf("%*d:", addr_width, prot);
> > }
>
> Yes (but indent } with tab please).
>
> >Thanks,
>
> WBR, Sergei
>
>
Thanks for comments, I have sent a v3.
Regards,
Vadim
^ permalink raw reply
* [PATCH net-next ] Documentation (ixgbe.txt): use a decimal address.
From: Rami Rosen @ 2014-12-05 17:35 UTC (permalink / raw)
To: davem; +Cc: netdev, jeffrey.t.kirsher, Rami Rosen
This patch fixes the erronous usage of an hexadecimal address in the
example, by replacing it with a decimal address.
Signed-off-by: Rami Rosen <ramirose@gmail.com>
---
Documentation/networking/ixgbe.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Documentation/networking/ixgbe.txt b/Documentation/networking/ixgbe.txt
index 96ccceb..0ace6e7 100644
--- a/Documentation/networking/ixgbe.txt
+++ b/Documentation/networking/ixgbe.txt
@@ -138,7 +138,7 @@ Other ethtool Commands:
To enable Flow Director
ethtool -K ethX ntuple on
To add a filter
- Use -U switch. e.g., ethtool -U ethX flow-type tcp4 src-ip 0x178000a
+ Use -U switch. e.g., ethtool -U ethX flow-type tcp4 src-ip 10.0.128.23
action 1
To see the list of filters currently present:
ethtool -u ethX
--
1.9.0
^ permalink raw reply related
* Re: [PATCH net-next ] Documentation (ixgbe.txt): use a decimal address.
From: Jeff Kirsher @ 2014-12-05 17:40 UTC (permalink / raw)
To: Rami Rosen; +Cc: davem, netdev
In-Reply-To: <1417800943-4429-1-git-send-email-ramirose@gmail.com>
[-- Attachment #1: Type: text/plain, Size: 392 bytes --]
On Fri, 2014-12-05 at 19:35 +0200, Rami Rosen wrote:
> This patch fixes the erronous usage of an hexadecimal address in the
> example, by replacing it with a decimal address.
>
> Signed-off-by: Rami Rosen <ramirose@gmail.com>
> ---
> Documentation/networking/ixgbe.txt | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply
* Re: [PATCH net-next] openvswitch: set correct protocol on route lookup
From: Pravin Shelar @ 2014-12-05 17:43 UTC (permalink / raw)
To: Jiri Benc; +Cc: dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org, netdev
In-Reply-To: <0ead411e38bc57a8971ffd9826f7bf9bfdc6ccf1.1417796557.git.jbenc-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
On Fri, Dec 5, 2014 at 8:24 AM, Jiri Benc <jbenc@redhat.com> wrote:
> Respect what the caller passed to ovs_tunnel_get_egress_info.
>
> Fixes: 8f0aad6f35f7e ("openvswitch: Extend packet attribute for egress tunnel info")
> Signed-off-by: Jiri Benc <jbenc@redhat.com>
Thanks for the fix.
Acked-by: Pravin B Shelar <pshelar@nicira.com>
> ---
> net/openvswitch/vport.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
> index e771a46933e5..9584526c0778 100644
> --- a/net/openvswitch/vport.c
> +++ b/net/openvswitch/vport.c
> @@ -600,7 +600,7 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
> fl.saddr = tun_key->ipv4_src;
> fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos);
> fl.flowi4_mark = skb_mark;
> - fl.flowi4_proto = IPPROTO_GRE;
> + fl.flowi4_proto = ipproto;
>
> rt = ip_route_output_key(net, &fl);
> if (IS_ERR(rt))
> --
> 1.8.3.1
>
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev
^ permalink raw reply
* Re: [PATCH] net: fix the flow limitation computation
From: Willem de Bruijn @ 2014-12-05 17:52 UTC (permalink / raw)
To: roy.qing.li; +Cc: Network Development
In-Reply-To: <1417772919-17744-1-git-send-email-roy.qing.li@gmail.com>
On Fri, Dec 5, 2014 at 4:48 AM, <roy.qing.li@gmail.com> wrote:
> From: Li RongQing <roy.qing.li@gmail.com>
>
> Once RPS is enabled, the skb maybe enqueue to different CPU, so the
> flow limitation computation should use the enqueued CPU, not the local
> CPU
No, the decision to do accounting on the initial cpu was deliberate,
to avoids cache line contention with other cpus.
The goal is to detect under stress a single four-tuple that is
responsible for the majority of traffic, based on the assumption
that traffic is spread across many flows under normal conditions.
Accounting on the destination rps cpus is the more obviously
correct solution. Accounting on the source cpu is generally
fine, too, since it still only blocks a flow if that flow sends
> 50% of all packets from that cpu.
The only false positive occurs if multiple cpus have flows that
are large in proportion to their own packet rate, but only a subset
of these cpus actually generate a lot of traffic. Then, the
proportionally large flows from the cpus that generate little
traffic are incorrectly throttled. Throttling is still proportional
to aggregate packet rate, so should be low.
>
> Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
> ---
> net/core/dev.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 945bbd0..e70507d 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3250,7 +3250,7 @@ static int rps_ipi_queued(struct softnet_data *sd)
> int netdev_flow_limit_table_len __read_mostly = (1 << 12);
> #endif
>
> -static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
> +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen, int cpu)
> {
> #ifdef CONFIG_NET_FLOW_LIMIT
> struct sd_flow_limit *fl;
> @@ -3260,7 +3260,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
> if (qlen < (netdev_max_backlog >> 1))
> return false;
>
> - sd = this_cpu_ptr(&softnet_data);
> + sd = &per_cpu(softnet_data, cpu);
>
> rcu_read_lock();
> fl = rcu_dereference(sd->flow_limit);
> @@ -3303,7 +3303,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>
> rps_lock(sd);
> qlen = skb_queue_len(&sd->input_pkt_queue);
> - if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
> + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen, cpu)) {
> if (skb_queue_len(&sd->input_pkt_queue)) {
> enqueue:
> __skb_queue_tail(&sd->input_pkt_queue, skb);
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox