Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino
From: Y Song @ 2018-05-22  0:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alban Crequy, netdev, linux-kernel, containers, cgroups,
	Alban Crequy, tj
In-Reply-To: <20180521162609.lpdrnozowmzdn57m@ast-mbp.dhcp.thefacebook.com>

On Mon, May 21, 2018 at 9:26 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Sun, May 13, 2018 at 07:33:18PM +0200, Alban Crequy wrote:
>>
>> +BPF_CALL_2(bpf_get_current_cgroup_ino, u32, hierarchy, u64, flags)
>> +{
>> +     // TODO: pick the correct hierarchy instead of the mem controller
>> +     struct cgroup *cgrp = task_cgroup(current, memory_cgrp_id);
>> +
>> +     if (unlikely(!cgrp))
>> +             return -EINVAL;
>> +     if (unlikely(hierarchy))
>> +             return -EINVAL;
>> +     if (unlikely(flags))
>> +             return -EINVAL;
>> +
>> +     return cgrp->kn->id.ino;
>
> ino only is not enough to identify cgroup. It needs generation number too.
> I don't quite see how hierarchy and flags can be used in the future.
> Also why limit it to memcg?
>
> How about something like this instead:
>
> BPF_CALL_2(bpf_get_current_cgroup_id)
> {
>         struct cgroup *cgrp = task_dfl_cgroup(current);
>
>         return cgrp->kn->id.id;
> }
> The user space can use fhandle api to get the same 64-bit id.

I think this should work. This will also be useful to bcc as user
space can encode desired id
in the bpf program and compared that id to the current cgroup id, so we can have
cgroup level tracing (esp. stat collection) support. To cope with
cgroup hierarchy, user can use
cgroup-array based approach or explicitly compare against multiple cgroup id's.

^ permalink raw reply

* [PATCH net-next v4 2/2] openvswitch: Support conntrack zone limit
From: Yi-Hung Wei @ 2018-05-22  0:16 UTC (permalink / raw)
  To: netdev, pshelar; +Cc: Yi-Hung Wei
In-Reply-To: <1526948165-32443-1-git-send-email-yihung.wei@gmail.com>

Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max.  In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table.  Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.

To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone.  For example, we can designate different zone to different VM,
and set conntrack limit to each zone.  By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs.  Moreover, the users can
set various conntrack limit to different zone based on their preference.

The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone.  If the number of
connection is above a configured limitation, ovs will return ENOMEM to the
userspace.  If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.

The following high leve APIs are provided to the userspace:
  - OVS_CT_LIMIT_CMD_SET:
    * set default connection limit for all zones
    * set the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_DEL:
    * remove the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_GET:
    * get the default connection limit for all zones
    * get the connection limit for a particular zone

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
 net/openvswitch/Kconfig     |   3 +-
 net/openvswitch/conntrack.c | 541 +++++++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/conntrack.h |   9 +-
 net/openvswitch/datapath.c  |   7 +-
 net/openvswitch/datapath.h  |   3 +
 5 files changed, 557 insertions(+), 6 deletions(-)

diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2650205cdaf9..89da9512ec1e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -9,7 +9,8 @@ config OPENVSWITCH
 		   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
 				     (!NF_NAT || NF_NAT) && \
 				     (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
-				     (!NF_NAT_IPV6 || NF_NAT_IPV6)))
+				     (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
+				     (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 02fc343feb66..e8bb91420ca9 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -16,8 +16,11 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/sctp.h>
+#include <linux/static_key.h>
 #include <net/ip.h>
+#include <net/genetlink.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_count.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
@@ -76,6 +79,31 @@ struct ovs_conntrack_info {
 #endif
 };
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+#define OVS_CT_LIMIT_UNLIMITED	0
+#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
+#define CT_LIMIT_HASH_BUCKETS 512
+static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
+
+struct ovs_ct_limit {
+	/* Elements in ovs_ct_limit_info->limits hash table */
+	struct hlist_node hlist_node;
+	struct rcu_head rcu;
+	u16 zone;
+	u32 limit;
+};
+
+struct ovs_ct_limit_info {
+	u32 default_limit;
+	struct hlist_head *limits;
+	struct nf_conncount_data *data __aligned(8);
+};
+
+static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
+	[OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, },
+};
+#endif
+
 static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
 
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -1036,6 +1064,89 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
 	return false;
 }
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static struct hlist_head *ct_limit_hash_bucket(
+	const struct ovs_ct_limit_info *info, u16 zone)
+{
+	return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)];
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_set(const struct ovs_ct_limit_info *info,
+			 struct ovs_ct_limit *new_ct_limit)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, new_ct_limit->zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == new_ct_limit->zone) {
+			hlist_replace_rcu(&ct_limit->hlist_node,
+					  &new_ct_limit->hlist_node);
+			kfree_rcu(ct_limit, rcu);
+			return;
+		}
+	}
+
+	hlist_add_head_rcu(&new_ct_limit->hlist_node, head);
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+	struct hlist_node *n;
+
+	head = ct_limit_hash_bucket(info, zone);
+	hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) {
+		if (ct_limit->zone == zone) {
+			hlist_del_rcu(&ct_limit->hlist_node);
+			kfree_rcu(ct_limit, rcu);
+			return;
+		}
+	}
+}
+
+/* Call with RCU read lock */
+static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == zone)
+			return ct_limit->limit;
+	}
+
+	return info->default_limit;
+}
+
+static int ovs_ct_check_limit(struct net *net,
+			      const struct ovs_conntrack_info *info,
+			      const struct nf_conntrack_tuple *tuple)
+{
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	u32 per_zone_limit, connections;
+	u32 conncount_key[5];
+
+	conncount_key[0] = info->zone.id;
+
+	per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
+	if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
+		return 0;
+
+	connections = nf_conncount_count(net, ct_limit_info->data,
+					 conncount_key, tuple, &info->zone);
+	if (connections > per_zone_limit)
+		return -ENOMEM;
+
+	return 0;
+}
+#endif
+
 /* Lookup connection and confirm if unconfirmed. */
 static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 			 const struct ovs_conntrack_info *info,
@@ -1054,6 +1165,21 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 	if (!ct)
 		return 0;
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
+		if (!nf_ct_is_confirmed(ct)) {
+			err = ovs_ct_check_limit(net, info,
+				&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+			if (err) {
+				net_warn_ratelimited("openvswitch: zone: %u "
+					"execeeds conntrack limit\n",
+					info->zone.id);
+				return err;
+			}
+		}
+	}
+#endif
+
 	/* Set the conntrack event mask if given.  NEW and DELETE events have
 	 * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener
 	 * typically would receive many kinds of updates.  Setting the event
@@ -1655,7 +1781,410 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
 		nf_ct_tmpl_free(ct_info->ct);
 }
 
-void ovs_ct_init(struct net *net)
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
+{
+	int i, err;
+
+	ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info),
+					 GFP_KERNEL);
+	if (!ovs_net->ct_limit_info)
+		return -ENOMEM;
+
+	ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
+	ovs_net->ct_limit_info->limits =
+		kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
+			      GFP_KERNEL);
+	if (!ovs_net->ct_limit_info->limits) {
+		kfree(ovs_net->ct_limit_info);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
+
+	ovs_net->ct_limit_info->data =
+		nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
+
+	if (IS_ERR(ovs_net->ct_limit_info->data)) {
+		err = PTR_ERR(ovs_net->ct_limit_info->data);
+		kfree(ovs_net->ct_limit_info->limits);
+		kfree(ovs_net->ct_limit_info);
+		return err;
+	}
+	return 0;
+}
+
+static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
+{
+	const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info;
+	int i;
+
+	nf_conncount_destroy(net, NFPROTO_INET, info->data);
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+		struct hlist_head *head = &info->limits[i];
+		struct ovs_ct_limit *ct_limit;
+
+		hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
+			kfree_rcu(ct_limit, rcu);
+	}
+	kfree(ovs_net->ct_limit_info->limits);
+	kfree(ovs_net->ct_limit_info);
+}
+
+static struct sk_buff *
+ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd,
+			     struct ovs_header **ovs_reply_header)
+{
+	struct ovs_header *ovs_header = info->userhdr;
+	struct sk_buff *skb;
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	*ovs_reply_header = genlmsg_put(skb, info->snd_portid,
+					info->snd_seq,
+					&dp_ct_limit_genl_family, 0, cmd);
+
+	if (!*ovs_reply_header) {
+		nlmsg_free(skb);
+		return ERR_PTR(-EMSGSIZE);
+	}
+	(*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex;
+
+	return skb;
+}
+
+static bool check_zone_id(int zone_id, u16 *pzone)
+{
+	if (zone_id >= 0 && zone_id <= 65535) {
+		*pzone = (u16)zone_id;
+		return true;
+	}
+	return false;
+}
+
+static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info)
+{
+	struct ovs_zone_limit *zone_limit;
+	int rem;
+	u16 zone;
+
+	rem = NLA_ALIGN(nla_len(nla_zone_limit));
+	zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+	while (rem >= sizeof(*zone_limit)) {
+		if (unlikely(zone_limit->zone_id == -1)) {
+			ovs_lock();
+			info->default_limit = zone_limit->limit;
+			ovs_unlock();
+		} else if (unlikely(!check_zone_id(
+				zone_limit->zone_id, &zone))) {
+			OVS_NLERR(true, "zone id is out of range");
+		} else {
+			struct ovs_ct_limit *ct_limit;
+
+			ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
+			if (!ct_limit)
+				return -ENOMEM;
+
+			ct_limit->zone = zone;
+			ct_limit->limit = zone_limit->limit;
+
+			ovs_lock();
+			ct_limit_set(info, ct_limit);
+			ovs_unlock();
+		}
+		rem -= NLA_ALIGN(sizeof(*zone_limit));
+		zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+				NLA_ALIGN(sizeof(*zone_limit)));
+	}
+
+	if (rem)
+		OVS_NLERR(true, "set zone limit has %d unknown bytes", rem);
+
+	return 0;
+}
+
+static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info)
+{
+	struct ovs_zone_limit *zone_limit;
+	int rem;
+	u16 zone;
+
+	rem = NLA_ALIGN(nla_len(nla_zone_limit));
+	zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+	while (rem >= sizeof(*zone_limit)) {
+		if (unlikely(!check_zone_id(zone_limit->zone_id, &zone))) {
+			OVS_NLERR(true, "zone id is out of range");
+		} else {
+			ovs_lock();
+			ct_limit_del(info, zone);
+			ovs_unlock();
+		}
+		rem -= NLA_ALIGN(sizeof(*zone_limit));
+		zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+				NLA_ALIGN(sizeof(*zone_limit)));
+	}
+
+	if (rem)
+		OVS_NLERR(true, "del zone limit has %d unknown bytes", rem);
+
+	return 0;
+}
+
+static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
+					  struct sk_buff *reply)
+{
+	struct ovs_zone_limit zone_limit;
+	int err;
+
+	zone_limit.zone_id = -1;
+	zone_limit.limit = info->default_limit;
+	err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int ovs_ct_limit_get_zone_limit(struct net *net,
+				       struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info,
+				       struct sk_buff *reply)
+{
+	struct nf_conntrack_zone ct_zone;
+	struct ovs_zone_limit *zone_limit;
+	int rem, err;
+	u32 conncount_key[5];
+	u16 zone;
+
+	rem = NLA_ALIGN(nla_len(nla_zone_limit));
+	zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+	while (rem >= sizeof(*zone_limit)) {
+		if (unlikely(zone_limit->zone_id == -1)) {
+			err = ovs_ct_limit_get_default_limit(info, reply);
+			if (err)
+				return err;
+		} else if (unlikely(!check_zone_id(zone_limit->zone_id,
+							&zone))) {
+			OVS_NLERR(true, "zone id is out of range");
+		} else {
+			rcu_read_lock();
+			zone_limit->limit = ct_limit_get(info, zone);
+			rcu_read_unlock();
+
+			nf_ct_zone_init(&ct_zone, zone, NF_CT_DEFAULT_ZONE_DIR,
+					0);
+			conncount_key[0] = zone;
+			zone_limit->count = nf_conncount_count(
+				net, info->data, conncount_key, NULL, &ct_zone);
+			err = nla_put_nohdr(reply, sizeof(*zone_limit),
+					    zone_limit);
+			if (err)
+				return err;
+		}
+		rem -= NLA_ALIGN(sizeof(*zone_limit));
+		zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+				NLA_ALIGN(sizeof(*zone_limit)));
+	}
+
+	if (rem)
+		OVS_NLERR(true, "get zone limit has %d unknown bytes", rem);
+
+	return 0;
+}
+
+static int ovs_ct_limit_get_all_zone_limit(struct net *net,
+					   struct ovs_ct_limit_info *info,
+					   struct sk_buff *reply)
+{
+	struct nf_conntrack_zone ct_zone;
+	struct ovs_zone_limit zone_limit;
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+	u32 conncount_key[5];
+	int i, err = 0;
+
+	err = ovs_ct_limit_get_default_limit(info, reply);
+	if (err)
+		return err;
+
+	rcu_read_lock();
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+		head = &info->limits[i];
+		hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+			zone_limit.zone_id = ct_limit->zone;
+			zone_limit.limit = ct_limit->limit;
+			nf_ct_zone_init(&ct_zone, ct_limit->zone,
+					NF_CT_DEFAULT_ZONE_DIR, 0);
+
+			conncount_key[0] = ct_limit->zone;
+			zone_limit.count = nf_conncount_count(net, info->data,
+					conncount_key, NULL, &ct_zone);
+			err = nla_put_nohdr(reply, sizeof(zone_limit),
+					&zone_limit);
+			if (err)
+				goto exit_err;
+		}
+	}
+
+exit_err:
+	rcu_read_unlock();
+	return err;
+}
+
+static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+		err = -EINVAL;
+		goto exit_err;
+	}
+
+	err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
+					  ct_limit_info);
+	if (err)
+		goto exit_err;
+
+	static_branch_enable(&ovs_ct_limit_enabled);
+
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+		err = -EINVAL;
+		goto exit_err;
+	}
+
+	err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
+					  ct_limit_info);
+	if (err)
+		goto exit_err;
+
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct nlattr *nla_reply;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct net *net = sock_net(skb->sk);
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
+
+	if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+		err = ovs_ct_limit_get_zone_limit(
+			net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info,
+			reply);
+		if (err)
+			goto exit_err;
+	} else {
+		err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info,
+						      reply);
+		if (err)
+			goto exit_err;
+	}
+
+	nla_nest_end(reply, nla_reply);
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static struct genl_ops ct_limit_genl_ops[] = {
+	{ .cmd = OVS_CT_LIMIT_CMD_SET,
+		.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+					   * privilege. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_set,
+	},
+	{ .cmd = OVS_CT_LIMIT_CMD_DEL,
+		.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+					   * privilege. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_del,
+	},
+	{ .cmd = OVS_CT_LIMIT_CMD_GET,
+		.flags = 0,		  /* OK for unprivileged users. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_get,
+	},
+};
+
+static const struct genl_multicast_group ovs_ct_limit_multicast_group = {
+	.name = OVS_CT_LIMIT_MCGROUP,
+};
+
+struct genl_family dp_ct_limit_genl_family __ro_after_init = {
+	.hdrsize = sizeof(struct ovs_header),
+	.name = OVS_CT_LIMIT_FAMILY,
+	.version = OVS_CT_LIMIT_VERSION,
+	.maxattr = OVS_CT_LIMIT_ATTR_MAX,
+	.netnsok = true,
+	.parallel_ops = true,
+	.ops = ct_limit_genl_ops,
+	.n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+	.mcgrps = &ovs_ct_limit_multicast_group,
+	.n_mcgrps = 1,
+	.module = THIS_MODULE,
+};
+#endif
+
+int ovs_ct_init(struct net *net)
 {
 	unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -1666,12 +2195,22 @@ void ovs_ct_init(struct net *net)
 	} else {
 		ovs_net->xt_label = true;
 	}
+
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	return ovs_ct_limit_init(net, ovs_net);
+#else
+	return 0;
+#endif
 }
 
 void ovs_ct_exit(struct net *net)
 {
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	ovs_ct_limit_exit(net, ovs_net);
+#endif
+
 	if (ovs_net->xt_label)
 		nf_connlabels_put(net);
 }
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 399dfdd2c4f9..900dadd70974 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -17,10 +17,11 @@
 #include "flow.h"
 
 struct ovs_conntrack_info;
+struct ovs_ct_limit_info;
 enum ovs_key_attr;
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-void ovs_ct_init(struct net *);
+int ovs_ct_init(struct net *);
 void ovs_ct_exit(struct net *);
 bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
 int ovs_ct_copy_action(struct net *, const struct nlattr *,
@@ -44,7 +45,7 @@ void ovs_ct_free_action(const struct nlattr *a);
 #else
 #include <linux/errno.h>
 
-static inline void ovs_ct_init(struct net *net) { }
+static inline int ovs_ct_init(struct net *net) { return 0; }
 
 static inline void ovs_ct_exit(struct net *net) { }
 
@@ -104,4 +105,8 @@ static inline void ovs_ct_free_action(const struct nlattr *a) { }
 
 #define CT_SUPPORTED_MASK 0
 #endif /* CONFIG_NF_CONNTRACK */
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+extern struct genl_family dp_ct_limit_genl_family;
+#endif
 #endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 015e24e08909..a61818e94396 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2288,6 +2288,9 @@ static struct genl_family * const dp_genl_families[] = {
 	&dp_flow_genl_family,
 	&dp_packet_genl_family,
 	&dp_meter_genl_family,
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	&dp_ct_limit_genl_family,
+#endif
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2323,8 +2326,7 @@ static int __net_init ovs_init_net(struct net *net)
 
 	INIT_LIST_HEAD(&ovs_net->dps);
 	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
-	ovs_ct_init(net);
-	return 0;
+	return ovs_ct_init(net);
 }
 
 static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2469,3 +2471,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 523d65526766..c9eb267c6f7e 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,6 +144,9 @@ struct dp_upcall_info {
 struct ovs_net {
 	struct list_head dps;
 	struct work_struct dp_notify_work;
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	struct ovs_ct_limit_info *ct_limit_info;
+#endif
 
 	/* Module reference for configuring conntrack. */
 	bool xt_label;
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v4 1/2] openvswitch: Add conntrack limit netlink definition
From: Yi-Hung Wei @ 2018-05-22  0:16 UTC (permalink / raw)
  To: netdev, pshelar; +Cc: Yi-Hung Wei
In-Reply-To: <1526948165-32443-1-git-send-email-yihung.wei@gmail.com>

Define netlink messages and attributes to support user kernel
communication that uses the conntrack limit feature.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
 include/uapi/linux/openvswitch.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 713e56ce681f..d8da2b7591f5 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -937,4 +937,30 @@ enum ovs_meter_band_type {
 
 #define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
 
+/* Conntrack limit */
+#define OVS_CT_LIMIT_FAMILY  "ovs_ct_limit"
+#define OVS_CT_LIMIT_MCGROUP "ovs_ct_limit"
+#define OVS_CT_LIMIT_VERSION 0x1
+
+enum ovs_ct_limit_cmd {
+	OVS_CT_LIMIT_CMD_UNSPEC,
+	OVS_CT_LIMIT_CMD_SET,		/* Add or modify ct limit. */
+	OVS_CT_LIMIT_CMD_DEL,		/* Delete ct limit. */
+	OVS_CT_LIMIT_CMD_GET		/* Get ct limit. */
+};
+
+enum ovs_ct_limit_attr {
+	OVS_CT_LIMIT_ATTR_UNSPEC,
+	OVS_CT_LIMIT_ATTR_ZONE_LIMIT,	/* Nested struct ovs_zone_limit. */
+	__OVS_CT_LIMIT_ATTR_MAX
+};
+
+#define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1)
+
+struct ovs_zone_limit {
+	int zone_id;
+	__u32 limit;
+	__u32 count;
+};
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v4 0/2] openvswitch: Support conntrack zone limit
From: Yi-Hung Wei @ 2018-05-22  0:16 UTC (permalink / raw)
  To: netdev, pshelar; +Cc: Yi-Hung Wei

Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max.  In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table.  Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.

To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone.  For example, we can designate different zone to different VM,
and set conntrack limit to each zone.  By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs.  Moreover, the users can
set various conntrack limit to different zone based on their preference.

The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone.  If the number of
connection is above a configured limitation, OVS will return ENOMEM to the
userspace.  If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.

The first patch defines the conntrack limit netlink definition, and the
second patch provides the implementation.

v3->v4:
  - Addresses comments from Parvin that include simplify netlink API,
    and remove unncessary RCU lockings.
  - Rebases to master.

v2->v3:
  - Addresses comments from Parvin that include using static keys to check
    if ovs_ct_limit features is used, only check ct_limit when a ct entry
    is unconfirmed, and reports rate limited warning messages when the ct
    limit is reached.
  - Rebases to master.

v1->v2:
  - Fixes commit log typos suggested by Greg.
  - Fixes memory free issue that Julia found.


Yi-Hung Wei (2):
  openvswitch: Add conntrack limit netlink definition
  openvswitch: Support conntrack zone limit

 include/uapi/linux/openvswitch.h |  26 ++
 net/openvswitch/Kconfig          |   3 +-
 net/openvswitch/conntrack.c      | 541 ++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/conntrack.h      |   9 +-
 net/openvswitch/datapath.c       |   7 +-
 net/openvswitch/datapath.h       |   3 +
 6 files changed, 583 insertions(+), 6 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg
From: kbuild test robot @ 2018-05-22  0:08 UTC (permalink / raw)
  To: Andrey Ignatov
  Cc: kbuild-all, netdev, Andrey Ignatov, davem, ast, daniel,
	kernel-team
In-Reply-To: <654b59700a21ed2342b12f30a583ba9329ef850b.1526694154.git.rdna@fb.com>

[-- Attachment #1: Type: text/plain, Size: 10869 bytes --]

Hi Andrey,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Andrey-Ignatov/bpf-Hooks-for-sys_sendmsg/20180522-065614
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-randconfig-s1-201820 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   net/ipv4/udp.c: In function 'udp_sendmsg':
   net/ipv4/udp.c:1014:44: error: macro "BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK" passed 3 arguments, but takes just 2
             (struct sockaddr *)usin, &ipc.addr);
                                               ^
>> net/ipv4/udp.c:1013:9: error: 'BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK' undeclared (first use in this function)
      err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/ipv4/udp.c:1013:9: note: each undeclared identifier is reported only once for each function it appears in
--
   net/ipv6/udp.c: In function 'udpv6_sendmsg':
   net/ipv6/udp.c:1320:44: error: macro "BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK" passed 3 arguments, but takes just 2
            (struct sockaddr *)sin6, &fl6.saddr);
                                               ^
>> net/ipv6/udp.c:1319:9: error: 'BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK' undeclared (first use in this function)
      err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/ipv6/udp.c:1319:9: note: each undeclared identifier is reported only once for each function it appears in

vim +/BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK +1013 net/ipv4/udp.c

   898	
   899	int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
   900	{
   901		struct inet_sock *inet = inet_sk(sk);
   902		struct udp_sock *up = udp_sk(sk);
   903		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
   904		struct flowi4 fl4_stack;
   905		struct flowi4 *fl4;
   906		int ulen = len;
   907		struct ipcm_cookie ipc;
   908		struct rtable *rt = NULL;
   909		int free = 0;
   910		int connected = 0;
   911		__be32 daddr, faddr, saddr;
   912		__be16 dport;
   913		u8  tos;
   914		int err, is_udplite = IS_UDPLITE(sk);
   915		int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
   916		int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
   917		struct sk_buff *skb;
   918		struct ip_options_data opt_copy;
   919	
   920		if (len > 0xFFFF)
   921			return -EMSGSIZE;
   922	
   923		/*
   924		 *	Check the flags.
   925		 */
   926	
   927		if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
   928			return -EOPNOTSUPP;
   929	
   930		ipc.opt = NULL;
   931		ipc.tx_flags = 0;
   932		ipc.ttl = 0;
   933		ipc.tos = -1;
   934	
   935		getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
   936	
   937		fl4 = &inet->cork.fl.u.ip4;
   938		if (up->pending) {
   939			/*
   940			 * There are pending frames.
   941			 * The socket lock must be held while it's corked.
   942			 */
   943			lock_sock(sk);
   944			if (likely(up->pending)) {
   945				if (unlikely(up->pending != AF_INET)) {
   946					release_sock(sk);
   947					return -EINVAL;
   948				}
   949				goto do_append_data;
   950			}
   951			release_sock(sk);
   952		}
   953		ulen += sizeof(struct udphdr);
   954	
   955		/*
   956		 *	Get and verify the address.
   957		 */
   958		if (usin) {
   959			if (msg->msg_namelen < sizeof(*usin))
   960				return -EINVAL;
   961			if (usin->sin_family != AF_INET) {
   962				if (usin->sin_family != AF_UNSPEC)
   963					return -EAFNOSUPPORT;
   964			}
   965	
   966			daddr = usin->sin_addr.s_addr;
   967			dport = usin->sin_port;
   968			if (dport == 0)
   969				return -EINVAL;
   970		} else {
   971			if (sk->sk_state != TCP_ESTABLISHED)
   972				return -EDESTADDRREQ;
   973			daddr = inet->inet_daddr;
   974			dport = inet->inet_dport;
   975			/* Open fast path for connected socket.
   976			   Route will not be used, if at least one option is set.
   977			 */
   978			connected = 1;
   979		}
   980	
   981		ipc.sockc.tsflags = sk->sk_tsflags;
   982		ipc.addr = inet->inet_saddr;
   983		ipc.oif = sk->sk_bound_dev_if;
   984		ipc.gso_size = up->gso_size;
   985	
   986		if (msg->msg_controllen) {
   987			err = udp_cmsg_send(sk, msg, &ipc.gso_size);
   988			if (err > 0)
   989				err = ip_cmsg_send(sk, msg, &ipc,
   990						   sk->sk_family == AF_INET6);
   991			if (unlikely(err < 0)) {
   992				kfree(ipc.opt);
   993				return err;
   994			}
   995			if (ipc.opt)
   996				free = 1;
   997			connected = 0;
   998		}
   999		if (!ipc.opt) {
  1000			struct ip_options_rcu *inet_opt;
  1001	
  1002			rcu_read_lock();
  1003			inet_opt = rcu_dereference(inet->inet_opt);
  1004			if (inet_opt) {
  1005				memcpy(&opt_copy, inet_opt,
  1006				       sizeof(*inet_opt) + inet_opt->opt.optlen);
  1007				ipc.opt = &opt_copy.opt;
  1008			}
  1009			rcu_read_unlock();
  1010		}
  1011	
  1012		if (!connected) {
> 1013			err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
> 1014						    (struct sockaddr *)usin, &ipc.addr);
  1015			if (err)
  1016				goto out_free;
  1017			if (usin) {
  1018				if (usin->sin_port == 0) {
  1019					/* BPF program set invalid port. Reject it. */
  1020					err = -EINVAL;
  1021					goto out_free;
  1022				}
  1023				daddr = usin->sin_addr.s_addr;
  1024				dport = usin->sin_port;
  1025			}
  1026		}
  1027	
  1028		saddr = ipc.addr;
  1029		ipc.addr = faddr = daddr;
  1030	
  1031		sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
  1032	
  1033		if (ipc.opt && ipc.opt->opt.srr) {
  1034			if (!daddr) {
  1035				err = -EINVAL;
  1036				goto out_free;
  1037			}
  1038			faddr = ipc.opt->opt.faddr;
  1039			connected = 0;
  1040		}
  1041		tos = get_rttos(&ipc, inet);
  1042		if (sock_flag(sk, SOCK_LOCALROUTE) ||
  1043		    (msg->msg_flags & MSG_DONTROUTE) ||
  1044		    (ipc.opt && ipc.opt->opt.is_strictroute)) {
  1045			tos |= RTO_ONLINK;
  1046			connected = 0;
  1047		}
  1048	
  1049		if (ipv4_is_multicast(daddr)) {
  1050			if (!ipc.oif)
  1051				ipc.oif = inet->mc_index;
  1052			if (!saddr)
  1053				saddr = inet->mc_addr;
  1054			connected = 0;
  1055		} else if (!ipc.oif) {
  1056			ipc.oif = inet->uc_index;
  1057		} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
  1058			/* oif is set, packet is to local broadcast and
  1059			 * and uc_index is set. oif is most likely set
  1060			 * by sk_bound_dev_if. If uc_index != oif check if the
  1061			 * oif is an L3 master and uc_index is an L3 slave.
  1062			 * If so, we want to allow the send using the uc_index.
  1063			 */
  1064			if (ipc.oif != inet->uc_index &&
  1065			    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
  1066								      inet->uc_index)) {
  1067				ipc.oif = inet->uc_index;
  1068			}
  1069		}
  1070	
  1071		if (connected)
  1072			rt = (struct rtable *)sk_dst_check(sk, 0);
  1073	
  1074		if (!rt) {
  1075			struct net *net = sock_net(sk);
  1076			__u8 flow_flags = inet_sk_flowi_flags(sk);
  1077	
  1078			fl4 = &fl4_stack;
  1079	
  1080			flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
  1081					   RT_SCOPE_UNIVERSE, sk->sk_protocol,
  1082					   flow_flags,
  1083					   faddr, saddr, dport, inet->inet_sport,
  1084					   sk->sk_uid);
  1085	
  1086			security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
  1087			rt = ip_route_output_flow(net, fl4, sk);
  1088			if (IS_ERR(rt)) {
  1089				err = PTR_ERR(rt);
  1090				rt = NULL;
  1091				if (err == -ENETUNREACH)
  1092					IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
  1093				goto out;
  1094			}
  1095	
  1096			err = -EACCES;
  1097			if ((rt->rt_flags & RTCF_BROADCAST) &&
  1098			    !sock_flag(sk, SOCK_BROADCAST))
  1099				goto out;
  1100			if (connected)
  1101				sk_dst_set(sk, dst_clone(&rt->dst));
  1102		}
  1103	
  1104		if (msg->msg_flags&MSG_CONFIRM)
  1105			goto do_confirm;
  1106	back_from_confirm:
  1107	
  1108		saddr = fl4->saddr;
  1109		if (!ipc.addr)
  1110			daddr = ipc.addr = fl4->daddr;
  1111	
  1112		/* Lockless fast path for the non-corking case. */
  1113		if (!corkreq) {
  1114			struct inet_cork cork;
  1115	
  1116			skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
  1117					  sizeof(struct udphdr), &ipc, &rt,
  1118					  &cork, msg->msg_flags);
  1119			err = PTR_ERR(skb);
  1120			if (!IS_ERR_OR_NULL(skb))
  1121				err = udp_send_skb(skb, fl4, &cork);
  1122			goto out;
  1123		}
  1124	
  1125		lock_sock(sk);
  1126		if (unlikely(up->pending)) {
  1127			/* The socket is already corked while preparing it. */
  1128			/* ... which is an evident application bug. --ANK */
  1129			release_sock(sk);
  1130	
  1131			net_dbg_ratelimited("socket already corked\n");
  1132			err = -EINVAL;
  1133			goto out;
  1134		}
  1135		/*
  1136		 *	Now cork the socket to pend data.
  1137		 */
  1138		fl4 = &inet->cork.fl.u.ip4;
  1139		fl4->daddr = daddr;
  1140		fl4->saddr = saddr;
  1141		fl4->fl4_dport = dport;
  1142		fl4->fl4_sport = inet->inet_sport;
  1143		up->pending = AF_INET;
  1144	
  1145	do_append_data:
  1146		up->len += ulen;
  1147		err = ip_append_data(sk, fl4, getfrag, msg, ulen,
  1148				     sizeof(struct udphdr), &ipc, &rt,
  1149				     corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
  1150		if (err)
  1151			udp_flush_pending_frames(sk);
  1152		else if (!corkreq)
  1153			err = udp_push_pending_frames(sk);
  1154		else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
  1155			up->pending = 0;
  1156		release_sock(sk);
  1157	
  1158	out:
  1159		ip_rt_put(rt);
  1160	out_free:
  1161		if (free)
  1162			kfree(ipc.opt);
  1163		if (!err)
  1164			return len;
  1165		/*
  1166		 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
  1167		 * ENOBUFS might not be good (it's not tunable per se), but otherwise
  1168		 * we don't have a good statistic (IpOutDiscards but it can be too many
  1169		 * things).  We could add another new stat but at least for now that
  1170		 * seems like overkill.
  1171		 */
  1172		if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  1173			UDP_INC_STATS(sock_net(sk),
  1174				      UDP_MIB_SNDBUFERRORS, is_udplite);
  1175		}
  1176		return err;
  1177	
  1178	do_confirm:
  1179		if (msg->msg_flags & MSG_PROBE)
  1180			dst_confirm_neigh(&rt->dst, &fl4->daddr);
  1181		if (!(msg->msg_flags&MSG_PROBE) || len)
  1182			goto back_from_confirm;
  1183		err = 0;
  1184		goto out;
  1185	}
  1186	EXPORT_SYMBOL(udp_sendmsg);
  1187	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 24538 bytes --]

^ permalink raw reply

* Re: [Cake] [PATCH net-next v14 6/7] sch_cake: Add overhead compensation support to the rate shaper
From: Marcelo Ricardo Leitner @ 2018-05-21 23:45 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: netdev, cake
In-Reply-To: <152693495874.32668.11244869690098290078.stgit@alrua-kau>

On Mon, May 21, 2018 at 10:35:58PM +0200, Toke Høiland-Jørgensen wrote:
> +static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
> +{
> +	const struct skb_shared_info *shinfo = skb_shinfo(skb);
> +	unsigned int hdr_len, last_len = 0;
> +	u32 off = skb_network_offset(skb);
> +	u32 len = qdisc_pkt_len(skb);
> +	u16 segs = 1;
> +
> +	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
> +
> +	if (!shinfo->gso_size)
> +		return cake_calc_overhead(q, len, off);
> +
> +	/* borrowed from qdisc_pkt_len_init() */
> +	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
> +
> +	/* + transport layer */
> +	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
> +						SKB_GSO_TCPV6))) {
> +		const struct tcphdr *th;
> +		struct tcphdr _tcphdr;
> +
> +		th = skb_header_pointer(skb, skb_transport_offset(skb),
> +					sizeof(_tcphdr), &_tcphdr);
> +		if (likely(th))
> +			hdr_len += __tcp_hdrlen(th);
> +	} else {

I didn't see some code limiting GSO packets to just TCP or UDP. Is it
safe to assume that this packet is an UDP one, and not SCTP or ESP,
for example?

> +		struct udphdr _udphdr;
> +
> +		if (skb_header_pointer(skb, skb_transport_offset(skb),
> +				       sizeof(_udphdr), &_udphdr))
> +			hdr_len += sizeof(struct udphdr);
> +	}
> +
> +	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
> +		segs = DIV_ROUND_UP(skb->len - hdr_len,
> +				    shinfo->gso_size);
> +	else
> +		segs = shinfo->gso_segs;
> +
> +	len = shinfo->gso_size + hdr_len;
> +	last_len = skb->len - shinfo->gso_size * (segs - 1);
> +
> +	return (cake_calc_overhead(q, len, off) * (segs - 1) +
> +		cake_calc_overhead(q, last_len, off));
> +}
> +

^ permalink raw reply

* [PATCH v2 net] netfilter: provide correct argument to nla_strlcpy()
From: Eric Dumazet @ 2018-05-21 23:35 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Eric Dumazet, Florian Westphal,
	Pablo Neira Ayuso

Recent patch forgot to remove nla_data(), upsetting syzkaller a bit.

BUG: KASAN: slab-out-of-bounds in nla_strlcpy+0x13d/0x150 lib/nlattr.c:314
Read of size 1 at addr ffff8801ad1f4fdd by task syz-executor189/4509

CPU: 1 PID: 4509 Comm: syz-executor189 Not tainted 4.17.0-rc6+ #62
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
 nla_strlcpy+0x13d/0x150 lib/nlattr.c:314
 nfnl_acct_new+0x574/0xc50 net/netfilter/nfnetlink_acct.c:118
 nfnetlink_rcv_msg+0xdb5/0xff0 net/netfilter/nfnetlink.c:212
 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
 nfnetlink_rcv+0x1fe/0x1ba0 net/netfilter/nfnetlink.c:513
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg+0xd5/0x120 net/socket.c:639
 sock_write_iter+0x35a/0x5a0 net/socket.c:908
 call_write_iter include/linux/fs.h:1784 [inline]
 new_sync_write fs/read_write.c:474 [inline]
 __vfs_write+0x64d/0x960 fs/read_write.c:487
 vfs_write+0x1f8/0x560 fs/read_write.c:549
 ksys_write+0xf9/0x250 fs/read_write.c:598
 __do_sys_write fs/read_write.c:610 [inline]
 __se_sys_write fs/read_write.c:607 [inline]
 __x64_sys_write+0x73/0xb0 fs/read_write.c:607

Fixes: 4e09fc873d92 ("netfilter: prefer nla_strlcpy for dealing with NLA_STRING attributes")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/netfilter/nfnetlink_acct.c     | 2 +-
 net/netfilter/nfnetlink_cthelper.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 6ddf89183e7b47e6c029b28cf5b524c73a790498..a0e5adf0b3b6ddbcd01f956d25dda01c611a7663 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -115,7 +115,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
 		nfacct->flags = flags;
 	}
 
-	nla_strlcpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+	nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
 
 	if (tb[NFACCT_BYTES]) {
 		atomic64_set(&nfacct->bytes,
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index fa026b269b3691d5186e28020eb2b08e93dc3679..cb5b5f2077774c29fb987b7f1eb2d75e9efc2fe1 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -150,7 +150,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
 		return -EINVAL;
 
 	nla_strlcpy(expect_policy->name,
-		    nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
+		    tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
 	expect_policy->max_expected =
 		ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
 	if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
@@ -235,7 +235,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
 		goto err1;
 
 	nla_strlcpy(helper->name,
-		    nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
+		    tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN);
 	size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
 	if (size > FIELD_SIZEOF(struct nf_conn_help, data)) {
 		ret = -ENOMEM;
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* Re: [PATCH net] netfilter: provide correct argument to nla_strlcpy()
From: Eric Dumazet @ 2018-05-21 23:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Eric Dumazet, Florian Westphal, Pablo Neira Ayuso
In-Reply-To: <20180521233320.66082-1-edumazet@google.com>

On Mon, May 21, 2018 at 4:33 PM Eric Dumazet <edumazet@google.com> wrote:

> Recent patch forgot to remove nla_data(), upsetting syzkaller a bit.


Humpff.

I forgot to add one file in the change.

Will send V2.

^ permalink raw reply

* Re: [Cake] [PATCH net-next v14 4/7] sch_cake: Add NAT awareness to packet classifier
From: Marcelo Ricardo Leitner @ 2018-05-21 23:34 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: netdev, cake, netfilter-devel
In-Reply-To: <152693495866.32668.5164616056948127124.stgit@alrua-kau>

[Cc'ing netfilter-devel@ for awareness]

On Mon, May 21, 2018 at 10:35:58PM +0200, Toke Høiland-Jørgensen wrote:
> When CAKE is deployed on a gateway that also performs NAT (which is a
> common deployment mode), the host fairness mechanism cannot distinguish
> internal hosts from each other, and so fails to work correctly.
> 
> To fix this, we add an optional NAT awareness mode, which will query the
> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet
> and use that in the flow and host hashing.
> 
> When the shaper is enabled and the host is already performing NAT, the cost
> of this lookup is negligible. However, in unlimited mode with no NAT being
> performed, there is a significant CPU cost at higher bandwidths. For this
> reason, the feature is turned off by default.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
> ---
>  net/sched/sch_cake.c |   79 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 79 insertions(+)
> 
> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
> index 92623160d43e..04364993ce19 100644
> --- a/net/sched/sch_cake.c
> +++ b/net/sched/sch_cake.c
> @@ -71,6 +71,12 @@
>  #include <net/tcp.h>
>  #include <net/flow_dissector.h>
>  
> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
> +#include <net/netfilter/nf_conntrack_core.h>
> +#include <net/netfilter/nf_conntrack_zones.h>
> +#include <net/netfilter/nf_conntrack.h>
> +#endif
> +
>  #define CAKE_SET_WAYS (8)
>  #define CAKE_MAX_TINS (8)
>  #define CAKE_QUEUES (1024)
> @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
>  	return drop;
>  }
>  
> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
> +
> +static void cake_update_flowkeys(struct flow_keys *keys,
> +				 const struct sk_buff *skb)
> +{
> +	const struct nf_conntrack_tuple *tuple;
> +	enum ip_conntrack_info ctinfo;
> +	struct nf_conn *ct;
> +	bool rev = false;
> +
> +	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
> +		return;
> +
> +	ct = nf_ct_get(skb, &ctinfo);
> +	if (ct) {
> +		tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
> +	} else {
> +		const struct nf_conntrack_tuple_hash *hash;
> +		struct nf_conntrack_tuple srctuple;
> +
> +		if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
> +				       NFPROTO_IPV4, dev_net(skb->dev),
> +				       &srctuple))
> +			return;
> +
> +		hash = nf_conntrack_find_get(dev_net(skb->dev),
> +					     &nf_ct_zone_dflt,
> +					     &srctuple);
> +		if (!hash)
> +			return;
> +
> +		rev = true;
> +		ct = nf_ct_tuplehash_to_ctrack(hash);
> +		tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
> +	}
> +
> +	keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
> +	keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
> +
> +	if (keys->ports.ports) {
> +		keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all;
> +		keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all;
> +	}
> +	if (rev)
> +		nf_ct_put(ct);
> +}
> +#else
> +static void cake_update_flowkeys(struct flow_keys *keys,
> +				 const struct sk_buff *skb)
> +{
> +	/* There is nothing we can do here without CONNTRACK */
> +}
> +#endif
> +
>  /* Cake has several subtle multiple bit settings. In these cases you
>   *  would be matching triple isolate mode as well.
>   */
> @@ -543,6 +603,9 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
>  	skb_flow_dissect_flow_keys(skb, &keys,
>  				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
>  
> +	if (flow_mode & CAKE_FLOW_NAT_FLAG)
> +		cake_update_flowkeys(&keys, skb);
> +
>  	/* flow_hash_from_keys() sorts the addresses by value, so we have
>  	 * to preserve their order in a separate data structure to treat
>  	 * src and dst host addresses as independently selectable.
> @@ -1894,6 +1957,18 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
>  	if (err < 0)
>  		return err;
>  
> +	if (tb[TCA_CAKE_NAT]) {
> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
> +		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
> +		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
> +			!!nla_get_u32(tb[TCA_CAKE_NAT]);
> +#else
> +		NL_SET_ERR_MSG_ATTR(extack, "No conntrack support in kernel",
> +				    tb[TCA_CAKE_NAT]);
> +		return -EOPNOTSUPP;
> +#endif
> +	}
> +
>  	if (tb[TCA_CAKE_BASE_RATE64])
>  		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
>  
> @@ -2066,6 +2141,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
>  		goto nla_put_failure;
>  
> +	if (nla_put_u32(skb, TCA_CAKE_NAT,
> +			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
> +		goto nla_put_failure;
> +
>  	return nla_nest_end(skb, opts);
>  
>  nla_put_failure:
> 
> _______________________________________________
> Cake mailing list
> Cake@lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/cake

^ permalink raw reply

* [PATCH net] netfilter: provide correct argument to nla_strlcpy()
From: Eric Dumazet @ 2018-05-21 23:33 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Eric Dumazet, Florian Westphal,
	Pablo Neira Ayuso

Recent patch forgot to remove nla_data(), upsetting syzkaller a bit.

BUG: KASAN: slab-out-of-bounds in nla_strlcpy+0x13d/0x150 lib/nlattr.c:314
Read of size 1 at addr ffff8801ad1f4fdd by task syz-executor189/4509

CPU: 1 PID: 4509 Comm: syz-executor189 Not tainted 4.17.0-rc6+ #62
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
 nla_strlcpy+0x13d/0x150 lib/nlattr.c:314
 nfnl_acct_new+0x574/0xc50 net/netfilter/nfnetlink_acct.c:118
 nfnetlink_rcv_msg+0xdb5/0xff0 net/netfilter/nfnetlink.c:212
 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
 nfnetlink_rcv+0x1fe/0x1ba0 net/netfilter/nfnetlink.c:513
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg+0xd5/0x120 net/socket.c:639
 sock_write_iter+0x35a/0x5a0 net/socket.c:908
 call_write_iter include/linux/fs.h:1784 [inline]
 new_sync_write fs/read_write.c:474 [inline]
 __vfs_write+0x64d/0x960 fs/read_write.c:487
 vfs_write+0x1f8/0x560 fs/read_write.c:549
 ksys_write+0xf9/0x250 fs/read_write.c:598
 __do_sys_write fs/read_write.c:610 [inline]
 __se_sys_write fs/read_write.c:607 [inline]
 __x64_sys_write+0x73/0xb0 fs/read_write.c:607

Fixes: 4e09fc873d92 ("netfilter: prefer nla_strlcpy for dealing with NLA_STRING attributes")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/netfilter/nfnetlink_acct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 6ddf89183e7b47e6c029b28cf5b524c73a790498..a0e5adf0b3b6ddbcd01f956d25dda01c611a7663 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -115,7 +115,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
 		nfacct->flags = flags;
 	}
 
-	nla_strlcpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+	nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
 
 	if (tb[NFACCT_BYTES]) {
 		atomic64_set(&nfacct->bytes,
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* Re: [PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg
From: kbuild test robot @ 2018-05-21 23:33 UTC (permalink / raw)
  To: Andrey Ignatov
  Cc: kbuild-all, netdev, Andrey Ignatov, davem, ast, daniel,
	kernel-team
In-Reply-To: <654b59700a21ed2342b12f30a583ba9329ef850b.1526694154.git.rdna@fb.com>

[-- Attachment #1: Type: text/plain, Size: 11021 bytes --]

Hi Andrey,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Andrey-Ignatov/bpf-Hooks-for-sys_sendmsg/20180522-065614
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: x86_64-randconfig-x003-201820 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   net/ipv4/udp.c: In function 'udp_sendmsg':
>> net/ipv4/udp.c:1014:44: error: macro "BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK" passed 3 arguments, but takes just 2
             (struct sockaddr *)usin, &ipc.addr);
                                               ^
>> net/ipv4/udp.c:1013:9: error: 'BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK' undeclared (first use in this function); did you mean 'BPF_CGROUP_UDP4_SENDMSG'?
      err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            BPF_CGROUP_UDP4_SENDMSG
   net/ipv4/udp.c:1013:9: note: each undeclared identifier is reported only once for each function it appears in
--
   net/ipv6/udp.c: In function 'udpv6_sendmsg':
>> net/ipv6/udp.c:1320:44: error: macro "BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK" passed 3 arguments, but takes just 2
            (struct sockaddr *)sin6, &fl6.saddr);
                                               ^
>> net/ipv6/udp.c:1319:9: error: 'BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK' undeclared (first use in this function); did you mean 'BPF_CGROUP_UDP6_SENDMSG'?
      err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            BPF_CGROUP_UDP6_SENDMSG
   net/ipv6/udp.c:1319:9: note: each undeclared identifier is reported only once for each function it appears in

vim +/BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK +1014 net/ipv4/udp.c

   898	
   899	int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
   900	{
   901		struct inet_sock *inet = inet_sk(sk);
   902		struct udp_sock *up = udp_sk(sk);
   903		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
   904		struct flowi4 fl4_stack;
   905		struct flowi4 *fl4;
   906		int ulen = len;
   907		struct ipcm_cookie ipc;
   908		struct rtable *rt = NULL;
   909		int free = 0;
   910		int connected = 0;
   911		__be32 daddr, faddr, saddr;
   912		__be16 dport;
   913		u8  tos;
   914		int err, is_udplite = IS_UDPLITE(sk);
   915		int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
   916		int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
   917		struct sk_buff *skb;
   918		struct ip_options_data opt_copy;
   919	
   920		if (len > 0xFFFF)
   921			return -EMSGSIZE;
   922	
   923		/*
   924		 *	Check the flags.
   925		 */
   926	
   927		if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
   928			return -EOPNOTSUPP;
   929	
   930		ipc.opt = NULL;
   931		ipc.tx_flags = 0;
   932		ipc.ttl = 0;
   933		ipc.tos = -1;
   934	
   935		getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
   936	
   937		fl4 = &inet->cork.fl.u.ip4;
   938		if (up->pending) {
   939			/*
   940			 * There are pending frames.
   941			 * The socket lock must be held while it's corked.
   942			 */
   943			lock_sock(sk);
   944			if (likely(up->pending)) {
   945				if (unlikely(up->pending != AF_INET)) {
   946					release_sock(sk);
   947					return -EINVAL;
   948				}
   949				goto do_append_data;
   950			}
   951			release_sock(sk);
   952		}
   953		ulen += sizeof(struct udphdr);
   954	
   955		/*
   956		 *	Get and verify the address.
   957		 */
   958		if (usin) {
   959			if (msg->msg_namelen < sizeof(*usin))
   960				return -EINVAL;
   961			if (usin->sin_family != AF_INET) {
   962				if (usin->sin_family != AF_UNSPEC)
   963					return -EAFNOSUPPORT;
   964			}
   965	
   966			daddr = usin->sin_addr.s_addr;
   967			dport = usin->sin_port;
   968			if (dport == 0)
   969				return -EINVAL;
   970		} else {
   971			if (sk->sk_state != TCP_ESTABLISHED)
   972				return -EDESTADDRREQ;
   973			daddr = inet->inet_daddr;
   974			dport = inet->inet_dport;
   975			/* Open fast path for connected socket.
   976			   Route will not be used, if at least one option is set.
   977			 */
   978			connected = 1;
   979		}
   980	
   981		ipc.sockc.tsflags = sk->sk_tsflags;
   982		ipc.addr = inet->inet_saddr;
   983		ipc.oif = sk->sk_bound_dev_if;
   984		ipc.gso_size = up->gso_size;
   985	
   986		if (msg->msg_controllen) {
   987			err = udp_cmsg_send(sk, msg, &ipc.gso_size);
   988			if (err > 0)
   989				err = ip_cmsg_send(sk, msg, &ipc,
   990						   sk->sk_family == AF_INET6);
   991			if (unlikely(err < 0)) {
   992				kfree(ipc.opt);
   993				return err;
   994			}
   995			if (ipc.opt)
   996				free = 1;
   997			connected = 0;
   998		}
   999		if (!ipc.opt) {
  1000			struct ip_options_rcu *inet_opt;
  1001	
  1002			rcu_read_lock();
  1003			inet_opt = rcu_dereference(inet->inet_opt);
  1004			if (inet_opt) {
  1005				memcpy(&opt_copy, inet_opt,
  1006				       sizeof(*inet_opt) + inet_opt->opt.optlen);
  1007				ipc.opt = &opt_copy.opt;
  1008			}
  1009			rcu_read_unlock();
  1010		}
  1011	
  1012		if (!connected) {
> 1013			err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
> 1014						    (struct sockaddr *)usin, &ipc.addr);
  1015			if (err)
  1016				goto out_free;
  1017			if (usin) {
  1018				if (usin->sin_port == 0) {
  1019					/* BPF program set invalid port. Reject it. */
  1020					err = -EINVAL;
  1021					goto out_free;
  1022				}
  1023				daddr = usin->sin_addr.s_addr;
  1024				dport = usin->sin_port;
  1025			}
  1026		}
  1027	
  1028		saddr = ipc.addr;
  1029		ipc.addr = faddr = daddr;
  1030	
  1031		sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
  1032	
  1033		if (ipc.opt && ipc.opt->opt.srr) {
  1034			if (!daddr) {
  1035				err = -EINVAL;
  1036				goto out_free;
  1037			}
  1038			faddr = ipc.opt->opt.faddr;
  1039			connected = 0;
  1040		}
  1041		tos = get_rttos(&ipc, inet);
  1042		if (sock_flag(sk, SOCK_LOCALROUTE) ||
  1043		    (msg->msg_flags & MSG_DONTROUTE) ||
  1044		    (ipc.opt && ipc.opt->opt.is_strictroute)) {
  1045			tos |= RTO_ONLINK;
  1046			connected = 0;
  1047		}
  1048	
  1049		if (ipv4_is_multicast(daddr)) {
  1050			if (!ipc.oif)
  1051				ipc.oif = inet->mc_index;
  1052			if (!saddr)
  1053				saddr = inet->mc_addr;
  1054			connected = 0;
  1055		} else if (!ipc.oif) {
  1056			ipc.oif = inet->uc_index;
  1057		} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
  1058			/* oif is set, packet is to local broadcast and
  1059			 * and uc_index is set. oif is most likely set
  1060			 * by sk_bound_dev_if. If uc_index != oif check if the
  1061			 * oif is an L3 master and uc_index is an L3 slave.
  1062			 * If so, we want to allow the send using the uc_index.
  1063			 */
  1064			if (ipc.oif != inet->uc_index &&
  1065			    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
  1066								      inet->uc_index)) {
  1067				ipc.oif = inet->uc_index;
  1068			}
  1069		}
  1070	
  1071		if (connected)
  1072			rt = (struct rtable *)sk_dst_check(sk, 0);
  1073	
  1074		if (!rt) {
  1075			struct net *net = sock_net(sk);
  1076			__u8 flow_flags = inet_sk_flowi_flags(sk);
  1077	
  1078			fl4 = &fl4_stack;
  1079	
  1080			flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
  1081					   RT_SCOPE_UNIVERSE, sk->sk_protocol,
  1082					   flow_flags,
  1083					   faddr, saddr, dport, inet->inet_sport,
  1084					   sk->sk_uid);
  1085	
  1086			security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
  1087			rt = ip_route_output_flow(net, fl4, sk);
  1088			if (IS_ERR(rt)) {
  1089				err = PTR_ERR(rt);
  1090				rt = NULL;
  1091				if (err == -ENETUNREACH)
  1092					IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
  1093				goto out;
  1094			}
  1095	
  1096			err = -EACCES;
  1097			if ((rt->rt_flags & RTCF_BROADCAST) &&
  1098			    !sock_flag(sk, SOCK_BROADCAST))
  1099				goto out;
  1100			if (connected)
  1101				sk_dst_set(sk, dst_clone(&rt->dst));
  1102		}
  1103	
  1104		if (msg->msg_flags&MSG_CONFIRM)
  1105			goto do_confirm;
  1106	back_from_confirm:
  1107	
  1108		saddr = fl4->saddr;
  1109		if (!ipc.addr)
  1110			daddr = ipc.addr = fl4->daddr;
  1111	
  1112		/* Lockless fast path for the non-corking case. */
  1113		if (!corkreq) {
  1114			struct inet_cork cork;
  1115	
  1116			skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
  1117					  sizeof(struct udphdr), &ipc, &rt,
  1118					  &cork, msg->msg_flags);
  1119			err = PTR_ERR(skb);
  1120			if (!IS_ERR_OR_NULL(skb))
  1121				err = udp_send_skb(skb, fl4, &cork);
  1122			goto out;
  1123		}
  1124	
  1125		lock_sock(sk);
  1126		if (unlikely(up->pending)) {
  1127			/* The socket is already corked while preparing it. */
  1128			/* ... which is an evident application bug. --ANK */
  1129			release_sock(sk);
  1130	
  1131			net_dbg_ratelimited("socket already corked\n");
  1132			err = -EINVAL;
  1133			goto out;
  1134		}
  1135		/*
  1136		 *	Now cork the socket to pend data.
  1137		 */
  1138		fl4 = &inet->cork.fl.u.ip4;
  1139		fl4->daddr = daddr;
  1140		fl4->saddr = saddr;
  1141		fl4->fl4_dport = dport;
  1142		fl4->fl4_sport = inet->inet_sport;
  1143		up->pending = AF_INET;
  1144	
  1145	do_append_data:
  1146		up->len += ulen;
  1147		err = ip_append_data(sk, fl4, getfrag, msg, ulen,
  1148				     sizeof(struct udphdr), &ipc, &rt,
  1149				     corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
  1150		if (err)
  1151			udp_flush_pending_frames(sk);
  1152		else if (!corkreq)
  1153			err = udp_push_pending_frames(sk);
  1154		else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
  1155			up->pending = 0;
  1156		release_sock(sk);
  1157	
  1158	out:
  1159		ip_rt_put(rt);
  1160	out_free:
  1161		if (free)
  1162			kfree(ipc.opt);
  1163		if (!err)
  1164			return len;
  1165		/*
  1166		 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
  1167		 * ENOBUFS might not be good (it's not tunable per se), but otherwise
  1168		 * we don't have a good statistic (IpOutDiscards but it can be too many
  1169		 * things).  We could add another new stat but at least for now that
  1170		 * seems like overkill.
  1171		 */
  1172		if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  1173			UDP_INC_STATS(sock_net(sk),
  1174				      UDP_MIB_SNDBUFERRORS, is_udplite);
  1175		}
  1176		return err;
  1177	
  1178	do_confirm:
  1179		if (msg->msg_flags & MSG_PROBE)
  1180			dst_confirm_neigh(&rt->dst, &fl4->daddr);
  1181		if (!(msg->msg_flags&MSG_PROBE) || len)
  1182			goto back_from_confirm;
  1183		err = 0;
  1184		goto out;
  1185	}
  1186	EXPORT_SYMBOL(udp_sendmsg);
  1187	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 27551 bytes --]

^ permalink raw reply

* Re: [PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg
From: Martin KaFai Lau @ 2018-05-21 23:16 UTC (permalink / raw)
  To: Andrey Ignatov; +Cc: netdev, davem, ast, daniel, kernel-team
In-Reply-To: <654b59700a21ed2342b12f30a583ba9329ef850b.1526694154.git.rdna@fb.com>

On Fri, May 18, 2018 at 07:21:09PM -0700, Andrey Ignatov wrote:
> In addition to already existing BPF hooks for sys_bind and sys_connect,
> the patch provides new hooks for sys_sendmsg.
> 
> It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR`
> that provides access to socket itlself (properties like family, type,
> protocol) and user-passed `struct sockaddr *` so that BPF program can
> override destination IP and port for system calls such as sendto(2) or
> sendmsg(2) and/or assign source IP to the socket.
> 
> The hooks are implemented as two new attach types:
> `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and
> UDPv6 correspondingly.
> 
> UDPv4 and UDPv6 separate attach types for same reason as sys_bind and
> sys_connect hooks, i.e. to prevent reading from / writing to e.g.
> user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound.
> 
> The difference with already existing hooks is sys_sendmsg are
> implemented only for unconnected UDP.
> 
> For TCP it doesn't make sense to change user-provided `struct sockaddr *`
> at sendto(2)/sendmsg(2) time since socket either was already connected
> and has source/destination set or wasn't connected and call to
> sendto(2)/sendmsg(2) would lead to ENOTCONN anyway.
> 
> Connected UDP is already handled by sys_connect hooks that can override
> source/destination at connect time and use fast-path later, i.e. these
> hooks don't affect UDP fast-path.
> 
> Rewriting source IP is implemented differently than that in sys_connect
> hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to
> just bind socket to desired local IP address since source IP can be set
> on per-packet basis by using ancillary data (cmsg(3)). So no matter if
> socket is bound or not, source IP has to be rewritten on every call to
> sys_sendmsg.
> 
> To do so two new fields are added to UAPI `struct bpf_sock_addr`;
> * `msg_src_ip4` to set source IPv4 for UDPv4;
> * `msg_src_ip6` to set source IPv6 for UDPv6.
> 
> Signed-off-by: Andrey Ignatov <rdna@fb.com>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpf-cgroup.h | 23 +++++++++++++++++------
>  include/linux/filter.h     |  1 +
>  include/uapi/linux/bpf.h   |  8 ++++++++
>  kernel/bpf/cgroup.c        | 11 ++++++++++-
>  kernel/bpf/syscall.c       |  8 ++++++++
>  net/core/filter.c          | 39 +++++++++++++++++++++++++++++++++++++++
>  net/ipv4/udp.c             | 20 ++++++++++++++++++--
>  net/ipv6/udp.c             | 17 +++++++++++++++++
>  8 files changed, 118 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 30d15e6..46f01ba 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
>  
>  int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
>  				      struct sockaddr *uaddr,
> -				      enum bpf_attach_type type);
> +				      enum bpf_attach_type type,
> +				      void *t_ctx);
>  
>  int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
>  				     struct bpf_sock_ops_kern *sock_ops,
> @@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
>  ({									       \
>  	int __ret = 0;							       \
>  	if (cgroup_bpf_enabled)						       \
> -		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
> +		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
> +							  NULL);	       \
>  	__ret;								       \
>  })
>  
> -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)			       \
> +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
>  ({									       \
>  	int __ret = 0;							       \
>  	if (cgroup_bpf_enabled)	{					       \
>  		lock_sock(sk);						       \
> -		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
> +		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
> +							  t_ctx);	       \
>  		release_sock(sk);					       \
>  	}								       \
>  	__ret;								       \
> @@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
>  	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
>  
>  #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
> -	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
> +	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
>  
>  #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
> -	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
> +	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
> +
> +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
> +	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
> +
> +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
> +	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
>  
>  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
>  ({									       \
> @@ -197,6 +206,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
>  #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
>  
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d358d18..d90abda 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern {
>  	 * only two (src and dst) are available at convert_ctx_access time
>  	 */
>  	u64 tmp_reg;
> +	void *t_ctx;	/* Attach type specific context. */
>  };
>  
>  struct bpf_sock_ops_kern {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 97446bb..b70ad2c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -158,6 +158,8 @@ enum bpf_attach_type {
>  	BPF_CGROUP_INET6_CONNECT,
>  	BPF_CGROUP_INET4_POST_BIND,
>  	BPF_CGROUP_INET6_POST_BIND,
> +	BPF_CGROUP_UDP4_SENDMSG,
> +	BPF_CGROUP_UDP6_SENDMSG,
>  	__MAX_BPF_ATTACH_TYPE
>  };
>  
> @@ -2247,6 +2249,12 @@ struct bpf_sock_addr {
>  	__u32 family;		/* Allows 4-byte read, but no write */
>  	__u32 type;		/* Allows 4-byte read, but no write */
>  	__u32 protocol;		/* Allows 4-byte read, but no write */
> +	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
> +				 * Stored in network byte order.
> +				 */
> +	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
> +				 * Stored in network byte order.
> +				 */
>  };
>  
>  /* User bpf_sock_ops struct to access socket values and specify request ops
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 43171a0..f7c00bd 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
>   * @sk: sock struct that will use sockaddr
>   * @uaddr: sockaddr struct provided by user
>   * @type: The type of program to be exectuted
> + * @t_ctx: Pointer to attach type specific context
>   *
>   * socket is expected to be of type INET or INET6.
>   *
> @@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
>   */
>  int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
>  				      struct sockaddr *uaddr,
> -				      enum bpf_attach_type type)
> +				      enum bpf_attach_type type,
> +				      void *t_ctx)
>  {
>  	struct bpf_sock_addr_kern ctx = {
>  		.sk = sk,
>  		.uaddr = uaddr,
> +		.t_ctx = t_ctx,
>  	};
> +	struct sockaddr_storage unspec;
>  	struct cgroup *cgrp;
>  	int ret;
>  
> @@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
>  	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
>  		return 0;
>  
> +	if (!ctx.uaddr) {
> +		memset(&unspec, 0, sizeof(unspec));
> +		ctx.uaddr = (struct sockaddr *)&unspec;
> +	}
> +
>  	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
>  	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
>  
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index bfcde94..11a5a95 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1247,6 +1247,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
>  		case BPF_CGROUP_INET6_BIND:
>  		case BPF_CGROUP_INET4_CONNECT:
>  		case BPF_CGROUP_INET6_CONNECT:
> +		case BPF_CGROUP_UDP4_SENDMSG:
> +		case BPF_CGROUP_UDP6_SENDMSG:
>  			return 0;
>  		default:
>  			return -EINVAL;
> @@ -1563,6 +1565,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>  	case BPF_CGROUP_INET6_BIND:
>  	case BPF_CGROUP_INET4_CONNECT:
>  	case BPF_CGROUP_INET6_CONNECT:
> +	case BPF_CGROUP_UDP4_SENDMSG:
> +	case BPF_CGROUP_UDP6_SENDMSG:
>  		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
>  		break;
>  	case BPF_CGROUP_SOCK_OPS:
> @@ -1633,6 +1637,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
>  	case BPF_CGROUP_INET6_BIND:
>  	case BPF_CGROUP_INET4_CONNECT:
>  	case BPF_CGROUP_INET6_CONNECT:
> +	case BPF_CGROUP_UDP4_SENDMSG:
> +	case BPF_CGROUP_UDP6_SENDMSG:
>  		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
>  		break;
>  	case BPF_CGROUP_SOCK_OPS:
> @@ -1690,6 +1696,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
>  	case BPF_CGROUP_INET6_POST_BIND:
>  	case BPF_CGROUP_INET4_CONNECT:
>  	case BPF_CGROUP_INET6_CONNECT:
> +	case BPF_CGROUP_UDP4_SENDMSG:
> +	case BPF_CGROUP_UDP6_SENDMSG:
>  	case BPF_CGROUP_SOCK_OPS:
>  	case BPF_CGROUP_DEVICE:
>  		break;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index aec5eba..f696dc9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5010,6 +5010,7 @@ static bool sock_addr_is_valid_access(int off, int size,
>  		switch (prog->expected_attach_type) {
>  		case BPF_CGROUP_INET4_BIND:
>  		case BPF_CGROUP_INET4_CONNECT:
> +		case BPF_CGROUP_UDP4_SENDMSG:
>  			break;
>  		default:
>  			return false;
> @@ -5019,6 +5020,24 @@ static bool sock_addr_is_valid_access(int off, int size,
>  		switch (prog->expected_attach_type) {
>  		case BPF_CGROUP_INET6_BIND:
>  		case BPF_CGROUP_INET6_CONNECT:
> +		case BPF_CGROUP_UDP6_SENDMSG:
> +			break;
> +		default:
> +			return false;
> +		}
> +		break;
> +	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
> +		switch (prog->expected_attach_type) {
> +		case BPF_CGROUP_UDP4_SENDMSG:
> +			break;
> +		default:
> +			return false;
> +		}
> +		break;
> +	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> +				msg_src_ip6[3]):
> +		switch (prog->expected_attach_type) {
> +		case BPF_CGROUP_UDP6_SENDMSG:
>  			break;
>  		default:
>  			return false;
> @@ -5029,6 +5048,9 @@ static bool sock_addr_is_valid_access(int off, int size,
>  	switch (off) {
>  	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
>  	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
> +	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
> +	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> +				msg_src_ip6[3]):
>  		/* Only narrow read access allowed for now. */
>  		if (type == BPF_READ) {
>  			bpf_ctx_record_field_size(info, size_default);
> @@ -5783,6 +5805,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
>  		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
>  					SK_FL_PROTO_SHIFT);
>  		break;
> +
> +	case offsetof(struct bpf_sock_addr, msg_src_ip4):
> +		/* Treat t_ctx as struct in_addr for msg_src_ip4. */
> +		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
> +			struct bpf_sock_addr_kern, struct in_addr, t_ctx,
> +			s_addr, BPF_SIZE(si->code), 0, tmp_reg);
> +		break;
> +
> +	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> +				msg_src_ip6[3]):
> +		off = si->off;
> +		off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
> +		/* Treat t_ctx as struct in6_addr for msg_src_ip6. */
> +		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
> +			struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
> +			s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
> +		break;
>  	}
>  
>  	return insn - insn_buf;
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index ff4d4ba..a1f9ba2 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -900,6 +900,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  {
>  	struct inet_sock *inet = inet_sk(sk);
>  	struct udp_sock *up = udp_sk(sk);
> +	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
>  	struct flowi4 fl4_stack;
>  	struct flowi4 *fl4;
>  	int ulen = len;
> @@ -954,8 +955,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  	/*
>  	 *	Get and verify the address.
>  	 */
> -	if (msg->msg_name) {
> -		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
> +	if (usin) {
>  		if (msg->msg_namelen < sizeof(*usin))
>  			return -EINVAL;
>  		if (usin->sin_family != AF_INET) {
> @@ -1009,6 +1009,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  		rcu_read_unlock();
>  	}
>  
> +	if (!connected) {
> +		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
> +					    (struct sockaddr *)usin, &ipc.addr);
> +		if (err)
> +			goto out_free;
> +		if (usin) {
> +			if (usin->sin_port == 0) {
> +				/* BPF program set invalid port. Reject it. */
> +				err = -EINVAL;
> +				goto out_free;
> +			}
> +			daddr = usin->sin_addr.s_addr;
> +			dport = usin->sin_port;
> +		}
> +	}
> +
>  	saddr = ipc.addr;
>  	ipc.addr = faddr = daddr;
>  
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 2839c1b..6f580ea 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1315,6 +1315,22 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  		fl6.saddr = np->saddr;
>  	fl6.fl6_sport = inet->inet_sport;
>  
> +	if (!connected) {
> +		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
> +					   (struct sockaddr *)sin6, &fl6.saddr);
> +		if (err)
> +			goto out_no_dst;
> +		if (sin6) {
> +			if (sin6->sin6_port == 0) {
> +				/* BPF program set invalid port. Reject it. */
> +				err = -EINVAL;
> +				goto out_no_dst;
> +			}
> +			fl6.fl6_dport = sin6->sin6_port;
> +			fl6.daddr = sin6->sin6_addr;
Could the bpf_prog change sin6 to a v4mapped address?

> +		}
> +	}
> +
>  	final_p = fl6_update_dst(&fl6, opt, &final);
It seems fl6_update_dst() may update fl6->daddr.
Is it fine (or inline with the expectation after running
a bpf_prog that has changed user_ip6)?

>  	if (final_p)
>  		connected = false;
> @@ -1394,6 +1410,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  
>  out:
>  	dst_release(dst);
> +out_no_dst:
A nit.  If dst is init to NULL, can a new exit label
be avoided?


>  	fl6_sock_release(flowlabel);
>  	txopt_put(opt_to_free);
>  	if (!err)
> -- 
> 2.9.5
> 

^ permalink raw reply

* Re: [PATCH v2] ath10k: transmit queued frames after waking queues
From: Rajkumar Manoharan @ 2018-05-21 23:11 UTC (permalink / raw)
  To: Niklas Cassel
  Cc: Kalle Valo, David S. Miller, ath10k, linux-wireless, netdev,
	linux-kernel, linux-wireless-owner
In-Reply-To: <20180521204359.23884-1-niklas.cassel@linaro.org>

On 2018-05-21 13:43, Niklas Cassel wrote:
> The following problem was observed when running iperf:
[...]
> 
> In order to avoid trying to flush the queue every time we free a frame,
> only do this when there are 3 or less frames pending, and while we
> actually have frames in the queue. This logic was copied from
> mt76_txq_schedule (mt76), one of few other drivers that are actually
> using wake_tx_queue.
> 
> Suggested-by: Toke Høiland-Jørgensen <toke@toke.dk>
> Signed-off-by: Niklas Cassel <niklas.cassel@linaro.org>
> ---
> Changes since V1: use READ_ONCE() to disallow the compiler
> optimizing things in undesirable ways.
> 
>  drivers/net/wireless/ath/ath10k/txrx.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/net/wireless/ath/ath10k/txrx.c
> b/drivers/net/wireless/ath/ath10k/txrx.c
> index cda164f6e9f6..264cf0bd5c00 100644
> --- a/drivers/net/wireless/ath/ath10k/txrx.c
> +++ b/drivers/net/wireless/ath/ath10k/txrx.c
> @@ -95,6 +95,9 @@ int ath10k_txrx_tx_unref(struct ath10k_htt *htt,
>  		wake_up(&htt->empty_tx_wq);
>  	spin_unlock_bh(&htt->tx_lock);
> 
> +	if (READ_ONCE(htt->num_pending_tx) <= 3 && !list_empty(&ar->txqs))
> +		ath10k_mac_tx_push_pending(ar);
> +
Niklas,

Sorry for the late response. ath10k_mac_tx_push_pending is already 
called
at the end of NAPI handler. Isn't that enough to process pending frames?

Earlier we observed performance issues in calling push_pending from each
tx completion. IMHO this change may introduce the same problem again.

-Rajkumar

^ permalink raw reply

* Re: [PATCH net-next] r8169: perform reset synchronously in __rtl8169_resume
From: Heiner Kallweit @ 2018-05-21 22:38 UTC (permalink / raw)
  To: Francois Romieu
  Cc: David Miller, Realtek linux nic maintainers,
	netdev@vger.kernel.org
In-Reply-To: <20180521212422.GA25067@electric-eye.fr.zoreil.com>

Am 21.05.2018 um 23:24 schrieb Francois Romieu:
> Heiner Kallweit <hkallweit1@gmail.com> :
> [...]
>> diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
>> index 75dfac024..1eb4f625a 100644
>> --- a/drivers/net/ethernet/realtek/r8169.c
>> +++ b/drivers/net/ethernet/realtek/r8169.c
>> @@ -7327,9 +7327,9 @@ static void __rtl8169_resume(struct net_device *dev)
>>  	rtl_lock_work(tp);
>>  	napi_enable(&tp->napi);
>>  	set_bit(RTL_FLAG_TASK_ENABLED, tp->wk.flags);
>> +	if (netif_running(dev))
>> +		rtl_reset_work(tp);
>>  	rtl_unlock_work(tp);
>> -
>> -	rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
>>  }
>>  
>>  static int rtl8169_resume(struct device *device)
> 
> netif_running() returns true before rtl_open(): issuing rtl_reset_work()
> synchronously against uninitialized rings right at the start of rtl_open()
> won't work as expected.
> 
rtl8169_runtime_resume() has a check for tp->TxDescArray at the beginning.
Therefore it will return immediately before even calling
__rtl8169_resume(), because tp->TxDescArray is set in rtl_open() only.
So I don't think the described issue exists.

However I have to admit that I wasn't aware that netif_running() is true
already when entering rtl_open(), so thanks for the hint.

> pm_runtime_get_sync() could be issued later in rtl_open() (but I would
> not mind something different with runtime_{resume/suspend} in open/close).
> 

^ permalink raw reply

* Re: [RFC PATCH net-next 10/12] vhost_net: build xdp buff
From: Michael S. Tsirkin @ 2018-05-21 22:21 UTC (permalink / raw)
  To: Jesse Brandeburg; +Cc: Jason Wang, kvm, virtualization, netdev, linux-kernel
In-Reply-To: <20180521095611.00005caa@intel.com>

On Mon, May 21, 2018 at 09:56:11AM -0700, Jesse Brandeburg wrote:
> On Mon, 21 May 2018 17:04:31 +0800 Jason wrote:
> > This patch implement build XDP buffers in vhost_net. The idea is do
> > userspace copy in vhost_net and build XDP buff based on the
> > page. Vhost_net can then submit one or an array of XDP buffs to
> > underlayer socket (e.g TUN). TUN can choose to do XDP or call
> > build_skb() to build skb. To support build skb, vnet header were also
> > stored into the header of the XDP buff.
> > 
> > This userspace copy and XDP buffs building is key to achieve XDP
> > batching in TUN, since TUN does not need to care about userspace copy
> > and then can disable premmption for several XDP buffs to achieve
> > batching from XDP.
> > 
> > TODO: reserve headroom based on the TUN XDP.
> > 
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> >  drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 74 insertions(+)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index f0639d7..1209e84 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
> >  	       likely(!vhost_exceeds_maxpend(net));
> >  }
> >  
> > +#define VHOST_NET_HEADROOM 256
> > +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> > +
> > +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> > +			       struct iov_iter *from,
> > +			       struct xdp_buff *xdp)
> > +{
> > +	struct vhost_virtqueue *vq = &nvq->vq;
> > +	struct page_frag *alloc_frag = &current->task_frag;
> > +	struct virtio_net_hdr *gso;
> > +	size_t len = iov_iter_count(from);
> > +	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > +	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
> > +				 + nvq->sock_hlen);
> > +	int sock_hlen = nvq->sock_hlen;
> > +	void *buf;
> > +	int copied;
> > +
> > +	if (len < nvq->sock_hlen)
> > +		return -EFAULT;
> > +
> > +	if (SKB_DATA_ALIGN(len + pad) +
> > +	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
> > +		return -ENOSPC;
> > +
> > +	buflen += SKB_DATA_ALIGN(len + pad);
> 
> maybe store the result of SKB_DATA_ALIGN in a local instead of doing
> the work twice?

I don't mind, but I guess gcc can always do it itself?

> > +	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
> > +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
> > +		return -ENOMEM;
> > +
> > +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > +
> > +	/* We store two kinds of metadata in the header which will be
> > +	 * used for XDP_PASS to do build_skb():
> > +	 * offset 0: buflen
> > +	 * offset sizeof(int): vnet header
> > +	 */
> > +	copied = copy_page_from_iter(alloc_frag->page,
> > +				     alloc_frag->offset + sizeof(int), sock_hlen, from);
> > +	if (copied != sock_hlen)
> > +		return -EFAULT;
> > +
> > +	gso = (struct virtio_net_hdr *)(buf + sizeof(int));
> > +
> > +	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> > +	    vhost16_to_cpu(vq, gso->csum_start) +
> > +	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
> > +	    vhost16_to_cpu(vq, gso->hdr_len)) {
> > +		gso->hdr_len = cpu_to_vhost16(vq,
> > +			       vhost16_to_cpu(vq, gso->csum_start) +
> > +			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
> > +
> > +		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
> > +			return -EINVAL;
> > +	}
> > +
> > +	len -= sock_hlen;
> > +	copied = copy_page_from_iter(alloc_frag->page,
> > +				     alloc_frag->offset + pad,
> > +				     len, from);
> > +	if (copied != len)
> > +		return -EFAULT;
> > +
> > +	xdp->data_hard_start = buf;
> > +	xdp->data = buf + pad;
> > +	xdp->data_end = xdp->data + len;
> > +	*(int *)(xdp->data_hard_start)= buflen;
> 
> space before =
> 
> > +
> > +	get_page(alloc_frag->page);
> > +	alloc_frag->offset += buflen;
> > +
> > +	return 0;
> > +}
> > +
> >  static void handle_tx_copy(struct vhost_net *net)
> >  {
> >  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];

^ permalink raw reply

* Reply
From: conapreb @ 2018-05-21 15:29 UTC (permalink / raw)
  To: Recipients

Are you free for discussion?

^ permalink raw reply

* [PATCH net-next 2/2] tcp: do not aggressively quick ack after ECN events
From: Eric Dumazet @ 2018-05-21 22:08 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Van Jacobson, Neal Cardwell, Yuchung Cheng,
	Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet
In-Reply-To: <20180521220857.229273-1-edumazet@google.com>

ECN signals currently forces TCP to enter quickack mode for
up to 16 (TCP_MAX_QUICKACKS) following incoming packets.

We believe this is not needed, and only sending one immediate ack
for the current packet should be enough.

This should reduce the extra load noticed in DCTCP environments,
after congestion events.

This is part 2 of our effort to reduce pure ACK packets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2e970e9f4e09d966b703af2d14d521a4328eba7e..1191cac72109f2f7e2b688ddbc1d404151d274d6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -263,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 		 * it is probably a retransmit.
 		 */
 		if (tp->ecn_flags & TCP_ECN_SEEN)
-			tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
+			tcp_enter_quickack_mode((struct sock *)tp, 1);
 		break;
 	case INET_ECN_CE:
 		if (tcp_ca_needs_ecn((struct sock *)tp))
@@ -271,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
-			tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
+			tcp_enter_quickack_mode((struct sock *)tp, 1);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
 		tp->ecn_flags |= TCP_ECN_SEEN;
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* [PATCH net-next 1/2] tcp: add max_quickacks param to tcp_incr_quickack and tcp_enter_quickack_mode
From: Eric Dumazet @ 2018-05-21 22:08 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Van Jacobson, Neal Cardwell, Yuchung Cheng,
	Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet
In-Reply-To: <20180521220857.229273-1-edumazet@google.com>

We want to add finer control of the number of ACK packets sent after
ECN events.

This patch is not changing current behavior, it only enables following
change.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_input.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aebb29ab2fdf2ceaa182cd11928f145a886149ff..2e970e9f4e09d966b703af2d14d521a4328eba7e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -203,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 	}
 }
 
-static void tcp_incr_quickack(struct sock *sk)
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
 
 	if (quickacks == 0)
 		quickacks = 2;
+	quickacks = min(quickacks, max_quickacks);
 	if (quickacks > icsk->icsk_ack.quick)
-		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+		icsk->icsk_ack.quick = quickacks;
 }
 
-static void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	tcp_incr_quickack(sk);
+
+	tcp_incr_quickack(sk, max_quickacks);
 	icsk->icsk_ack.pingpong = 0;
 	icsk->icsk_ack.ato = TCP_ATO_MIN;
 }
@@ -261,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 		 * it is probably a retransmit.
 		 */
 		if (tp->ecn_flags & TCP_ECN_SEEN)
-			tcp_enter_quickack_mode((struct sock *)tp);
+			tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
 		break;
 	case INET_ECN_CE:
 		if (tcp_ca_needs_ecn((struct sock *)tp))
@@ -269,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
-			tcp_enter_quickack_mode((struct sock *)tp);
+			tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
 		tp->ecn_flags |= TCP_ECN_SEEN;
@@ -686,7 +688,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 		/* The _first_ data packet received, initialize
 		 * delayed ACK engine.
 		 */
-		tcp_incr_quickack(sk);
+		tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
 		icsk->icsk_ack.ato = TCP_ATO_MIN;
 	} else {
 		int m = now - icsk->icsk_ack.lrcvtime;
@@ -702,7 +704,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 			/* Too long gap. Apparently sender failed to
 			 * restart window, so that we send ACKs quickly.
 			 */
-			tcp_incr_quickack(sk);
+			tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
 			sk_mem_reclaim(sk);
 		}
 	}
@@ -4179,7 +4181,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
-		tcp_enter_quickack_mode(sk);
+		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 
 		if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4706,7 +4708,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
 out_of_window:
-		tcp_enter_quickack_mode(sk);
+		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 		inet_csk_schedule_ack(sk);
 drop:
 		tcp_drop(sk, skb);
@@ -5790,7 +5792,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			 * to stand against the temptation 8)     --ANK
 			 */
 			inet_csk_schedule_ack(sk);
-			tcp_enter_quickack_mode(sk);
+			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						  TCP_DELACK_MAX, TCP_RTO_MAX);
 
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* [PATCH net-next 0/2] tcp: reduce quickack pressure for ECN
From: Eric Dumazet @ 2018-05-21 22:08 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Van Jacobson, Neal Cardwell, Yuchung Cheng,
	Soheil Hassas Yeganeh, Eric Dumazet, Eric Dumazet

Small patch series changing TCP behavior vs quickack and ECN

First patch is a refactoring, adding parameter to tcp_incr_quickack()
and tcp_enter_quickack_mode() helpers.

Second patch implements the change, lowering number of ACK packets
sent after an ECN event.

Eric Dumazet (2):
  tcp: add max_quickacks param to tcp_incr_quickack and
    tcp_enter_quickack_mode
  tcp: do not aggressively quick ack after ECN events

 net/ipv4/tcp_input.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply

* Re: [PATCH net] tuntap: raise EPOLLOUT on device up
From: Michael S. Tsirkin @ 2018-05-21 22:08 UTC (permalink / raw)
  To: David Miller; +Cc: jasowang, netdev, linux-kernel, hannes, edumazet
In-Reply-To: <20180521.114742.427929977852677864.davem@davemloft.net>

On Mon, May 21, 2018 at 11:47:42AM -0400, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Fri, 18 May 2018 21:00:43 +0800
> 
> > We return -EIO on device down but can not raise EPOLLOUT after it was
> > up. This may confuse user like vhost which expects tuntap to raise
> > EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> > be easily reproduced by transmitting packets from VM while down and up
> > the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> > 
> > Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
> > Cc: Eric Dumazet <edumazet@google.com>
> > Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> 
> I'm no so sure what to do with this patch.
> 
> Like Michael says, this flag bit is only checks upon transmit which
> may or may not happen after this point.  It doesn't seem to be
> guaranteed.

Jason, can't we detect a link up transition and respond accordingly?
What do you think?

-- 
MST

^ permalink raw reply

* Re: [PATCH net] tuntap: raise EPOLLOUT on device up
From: Michael S. Tsirkin @ 2018-05-21 22:06 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, Hannes Frederic Sowa, Eric Dumazet
In-Reply-To: <e178f169-43c4-8ac4-8334-bc3eba1bd10e@redhat.com>

On Sat, May 19, 2018 at 09:09:11AM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月18日 22:46, Michael S. Tsirkin wrote:
> > On Fri, May 18, 2018 at 10:11:54PM +0800, Jason Wang wrote:
> > > 
> > > On 2018年05月18日 22:06, Michael S. Tsirkin wrote:
> > > > On Fri, May 18, 2018 at 10:00:31PM +0800, Jason Wang wrote:
> > > > > On 2018年05月18日 21:26, Jason Wang wrote:
> > > > > > On 2018年05月18日 21:13, Michael S. Tsirkin wrote:
> > > > > > > On Fri, May 18, 2018 at 09:00:43PM +0800, Jason Wang wrote:
> > > > > > > > We return -EIO on device down but can not raise EPOLLOUT after it was
> > > > > > > > up. This may confuse user like vhost which expects tuntap to raise
> > > > > > > > EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> > > > > > > > be easily reproduced by transmitting packets from VM while down and up
> > > > > > > > the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> > > > > > > > 
> > > > > > > > Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
> > > > > > > > Cc: Eric Dumazet <edumazet@google.com>
> > > > > > > > Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > ---
> > > > > > > >     drivers/net/tun.c | 4 +++-
> > > > > > > >     1 file changed, 3 insertions(+), 1 deletion(-)
> > > > > > > > 
> > > > > > > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > > > > > > > index d45ac37..1b29761 100644
> > > > > > > > --- a/drivers/net/tun.c
> > > > > > > > +++ b/drivers/net/tun.c
> > > > > > > > @@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct
> > > > > > > > tun_struct *tun, struct tun_file *tfile,
> > > > > > > >         int skb_xdp = 1;
> > > > > > > >         bool frags = tun_napi_frags_enabled(tun);
> > > > > > > >     -    if (!(tun->dev->flags & IFF_UP))
> > > > > > > > +    if (!(tun->dev->flags & IFF_UP)) {
> > > > > > > Isn't this racy?  What if flag is cleared at this point?
> > > > > > I think you mean "set at this point"? Then yes, so we probably need to
> > > > > > set the bit during tun_net_close().
> > > > > > 
> > > > > > Thanks
> > > > > Looks no need, vhost will poll socket after it see EIO. So we are ok here?
> > > > > 
> > > > > Thanks
> > > > In fact I don't even understand why does this help any longer.
> > > > 
> > > We disable tx polling and only enable it on demand for a better rx
> > > performance. You may want to have a look at :
> > > 
> > > commit feb8892cb441c742d4220cf7ced001e7fa070731
> > > Author: Jason Wang <jasowang@redhat.com>
> > > Date:   Mon Nov 13 11:45:34 2017 +0800
> > > 
> > >      vhost_net: conditionally enable tx polling
> > > 
> > > Thanks
> > 
> > Question is, what looks at SOCKWQ_ASYNC_NOSPACE.
> > I think it's tested when packet is transmitted,
> > but there is no guarantee here any packet will
> > ever be transmitted.
> > 
> 
> Well, actually, I do plan to disable vq polling from the beginning. But
> looks like you do not want this:
> 
> See https://patchwork.kernel.org/patch/10034025/
> 
> Thanks

Not sure I understand what you are saying, it's enabling polling we are
talking about.

-- 
MST

^ permalink raw reply

* [PATCH net-next 12/12] amd-xgbe: Improve SFP 100Mbps auto-negotiation
From: Tom Lendacky @ 2018-05-21 22:00 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180521215818.8135.83100.stgit@tlendack-t1.amdoffice.net>

After changing speed to 100Mbps as a result of auto-negotiation (AN),
some 10/100/1000Mbps SFPs indicate a successful link (no faults or loss
of signal), but cannot successfully transmit or receive data.  These
SFPs required an extra auto-negotiation (AN) after the speed change in
order to operate properly.  Add a quirk for these SFPs so that if the
outcome of the AN actually results in changing to a new speed, re-initiate
AN at that new speed.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |   77 +++++++++++++++------------
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    6 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h        |    1 
 3 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 450b89c..4b5d625 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -331,13 +331,15 @@ static void xgbe_switch_mode(struct xgbe_prv_data *pdata)
 	xgbe_change_mode(pdata, pdata->phy_if.phy_impl.switch_mode(pdata));
 }
 
-static void xgbe_set_mode(struct xgbe_prv_data *pdata,
+static bool xgbe_set_mode(struct xgbe_prv_data *pdata,
 			  enum xgbe_mode mode)
 {
 	if (mode == xgbe_cur_mode(pdata))
-		return;
+		return false;
 
 	xgbe_change_mode(pdata, mode);
+
+	return true;
 }
 
 static bool xgbe_use_mode(struct xgbe_prv_data *pdata,
@@ -1178,21 +1180,23 @@ static int xgbe_phy_config_fixed(struct xgbe_prv_data *pdata)
 	return 0;
 }
 
-static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata, bool set_mode)
 {
 	int ret;
 
+	mutex_lock(&pdata->an_mutex);
+
 	set_bit(XGBE_LINK_INIT, &pdata->dev_state);
 	pdata->link_check = jiffies;
 
 	ret = pdata->phy_if.phy_impl.an_config(pdata);
 	if (ret)
-		return ret;
+		goto out;
 
 	if (pdata->phy.autoneg != AUTONEG_ENABLE) {
 		ret = xgbe_phy_config_fixed(pdata);
 		if (ret || !pdata->kr_redrv)
-			return ret;
+			goto out;
 
 		netif_dbg(pdata, link, pdata->netdev, "AN redriver support\n");
 	} else {
@@ -1202,24 +1206,27 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	/* Disable auto-negotiation interrupt */
 	disable_irq(pdata->an_irq);
 
-	/* Start auto-negotiation in a supported mode */
-	if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KR);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SFI);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
-		xgbe_set_mode(pdata, XGBE_MODE_X);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
-	} else {
-		enable_irq(pdata->an_irq);
-		return -EINVAL;
+	if (set_mode) {
+		/* Start auto-negotiation in a supported mode */
+		if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KR);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SFI);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
+			xgbe_set_mode(pdata, XGBE_MODE_X);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
+		} else {
+			enable_irq(pdata->an_irq);
+			ret = -EINVAL;
+			goto out;
+		}
 	}
 
 	/* Disable and stop any in progress auto-negotiation */
@@ -1239,16 +1246,7 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	xgbe_an_init(pdata);
 	xgbe_an_restart(pdata);
 
-	return 0;
-}
-
-static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
-{
-	int ret;
-
-	mutex_lock(&pdata->an_mutex);
-
-	ret = __xgbe_phy_config_aneg(pdata);
+out:
 	if (ret)
 		set_bit(XGBE_LINK_ERR, &pdata->dev_state);
 	else
@@ -1259,6 +1257,16 @@ static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	return ret;
 }
 
+static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+{
+	return __xgbe_phy_config_aneg(pdata, true);
+}
+
+static int xgbe_phy_reconfig_aneg(struct xgbe_prv_data *pdata)
+{
+	return __xgbe_phy_config_aneg(pdata, false);
+}
+
 static bool xgbe_phy_aneg_done(struct xgbe_prv_data *pdata)
 {
 	return (pdata->an_result == XGBE_AN_COMPLETE);
@@ -1315,7 +1323,8 @@ static void xgbe_phy_status_result(struct xgbe_prv_data *pdata)
 
 	pdata->phy.duplex = DUPLEX_FULL;
 
-	xgbe_set_mode(pdata, mode);
+	if (xgbe_set_mode(pdata, mode) && pdata->an_again)
+		xgbe_phy_reconfig_aneg(pdata);
 }
 
 static void xgbe_phy_status(struct xgbe_prv_data *pdata)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 194a569..3ceb4f9 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -902,6 +902,9 @@ static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
 		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
 		return false;
 
+	/* For Bel-Fuse, use the extra AN flag */
+	pdata->an_again = 1;
+
 	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
 		   XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
 		return false;
@@ -978,6 +981,9 @@ static int xgbe_phy_find_phy_device(struct xgbe_prv_data *pdata)
 	if (phy_data->phydev)
 		return 0;
 
+	/* Clear the extra AN flag */
+	pdata->an_again = 0;
+
 	/* Check for the use of an external PHY */
 	if (phy_data->phydev_mode == XGBE_MDIO_MODE_NONE)
 		return 0;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 7a412cf..47bcbcf 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -1261,6 +1261,7 @@ struct xgbe_prv_data {
 	enum xgbe_rx kr_state;
 	enum xgbe_rx kx_state;
 	struct work_struct an_work;
+	unsigned int an_again;
 	unsigned int an_supported;
 	unsigned int parallel_detect;
 	unsigned int fec_ability;

^ permalink raw reply related

* [PATCH net-next 11/12] amd-xgbe: Update the BelFuse quirk to support SGMII
From: Tom Lendacky @ 2018-05-21 22:00 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180521215818.8135.83100.stgit@tlendack-t1.amdoffice.net>

Instead of using a quirk to make the BelFuse 1GBT-SFP06 part look like
a 1000baseX part, program the SFP PHY to support SGMII and 10/100/1000
baseT.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |  109 +++++++++++++++++++--------
 1 file changed, 75 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index dd747f6..194a569 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -860,6 +860,9 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
 	unsigned int phy_id = phy_data->phydev->phy_id;
 
+	if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+		return false;
+
 	if ((phy_id & 0xfffffff0) != 0x01ff0cc0)
 		return false;
 
@@ -885,8 +888,80 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
 	return true;
 }
 
+static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+	struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom;
+	unsigned int phy_id = phy_data->phydev->phy_id;
+	int reg;
+
+	if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+		return false;
+
+	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
+		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
+		return false;
+
+	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
+		   XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
+		return false;
+
+	if ((phy_id & 0xfffffff0) != 0x03625d10)
+		return false;
+
+	/* Disable RGMII mode */
+	phy_write(phy_data->phydev, 0x18, 0x7007);
+	reg = phy_read(phy_data->phydev, 0x18);
+	phy_write(phy_data->phydev, 0x18, reg & ~0x0080);
+
+	/* Enable fiber register bank */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0001;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0001);
+
+	/* Power down SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg | 0x00800);
+
+	/* Configure SGMII-to-Copper mode */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0006;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0004);
+
+	/* Power up SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+	/* Enable copper register bank */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0001;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg);
+
+	/* Power up SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+	phy_data->phydev->supported = PHY_GBIT_FEATURES;
+	phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+	phy_data->phydev->advertising = phy_data->phydev->supported;
+
+	netif_dbg(pdata, drv, pdata->netdev,
+		  "BelFuse PHY quirk in place\n");
+
+	return true;
+}
+
 static void xgbe_phy_external_phy_quirks(struct xgbe_prv_data *pdata)
 {
+	if (xgbe_phy_belfuse_phy_quirks(pdata))
+		return;
+
 	if (xgbe_phy_finisar_phy_quirks(pdata))
 		return;
 }
@@ -1027,37 +1102,6 @@ static bool xgbe_phy_check_sfp_mod_absent(struct xgbe_phy_data *phy_data)
 	return false;
 }
 
-static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata)
-{
-	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom;
-
-	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
-		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
-		return false;
-
-	if (!memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
-		    XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN)) {
-		phy_data->sfp_base = XGBE_SFP_BASE_1000_SX;
-		phy_data->sfp_cable = XGBE_SFP_CABLE_ACTIVE;
-		phy_data->sfp_speed = XGBE_SFP_SPEED_1000;
-		if (phy_data->sfp_changed)
-			netif_dbg(pdata, drv, pdata->netdev,
-				  "Bel-Fuse SFP quirk in place\n");
-		return true;
-	}
-
-	return false;
-}
-
-static bool xgbe_phy_sfp_parse_quirks(struct xgbe_prv_data *pdata)
-{
-	if (xgbe_phy_belfuse_parse_quirks(pdata))
-		return true;
-
-	return false;
-}
-
 static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
@@ -1076,9 +1120,6 @@ static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
 	phy_data->sfp_tx_fault = xgbe_phy_check_sfp_tx_fault(phy_data);
 	phy_data->sfp_rx_los = xgbe_phy_check_sfp_rx_los(phy_data);
 
-	if (xgbe_phy_sfp_parse_quirks(pdata))
-		return;
-
 	/* Assume ACTIVE cable unless told it is PASSIVE */
 	if (sfp_base[XGBE_SFP_BASE_CABLE] & XGBE_SFP_BASE_CABLE_PASSIVE) {
 		phy_data->sfp_cable = XGBE_SFP_CABLE_PASSIVE;

^ permalink raw reply related

* [PATCH net-next 10/12] amd-xgbe: Advertise FEC support with the KR re-driver
From: Tom Lendacky @ 2018-05-21 21:59 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180521215818.8135.83100.stgit@tlendack-t1.amdoffice.net>

When a KR re-driver is present, indicate the FEC support is available
during auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 141bb13..dd747f6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1720,6 +1720,10 @@ static void xgbe_phy_an_advertising(struct xgbe_prv_data *pdata,
 	XGBE_CLR_ADV(dlks, 1000baseKX_Full);
 	XGBE_CLR_ADV(dlks, 10000baseKR_Full);
 
+	/* Advertise FEC support is present */
+	if (pdata->fec_ability & MDIO_PMA_10GBR_FECABLE_ABLE)
+		XGBE_SET_ADV(dlks, 10000baseR_FEC);
+
 	switch (phy_data->port_mode) {
 	case XGBE_PORT_MODE_BACKPLANE:
 		XGBE_SET_ADV(dlks, 10000baseKR_Full);

^ permalink raw reply related

* [PATCH net-next 09/12] amd-xgbe: Always attempt link training in KR mode
From: Tom Lendacky @ 2018-05-21 21:59 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180521215818.8135.83100.stgit@tlendack-t1.amdoffice.net>

Link training is always attempted when in KR mode, but the code is
structured to check if link training has been enabled before attempting
to perform it.  Since that check will always be true, simplify the code
to always enable and start link training during KR auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |   69 +++++++----------------------
 1 file changed, 16 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 9c39c72..450b89c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -216,31 +216,8 @@ static void xgbe_an_clear_interrupts_all(struct xgbe_prv_data *pdata)
 	xgbe_an37_clear_interrupts(pdata);
 }
 
-static void xgbe_an73_enable_kr_training(struct xgbe_prv_data *pdata)
-{
-	unsigned int reg;
-
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-	reg |= XGBE_KR_TRAINING_ENABLE;
-	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
-static void xgbe_an73_disable_kr_training(struct xgbe_prv_data *pdata)
-{
-	unsigned int reg;
-
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-	reg &= ~XGBE_KR_TRAINING_ENABLE;
-	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
 static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 {
-	/* Enable KR training */
-	xgbe_an73_enable_kr_training(pdata);
-
 	/* Set MAC to 10G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_10000);
 
@@ -250,9 +227,6 @@ static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 2.5G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_2500);
 
@@ -262,9 +236,6 @@ static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_1000_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -278,9 +249,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 	if (pdata->kr_redrv)
 		return xgbe_kr_mode(pdata);
 
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 10G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_10000);
 
@@ -290,9 +258,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -302,9 +267,6 @@ static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -314,9 +276,6 @@ static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sgmii_100_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -425,6 +384,12 @@ static void xgbe_an73_set(struct xgbe_prv_data *pdata, bool enable,
 {
 	unsigned int reg;
 
+	/* Disable KR training for now */
+	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+	reg &= ~XGBE_KR_TRAINING_ENABLE;
+	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
+
+	/* Update AN settings */
 	reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1);
 	reg &= ~MDIO_AN_CTRL1_ENABLE;
 
@@ -522,21 +487,19 @@ static enum xgbe_an xgbe_an73_tx_training(struct xgbe_prv_data *pdata,
 	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL, reg);
 
 	/* Start KR training */
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-	if (reg & XGBE_KR_TRAINING_ENABLE) {
-		if (pdata->phy_if.phy_impl.kr_training_pre)
-			pdata->phy_if.phy_impl.kr_training_pre(pdata);
+	if (pdata->phy_if.phy_impl.kr_training_pre)
+		pdata->phy_if.phy_impl.kr_training_pre(pdata);
 
-		reg |= XGBE_KR_TRAINING_START;
-		XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
-			    reg);
+	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+	reg |= XGBE_KR_TRAINING_ENABLE;
+	reg |= XGBE_KR_TRAINING_START;
+	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
 
-		netif_dbg(pdata, link, pdata->netdev,
-			  "KR training initiated\n");
+	netif_dbg(pdata, link, pdata->netdev,
+		  "KR training initiated\n");
 
-		if (pdata->phy_if.phy_impl.kr_training_post)
-			pdata->phy_if.phy_impl.kr_training_post(pdata);
-	}
+	if (pdata->phy_if.phy_impl.kr_training_post)
+		pdata->phy_if.phy_impl.kr_training_post(pdata);
 
 	return XGBE_AN_PAGE_RECEIVED;
 }

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox