Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
From: Sridhar Samudrala @ 2017-08-31 23:27 UTC (permalink / raw)
  To: alexander.h.duyck, netdev

This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
to enable symmetric tx and rx queues on a socket.

This option is specifically useful for epoll based multi threaded workloads
where each thread handles packets received on a single RX queue . In this model,
we have noticed that it helps to send the packets on the same TX queue
corresponding to the queue-pair associated with the RX queue specifically when
busy poll is enabled with epoll().

Two new fields are added to struct sock_common to cache the last rx ifindex and
the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
rx queue when this option is enabled and the TX is happening on the same device.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 include/net/request_sock.h        |  1 +
 include/net/sock.h                | 17 +++++++++++++++++
 include/uapi/asm-generic/socket.h |  2 ++
 net/core/dev.c                    |  8 +++++++-
 net/core/sock.c                   | 10 ++++++++++
 net/ipv4/tcp_input.c              |  1 +
 net/ipv4/tcp_ipv4.c               |  1 +
 net/ipv4/tcp_minisocks.c          |  1 +
 8 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 23e2205..c3bc12e 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
 	req_to_sk(req)->sk_prot = sk_listener->sk_prot;
 	sk_node_init(&req_to_sk(req)->sk_node);
 	sk_tx_queue_clear(req_to_sk(req));
+	req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
 	req->saved_syn = NULL;
 	refcount_set(&req->rsk_refcnt, 0);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 03a3625..3421809 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_tx_queue_mapping: tx queue number for this connection
+ *	@skc_rx_queue_mapping: rx queue number for this connection
+ *	@skc_rx_ifindex: rx ifindex for this connection
  *	@skc_flags: place holder for sk_flags
  *		%SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *		%SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  *	@skc_incoming_cpu: record/match cpu processing incoming packets
  *	@skc_refcnt: reference count
+ *	@skc_symmetric_queues: symmetric tx/rx queues
  *
  *	This is the minimal network layer representation of sockets, the header
  *	for struct sock and struct inet_timewait_sock.
@@ -177,6 +180,7 @@ struct sock_common {
 	unsigned char		skc_reuseport:1;
 	unsigned char		skc_ipv6only:1;
 	unsigned char		skc_net_refcnt:1;
+	unsigned char		skc_symmetric_queues:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -214,6 +218,8 @@ struct sock_common {
 		struct hlist_nulls_node skc_nulls_node;
 	};
 	int			skc_tx_queue_mapping;
+	int			skc_rx_queue_mapping;
+	int			skc_rx_ifindex;
 	union {
 		int		skc_incoming_cpu;
 		u32		skc_rcv_wnd;
@@ -324,6 +330,8 @@ struct sock {
 #define sk_nulls_node		__sk_common.skc_nulls_node
 #define sk_refcnt		__sk_common.skc_refcnt
 #define sk_tx_queue_mapping	__sk_common.skc_tx_queue_mapping
+#define sk_rx_queue_mapping	__sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex		__sk_common.skc_rx_ifindex
 
 #define sk_dontcopy_begin	__sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end		__sk_common.skc_dontcopy_end
@@ -340,6 +348,7 @@ struct sock {
 #define sk_reuseport		__sk_common.skc_reuseport
 #define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
+#define sk_symmetric_queues	__sk_common.skc_symmetric_queues
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
@@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
 	return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+	if (sk->sk_symmetric_queues) {
+		sk->sk_rx_ifindex = skb->skb_iif;
+		sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+	}
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk_tx_queue_clear(sk);
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index e47c9e4..f6b416e 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -106,4 +106,6 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_SYMMETRIC_QUEUES	61
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 270b547..d96cda8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 	if (queue_index < 0 || skb->ooo_okay ||
 	    queue_index >= dev->real_num_tx_queues) {
-		int new_index = get_xps_queue(dev, skb);
+		int new_index = -1;
+
+		if (sk && sk->sk_symmetric_queues && dev->ifindex == sk->sk_rx_ifindex)
+			new_index = sk->sk_rx_queue_mapping;
+
+		if (new_index < 0 || new_index >= dev->real_num_tx_queues)
+			new_index = get_xps_queue(dev, skb);
 
 		if (new_index < 0)
 			new_index = skb_tx_hash(dev, skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9b7b6bb..3876cce 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
 		break;
 
+	case SO_SYMMETRIC_QUEUES:
+		sk->sk_symmetric_queues = valbool;
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
 		break;
 
+	case SO_SYMMETRIC_QUEUES:
+		v.val = sk->sk_symmetric_queues;
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
@@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_max_pacing_rate = ~0U;
 	sk->sk_pacing_rate = ~0U;
 	sk->sk_incoming_cpu = -1;
+	sk->sk_rx_ifindex = -1;
+	sk->sk_rx_queue_mapping = -1;
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5d7656..12381e0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_rsk(req)->txhash = net_tx_rndhash();
 	tcp_openreq_init_rwin(req, sk, dst);
+	sk_mark_rx_queue(req_to_sk(req), skb);
 	if (!want_cookie) {
 		tcp_reqsk_record_syn(sk, req, skb);
 		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a63486a..82f9af4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
+		sk_mark_rx_queue(sk, skb);
 		if (dst) {
 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
 			    !dst->ops->check(dst, 0)) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f3..2b5efd5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 
 	/* record NAPI ID of child */
 	sk_mark_napi_id(child, skb);
+	sk_mark_rx_queue(child, skb);
 
 	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next v5 2/2] tcp_diag: report TCP MD5 signing keys and addresses
From: Sabrina Dubroca @ 2017-08-31 23:26 UTC (permalink / raw)
  To: Ivan Delalande; +Cc: David Miller, Eric Dumazet, netdev
In-Reply-To: <20170831165939.5121-3-colona@arista.com>

2017-08-31, 09:59:39 -0700, Ivan Delalande wrote:
> diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
> index a748c74aa8b7..abbf0edcf6c2 100644
> --- a/net/ipv4/tcp_diag.c
> +++ b/net/ipv4/tcp_diag.c
[...]
> +static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
> +			    struct sk_buff *skb)
> +{
> +#ifdef CONFIG_TCP_MD5SIG
> +	if (net_admin) {

In tcp_diag_get_aux_size() you put a check for sk_fullsock. I don't
see anything preventing you from reaching this with a !fullsock?


> +		struct tcp_md5sig_info *md5sig;
> +		int err = 0;
> +
> +		rcu_read_lock();
> +		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
> +		if (md5sig)
> +			err = tcp_diag_put_md5sig(skb, md5sig);
> +		rcu_read_unlock();
> +		if (err < 0)
> +			return err;
> +	}
> +#endif
> +
> +	return 0;
> +}
> +
> +static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
> +{
> +	size_t size = 0;
> +
> +#ifdef CONFIG_TCP_MD5SIG
> +	if (net_admin && sk_fullsock(sk)) {
> +		const struct tcp_md5sig_info *md5sig;
> +		const struct tcp_md5sig_key *key;
> +		size_t md5sig_count = 0;
> +
> +		rcu_read_lock();
> +		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
> +		if (md5sig) {
> +			hlist_for_each_entry_rcu(key, &md5sig->head, node)
> +				md5sig_count++;
> +		}
> +		rcu_read_unlock();
> +		size += nla_total_size(md5sig_count *
> +				       sizeof(struct tcp_diag_md5sig));
> +	}
> +#endif
> +
> +	return size;
> +}
> +
>  static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
>  			  const struct inet_diag_req_v2 *r, struct nlattr *bc)
>  {

-- 
Sabrina

^ permalink raw reply

* [PATCH net-next 2/2] netvsc: allow driver to be removed even if VF is present
From: Stephen Hemminger @ 2017-08-31 23:16 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin; +Cc: devel, netdev
In-Reply-To: <20170831231613.15805-1-sthemmin@microsoft.com>

If VF is attached then can still allow netvsc driver module to
be removed. Just have to make sure and do the cleanup.

Also, avoid extra rtnl round trip when calling unregister.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/netvsc_drv.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 128165e49c9a..97ed4bdc439f 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1834,9 +1834,6 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 
 	netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
 
-	/* Prevent this module from being unloaded while VF is registered */
-	try_module_get(THIS_MODULE);
-
 	dev_hold(vf_netdev);
 	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
 	return NOTIFY_OK;
@@ -1880,10 +1877,11 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev)
 
 	netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
 
+	netdev_rx_handler_unregister(vf_netdev);
 	netdev_upper_dev_unlink(vf_netdev, ndev);
 	RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
 	dev_put(vf_netdev);
-	module_put(THIS_MODULE);
+
 	return NOTIFY_OK;
 }
 
@@ -1987,11 +1985,11 @@ static int netvsc_probe(struct hv_device *dev,
 
 static int netvsc_remove(struct hv_device *dev)
 {
-	struct net_device *net;
 	struct net_device_context *ndev_ctx;
+	struct net_device *vf_netdev;
+	struct net_device *net;
 
 	net = hv_get_drvdata(dev);
-
 	if (net == NULL) {
 		dev_err(&dev->device, "No net device to remove\n");
 		return 0;
@@ -2008,12 +2006,15 @@ static int netvsc_remove(struct hv_device *dev)
 	 * removed. Also blocks mtu and channel changes.
 	 */
 	rtnl_lock();
+	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
+	if (vf_netdev)
+		netvsc_unregister_vf(vf_netdev);
+
 	rndis_filter_device_remove(dev,
 				   rtnl_dereference(ndev_ctx->nvdev));
+	unregister_netdevice(net);
 	rtnl_unlock();
 
-	unregister_netdev(net);
-
 	hv_set_drvdata(dev, NULL);
 
 	free_percpu(ndev_ctx->vf_stats);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/2] netvsc: cleanup datapath switch
From: Stephen Hemminger @ 2017-08-31 23:16 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin; +Cc: devel, netdev
In-Reply-To: <20170831231613.15805-1-sthemmin@microsoft.com>

Use one routine for datapath up/down. Don't need to reopen
the rndis layer.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/netvsc_drv.c | 38 +++++++-------------------------------
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index fac44c5c8d0d..128165e49c9a 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1842,11 +1842,13 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 	return NOTIFY_OK;
 }
 
-static int netvsc_vf_up(struct net_device *vf_netdev)
+/* VF up/down change detected, schedule to change data path */
+static int netvsc_vf_changed(struct net_device *vf_netdev)
 {
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
 	struct net_device *ndev;
+	bool vf_is_up = netif_running(vf_netdev);
 
 	ndev = get_netvsc_byref(vf_netdev);
 	if (!ndev)
@@ -1857,34 +1859,9 @@ static int netvsc_vf_up(struct net_device *vf_netdev)
 	if (!netvsc_dev)
 		return NOTIFY_DONE;
 
-	/* Bump refcount when datapath is acvive - Why? */
-	rndis_filter_open(netvsc_dev);
-
-	/* notify the host to switch the data path. */
-	netvsc_switch_datapath(ndev, true);
-	netdev_info(ndev, "Data path switched to VF: %s\n", vf_netdev->name);
-
-	return NOTIFY_OK;
-}
-
-static int netvsc_vf_down(struct net_device *vf_netdev)
-{
-	struct net_device_context *net_device_ctx;
-	struct netvsc_device *netvsc_dev;
-	struct net_device *ndev;
-
-	ndev = get_netvsc_byref(vf_netdev);
-	if (!ndev)
-		return NOTIFY_DONE;
-
-	net_device_ctx = netdev_priv(ndev);
-	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
-	if (!netvsc_dev)
-		return NOTIFY_DONE;
-
-	netvsc_switch_datapath(ndev, false);
-	netdev_info(ndev, "Data path switched from VF: %s\n", vf_netdev->name);
-	rndis_filter_close(netvsc_dev);
+	netvsc_switch_datapath(ndev, vf_is_up);
+	netdev_info(ndev, "Data path switched %s VF: %s\n",
+		    vf_is_up ? "to" : "from", vf_netdev->name);
 
 	return NOTIFY_OK;
 }
@@ -2094,9 +2071,8 @@ static int netvsc_netdev_event(struct notifier_block *this,
 	case NETDEV_UNREGISTER:
 		return netvsc_unregister_vf(event_dev);
 	case NETDEV_UP:
-		return netvsc_vf_up(event_dev);
 	case NETDEV_DOWN:
-		return netvsc_vf_down(event_dev);
+		return netvsc_vf_changed(event_dev);
 	default:
 		return NOTIFY_DONE;
 	}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 0/2] netvsc: transparent VF related cleanups
From: Stephen Hemminger @ 2017-08-31 23:16 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin; +Cc: devel, netdev

The first gets rid of unnecessary ref counting, and second
allows removing hv_netvsc driver even if VF present.

Stephen Hemminger (2):
  netvsc: cleanup datapath switch
  netvsc: allow driver to be removed even if VF is present

 drivers/net/hyperv/netvsc_drv.c | 55 ++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 39 deletions(-)

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t
From: Eric Dumazet @ 2017-08-31 23:15 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Eric Dumazet, David S . Miller, netdev, Willem de Bruijn
In-Reply-To: <CAF=yD-KoztUEn-K5G9mvjE6wpq5+V9opAs5pim5cBr5cdvv3ig@mail.gmail.com>

On Thu, 2017-08-31 at 18:45 -0400, Willem de Bruijn wrote:
> On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet <edumazet@google.com> wrote:
> > refcount_t type and corresponding API should be
> > used instead of atomic_t when the variable is used as
> > a reference counter. This allows to avoid accidental
> > refcounter overflows that might lead to use-after-free
> > situations.
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/linux/skbuff.h | 5 +++--
> >  net/core/skbuff.c      | 8 ++++----
> >  2 files changed, 7 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -22,6 +22,7 @@
> >  #include <linux/cache.h>
> >  #include <linux/rbtree.h>
> >  #include <linux/socket.h>
> > +#include <linux/refcount.h>
> >
> >  #include <linux/atomic.h>
> >  #include <asm/types.h>
> > @@ -456,7 +457,7 @@ struct ubuf_info {
> >                         u32 bytelen;
> >                 };
> >         };
> > -       atomic_t refcnt;
> > +       refcount_t refcnt;
> >
> >         struct mmpin {
> >                 struct user_struct *user;
> > @@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
> >
> >  static inline void sock_zerocopy_get(struct ubuf_info *uarg)
> >  {
> > -       atomic_inc(&uarg->refcnt);
> > +       refcount_inc(&uarg->refcnt);
> >  }
> >
> >  void sock_zerocopy_put(struct ubuf_info *uarg);
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 65b9ca3945f8fd2b1bef4aef5dd774be04e5d128..ed86ca9afd9d8d1ac47983acf6006c179285a612 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
> >         uarg->len = 1;
> >         uarg->bytelen = size;
> >         uarg->zerocopy = 1;
> > -       atomic_set(&uarg->refcnt, 1);
> > +       refcount_set(&uarg->refcnt, 1);
> >         sock_hold(sk);
> >
> >         return uarg;
> > @@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
> >
> >  void sock_zerocopy_put(struct ubuf_info *uarg)
> >  {
> > -       if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
> > +       if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
> >                 if (uarg->callback)
> >                         uarg->callback(uarg, uarg->zerocopy);
> >                 else
> > @@ -1108,7 +1108,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
> >                  * avoid an skb send inside the main loop triggering uarg free.
> >                  */
> >                 if (sk->sk_type != SOCK_STREAM)
> > -                       atomic_inc(&uarg->refcnt);
> > +                       refcount_inc(&uarg->refcnt);
> 
> This would be a 0 -> 1 transition. It is taken only on the error path from
> a socket that does not take an explicit hold at the start of sendmsg.
> 

Arg.

> The code is vestigial, really, as the final patchset only includes tcp.
> It is safe to leave as is for now, or I can remove it.
> 

OK I probably should remove this chunk in patch 1/2 then.

> In 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") I also updated
> other users of ubuf_info to initialize refcnt. We probably also want to
> convert the call in handle_tx in drivers/vhost/net.c.

Thanks ! :)

^ permalink raw reply

* Re: [PATCH net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t
From: Willem de Bruijn @ 2017-08-31 22:45 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Willem de Bruijn, Eric Dumazet
In-Reply-To: <20170831203013.9219-3-edumazet@google.com>

On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet <edumazet@google.com> wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/linux/skbuff.h | 5 +++--
>  net/core/skbuff.c      | 8 ++++----
>  2 files changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -22,6 +22,7 @@
>  #include <linux/cache.h>
>  #include <linux/rbtree.h>
>  #include <linux/socket.h>
> +#include <linux/refcount.h>
>
>  #include <linux/atomic.h>
>  #include <asm/types.h>
> @@ -456,7 +457,7 @@ struct ubuf_info {
>                         u32 bytelen;
>                 };
>         };
> -       atomic_t refcnt;
> +       refcount_t refcnt;
>
>         struct mmpin {
>                 struct user_struct *user;
> @@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
>
>  static inline void sock_zerocopy_get(struct ubuf_info *uarg)
>  {
> -       atomic_inc(&uarg->refcnt);
> +       refcount_inc(&uarg->refcnt);
>  }
>
>  void sock_zerocopy_put(struct ubuf_info *uarg);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 65b9ca3945f8fd2b1bef4aef5dd774be04e5d128..ed86ca9afd9d8d1ac47983acf6006c179285a612 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
>         uarg->len = 1;
>         uarg->bytelen = size;
>         uarg->zerocopy = 1;
> -       atomic_set(&uarg->refcnt, 1);
> +       refcount_set(&uarg->refcnt, 1);
>         sock_hold(sk);
>
>         return uarg;
> @@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
>
>  void sock_zerocopy_put(struct ubuf_info *uarg)
>  {
> -       if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
> +       if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
>                 if (uarg->callback)
>                         uarg->callback(uarg, uarg->zerocopy);
>                 else
> @@ -1108,7 +1108,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
>                  * avoid an skb send inside the main loop triggering uarg free.
>                  */
>                 if (sk->sk_type != SOCK_STREAM)
> -                       atomic_inc(&uarg->refcnt);
> +                       refcount_inc(&uarg->refcnt);

This would be a 0 -> 1 transition. It is taken only on the error path from
a socket that does not take an explicit hold at the start of sendmsg.

The code is vestigial, really, as the final patchset only includes tcp.
It is safe to leave as is for now, or I can remove it.

In 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") I also updated
other users of ubuf_info to initialize refcnt. We probably also want to
convert the call in handle_tx in drivers/vhost/net.c.

^ permalink raw reply

* Re: [PATCH net-next] bridge: add tracepoint in br_fdb_update
From: Stephen Hemminger @ 2017-08-31 22:43 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Miller, roopa, netdev, nikolay, f.fainelli, andrew, bridge
In-Reply-To: <20170831235026.6377bf86@redhat.com>

On Thu, 31 Aug 2017 23:50:26 +0200
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> On Thu, 31 Aug 2017 11:43:25 -0700 (PDT)
> David Miller <davem@davemloft.net> wrote:
> 
> > From: Roopa Prabhu <roopa@cumulusnetworks.com>
> > Date: Wed, 30 Aug 2017 22:18:13 -0700
> >   
> > > From: Roopa Prabhu <roopa@cumulusnetworks.com>
> > > 
> > > This extends bridge fdb table tracepoints to also cover
> > > learned fdb entries in the br_fdb_update path. Note that
> > > unlike other tracepoints I have moved this to when the fdb
> > > is modified because this is in the datapath and can generate
> > > a lot of noise in the trace output. br_fdb_update is also called
> > > from added_by_user context in the NTF_USE case which is already
> > > traced ..hence the !added_by_user check.
> > > 
> > > Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>    
> > 
> > Applied.
> > 
> > Let's use dev->name for now and if the tooling can eventually
> > do transparent ifindex->name then we can consider redoing
> > a bunch of networking tracepoints.  
> 
> I agree! :-)
> 

Agreed, but it is yet another case of tracepoints not having
a stable ABI. There is no ABI guarantee on tracing, but it still
makes for user complaints.

^ permalink raw reply

* Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
From: Daniel Borkmann @ 2017-08-31 22:38 UTC (permalink / raw)
  To: Chenbo Feng, linux-security-module
  Cc: Jeffrey Vander Stoep, netdev, SELinux, Alexei Starovoitov,
	lorenzo, Chenbo Feng
In-Reply-To: <20170831205635.80256-3-chenbofeng.kernel@gmail.com>

On 08/31/2017 10:56 PM, Chenbo Feng wrote:
> From: Chenbo Feng <fengc@google.com>
>
> Introduce a pointer into struct bpf_map to hold the security information
> about the map. The actual security struct varies based on the security
> models implemented. Place the LSM hooks before each of the unrestricted
> eBPF operations, the map_update_elem and map_delete_elem operations are
> checked by security_map_modify. The map_lookup_elem and map_get_next_key
> operations are checked by securtiy_map_read.
>
> Signed-off-by: Chenbo Feng <fengc@google.com>

Against which tree is this by the way, net-next? There are
changes here which require a rebase of your set.

> ---
>   include/linux/bpf.h  |  3 +++
>   kernel/bpf/syscall.c | 28 ++++++++++++++++++++++++++++
>   2 files changed, 31 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index b69e7a5869ff..ca3e6ff7091d 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -53,6 +53,9 @@ struct bpf_map {
>   	struct work_struct work;
>   	atomic_t usercnt;
>   	struct bpf_map *inner_map_meta;
> +#ifdef CONFIG_SECURITY
> +	void *security;
> +#endif
>   };
>
>   /* function argument constraints */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 045646da97cc..b15580bcf3b1 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr)
>   	if (err)
>   		return -EINVAL;
>
> +	err = security_map_create();

Seems a bit limited to me, don't you want to be able to
also differentiate between different map types? Same goes
for security_prog_load() wrt prog types, no?

> +	if (err)
> +		return -EACCES;
> +
>   	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
>   	map = find_and_alloc_map(attr);
>   	if (IS_ERR(map))
> @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr)
>   	if (err)
>   		goto free_map_nouncharge;
>
> +	err = security_post_create(map);
> +	if (err < 0)
> +		goto free_map;
> +

So the hook you implement in patch 3/3 does:

+static int selinux_bpf_post_create(struct bpf_map *map)
+{
+	struct bpf_security_struct *bpfsec;
+
+	bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
+	if (!bpfsec)
+		return -ENOMEM;
+
+	bpfsec->sid = current_sid();
+	map->security = bpfsec;
+
+	return 0;
+}

Where do you kfree() bpfsec when the map gets released
normally or in error path?

>   	err = bpf_map_alloc_id(map);
>   	if (err)
>   		goto free_map;
> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
>   	if (IS_ERR(map))
>   		return PTR_ERR(map);
>
> +	err = security_map_read(map);
> +	if (err)
> +		return -EACCES;

How about actually dropping ref?

> +
>   	key = memdup_user(ukey, map->key_size);
>   	if (IS_ERR(key)) {
>   		err = PTR_ERR(key);
> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
>   	if (IS_ERR(map))
>   		return PTR_ERR(map);
>
> +	err = security_map_modify(map);
> +	if (err)
> +		return -EACCES;

Ditto ...

>   	key = memdup_user(ukey, map->key_size);
>   	if (IS_ERR(key)) {
>   		err = PTR_ERR(key);
> @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr)
>   	if (IS_ERR(map))
>   		return PTR_ERR(map);
>
> +	err = security_map_modify(map);
> +	if (err)
> +		return -EACCES;

Ditto ...

>   	key = memdup_user(ukey, map->key_size);
>   	if (IS_ERR(key)) {
>   		err = PTR_ERR(key);
> @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr)
>   	if (IS_ERR(map))
>   		return PTR_ERR(map);
>
> +	err = security_map_read(map);
> +	if (err)
> +		return -EACCES;

And once again here ... :(

>   	if (ukey) {
>   		key = memdup_user(ukey, map->key_size);
>   		if (IS_ERR(key)) {
> @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr)
>   	if (CHECK_ATTR(BPF_PROG_LOAD))
>   		return -EINVAL;
>
> +	err = security_prog_load();
> +	if (err)
> +		return -EACCES;
> +
>   	if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
>   		return -EINVAL;
>
>

^ permalink raw reply

* Re: [PATCH net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion
From: Willem de Bruijn @ 2017-08-31 22:35 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Willem de Bruijn, Eric Dumazet
In-Reply-To: <20170831203013.9219-2-edumazet@google.com>

On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet <edumazet@google.com> wrote:
> In order to convert this atomic_t refcnt to refcount_t,
> we need to init the refcount to one to not trigger
> a 0 -> 1 transition.
>
> This also removes one atomic operation in fast path.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Acked-by: Willem de Bruijn <willemb@google.com>

Thanks, Eric.

^ permalink raw reply

* [PATCH net-next 2/2] flow_dissector: Add limits for encapsulation and EH
From: Tom Herbert @ 2017-08-31 22:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, alex.popov, hannes, Tom Herbert
In-Reply-To: <20170831222239.21509-1-tom@quantonium.net>

In flow dissector there are no limits to the number of nested
encapsulations that might be dissected which makes for a nice DOS
attack. This patch limits for dissecting nested encapsulations
as well as for dissecting over extension headers.

Reported-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/core/flow_dissector.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5110180a3e96..1bca748de27d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -396,6 +396,35 @@ __skb_flow_dissect_ipv6(const struct sk_buff *skb,
 	key_ip->ttl = iph->hop_limit;
 }
 
+/* Maximum number of nested encapsulations that can be processed in
+ * __skb_flow_dissect
+ */
+#define MAX_FLOW_DISSECT_ENCAPS	5
+
+static bool skb_flow_dissect_encap_allowed(int *num_encaps, unsigned int *flags)
+{
+	++*num_encaps;
+
+	if (*num_encaps >= MAX_FLOW_DISSECT_ENCAPS) {
+		if (*num_encaps == MAX_FLOW_DISSECT_ENCAPS) {
+			/* Allow one more pass but ignore disregard
+			 * further encapsulations
+			 */
+			*flags |= FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+		} else {
+			/* Max encaps reached */
+			return  false;
+		}
+	}
+
+	return true;
+}
+
+/* Maximum number of extension headers can be processed in __skb_flow_dissect
+ * per IPv6 packet
+ */
+#define MAX_FLOW_DISSECT_EH	5
+
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
@@ -426,6 +455,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_tags *key_tags;
 	struct flow_dissector_key_vlan *key_vlan;
 	enum flow_dissect_ret fdret;
+	int num_eh, num_encaps = 0;
 	bool skip_vlan = false;
 	u8 ip_proto = 0;
 	bool ret;
@@ -714,7 +744,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	case FLOW_DISSECT_RET_OUT_GOOD:
 		goto out_good;
 	case FLOW_DISSECT_RET_PROTO_AGAIN:
-		goto proto_again;
+		if (skb_flow_dissect_encap_allowed(&num_encaps, &flags))
+			goto proto_again;
+		goto out_good;
 	case FLOW_DISSECT_RET_CONTINUE:
 	case FLOW_DISSECT_RET_IPPROTO_AGAIN:
 	case FLOW_DISSECT_RET_IPPROTO_AGAIN_EH:
@@ -724,6 +756,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		goto out_bad;
 	}
 
+	num_eh = 0;
+
 ip_proto_again:
 	fdret = FLOW_DISSECT_RET_CONTINUE;
 
@@ -844,10 +878,18 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	/* Process result of IP proto processing */
 	switch (fdret) {
 	case FLOW_DISSECT_RET_PROTO_AGAIN:
-		goto proto_again;
+		if (skb_flow_dissect_encap_allowed(&num_encaps, &flags))
+			goto proto_again;
+		break;
 	case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+		if (skb_flow_dissect_encap_allowed(&num_encaps, &flags))
+			goto ip_proto_again;
+		break;
 	case FLOW_DISSECT_RET_IPPROTO_AGAIN_EH:
-		goto ip_proto_again;
+		++num_eh;
+		if (num_eh <= MAX_FLOW_DISSECT_EH)
+			goto ip_proto_again;
+		break;
 	case FLOW_DISSECT_RET_OUT_GOOD:
 	case FLOW_DISSECT_RET_CONTINUE:
 		break;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/2] flow_dissector: Cleanup control flow
From: Tom Herbert @ 2017-08-31 22:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, alex.popov, hannes, Tom Herbert
In-Reply-To: <20170831222239.21509-1-tom@quantonium.net>

__skb_flow_dissect is riddled with gotos that make discerning the flow,
debugging, and extending the capability difficult. This patch
reorganizes things so that we only perform goto's after the two main
switch statements (no gotos within the cases now). It also eliminates
several goto labels so that there are only two labels that can be target
for goto.

Reported-by: Alexander Popov <alex.popov@linux.com>
Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/net/flow_dissector.h |   9 ++
 net/core/flow_dissector.c    | 225 ++++++++++++++++++++++++++++---------------
 2 files changed, 156 insertions(+), 78 deletions(-)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index e2663e900b0a..c358c3ff6acc 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -19,6 +19,15 @@ struct flow_dissector_key_control {
 #define FLOW_DIS_FIRST_FRAG	BIT(1)
 #define FLOW_DIS_ENCAPSULATION	BIT(2)
 
+enum flow_dissect_ret {
+	FLOW_DISSECT_RET_OUT_GOOD,
+	FLOW_DISSECT_RET_OUT_BAD,
+	FLOW_DISSECT_RET_PROTO_AGAIN,
+	FLOW_DISSECT_RET_IPPROTO_AGAIN,
+	FLOW_DISSECT_RET_IPPROTO_AGAIN_EH,
+	FLOW_DISSECT_RET_CONTINUE,
+};
+
 /**
  * struct flow_dissector_key_basic:
  * @thoff: Transport header offset
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e2eaa1ff948d..5110180a3e96 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -115,12 +115,6 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
-enum flow_dissect_ret {
-	FLOW_DISSECT_RET_OUT_GOOD,
-	FLOW_DISSECT_RET_OUT_BAD,
-	FLOW_DISSECT_RET_OUT_PROTO_AGAIN,
-};
-
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
@@ -341,7 +335,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
 	if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
 		return FLOW_DISSECT_RET_OUT_GOOD;
 
-	return FLOW_DISSECT_RET_OUT_PROTO_AGAIN;
+	return FLOW_DISSECT_RET_PROTO_AGAIN;
 }
 
 static void
@@ -431,6 +425,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_icmp *key_icmp;
 	struct flow_dissector_key_tags *key_tags;
 	struct flow_dissector_key_vlan *key_vlan;
+	enum flow_dissect_ret fdret;
 	bool skip_vlan = false;
 	u8 ip_proto = 0;
 	bool ret;
@@ -482,14 +477,19 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	}
 
 proto_again:
+	fdret = FLOW_DISSECT_RET_CONTINUE;
+
 	switch (proto) {
 	case htons(ETH_P_IP): {
 		const struct iphdr *iph;
 		struct iphdr _iph;
-ip:
+
 		iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
-		if (!iph || iph->ihl < 5)
-			goto out_bad;
+		if (!iph || iph->ihl < 5) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
+
 		nhoff += iph->ihl * 4;
 
 		ip_proto = iph->protocol;
@@ -509,19 +509,25 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			key_control->flags |= FLOW_DIS_IS_FRAGMENT;
 
 			if (iph->frag_off & htons(IP_OFFSET)) {
-				goto out_good;
+				fdret = FLOW_DISSECT_RET_OUT_GOOD;
+				break;
 			} else {
 				key_control->flags |= FLOW_DIS_FIRST_FRAG;
-				if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
-					goto out_good;
+				if (!(flags &
+				      FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
+					fdret = FLOW_DISSECT_RET_OUT_GOOD;
+					break;
+				}
 			}
 		}
 
 		__skb_flow_dissect_ipv4(skb, flow_dissector,
 					target_container, data, iph);
 
-		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
-			goto out_good;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
+			break;
+		}
 
 		break;
 	}
@@ -529,10 +535,11 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		const struct ipv6hdr *iph;
 		struct ipv6hdr _iph;
 
-ipv6:
 		iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
-		if (!iph)
-			goto out_bad;
+		if (!iph) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
@@ -561,15 +568,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 								     target_container);
 				key_tags->flow_label = ntohl(flow_label);
 			}
-			if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
-				goto out_good;
+			if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) {
+				fdret = FLOW_DISSECT_RET_OUT_GOOD;
+				break;
+			}
 		}
 
 		__skb_flow_dissect_ipv6(skb, flow_dissector,
 					target_container, data, iph);
 
 		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
-			goto out_good;
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
 
 		break;
 	}
@@ -585,12 +594,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		if (!vlan_tag_present || eth_type_vlan(skb->protocol)) {
 			vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
 						    data, hlen, &_vlan);
-			if (!vlan)
-				goto out_bad;
+			if (!vlan) {
+				fdret = FLOW_DISSECT_RET_OUT_BAD;
+				break;
+			}
+
 			proto = vlan->h_vlan_encapsulated_proto;
 			nhoff += sizeof(*vlan);
-			if (skip_vlan)
-				goto proto_again;
+			if (skip_vlan) {
+				fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+				break;
+			}
 		}
 
 		skip_vlan = true;
@@ -613,7 +627,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			}
 		}
 
-		goto proto_again;
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
 	}
 	case htons(ETH_P_PPP_SES): {
 		struct {
@@ -621,18 +636,27 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			__be16 proto;
 		} *hdr, _hdr;
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
-		if (!hdr)
-			goto out_bad;
+		if (!hdr) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
+
 		proto = hdr->proto;
 		nhoff += PPPOE_SES_HLEN;
 		switch (proto) {
 		case htons(PPP_IP):
-			goto ip;
+			proto = htons(ETH_P_IP);
+			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+			break;
 		case htons(PPP_IPV6):
-			goto ipv6;
+			proto = htons(ETH_P_IPV6);
+			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+			break;
 		default:
-			goto out_bad;
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
 		}
+		break;
 	}
 	case htons(ETH_P_TIPC): {
 		struct {
@@ -640,8 +664,10 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			__be32 srcnode;
 		} *hdr, _hdr;
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
-		if (!hdr)
-			goto out_bad;
+		if (!hdr) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		if (dissector_uses_key(flow_dissector,
 				       FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
@@ -651,56 +677,63 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			key_addrs->tipcaddrs.srcnode = hdr->srcnode;
 			key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS;
 		}
-		goto out_good;
+		fdret = FLOW_DISSECT_RET_OUT_GOOD;
+		break;
 	}
 
 	case htons(ETH_P_MPLS_UC):
 	case htons(ETH_P_MPLS_MC):
-mpls:
-		switch (__skb_flow_dissect_mpls(skb, flow_dissector,
+		fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
 						target_container, data,
-						nhoff, hlen)) {
-		case FLOW_DISSECT_RET_OUT_GOOD:
-			goto out_good;
-		case FLOW_DISSECT_RET_OUT_BAD:
-		default:
-			goto out_bad;
-		}
+						nhoff, hlen);
+		break;
 	case htons(ETH_P_FCOE):
-		if ((hlen - nhoff) < FCOE_HEADER_LEN)
-			goto out_bad;
+		if ((hlen - nhoff) < FCOE_HEADER_LEN) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		nhoff += FCOE_HEADER_LEN;
-		goto out_good;
+		fdret = FLOW_DISSECT_RET_OUT_GOOD;
+		break;
 
 	case htons(ETH_P_ARP):
 	case htons(ETH_P_RARP):
-		switch (__skb_flow_dissect_arp(skb, flow_dissector,
+		fdret = __skb_flow_dissect_arp(skb, flow_dissector,
 					       target_container, data,
-					       nhoff, hlen)) {
-		case FLOW_DISSECT_RET_OUT_GOOD:
-			goto out_good;
-		case FLOW_DISSECT_RET_OUT_BAD:
-		default:
-			goto out_bad;
-		}
+					       nhoff, hlen);
+		break;
+
+	default:
+		fdret = FLOW_DISSECT_RET_OUT_BAD;
+		break;
+	}
+
+	/* Process result of proto processing */
+	switch (fdret) {
+	case FLOW_DISSECT_RET_OUT_GOOD:
+		goto out_good;
+	case FLOW_DISSECT_RET_PROTO_AGAIN:
+		goto proto_again;
+	case FLOW_DISSECT_RET_CONTINUE:
+	case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+	case FLOW_DISSECT_RET_IPPROTO_AGAIN_EH:
+		break;
+	case FLOW_DISSECT_RET_OUT_BAD:
 	default:
 		goto out_bad;
 	}
 
 ip_proto_again:
+	fdret = FLOW_DISSECT_RET_CONTINUE;
+
 	switch (ip_proto) {
 	case IPPROTO_GRE:
-		switch (__skb_flow_dissect_gre(skb, key_control, flow_dissector,
+		fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
 					       target_container, data,
-					       &proto, &nhoff, &hlen, flags)) {
-		case FLOW_DISSECT_RET_OUT_GOOD:
-			goto out_good;
-		case FLOW_DISSECT_RET_OUT_BAD:
-			goto out_bad;
-		case FLOW_DISSECT_RET_OUT_PROTO_AGAIN:
-			goto proto_again;
-		}
+					       &proto, &nhoff, &hlen, flags);
+		break;
+
 	case NEXTHDR_HOP:
 	case NEXTHDR_ROUTING:
 	case NEXTHDR_DEST: {
@@ -711,13 +744,16 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 
 		opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
 					      data, hlen, &_opthdr);
-		if (!opthdr)
-			goto out_bad;
+		if (!opthdr) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		ip_proto = opthdr[0];
 		nhoff += (opthdr[1] + 1) << 3;
 
-		goto ip_proto_again;
+		fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN_EH;
+		break;
 	}
 	case NEXTHDR_FRAGMENT: {
 		struct frag_hdr _fh, *fh;
@@ -728,8 +764,10 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
 					  data, hlen, &_fh);
 
-		if (!fh)
-			goto out_bad;
+		if (!fh) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		key_control->flags |= FLOW_DIS_IS_FRAGMENT;
 
@@ -738,34 +776,50 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 
 		if (!(fh->frag_off & htons(IP6_OFFSET))) {
 			key_control->flags |= FLOW_DIS_FIRST_FRAG;
-			if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)
-				goto ip_proto_again;
+			if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
+				fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN_EH;
+				break;
+			}
 		}
-		goto out_good;
+
+		fdret = FLOW_DISSECT_RET_OUT_GOOD;
+		break;
 	}
 	case IPPROTO_IPIP:
 		proto = htons(ETH_P_IP);
 
 		key_control->flags |= FLOW_DIS_ENCAPSULATION;
-		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
-			goto out_good;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
+			break;
+		}
+
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
 
-		goto ip;
 	case IPPROTO_IPV6:
 		proto = htons(ETH_P_IPV6);
 
 		key_control->flags |= FLOW_DIS_ENCAPSULATION;
-		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
-			goto out_good;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
+			break;
+		}
+
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
+
 
-		goto ipv6;
 	case IPPROTO_MPLS:
 		proto = htons(ETH_P_MPLS_UC);
-		goto mpls;
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
+
 	case IPPROTO_TCP:
 		__skb_flow_dissect_tcp(skb, flow_dissector, target_container,
 				       data, nhoff, hlen);
 		break;
+
 	default:
 		break;
 	}
@@ -787,6 +841,21 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
 	}
 
+	/* Process result of IP proto processing */
+	switch (fdret) {
+	case FLOW_DISSECT_RET_PROTO_AGAIN:
+		goto proto_again;
+	case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+	case FLOW_DISSECT_RET_IPPROTO_AGAIN_EH:
+		goto ip_proto_again;
+	case FLOW_DISSECT_RET_OUT_GOOD:
+	case FLOW_DISSECT_RET_CONTINUE:
+		break;
+	case FLOW_DISSECT_RET_OUT_BAD:
+	default:
+		goto out_bad;
+	}
+
 out_good:
 	ret = true;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 0/2] flow_dissector: Flow dissector fixes
From: Tom Herbert @ 2017-08-31 22:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, alex.popov, hannes, Tom Herbert

This patch set fixes some basic issues with __skb_flow_dissect function.

Items addressed:
  - Cleanup control flow in the fucntion; in particular eliminate a
    bunch of goto's and implement a simplified control flow model
  - Add limits for number of encapsulations of extension headers that
    can be dissected

Tested:

Ran normal traffic, GUE, and VXLAN traffic.


Tom Herbert (2):
  flow_dissector: Cleanup control flow
  flow_dissector: Add limits for encapsulation and EH

 include/net/flow_dissector.h |   9 ++
 net/core/flow_dissector.c    | 267 ++++++++++++++++++++++++++++++-------------
 2 files changed, 198 insertions(+), 78 deletions(-)

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH v3 net-next 2/7] bpf: Allow cgroup sock filters to use get_current_uid_gid helper
From: Daniel Borkmann @ 2017-08-31 22:22 UTC (permalink / raw)
  To: David Ahern, netdev, ast
In-Reply-To: <1504217150-16151-3-git-send-email-dsahern@gmail.com>

On 09/01/2017 12:05 AM, David Ahern wrote:
> Allow BPF programs run on sock create to use the get_current_uid_gid
> helper. IPv4 and IPv6 sockets are created in a process context so
> there is always a valid uid/gid
>
> Signed-off-by: David Ahern <dsahern@gmail.com>
> Acked-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

^ permalink raw reply

* Re: [PATCH v3 net-next 1/7] bpf: Add mark and priority to sock options that can be set
From: Daniel Borkmann @ 2017-08-31 22:21 UTC (permalink / raw)
  To: David Ahern, netdev, ast
In-Reply-To: <1504217150-16151-2-git-send-email-dsahern@gmail.com>

On 09/01/2017 12:05 AM, David Ahern wrote:
> Add socket mark and priority to fields that can be set by
> ebpf program when a socket is created.
>
> Signed-off-by: David Ahern <dsahern@gmail.com>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> ---
>   include/uapi/linux/bpf.h |  2 ++
>   net/core/filter.c        | 26 ++++++++++++++++++++++++++
>   2 files changed, 28 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d46cf326b95f..e9c89e20adff 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -758,6 +758,8 @@ struct bpf_sock {
>   	__u32 family;
>   	__u32 type;
>   	__u32 protocol;
> +	__u32 mark;
> +	__u32 priority;
>   };
>
>   #define XDP_PACKET_HEADROOM 256
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c6a37fe0285b..f51b9690adf3 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int size,
>   		switch (off) {
>   		case offsetof(struct bpf_sock, bound_dev_if):
>   			break;
> +		case offsetof(struct bpf_sock, mark):
> +			break;
> +		case offsetof(struct bpf_sock, priority):
> +			break;

Can also be follow-up, but please do keep this consistent
to all the other *_is_valid_access() helpers, meaning:

	switch (off) {
	case offsetof(struct bpf_sock, bound_dev_if):
	case offsetof(struct bpf_sock, mark):
	case offsetof(struct bpf_sock, priority):
		break;
	default:
		return false;
	}

Rest:

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

^ permalink raw reply

* Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
From: Chenbo Feng @ 2017-08-31 22:17 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: Chenbo Feng, linux-security-module, Jeffrey Vander Stoep, netdev,
	SELinux, Alexei Starovoitov, Lorenzo Colitti
In-Reply-To: <1504214241.6901.31.camel@linux.vnet.ibm.com>

On Thu, Aug 31, 2017 at 2:17 PM, Mimi Zohar <zohar@linux.vnet.ibm.com> wrote:
> On Thu, 2017-08-31 at 13:56 -0700, Chenbo Feng wrote:
>> From: Chenbo Feng <fengc@google.com>
>>
>> Introduce a pointer into struct bpf_map to hold the security information
>> about the map. The actual security struct varies based on the security
>> models implemented. Place the LSM hooks before each of the unrestricted
>> eBPF operations, the map_update_elem and map_delete_elem operations are
>> checked by security_map_modify. The map_lookup_elem and map_get_next_key
>> operations are checked by securtiy_map_read.
>>
>> Signed-off-by: Chenbo Feng <fengc@google.com>
>> ---
>>  include/linux/bpf.h  |  3 +++
>>  kernel/bpf/syscall.c | 28 ++++++++++++++++++++++++++++
>>  2 files changed, 31 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index b69e7a5869ff..ca3e6ff7091d 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -53,6 +53,9 @@ struct bpf_map {
>>       struct work_struct work;
>>       atomic_t usercnt;
>>       struct bpf_map *inner_map_meta;
>> +#ifdef CONFIG_SECURITY
>> +     void *security;
>> +#endif
>>  };
>>
>>  /* function argument constraints */
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 045646da97cc..b15580bcf3b1 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr)
>>       if (err)
>>               return -EINVAL;
>>
>> +     err = security_map_create();
>> +     if (err)
>> +             return -EACCES;
>
> Any reason not to just return err?
>
> Mimi
>
Nope... return err might be better. I will fix this in next version.

Thanks
Chenbo
>> +
>>       /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
>>       map = find_and_alloc_map(attr);
>>       if (IS_ERR(map))
>> @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr)
>>       if (err)
>>               goto free_map_nouncharge;
>>
>> +     err = security_post_create(map);
>> +     if (err < 0)
>> +             goto free_map;
>> +
>>       err = bpf_map_alloc_id(map);
>>       if (err)
>>               goto free_map;
>> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
>>       if (IS_ERR(map))
>>               return PTR_ERR(map);
>>
>> +     err = security_map_read(map);
>> +     if (err)
>> +             return -EACCES;
>> +
>>       key = memdup_user(ukey, map->key_size);
>>       if (IS_ERR(key)) {
>>               err = PTR_ERR(key);
>> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
>>       if (IS_ERR(map))
>>               return PTR_ERR(map);
>>
>> +     err = security_map_modify(map);
>> +     if (err)
>> +             return -EACCES;
>> +
>>       key = memdup_user(ukey, map->key_size);
>>       if (IS_ERR(key)) {
>>               err = PTR_ERR(key);
>> @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr)
>>       if (IS_ERR(map))
>>               return PTR_ERR(map);
>>
>> +     err = security_map_modify(map);
>> +     if (err)
>> +             return -EACCES;
>> +
>>       key = memdup_user(ukey, map->key_size);
>>       if (IS_ERR(key)) {
>>               err = PTR_ERR(key);
>> @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr)
>>       if (IS_ERR(map))
>>               return PTR_ERR(map);
>>
>> +     err = security_map_read(map);
>> +     if (err)
>> +             return -EACCES;
>> +
>>       if (ukey) {
>>               key = memdup_user(ukey, map->key_size);
>>               if (IS_ERR(key)) {
>> @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr)
>>       if (CHECK_ATTR(BPF_PROG_LOAD))
>>               return -EINVAL;
>>
>> +     err = security_prog_load();
>> +     if (err)
>> +             return -EACCES;
>> +
>>       if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
>>               return -EINVAL;
>>
>

^ permalink raw reply

* [PATCH v3 net-next 7/7] samples/bpf: Update cgroup socket examples to use uid gid helper
From: David Ahern @ 2017-08-31 22:05 UTC (permalink / raw)
  To: netdev, daniel, ast; +Cc: David Ahern
In-Reply-To: <1504217150-16151-1-git-send-email-dsahern@gmail.com>

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/sock_flags_kern.c |  5 +++++
 samples/bpf/test_cgrp2_sock.c | 12 +++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c
index 533dd11a6baa..05dcdf8a4baa 100644
--- a/samples/bpf/sock_flags_kern.c
+++ b/samples/bpf/sock_flags_kern.c
@@ -9,8 +9,13 @@ SEC("cgroup/sock1")
 int bpf_prog1(struct bpf_sock *sk)
 {
 	char fmt[] = "socket: family %d type %d protocol %d\n";
+	char fmt2[] = "socket: uid %u gid %u\n";
+	__u64 gid_uid = bpf_get_current_uid_gid();
+	__u32 uid = gid_uid & 0xffffffff;
+	__u32 gid = gid_uid >> 32;
 
 	bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+	bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid);
 
 	/* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets
 	 * ie., make ping6 fail
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 5a688837720c..e79594dd629b 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -46,8 +46,18 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio)
 
 	/* set mark on socket */
 	struct bpf_insn prog_mark[] = {
-		BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+		/* get uid of process */
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_current_uid_gid),
+		BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff),
+
+		/* if uid is 0, use given mark, else use the uid as the mark */
+		BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
 		BPF_MOV64_IMM(BPF_REG_3, mark),
+
+		/* set the mark on the new socket */
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
 		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)),
 		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)),
 	};
-- 
2.1.4

^ permalink raw reply related

* [PATCH v3 net-next 6/7] samples/bpf: Update cgrp2 socket tests
From: David Ahern @ 2017-08-31 22:05 UTC (permalink / raw)
  To: netdev, daniel, ast; +Cc: David Ahern
In-Reply-To: <1504217150-16151-1-git-send-email-dsahern@gmail.com>

Update cgrp2 bpf sock tests to check that device, mark and priority
can all be set on a socket via bpf programs attached to a cgroup.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/test_cgrp2_sock.sh | 162 +++++++++++++++++++++++++++++++----------
 1 file changed, 124 insertions(+), 38 deletions(-)

diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
index 1153c33e8964..a81f38eef417 100755
--- a/samples/bpf/test_cgrp2_sock.sh
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -1,47 +1,133 @@
-#!/bin/bash
-
-function config_device {
-	ip netns add at_ns0
-	ip link add veth0 type veth peer name veth0b
-	ip link set veth0b up
-	ip link set veth0 netns at_ns0
-	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
-	ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
-	ip netns exec at_ns0 ip link set dev veth0 up
-	ip link add foo type vrf table 1234
-	ip link set foo up
-	ip addr add 172.16.1.101/24 dev veth0b
-	ip addr add 2401:db00::2/64 dev veth0b nodad
-	ip link set veth0b master foo
+#!/bin/sh
+
+# Test various socket options that can be set by attaching programs to cgroups.
+
+CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock"
+
+################################################################################
+#
+print_result()
+{
+	local rc=$1
+	local status=" OK "
+
+	[ $rc -ne 0 ] && status="FAIL"
+
+	printf "%-50s    [%4s]\n" "$2" "$status"
 }
 
-function attach_bpf {
-	rm -rf /tmp/cgroupv2
-	mkdir -p /tmp/cgroupv2
-	mount -t cgroup2 none /tmp/cgroupv2
-	mkdir -p /tmp/cgroupv2/foo
-	test_cgrp2_sock -b foo /tmp/cgroupv2/foo
-	echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+check_sock()
+{
+	out=$(test_cgrp2_sock)
+	echo $out | grep -q "$1"
+	if [ $? -ne 0 ]; then
+		print_result 1 "IPv4: $2"
+		echo "    expected: $1"
+		echo "        have: $out"
+		rc=1
+	else
+		print_result 0 "IPv4: $2"
+	fi
 }
 
-function cleanup {
-	set +ex
-	ip netns delete at_ns0
-	ip link del veth0
-	ip link del foo
-	umount /tmp/cgroupv2
-	rm -rf /tmp/cgroupv2
-	set -ex
+check_sock6()
+{
+	out=$(test_cgrp2_sock -6)
+	echo $out | grep -q "$1"
+	if [ $? -ne 0 ]; then
+		print_result 1 "IPv6: $2"
+		echo "    expected: $1"
+		echo "        have: $out"
+		rc=1
+	else
+		print_result 0 "IPv6: $2"
+	fi
 }
 
-function do_test {
-	ping -c1 -w1 172.16.1.100
-	ping6 -c1 -w1 2401:db00::1
+################################################################################
+#
+
+cleanup()
+{
+	echo $$ >> ${CGRP_MNT}/cgroup.procs
+	rmdir ${CGRP_MNT}/sockopts
 }
 
+cleanup_and_exit()
+{
+	local rc=$1
+	local msg="$2"
+
+	[ -n "$msg" ] && echo "ERROR: $msg"
+
+	ip li del cgrp2_sock
+	umount ${CGRP_MNT}
+
+	exit $rc
+}
+
+
+################################################################################
+# main
+
+rc=0
+
+ip li add cgrp2_sock type dummy 2>/dev/null
+
+set -e
+mkdir -p ${CGRP_MNT}
+mount -t cgroup2 none ${CGRP_MNT}
+set +e
+
+
+# make sure we have a known start point
 cleanup 2>/dev/null
-config_device
-attach_bpf
-do_test
-cleanup
-echo "*** PASS ***"
+
+mkdir -p ${CGRP_MNT}/sockopts
+[ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy"
+
+
+# set pid into cgroup
+echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs
+
+# no bpf program attached, so socket should show no settings
+check_sock "dev , mark 0, priority 0" "No programs attached"
+check_sock6 "dev , mark 0, priority 0" "No programs attached"
+
+# verify device is set
+#
+test_cgrp2_sock -b cgrp2_sock ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+	cleanup_and_exit 1 "Failed to install program to set device"
+fi
+check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set"
+check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set"
+
+# verify mark is set
+#
+test_cgrp2_sock -m 666 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+	cleanup_and_exit 1 "Failed to install program to set mark"
+fi
+check_sock "dev , mark 666, priority 0" "Mark set"
+check_sock6 "dev , mark 666, priority 0" "Mark set"
+
+# verify priority is set
+#
+test_cgrp2_sock -p 123 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+	cleanup_and_exit 1 "Failed to install program to set priority"
+fi
+check_sock "dev , mark 0, priority 123" "Priority set"
+check_sock6 "dev , mark 0, priority 123" "Priority set"
+
+# all 3 at once
+#
+test_cgrp2_sock -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+	cleanup_and_exit 1 "Failed to install program to set device, mark and priority"
+fi
+check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set"
+check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set"
+
+cleanup_and_exit $rc
-- 
2.1.4

^ permalink raw reply related

* [PATCH v3 net-next 5/7] samples/bpf: Add option to dump socket settings
From: David Ahern @ 2017-08-31 22:05 UTC (permalink / raw)
  To: netdev, daniel, ast; +Cc: David Ahern
In-Reply-To: <1504217150-16151-1-git-send-email-dsahern@gmail.com>

Add option to dump socket settings. Will be used in the next patch
to verify bpf programs are correctly setting mark, priority and
device based on the cgroup attachment for the program run.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/test_cgrp2_sock.c | 75 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 15396761c5cc..5a688837720c 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -112,6 +112,70 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio)
 	return ret;
 }
 
+static int get_bind_to_device(int sd, char *name, size_t len)
+{
+	socklen_t optlen = len;
+	int rc;
+
+	name[0] = '\0';
+	rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen);
+	if (rc < 0)
+		perror("setsockopt(SO_BINDTODEVICE)");
+
+	return rc;
+}
+
+static unsigned int get_somark(int sd)
+{
+	unsigned int mark = 0;
+	socklen_t optlen = sizeof(mark);
+	int rc;
+
+	rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen);
+	if (rc < 0)
+		perror("getsockopt(SO_MARK)");
+
+	return mark;
+}
+
+static unsigned int get_priority(int sd)
+{
+	unsigned int prio = 0;
+	socklen_t optlen = sizeof(prio);
+	int rc;
+
+	rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen);
+	if (rc < 0)
+		perror("getsockopt(SO_PRIORITY)");
+
+	return prio;
+}
+
+static int show_sockopts(int family)
+{
+	unsigned int mark, prio;
+	char name[16];
+	int sd;
+
+	sd = socket(family, SOCK_DGRAM, 17);
+	if (sd < 0) {
+		perror("socket");
+		return 1;
+	}
+
+	if (get_bind_to_device(sd, name, sizeof(name)) < 0)
+		return 1;
+
+	mark = get_somark(sd);
+	prio = get_priority(sd);
+
+	close(sd);
+
+	printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio);
+
+	return 0;
+}
+
 static int usage(const char *argv0)
 {
 	printf("Usage:\n");
@@ -120,6 +184,9 @@ static int usage(const char *argv0)
 	printf("\n");
 	printf("  Detach a program\n");
 	printf("  %s -d cg-path\n", argv0);
+	printf("\n");
+	printf("  Show inherited socket settings (mark, priority, and device)\n");
+	printf("  %s [-6]\n", argv0);
 	return EXIT_FAILURE;
 }
 
@@ -128,10 +195,11 @@ int main(int argc, char **argv)
 	__u32 idx = 0, mark = 0, prio = 0;
 	const char *cgrp_path = NULL;
 	int cg_fd, prog_fd, ret;
+	int family = PF_INET;
 	int do_attach = 1;
 	int rc;
 
-	while ((rc = getopt(argc, argv, "db:m:p:")) != -1) {
+	while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) {
 		switch (rc) {
 		case 'd':
 			do_attach = 0;
@@ -152,13 +220,16 @@ int main(int argc, char **argv)
 		case 'p':
 			prio = strtoumax(optarg, NULL, 0);
 			break;
+		case '6':
+			family = PF_INET6;
+			break;
 		default:
 			return usage(argv[0]);
 		}
 	}
 
 	if (optind == argc)
-		return usage(argv[0]);
+		return show_sockopts(family);
 
 	cgrp_path = argv[optind];
 	if (!cgrp_path) {
-- 
2.1.4

^ permalink raw reply related

* [PATCH v3 net-next 3/7] samples/bpf: Update sock test to allow setting mark and priority
From: David Ahern @ 2017-08-31 22:05 UTC (permalink / raw)
  To: netdev, daniel, ast; +Cc: David Ahern
In-Reply-To: <1504217150-16151-1-git-send-email-dsahern@gmail.com>

Update sock test to set mark and priority on socket create.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/test_cgrp2_sock.c  | 134 ++++++++++++++++++++++++++++++++++++-----
 samples/bpf/test_cgrp2_sock.sh |   2 +-
 2 files changed, 119 insertions(+), 17 deletions(-)

diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index c3cfb23e23b5..681abbe6c85e 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -19,59 +19,161 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <net/if.h>
+#include <inttypes.h>
 #include <linux/bpf.h>
 
 #include "libbpf.h"
 
 char bpf_log_buf[BPF_LOG_BUF_SIZE];
 
-static int prog_load(int idx)
+static int prog_load(__u32 idx, __u32 mark, __u32 prio)
 {
-	struct bpf_insn prog[] = {
+	/* save pointer to context */
+	struct bpf_insn prog_start[] = {
 		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	};
+	struct bpf_insn prog_end[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+
+	/* set sk_bound_dev_if on socket */
+	struct bpf_insn prog_dev[] = {
 		BPF_MOV64_IMM(BPF_REG_3, idx),
 		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
 		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
-		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
-		BPF_EXIT_INSN(),
 	};
-	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
 
-	return bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt,
+	/* set mark on socket */
+	struct bpf_insn prog_mark[] = {
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+		BPF_MOV64_IMM(BPF_REG_3, mark),
+		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)),
+		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)),
+	};
+
+	/* set priority on socket */
+	struct bpf_insn prog_prio[] = {
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+		BPF_MOV64_IMM(BPF_REG_3, prio),
+		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)),
+		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)),
+	};
+
+	struct bpf_insn *prog;
+	size_t insns_cnt;
+	void *p;
+	int ret;
+
+	insns_cnt = sizeof(prog_start) + sizeof(prog_end);
+	if (idx)
+		insns_cnt += sizeof(prog_dev);
+
+	if (mark)
+		insns_cnt += sizeof(prog_mark);
+
+	if (prio)
+		insns_cnt += sizeof(prog_prio);
+
+	p = prog = malloc(insns_cnt);
+	if (!prog) {
+		fprintf(stderr, "Failed to allocate memory for instructions\n");
+		return EXIT_FAILURE;
+	}
+
+	memcpy(p, prog_start, sizeof(prog_start));
+	p += sizeof(prog_start);
+
+	if (idx) {
+		memcpy(p, prog_dev, sizeof(prog_dev));
+		p += sizeof(prog_dev);
+	}
+
+	if (mark) {
+		memcpy(p, prog_mark, sizeof(prog_mark));
+		p += sizeof(prog_mark);
+	}
+
+	if (prio) {
+		memcpy(p, prog_prio, sizeof(prog_prio));
+		p += sizeof(prog_prio);
+	}
+
+	memcpy(p, prog_end, sizeof(prog_end));
+	p += sizeof(prog_end);
+
+	insns_cnt /= sizeof(struct bpf_insn);
+
+	ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt,
 				"GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+	free(prog);
+
+	return ret;
 }
 
 static int usage(const char *argv0)
 {
-	printf("Usage: %s cg-path device-index\n", argv0);
+	printf("Usage: %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
 	return EXIT_FAILURE;
 }
 
 int main(int argc, char **argv)
 {
+	__u32 idx = 0, mark = 0, prio = 0;
+	const char *cgrp_path = NULL;
 	int cg_fd, prog_fd, ret;
-	unsigned int idx;
+	int rc;
+
+	while ((rc = getopt(argc, argv, "b:m:p:")) != -1) {
+		switch (rc) {
+		case 'b':
+			idx = if_nametoindex(optarg);
+			if (!idx) {
+				idx = strtoumax(optarg, NULL, 0);
+				if (!idx) {
+					printf("Invalid device name\n");
+					return EXIT_FAILURE;
+				}
+			}
+			break;
+		case 'm':
+			mark = strtoumax(optarg, NULL, 0);
+			break;
+		case 'p':
+			prio = strtoumax(optarg, NULL, 0);
+			break;
+		default:
+			return usage(argv[0]);
+		}
+	}
 
-	if (argc < 2)
+	if (optind == argc)
 		return usage(argv[0]);
 
-	idx = if_nametoindex(argv[2]);
-	if (!idx) {
-		printf("Invalid device name\n");
+	cgrp_path = argv[optind];
+	if (!cgrp_path) {
+		fprintf(stderr, "cgroup path not given\n");
 		return EXIT_FAILURE;
 	}
 
-	cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+	if (!idx && !mark && !prio) {
+		fprintf(stderr,
+			"One of device, mark or priority must be given\n");
+		return EXIT_FAILURE;
+	}
+
+	cg_fd = open(cgrp_path, O_DIRECTORY | O_RDONLY);
 	if (cg_fd < 0) {
 		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
 		return EXIT_FAILURE;
 	}
 
-	prog_fd = prog_load(idx);
-	printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
-
+	prog_fd = prog_load(idx, mark, prio);
 	if (prog_fd < 0) {
 		printf("Failed to load prog: '%s'\n", strerror(errno));
+		printf("Output from kernel verifier:\n%s\n-------\n",
+		       bpf_log_buf);
 		return EXIT_FAILURE;
 	}
 
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
index 925fd467c7cc..1153c33e8964 100755
--- a/samples/bpf/test_cgrp2_sock.sh
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -20,7 +20,7 @@ function attach_bpf {
 	mkdir -p /tmp/cgroupv2
 	mount -t cgroup2 none /tmp/cgroupv2
 	mkdir -p /tmp/cgroupv2/foo
-	test_cgrp2_sock /tmp/cgroupv2/foo foo
+	test_cgrp2_sock -b foo /tmp/cgroupv2/foo
 	echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
 }
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH v3 net-next 4/7] samples/bpf: Add detach option to test_cgrp2_sock
From: David Ahern @ 2017-08-31 22:05 UTC (permalink / raw)
  To: netdev, daniel, ast; +Cc: David Ahern
In-Reply-To: <1504217150-16151-1-git-send-email-dsahern@gmail.com>

Add option to detach programs from a cgroup.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/test_cgrp2_sock.c | 50 ++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 681abbe6c85e..15396761c5cc 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -114,7 +114,12 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio)
 
 static int usage(const char *argv0)
 {
-	printf("Usage: %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
+	printf("Usage:\n");
+	printf("  Attach a program\n");
+	printf("  %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
+	printf("\n");
+	printf("  Detach a program\n");
+	printf("  %s -d cg-path\n", argv0);
 	return EXIT_FAILURE;
 }
 
@@ -123,10 +128,14 @@ int main(int argc, char **argv)
 	__u32 idx = 0, mark = 0, prio = 0;
 	const char *cgrp_path = NULL;
 	int cg_fd, prog_fd, ret;
+	int do_attach = 1;
 	int rc;
 
-	while ((rc = getopt(argc, argv, "b:m:p:")) != -1) {
+	while ((rc = getopt(argc, argv, "db:m:p:")) != -1) {
 		switch (rc) {
+		case 'd':
+			do_attach = 0;
+			break;
 		case 'b':
 			idx = if_nametoindex(optarg);
 			if (!idx) {
@@ -157,7 +166,7 @@ int main(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
-	if (!idx && !mark && !prio) {
+	if (do_attach && !idx && !mark && !prio) {
 		fprintf(stderr,
 			"One of device, mark or priority must be given\n");
 		return EXIT_FAILURE;
@@ -169,20 +178,31 @@ int main(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
-	prog_fd = prog_load(idx, mark, prio);
-	if (prog_fd < 0) {
-		printf("Failed to load prog: '%s'\n", strerror(errno));
-		printf("Output from kernel verifier:\n%s\n-------\n",
-		       bpf_log_buf);
-		return EXIT_FAILURE;
-	}
+	if (do_attach) {
+		prog_fd = prog_load(idx, mark, prio);
+		if (prog_fd < 0) {
+			printf("Failed to load prog: '%s'\n", strerror(errno));
+			printf("Output from kernel verifier:\n%s\n-------\n",
+			       bpf_log_buf);
+			return EXIT_FAILURE;
+		}
 
-	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0);
-	if (ret < 0) {
-		printf("Failed to attach prog to cgroup: '%s'\n",
-		       strerror(errno));
-		return EXIT_FAILURE;
+		ret = bpf_prog_attach(prog_fd, cg_fd,
+				      BPF_CGROUP_INET_SOCK_CREATE, 0);
+		if (ret < 0) {
+			printf("Failed to attach prog to cgroup: '%s'\n",
+			       strerror(errno));
+			return EXIT_FAILURE;
+		}
+	} else {
+		ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+		if (ret < 0) {
+			printf("Failed to detach prog from cgroup: '%s'\n",
+			       strerror(errno));
+			return EXIT_FAILURE;
+		}
 	}
 
+	close(cg_fd);
 	return EXIT_SUCCESS;
 }
-- 
2.1.4

^ permalink raw reply related

* [PATCH v3 net-next 2/7] bpf: Allow cgroup sock filters to use get_current_uid_gid helper
From: David Ahern @ 2017-08-31 22:05 UTC (permalink / raw)
  To: netdev, daniel, ast; +Cc: David Ahern
In-Reply-To: <1504217150-16151-1-git-send-email-dsahern@gmail.com>

Allow BPF programs run on sock create to use the get_current_uid_gid
helper. IPv4 and IPv6 sockets are created in a process context so
there is always a valid uid/gid

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index f51b9690adf3..9dad3e7e2e10 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3150,6 +3150,20 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
+sock_filter_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	/* inet and inet6 sockets are created in a process
+	 * context so there is always a valid uid/gid
+	 */
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -4233,7 +4247,7 @@ const struct bpf_verifier_ops lwt_xmit_prog_ops = {
 };
 
 const struct bpf_verifier_ops cg_sock_prog_ops = {
-	.get_func_proto		= bpf_base_func_proto,
+	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
-- 
2.1.4

^ permalink raw reply related

* [PATCH v3 net-next 1/7] bpf: Add mark and priority to sock options that can be set
From: David Ahern @ 2017-08-31 22:05 UTC (permalink / raw)
  To: netdev, daniel, ast; +Cc: David Ahern
In-Reply-To: <1504217150-16151-1-git-send-email-dsahern@gmail.com>

Add socket mark and priority to fields that can be set by
ebpf program when a socket is created.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d46cf326b95f..e9c89e20adff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -758,6 +758,8 @@ struct bpf_sock {
 	__u32 family;
 	__u32 type;
 	__u32 protocol;
+	__u32 mark;
+	__u32 priority;
 };
 
 #define XDP_PACKET_HEADROOM 256
diff --git a/net/core/filter.c b/net/core/filter.c
index c6a37fe0285b..f51b9690adf3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int size,
 		switch (off) {
 		case offsetof(struct bpf_sock, bound_dev_if):
 			break;
+		case offsetof(struct bpf_sock, mark):
+			break;
+		case offsetof(struct bpf_sock, priority):
+			break;
 		default:
 			return false;
 		}
@@ -3958,6 +3962,28 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
 
+	case offsetof(struct bpf_sock, mark):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					offsetof(struct sock, sk_mark));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct sock, sk_mark));
+		break;
+
+	case offsetof(struct bpf_sock, priority):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					offsetof(struct sock, sk_priority));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct sock, sk_priority));
+		break;
+
 	case offsetof(struct bpf_sock, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH v3 net-next 0/7] bpf: Add option to set mark and priority in cgroup sock programs
From: David Ahern @ 2017-08-31 22:05 UTC (permalink / raw)
  To: netdev, daniel, ast; +Cc: David Ahern

Add option to set mark and priority in addition to bound device for newly
created sockets. Also, allow the bpf programs to use the get_current_uid_gid
helper meaning socket marks, priority and device can be set based on the
uid/gid of the running process.

Sample programs are updated to demonstrate the new options.

v3
- no changes to Patches 1 and 2 which Alexei acked in previous versions
- dropped change related to recursive programs in a cgroup
- updated tests per dropped patch

v2
- added flag to control recursive behavior as requested by Alexei
- added comment to sock_filter_func_proto regarding use of
  get_current_uid_gid helper
- updated test programs for recursive option

David Ahern (7):
  bpf: Add mark and priority to sock options that can be set
  bpf: Allow cgroup sock filters to use get_current_uid_gid helper
  samples/bpf: Update sock test to allow setting mark and priority
  samples/bpf: Add detach option to test_cgrp2_sock
  samples/bpf: Add option to dump socket settings
  samples/bpf: Update cgrp2 socket tests
  samples/bpf: Update cgroup socket examples to use uid gid helper

 include/uapi/linux/bpf.h       |   2 +
 net/core/filter.c              |  42 ++++++-
 samples/bpf/sock_flags_kern.c  |   5 +
 samples/bpf/test_cgrp2_sock.c  | 255 ++++++++++++++++++++++++++++++++++++-----
 samples/bpf/test_cgrp2_sock.sh | 162 ++++++++++++++++++++------
 5 files changed, 401 insertions(+), 65 deletions(-)

-- 
2.1.4

^ permalink raw reply

* Re: [PATCH net-next] bridge: add tracepoint in br_fdb_update
From: Jesper Dangaard Brouer @ 2017-08-31 21:50 UTC (permalink / raw)
  To: David Miller; +Cc: brouer, roopa, netdev, nikolay, f.fainelli, andrew, bridge
In-Reply-To: <20170831.114325.2027598728601121750.davem@davemloft.net>

On Thu, 31 Aug 2017 11:43:25 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> Date: Wed, 30 Aug 2017 22:18:13 -0700
> 
> > From: Roopa Prabhu <roopa@cumulusnetworks.com>
> > 
> > This extends bridge fdb table tracepoints to also cover
> > learned fdb entries in the br_fdb_update path. Note that
> > unlike other tracepoints I have moved this to when the fdb
> > is modified because this is in the datapath and can generate
> > a lot of noise in the trace output. br_fdb_update is also called
> > from added_by_user context in the NTF_USE case which is already
> > traced ..hence the !added_by_user check.
> > 
> > Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>  
> 
> Applied.
> 
> Let's use dev->name for now and if the tooling can eventually
> do transparent ifindex->name then we can consider redoing
> a bunch of networking tracepoints.

I agree! :-)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox