Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 2/2] mlxsw: spectrum_router: Return an error for routes added after abort
From: Ido Schimmel @ 2018-05-01  8:16 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, dsahern, mlxsw, Ido Schimmel
In-Reply-To: <20180501081639.29162-1-idosch@mellanox.com>

We currently do not perform accounting in the driver and thus can't
reject routes before resources are exceeded.

However, in order to make users aware of the fact that routes are no
longer offloaded we can return an error for routes configured after the
abort mechanism was triggered.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index baea97560029..c9fce669cee4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -5928,6 +5928,13 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 						     router->mlxsw_sp);
 		if (!err || info->extack)
 			return notifier_from_errno(err);
+		break;
+	case FIB_EVENT_ENTRY_ADD:
+		if (router->aborted) {
+			NL_SET_ERR_MSG_MOD(info->extack, "FIB offload was aborted. Not configuring route");
+			return notifier_from_errno(-EINVAL);
+		}
+		break;
 	}
 
 	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next 1/2] mlxsw: spectrum_router: Return an error for non-default FIB rules
From: Ido Schimmel @ 2018-05-01  8:16 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, dsahern, mlxsw, Ido Schimmel
In-Reply-To: <20180501081639.29162-1-idosch@mellanox.com>

Since commit 9776d32537d2 ("net: Move call_fib_rule_notifiers up in
fib_nl_newrule") it is possible to forbid the installation of
unsupported FIB rules.

Have mlxsw return an error for non-default FIB rules in addition to the
existing extack message.

Example:
# ip rule add from 198.51.100.1 table 10
Error: mlxsw_spectrum: FIB rules not supported.

Note that offload is only aborted when non-default FIB rules are already
installed and merely replayed during module initialization.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 8e4edb634b11..baea97560029 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -5899,7 +5899,7 @@ static int mlxsw_sp_router_fib_rule_event(unsigned long event,
 	}
 
 	if (err < 0)
-		NL_SET_ERR_MSG_MOD(extack, "FIB rules not supported. Aborting offload");
+		NL_SET_ERR_MSG_MOD(extack, "FIB rules not supported");
 
 	return err;
 }
@@ -5926,8 +5926,8 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	case FIB_EVENT_RULE_DEL:
 		err = mlxsw_sp_router_fib_rule_event(event, info,
 						     router->mlxsw_sp);
-		if (!err)
-			return NOTIFY_DONE;
+		if (!err || info->extack)
+			return notifier_from_errno(err);
 	}
 
 	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next 0/2] mlxsw: Reject unsupported FIB configurations
From: Ido Schimmel @ 2018-05-01  8:16 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, dsahern, mlxsw, Ido Schimmel

Recently it became possible for listeners of the FIB notification chain
to veto operations such as addition of routes and rules.

Adjust the mlxsw driver to take advantage of it and return an error for
unsupported FIB rules and for routes configured after the abort
mechanism was triggered (due to exceeded resources for example).

Ido Schimmel (2):
  mlxsw: spectrum_router: Return an error for non-default FIB rules
  mlxsw: spectrum_router: Return an error for routes added after abort

 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

-- 
2.14.3

^ permalink raw reply

* Re: [PATCH RFC 6/9] veth: Add ndo_xdp_xmit
From: Jesper Dangaard Brouer @ 2018-05-01  8:14 UTC (permalink / raw)
  To: Toshiaki Makita
  Cc: Toshiaki Makita, netdev, Tariq Toukan, brouer, Daniel Borkmann,
	Alexei Starovoitov, Eran Ben Elisha
In-Reply-To: <6ed7c0c5-d6cb-7870-38da-4bb49707a63c@lab.ntt.co.jp>

On Tue, 1 May 2018 10:02:01 +0900
Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:

> On 2018/05/01 2:27, Jesper Dangaard Brouer wrote:
> > On Thu, 26 Apr 2018 19:52:40 +0900
> > Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:
> >   
> >> On 2018/04/26 5:24, Jesper Dangaard Brouer wrote:  
> >>> On Tue, 24 Apr 2018 23:39:20 +0900
> >>> Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:
> >>>     
> >>>> +static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
> >>>> +{
> >>>> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> >>>> +	int headroom = frame->data - (void *)frame;
> >>>> +	struct net_device *rcv;
> >>>> +	int err = 0;
> >>>> +
> >>>> +	rcv = rcu_dereference(priv->peer);
> >>>> +	if (unlikely(!rcv))
> >>>> +		return -ENXIO;
> >>>> +
> >>>> +	rcv_priv = netdev_priv(rcv);
> >>>> +	/* xdp_ring is initialized on receive side? */
> >>>> +	if (rcu_access_pointer(rcv_priv->xdp_prog)) {
> >>>> +		err = xdp_ok_fwd_dev(rcv, frame->len);
> >>>> +		if (unlikely(err))
> >>>> +			return err;
> >>>> +
> >>>> +		err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
> >>>> +	} else {
> >>>> +		struct sk_buff *skb;
> >>>> +
> >>>> +		skb = veth_build_skb(frame, headroom, frame->len, 0);
> >>>> +		if (unlikely(!skb))
> >>>> +			return -ENOMEM;
> >>>> +
> >>>> +		/* Get page ref in case skb is dropped in netif_rx.
> >>>> +		 * The caller is responsible for freeing the page on error.
> >>>> +		 */
> >>>> +		get_page(virt_to_page(frame->data));    
> >>>
> >>> I'm not sure you can make this assumption, that xdp_frames coming from
> >>> another device driver uses a refcnt based memory model. But maybe I'm
> >>> confused, as this looks like an SKB receive path, but in the
> >>> ndo_xdp_xmit().    
> >>
> >> I find this path similar to cpumap, which creates skb from redirected
> >> xdp frame. Once it is converted to skb, skb head is freed by
> >> page_frag_free, so anyway I needed to get the refcount here regardless
> >> of memory model.  
> > 
> > Yes I know, I wrote cpumap ;-)
> > 
> > First of all, I don't want to see such xdp_frame to SKB conversion code
> > in every driver.  Because that increase the chances of errors.  And
> > when looking at the details, then it seems that you have made the
> > mistake of making it possible to leak xdp_frame info to the SKB (which
> > cpumap takes into account).  
> 
> Do you mean leaving xdp_frame in skb->head is leaking something? how?

Like commit 97e19cce05e5 ("bpf: reserve xdp_frame size in xdp headroom")
and commit 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data
on page reuse").

But this time, the concern is a bpf_prog attached at TC/bpf_cls level, that can 
that can adjust head via bpf_skb_change_head (for XDP it is
bpf_xdp_adjust_head) into the area used by xdp_frame.  As described in
https://git.kernel.org/davem/net-next/c/6dfb970d3dbd in is not super
critical at the moment, as this _currently_ runs as CAP_SYS_ADMIN, but
we would like to move towards CAP_NET_ADMIN.

 
> > Second, I think the refcnt scheme here is wrong. The xdp_frame should
> > be "owned" by XDP and have the proper refcnt to deliver it directly to
> > the network stack.
> > 
> > Third, if we choose that we want a fallback, in-case XDP is not enabled
> > on egress dev (but it have an ndo_xdp_xmit), then it should be placed
> > in the generic/core code.  E.g. __bpf_tx_xdp_map() could look at the
> > return code from dev->netdev_ops->ndo_xdp() and create an SKB.  (Hint,
> > this would make it easy to implement TX bulking towards the dev).  
> 
> Right, this is a much cleaner way.
>
> Although I feel like we should add this fallback for veth because it
> requires something which is different from other drivers (enabling XDP
> on the peer device of the egress device), 

(This is why I Cc'ed Tariq...)

This is actually a general problem with the xdp "xmit" side (and not
specific to veth driver). The problem exists for other drivers as well.

The problem is that a driver can implement ndo_xdp_xmit(), but the
driver might not have allocated the necessary XDP TX-queue resources
yet (or it might not be possible due to system resource limits).

The current "hack" is to load an XDP prog on the egress device, and
then assume that this cause the driver to also allocate the XDP
ndo_xdo_xmit side HW resources.  This is IMHO a wrong assumption.

We need a more generic way to test if a net_device is "ready/enabled"
for handling xdp_frames via ndo_xdp_xmit.  And Tariq had some ideas on
how to implement this...

My opinion is that it is a waste of (HW/mem) resources to always
allocate resources for ndo_xdp_xmit when loading an XDP program.
Because what if my use-cases are XDP_DROP DDoS filter, or CPUMAP
redirect load-balancer, then I don't want/need ndo_xdp_xmit.  E.g.
today using ixgbe on machines with more than 96 CPUs, will fail due to
limited TX queue resources. Thus, blocking the mentioned use-cases.


> I'll drop the part for now. It should not be resolved in the driver
> code.

Thank you.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH RFC 4/9] veth: Use NAPI for XDP
From: Toshiaki Makita @ 2018-05-01  8:02 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Toshiaki Makita, netdev
In-Reply-To: <20180501095008.207c354f@brouer.com>

On 2018/05/01 16:50, Jesper Dangaard Brouer wrote:
> On Tue, 24 Apr 2018 23:39:18 +0900
> Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:
> 
>> +static int veth_xdp_enqueue(struct veth_priv *priv, void *ptr)
>> +{
>> +	if (unlikely(ptr_ring_produce(&priv->xdp_ring, ptr)))
>> +		return -ENOSPC;
>> +
>> +	return 0;
>> +}
> 
> Here we have a lock per (enqueued) packet.  I'm working on changing the
> ndo_xdp_xmit API to allow bulking.  And the tun driver have exact same
> issue/need.

Probably I should have noted in commitlog, but this per-packet lock is
removed in patch 9.
I'm curious about if any change is needed by your new API.

-- 
Toshiaki Makita

^ permalink raw reply

* Re: [PATCH] change the comment of vti6_ioctl
From: Steffen Klassert @ 2018-05-01  8:00 UTC (permalink / raw)
  To: David Miller; +Cc: sunlw.fnst, netdev, herbert
In-Reply-To: <20180430.115731.1213104451620131850.davem@davemloft.net>

On Mon, Apr 30, 2018 at 11:57:31AM -0400, David Miller wrote:
> From: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
> Date: Sun, 29 Apr 2018 15:05:52 +0800
> 
> > The comment of vti6_ioctl() is wrong. which use vti6_tnl_ioctl
> > instead of vti6_ioctl.
> > 
> > Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
> 
> Please CC: the IPSEC maintainers on future patch submissions to IPSEC
> files, as per the top level MAINTAINERS file.
> 
> Steffen, please queue this up, thank you.

Now applied to ipsec-next, thanks everyone!

^ permalink raw reply

* Re: IP_PKTINFO broken by acf568ee859f0 (xfrm: Reinject transport-mode packets through tasklet)
From: Steffen Klassert @ 2018-05-01  7:51 UTC (permalink / raw)
  To: Maxime Bizon; +Cc: Herbert Xu, netdev
In-Reply-To: <1524863813.15003.174.camel@sakura.staff.proxad.net>

On Fri, Apr 27, 2018 at 11:16:53PM +0200, Maxime Bizon wrote:
> 
> Hello Herbert,
> 
> That patch just went into stable 4.14 and is causing a regression on my
> setup.
> 
> Basically, IP_PKTINFO does not work anymore on transport-mode packets,
> because skb->cb is now used to store the finish callback.
> 
> Was that expected or is it an unforeseen side effect ?

This should be fixed by:

commit 9a3fb9fb84cc ("xfrm: Fix transport mode skb control buffer usage.")

^ permalink raw reply

* Re: [PATCH RFC 4/9] veth: Use NAPI for XDP
From: Jesper Dangaard Brouer @ 2018-05-01  7:50 UTC (permalink / raw)
  To: Toshiaki Makita; +Cc: brouer, netdev, Toshiaki Makita
In-Reply-To: <20180424143923.26519-5-toshiaki.makita1@gmail.com>

On Tue, 24 Apr 2018 23:39:18 +0900
Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:

> +static int veth_xdp_enqueue(struct veth_priv *priv, void *ptr)
> +{
> +	if (unlikely(ptr_ring_produce(&priv->xdp_ring, ptr)))
> +		return -ENOSPC;
> +
> +	return 0;
> +}

Here we have a lock per (enqueued) packet.  I'm working on changing the
ndo_xdp_xmit API to allow bulking.  And the tun driver have exact same
issue/need.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH net-next v9 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Jiri Pirko @ 2018-05-01  7:33 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh, aaron.f.brown
In-Reply-To: <c965d7f2-9ba6-fc82-ce1c-4d70c3aceb6d@intel.com>

Mon, Apr 30, 2018 at 09:26:34PM CEST, sridhar.samudrala@intel.com wrote:
>On 4/30/2018 12:12 AM, Jiri Pirko wrote:
>> Mon, Apr 30, 2018 at 05:00:33AM CEST, sridhar.samudrala@intel.com wrote:
>> > On 4/28/2018 1:24 AM, Jiri Pirko wrote:
>> > > Fri, Apr 27, 2018 at 07:06:59PM CEST, sridhar.samudrala@intel.com wrote:
>> > > > This patch enables virtio_net to switch over to a VF datapath when a VF
>> > > > netdev is present with the same MAC address. It allows live migration
>> > > > of a VM with a direct attached VF without the need to setup a bond/team
>> > > > between a VF and virtio net device in the guest.
>> > > > 
>> > > > The hypervisor needs to enable only one datapath at any time so that
>> > > > packets don't get looped back to the VM over the other datapath. When a VF
>> > > Why? Both datapaths could be enabled at a time. Why the loop on
>> > > hypervisor side would be a problem. This in not an issue for
>> > > bonding/team as well.
>> > Somehow the hypervisor needs to make sure that the broadcasts/multicasts from the VM
>> > sent over the VF datapath don't get looped back to the VM via the virtio-net datapth.
>> Why? Please see below.
>> 
>> 
>> > This can happen if both datapaths are enabled at the same time.
>> > 
>> > I would think this is an issue even with bonding/team as well when virtio-net and
>> > VF are backed by the same PF.
>> > 
>> > 
>> I believe that the scenario is the same as on an ordinary nic/swich
>> network:
>> 
>> ...................
>> 
>>    host
>>         bond0
>>        /     \
>>      eth0   eth1
>>       |       |
>> ...................
>>       |       |
>>       p1      p2
>> 
>>    switch
>> 
>> ...................
>> 
>> It is perfectly valid to p1 and p2 be up and "bridged" together. Bond
>> has to cope with loop-backed frames. "Failover driver" should too,
>> it's the same scenario.
>
>OK. So looks like we should be able to handle this by returning RX_HANDLER_EXACT
>for frames received on standby device when primary is present.

Yep.

^ permalink raw reply

* [PATCH] tipc: fix a potential missing-check bug
From: Wenwen Wang @ 2018-05-01  4:26 UTC (permalink / raw)
  To: Wenwen Wang
  Cc: Kangjie Lu, Jon Maloy, Ying Xue, David S. Miller,
	open list:TIPC NETWORK LAYER, open list:TIPC NETWORK LAYER,
	open list

In tipc_link_xmit(), the member field "len" of l->backlog[imp] must
be less than the member field "limit" of l->backlog[imp] when imp is
equal to TIPC_SYSTEM_IMPORTANCE. Otherwise, an error code, i.e., -ENOBUFS,
is returned. This is enforced by the security check. However, at the end
of tipc_link_xmit(), the length of "list" is added to l->backlog[imp].len
without any further check. This can potentially cause unexpected values for
l->backlog[imp].len. If imp is equal to TIPC_SYSTEM_IMPORTANCE and the
original value of l->backlog[imp].len is less than l->backlog[imp].limit,
after this addition, l->backlog[imp] could be larger than
l->backlog[imp].limit. That means the security check can potentially be
bypassed, especially when an adversary can control the length of "list".

This patch performs such a check after the modification to
l->backlog[imp].len (if imp is TIPC_SYSTEM_IMPORTANCE) to avoid such
security issues. An error code will be returned if an unexpected value of
l->backlog[imp].len is generated.

Signed-off-by: Wenwen Wang <wang6495@umn.edu>
---
 net/tipc/link.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 695acb7..62972fa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -948,6 +948,11 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 			continue;
 		}
 		l->backlog[imp].len += skb_queue_len(list);
+		if (imp == TIPC_SYSTEM_IMPORTANCE &&
+		    l->backlog[imp].len >= l->backlog[imp].limit) {
+			pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
+			return -ENOBUFS;
+		}
 		skb_queue_splice_tail_init(list, backlogq);
 	}
 	l->snd_nxt = seqno;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH] ipv6: Allow non-gateway ECMP for IPv6
From: David Ahern @ 2018-05-01  2:59 UTC (permalink / raw)
  To: Thomas Winter, netdev
  Cc: David S. Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Ido Schimmel
In-Reply-To: <20180430211529.8295-1-Thomas.Winter@alliedtelesis.co.nz>

On 4/30/18 3:15 PM, Thomas Winter wrote:
> It is valid to have static routes where the nexthop
> is an interface not an address such as tunnels.
> For IPv4 it was possible to use ECMP on these routes
> but not for IPv6.
> 
> Signed-off-by: Thomas Winter <Thomas.Winter@alliedtelesis.co.nz>
> Cc: David Ahern <dsahern@gmail.com>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
> ---
>  include/net/ip6_route.h | 3 +--
>  net/ipv6/ip6_fib.c      | 3 ---
>  2 files changed, 1 insertion(+), 5 deletions(-)
> 

Interesting. Existing code inserts the dev nexthop as a separate route.

Change looks good to me.

Acked-by: David Ahern <dsahern@gmail.com>

^ permalink raw reply

* Re: [PATCH net-next 0/4] net/smc: fixes 2018/04/30
From: David Miller @ 2018-05-01  1:03 UTC (permalink / raw)
  To: ubraun; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl
In-Reply-To: <20180430145119.72479-1-ubraun@linux.ibm.com>

From: Ursula Braun <ubraun@linux.ibm.com>
Date: Mon, 30 Apr 2018 16:51:15 +0200

> From: Ursula Braun <ursula.braun@linux.ibm.com>
> 
> Dave,
> 
> here are 4 smc patches for net-next covering different areas:
>    * link health check
>    * diagnostics for IPv6 smc sockets
>    * ioctl
>    * improvement for vlan determination

You say "fixes" in your Subject line but adding ipv6 smc
socket diag support is a feature not a fix.

Actually, generally speaking your patch submissions are
confusing.

You do submit really pure bug fixes, but for some odd
reason you target the net-next tree instead of net.

Maybe you have a good reason for doing this and you can
explain it to me?

^ permalink raw reply

* Re: [PATCH RFC 6/9] veth: Add ndo_xdp_xmit
From: Toshiaki Makita @ 2018-05-01  1:02 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Toshiaki Makita, netdev, Tariq Toukan
In-Reply-To: <20180430192709.1fc47265@redhat.com>

On 2018/05/01 2:27, Jesper Dangaard Brouer wrote:
> On Thu, 26 Apr 2018 19:52:40 +0900
> Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:
> 
>> On 2018/04/26 5:24, Jesper Dangaard Brouer wrote:
>>> On Tue, 24 Apr 2018 23:39:20 +0900
>>> Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:
>>>   
>>>> +static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
>>>> +{
>>>> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
>>>> +	int headroom = frame->data - (void *)frame;
>>>> +	struct net_device *rcv;
>>>> +	int err = 0;
>>>> +
>>>> +	rcv = rcu_dereference(priv->peer);
>>>> +	if (unlikely(!rcv))
>>>> +		return -ENXIO;
>>>> +
>>>> +	rcv_priv = netdev_priv(rcv);
>>>> +	/* xdp_ring is initialized on receive side? */
>>>> +	if (rcu_access_pointer(rcv_priv->xdp_prog)) {
>>>> +		err = xdp_ok_fwd_dev(rcv, frame->len);
>>>> +		if (unlikely(err))
>>>> +			return err;
>>>> +
>>>> +		err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
>>>> +	} else {
>>>> +		struct sk_buff *skb;
>>>> +
>>>> +		skb = veth_build_skb(frame, headroom, frame->len, 0);
>>>> +		if (unlikely(!skb))
>>>> +			return -ENOMEM;
>>>> +
>>>> +		/* Get page ref in case skb is dropped in netif_rx.
>>>> +		 * The caller is responsible for freeing the page on error.
>>>> +		 */
>>>> +		get_page(virt_to_page(frame->data));  
>>>
>>> I'm not sure you can make this assumption, that xdp_frames coming from
>>> another device driver uses a refcnt based memory model. But maybe I'm
>>> confused, as this looks like an SKB receive path, but in the
>>> ndo_xdp_xmit().  
>>
>> I find this path similar to cpumap, which creates skb from redirected
>> xdp frame. Once it is converted to skb, skb head is freed by
>> page_frag_free, so anyway I needed to get the refcount here regardless
>> of memory model.
> 
> Yes I know, I wrote cpumap ;-)
> 
> First of all, I don't want to see such xdp_frame to SKB conversion code
> in every driver.  Because that increase the chances of errors.  And
> when looking at the details, then it seems that you have made the
> mistake of making it possible to leak xdp_frame info to the SKB (which
> cpumap takes into account).

Do you mean leaving xdp_frame in skb->head is leaking something? how?

> 
> Second, I think the refcnt scheme here is wrong. The xdp_frame should
> be "owned" by XDP and have the proper refcnt to deliver it directly to
> the network stack.
> 
> Third, if we choose that we want a fallback, in-case XDP is not enabled
> on egress dev (but it have an ndo_xdp_xmit), then it should be placed
> in the generic/core code.  E.g. __bpf_tx_xdp_map() could look at the
> return code from dev->netdev_ops->ndo_xdp() and create an SKB.  (Hint,
> this would make it easy to implement TX bulking towards the dev).

Right, this is a much cleaner way.
Although I feel like we should add this fallback for veth because it
requires something which is different from other drivers (enabling XDP
on the peer device of the egress device), I'll drop the part for now. It
should not be resolved in the driver code.

-- 
Toshiaki Makita

^ permalink raw reply

* [PATCH net-next] udp: Complement partial checksum for GSO packet
From: Sean Tranchetti @ 2018-05-01  0:01 UTC (permalink / raw)
  To: willemb, davem, netdev; +Cc: Sean Tranchetti, Subash Abhinov Kasiviswanathan

Using the udp_v4_check() function to calculate the pseudo header
for the newly segmented UDP packets results in assigning the complement
of the value to the UDP header checksum field.

Always undo the complement the partial checksum value in order to
match the case where GSO is not used on the UDP transmit path.

Fixes: ee80d1ebe5ba ("udp: add udp gso")
Signed-off-by: Sean Tranchetti <stranche@codeaurora.org>
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 net/ipv4/udp_offload.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f78fb36..0062570 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -223,6 +223,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 			csum_replace2(&uh->check, htons(mss),
 				      htons(seg->len - hdrlen - sizeof(*uh)));
 
+		uh->check = ~uh->check;
 		seg->destructor = sock_wfree;
 		seg->sk = sk;
 		sum_truesize += seg->truesize;
-- 
1.9.1

^ permalink raw reply related

* Re: [RFC net-next 0/5] Support for PHY test modes
From: Andrew Lunn @ 2018-04-30 23:24 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: David Miller, netdev, rmk, linux-kernel, cphealy, nikita.yoush,
	vivien.didelot, Nisar.Sayed, UNGLinuxDriver
In-Reply-To: <eea70076-891c-5caa-a590-ef1b0421f3b2@gmail.com>

> Turning these tests on will typically result in the link partner
> dropping the link with us, and the interface will be non-functional as
> far as the data path is concerned (similar to an isolation mode). This
> might warrant properly reporting that to user-space through e.g: a
> private IFF_* value maybe?

Hi Florian

I think a IFF_* value would be a good idea. We want to give the user
some indicate why they don't have working networking. ip link show
showing PHY-TEST-MODE would help.

	Andrew

^ permalink raw reply

* Re: [RFC net-next 4/5] net: phy: Add support for IEEE standard test modes
From: Andrew Lunn @ 2018-04-30 23:20 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, Russell King, open list, davem, cphealy, nikita.yoush,
	vivien.didelot, Nisar.Sayed, UNGLinuxDriver
In-Reply-To: <20180428003237.1536-5-f.fainelli@gmail.com>

> +/* genphy_set_test - Make a PHY enter one of the standard IEEE defined
> + * test modes
> + * @phydev: the PHY device instance
> + * @test: the desired test mode
> + * @data: test specific data (none)
> + *
> + * This function makes the designated @phydev enter the desired standard
> + * 100BaseT2 or 1000BaseT test mode as defined in IEEE 802.3-2012 section TWO
> + * and THREE under 32.6.1.2.1 and 40.6.1.1.2 respectively
> + */
> +int genphy_set_test(struct phy_device *phydev,
> +		    struct ethtool_phy_test *test, const u8 *data)
> +{
> +	u16 shift, base, bmcr = 0;
> +	int ret;
> +
> +	/* Exit test mode */
> +	if (test->mode == PHY_STD_TEST_MODE_NORMAL) {
> +		ret = phy_read(phydev, MII_CTRL1000);
> +		if (ret < 0)
> +			return ret;
> +
> +		ret &= ~GENMASK(15, 13);
> +
> +		return phy_write(phydev, MII_CTRL1000, ret);
> +	}

Hi Florain

I looked at the Marvell SDK for PHYs. It performs a soft reset after
swapping back to normal mode. I assume the broadcom PHY does not need
this? But maybe we can add it anyway?

> +
> +	switch (test->mode) {
> +	case PHY_STD_TEST_MODE_100BASET2_1:
> +	case PHY_STD_TEST_MODE_100BASET2_2:
> +	case PHY_STD_TEST_MODE_100BASET2_3:
> +		if (!(phydev->supported & PHY_100BT_FEATURES))
> +			return -EOPNOTSUPP;
> +
> +		shift = 14;
> +		base = test->mode - PHY_STD_TEST_MODE_NORMAL;
> +		bmcr = BMCR_SPEED100;
> +		break;
> +
> +	case PHY_STD_TEST_MODE_1000BASET_1:
> +	case PHY_STD_TEST_MODE_1000BASET_2:
> +	case PHY_STD_TEST_MODE_1000BASET_3:
> +	case PHY_STD_TEST_MODE_1000BASET_4:
> +		if (!(phydev->supported & PHY_1000BT_FEATURES))
> +			return -EOPNOTSUPP;
> +
> +		shift = 13;
> +		base = test->mode - PHY_STD_TEST_MODE_100BASET2_MAX;
> +		bmcr = BMCR_SPEED1000;
> +		break;
> +
> +	default:
> +		/* Let an upper driver deal with additional modes it may
> +		 * support
> +		 */
> +		return -EOPNOTSUPP;
> +	}
> +
> +	/* Force speed and duplex */
> +	ret = phy_write(phydev, MII_BMCR, bmcr | BMCR_FULLDPLX);
> +	if (ret < 0)
> +		return ret;

Should there be something to undo this when returning to normal mode?

       Andrew

^ permalink raw reply

* Re: [PATCH net-next v6] Add Common Applications Kept Enhanced (cake) qdisc
From: Cong Wang @ 2018-04-30 23:04 UTC (permalink / raw)
  To: Dave Taht
  Cc: Toke Høiland-Jørgensen, Linux Kernel Network Developers,
	Cake List
In-Reply-To: <CAA93jw6F+c-QRXe+MA2QmRkwiKEBqFgOFKTvWGfO7FvCQ5tFvw@mail.gmail.com>

On Mon, Apr 30, 2018 at 2:27 PM, Dave Taht <dave.taht@gmail.com> wrote:
> On Mon, Apr 30, 2018 at 2:21 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> On Sun, Apr 29, 2018 at 2:34 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>>> sch_cake targets the home router use case and is intended to squeeze the
>>> most bandwidth and latency out of even the slowest ISP links and routers,
>>> while presenting an API simple enough that even an ISP can configure it.
>>>
>>> Example of use on a cable ISP uplink:
>>>
>>> tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter
>>>
>>> To shape a cable download link (ifb and tc-mirred setup elided)
>>>
>>> tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash
>>>
>>> CAKE is filled with:
>>>
>>> * A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel
>>>   derived Flow Queuing system, which autoconfigures based on the bandwidth.
>>> * A novel "triple-isolate" mode (the default) which balances per-host
>>>   and per-flow FQ even through NAT.
>>> * An deficit based shaper, that can also be used in an unlimited mode.
>>> * 8 way set associative hashing to reduce flow collisions to a minimum.
>>> * A reasonable interpretation of various diffserv latency/loss tradeoffs.
>>> * Support for zeroing diffserv markings for entering and exiting traffic.
>>> * Support for interacting well with Docsis 3.0 shaper framing.
>>> * Extensive support for DSL framing types.
>>> * Support for ack filtering.
>>
>> Why this TCP ACK filtering has to be built into CAKE qdisc rather than
>> an independent TC filter? Why other qdisc's can't use it?
>
> I actually have a tc - bpf based ack filter, during the development of
> cake's ack-thinner, that I should submit one of these days. It
> proved to be of limited use.

Yeah.

>
> Probably the biggest mistake we made is by calling this cake feature a
> filter. It isn't.


It inspects the payload of each packet and drops packets, therefore
it is a filter by definition, no matter how you name it.

>
> Maybe we should have called it a "thinner" or something like that? In
> order to properly "thin" or "reduce" an ack stream
> you have to have a queue to look at and some related state. TC filters
> do not operate on queues, qdiscs do. Thus the "ack-filter" here is
> deeply embedded into cake's flow isolation and queue structures.


TC filters are installed on qdiscs and in the beginning qdiscs were
queues,for example, pfifo. We already have flow-based filters too
(cls_flower),so we can make them work together, although probably
it is not straight forward.

^ permalink raw reply

* Re: [PATCH net-next 2/4] net/smc: ipv6 support for smc_diag.c
From: kbuild test robot @ 2018-04-30 22:46 UTC (permalink / raw)
  To: Ursula Braun
  Cc: kbuild-all, davem, netdev, linux-s390, schwidefsky,
	heiko.carstens, raspl, ubraun
In-Reply-To: <20180430145119.72479-3-ubraun@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 6126 bytes --]

Hi Karsten,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Ursula-Braun/net-smc-periodic-testlink-support/20180501-045940
config: x86_64-randconfig-x016-201817 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/sock_diag.h:8:0,
                    from net/smc/smc_diag.c:15:
   net/smc/smc_diag.c: In function 'smc_diag_msg_common_fill':
>> include/net/sock.h:350:37: error: 'struct sock_common' has no member named 'skc_v6_rcv_saddr'; did you mean 'skc_rcv_saddr'?
    #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
                                        ^
>> net/smc/smc_diag.c:49:47: note: in expansion of macro 'sk_v6_rcv_saddr'
      memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
                                                  ^~~~~~~~~~~~~~~
>> include/net/sock.h:350:37: error: 'struct sock_common' has no member named 'skc_v6_rcv_saddr'; did you mean 'skc_rcv_saddr'?
    #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
                                        ^
   net/smc/smc_diag.c:50:35: note: in expansion of macro 'sk_v6_rcv_saddr'
             sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
                                      ^~~~~~~~~~~~~~~
>> include/net/sock.h:349:34: error: 'struct sock_common' has no member named 'skc_v6_daddr'; did you mean 'skc_daddr'?
    #define sk_v6_daddr  __sk_common.skc_v6_daddr
                                     ^
>> net/smc/smc_diag.c:51:47: note: in expansion of macro 'sk_v6_daddr'
      memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
                                                  ^~~~~~~~~~~
>> include/net/sock.h:349:34: error: 'struct sock_common' has no member named 'skc_v6_daddr'; did you mean 'skc_daddr'?
    #define sk_v6_daddr  __sk_common.skc_v6_daddr
                                     ^
   net/smc/smc_diag.c:52:35: note: in expansion of macro 'sk_v6_daddr'
             sizeof(smc->clcsock->sk->sk_v6_daddr));
                                      ^~~~~~~~~~~
--
   In file included from include/linux/sock_diag.h:8:0,
                    from net//smc/smc_diag.c:15:
   net//smc/smc_diag.c: In function 'smc_diag_msg_common_fill':
>> include/net/sock.h:350:37: error: 'struct sock_common' has no member named 'skc_v6_rcv_saddr'; did you mean 'skc_rcv_saddr'?
    #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
                                        ^
   net//smc/smc_diag.c:49:47: note: in expansion of macro 'sk_v6_rcv_saddr'
      memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
                                                  ^~~~~~~~~~~~~~~
>> include/net/sock.h:350:37: error: 'struct sock_common' has no member named 'skc_v6_rcv_saddr'; did you mean 'skc_rcv_saddr'?
    #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
                                        ^
   net//smc/smc_diag.c:50:35: note: in expansion of macro 'sk_v6_rcv_saddr'
             sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
                                      ^~~~~~~~~~~~~~~
>> include/net/sock.h:349:34: error: 'struct sock_common' has no member named 'skc_v6_daddr'; did you mean 'skc_daddr'?
    #define sk_v6_daddr  __sk_common.skc_v6_daddr
                                     ^
   net//smc/smc_diag.c:51:47: note: in expansion of macro 'sk_v6_daddr'
      memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
                                                  ^~~~~~~~~~~
>> include/net/sock.h:349:34: error: 'struct sock_common' has no member named 'skc_v6_daddr'; did you mean 'skc_daddr'?
    #define sk_v6_daddr  __sk_common.skc_v6_daddr
                                     ^
   net//smc/smc_diag.c:52:35: note: in expansion of macro 'sk_v6_daddr'
             sizeof(smc->clcsock->sk->sk_v6_daddr));
                                      ^~~~~~~~~~~

vim +/sk_v6_rcv_saddr +49 net/smc/smc_diag.c

  > 15	#include <linux/sock_diag.h>
    16	#include <linux/inet_diag.h>
    17	#include <linux/smc_diag.h>
    18	#include <net/netlink.h>
    19	#include <net/smc.h>
    20	
    21	#include "smc.h"
    22	#include "smc_core.h"
    23	
    24	static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
    25	{
    26		sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
    27			be16_to_cpu(((__be16 *)gid_raw)[0]),
    28			be16_to_cpu(((__be16 *)gid_raw)[1]),
    29			be16_to_cpu(((__be16 *)gid_raw)[2]),
    30			be16_to_cpu(((__be16 *)gid_raw)[3]),
    31			be16_to_cpu(((__be16 *)gid_raw)[4]),
    32			be16_to_cpu(((__be16 *)gid_raw)[5]),
    33			be16_to_cpu(((__be16 *)gid_raw)[6]),
    34			be16_to_cpu(((__be16 *)gid_raw)[7]));
    35	}
    36	
    37	static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
    38	{
    39		struct smc_sock *smc = smc_sk(sk);
    40	
    41		if (!smc->clcsock)
    42			return;
    43		r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
    44		r->id.idiag_dport = smc->clcsock->sk->sk_dport;
    45		r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
    46		sock_diag_save_cookie(sk, r->id.idiag_cookie);
    47		if (sk->sk_protocol == SMCPROTO_SMC6) {
    48			r->diag_family = PF_INET6;
  > 49			memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
  > 50			       sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
  > 51			memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
    52			       sizeof(smc->clcsock->sk->sk_v6_daddr));
    53		} else {
    54			r->diag_family = PF_INET;
    55			memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
    56			memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
    57			r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
    58			r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
    59		}
    60	}
    61	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 30034 bytes --]

^ permalink raw reply

* [PATCH bpf-next 3/3] bpf: add faked "ending" subprog
From: Jiong Wang @ 2018-04-30 22:28 UTC (permalink / raw)
  To: alexei.starovoitov, borkmann; +Cc: ecree, netdev, oss-drivers, Jiong Wang
In-Reply-To: <1525127296-3573-1-git-send-email-jiong.wang@netronome.com>

There are quite a few code snippet like the following in verifier:

       subprog_start = 0;
       if (env->subprog_cnt == cur_subprog + 1)
               subprog_end = insn_cnt;
       else
               subprog_end = env->subprog_info[cur_subprog + 1].start;

The reason is there is no marker in subprog_info array to tell the end of
it.

We could resolve this issue by introducing a faked "ending" subprog.
The special "ending" subprog is with "insn_cnt" as start offset, so it is
serving as the end mark whenever we iterate over all subprogs.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 kernel/bpf/verifier.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9764b9b..4a081e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -766,7 +766,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 	ret = find_subprog(env, off);
 	if (ret >= 0)
 		return 0;
-	if (env->subprog_cnt > BPF_MAX_SUBPROGS) {
+	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
@@ -807,16 +807,18 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			return ret;
 	}
 
+	/* Add a fake 'exit' subprog which could simplify subprog iteration
+	 * logic. 'subprog_cnt' should not be increased.
+	 */
+	subprog[env->subprog_cnt].start = insn_cnt;
+
 	if (env->log.level > 1)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
 	/* now check that all jumps are within the same subprog */
-	subprog_start = 0;
-	if (env->subprog_cnt == cur_subprog + 1)
-		subprog_end = insn_cnt;
-	else
-		subprog_end = subprog[cur_subprog + 1].start;
+	subprog_start = subprog[cur_subprog].start;
+	subprog_end = subprog[cur_subprog + 1].start;
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -840,11 +842,9 @@ static int check_subprogs(struct bpf_verifier_env *env)
 				verbose(env, "last insn is not an exit or jmp\n");
 				return -EINVAL;
 			}
-			cur_subprog++;
 			subprog_start = subprog_end;
-			if (env->subprog_cnt == cur_subprog + 1)
-				subprog_end = insn_cnt;
-			else
+			cur_subprog++;
+			if (cur_subprog < env->subprog_cnt)
 				subprog_end = subprog[cur_subprog + 1].start;
 		}
 	}
@@ -1499,7 +1499,6 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 	int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
 	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
 	int ret_insn[MAX_CALL_FRAMES];
 	int ret_prog[MAX_CALL_FRAMES];
 
@@ -1514,10 +1513,7 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == idx + 1)
-		subprog_end = insn_cnt;
-	else
-		subprog_end = subprog[idx + 1].start;
+	subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -5268,10 +5264,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 
 	for (i = 0; i < env->subprog_cnt; i++) {
 		subprog_start = subprog_end;
-		if (env->subprog_cnt == i + 1)
-			subprog_end = prog->len;
-		else
-			subprog_end = env->subprog_info[i + 1].start;
+		subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
-- 
2.7.4

^ permalink raw reply related

* [PATCH bpf-next 2/3] bpf: centre subprog information fields
From: Jiong Wang @ 2018-04-30 22:28 UTC (permalink / raw)
  To: alexei.starovoitov, borkmann; +Cc: ecree, netdev, oss-drivers, Jiong Wang
In-Reply-To: <1525127296-3573-1-git-send-email-jiong.wang@netronome.com>

It is better to centre all subprog information fields into one structure.
This structure could later serve as function node in call graph.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 include/linux/bpf_verifier.h |  9 ++++---
 kernel/bpf/verifier.c        | 62 +++++++++++++++++++++++---------------------
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f655b92..8f70dc1 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -173,6 +173,11 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 
 #define BPF_MAX_SUBPROGS 256
 
+struct bpf_subprog_info {
+	u32 start; /* insn idx of function entry point */
+	u16 stack_depth; /* max. stack depth used by this function */
+};
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
@@ -191,9 +196,7 @@ struct bpf_verifier_env {
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	struct bpf_verifier_log log;
-	u32 subprog_starts[BPF_MAX_SUBPROGS + 1];
-	/* computes the stack depth of each bpf function */
-	u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
+	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 16ec977..9764b9b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -738,18 +738,19 @@ enum reg_arg_type {
 
 static int cmp_subprogs(const void *a, const void *b)
 {
-	return *(int *)a - *(int *)b;
+	return ((struct bpf_subprog_info *)a)->start -
+	       ((struct bpf_subprog_info *)b)->start;
 }
 
 static int find_subprog(struct bpf_verifier_env *env, int off)
 {
-	u32 *p;
+	struct bpf_subprog_info *p;
 
-	p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
-		    sizeof(env->subprog_starts[0]), cmp_subprogs);
+	p = bsearch(&off, env->subprog_info, env->subprog_cnt,
+		    sizeof(env->subprog_info[0]), cmp_subprogs);
 	if (!p)
 		return -ENOENT;
-	return p - env->subprog_starts;
+	return p - env->subprog_info;
 
 }
 
@@ -769,15 +770,16 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
-	env->subprog_starts[env->subprog_cnt++] = off;
-	sort(env->subprog_starts, env->subprog_cnt,
-	     sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+	env->subprog_info[env->subprog_cnt++].start = off;
+	sort(env->subprog_info, env->subprog_cnt,
+	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
 	return 0;
 }
 
 static int check_subprogs(struct bpf_verifier_env *env)
 {
 	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 
@@ -807,14 +809,14 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
 	if (env->log.level > 1)
 		for (i = 0; i < env->subprog_cnt; i++)
-			verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
 	/* now check that all jumps are within the same subprog */
 	subprog_start = 0;
 	if (env->subprog_cnt == cur_subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[cur_subprog + 1];
+		subprog_end = subprog[cur_subprog + 1].start;
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -843,8 +845,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			if (env->subprog_cnt == cur_subprog + 1)
 				subprog_end = insn_cnt;
 			else
-				subprog_end =
-					env->subprog_starts[cur_subprog + 1];
+				subprog_end = subprog[cur_subprog + 1].start;
 		}
 	}
 	return 0;
@@ -1477,13 +1478,13 @@ static int update_stack_depth(struct bpf_verifier_env *env,
 			      const struct bpf_func_state *func,
 			      int off)
 {
-	u16 stack = env->subprog_stack_depth[func->subprogno];
+	u16 stack = env->subprog_info[func->subprogno].stack_depth;
 
 	if (stack >= -off)
 		return 0;
 
 	/* update known max for given subprogram */
-	env->subprog_stack_depth[func->subprogno] = -off;
+	env->subprog_info[func->subprogno].stack_depth = -off;
 	return 0;
 }
 
@@ -1495,7 +1496,8 @@ static int update_stack_depth(struct bpf_verifier_env *env,
  */
 static int check_max_stack_depth(struct bpf_verifier_env *env)
 {
-	int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+	int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
+	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 	int ret_insn[MAX_CALL_FRAMES];
@@ -1505,17 +1507,17 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 	/* round up to 32-bytes, since this is granularity
 	 * of interpreter stack size
 	 */
-	depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
 	if (depth > MAX_BPF_STACK) {
 		verbose(env, "combined stack size of %d calls is %d. Too large\n",
 			frame + 1, depth);
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == subprog + 1)
+	if (env->subprog_cnt == idx + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[subprog + 1];
+		subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -1523,12 +1525,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 			continue;
 		/* remember insn and function to return to */
 		ret_insn[frame] = i + 1;
-		ret_prog[frame] = subprog;
+		ret_prog[frame] = idx;
 
 		/* find the callee */
 		i = i + insn[i].imm + 1;
-		subprog = find_subprog(env, i);
-		if (subprog < 0) {
+		idx = find_subprog(env, i);
+		if (idx < 0) {
 			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
 				  i);
 			return -EFAULT;
@@ -1545,10 +1547,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 	 */
 	if (frame == 0)
 		return 0;
-	depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
 	frame--;
 	i = ret_insn[frame];
-	subprog = ret_prog[frame];
+	idx = ret_prog[frame];
 	goto continue_func;
 }
 
@@ -1564,7 +1566,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 			  start);
 		return -EFAULT;
 	}
-	return env->subprog_stack_depth[subprog];
+	return env->subprog_info[subprog].stack_depth;
 }
 #endif
 
@@ -4855,14 +4857,14 @@ static int do_check(struct bpf_verifier_env *env)
 	verbose(env, "processed %d insns (limit %d), stack depth ",
 		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
 	for (i = 0; i < env->subprog_cnt; i++) {
-		u32 depth = env->subprog_stack_depth[i];
+		u32 depth = env->subprog_info[i].stack_depth;
 
 		verbose(env, "%d", depth);
 		if (i + 1 < env->subprog_cnt)
 			verbose(env, "+");
 	}
 	verbose(env, "\n");
-	env->prog->aux->stack_depth = env->subprog_stack_depth[0];
+	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return 0;
 }
 
@@ -5069,9 +5071,9 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
 	if (len == 1)
 		return;
 	for (i = 0; i < env->subprog_cnt; i++) {
-		if (env->subprog_starts[i] < off)
+		if (env->subprog_info[i].start < off)
 			continue;
-		env->subprog_starts[i] += len - 1;
+		env->subprog_info[i].start += len - 1;
 	}
 }
 
@@ -5269,7 +5271,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		if (env->subprog_cnt == i + 1)
 			subprog_end = prog->len;
 		else
-			subprog_end = env->subprog_starts[i + 1];
+			subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5286,7 +5288,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		 * Long term would need debug info to populate names
 		 */
 		func[i]->aux->name[0] = 'F';
-		func[i]->aux->stack_depth = env->subprog_stack_depth[i];
+		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
-- 
2.7.4

^ permalink raw reply related

* [PATCH bpf-next 1/3] bpf: unify main prog and subprog
From: Jiong Wang @ 2018-04-30 22:28 UTC (permalink / raw)
  To: alexei.starovoitov, borkmann; +Cc: ecree, netdev, oss-drivers, Jiong Wang
In-Reply-To: <1525127296-3573-1-git-send-email-jiong.wang@netronome.com>

Currently, verifier treat main prog and subprog differently. All subprogs
detected are kept in env->subprog_starts while main prog is not kept there.
Instead, main prog is implicitly defined as the prog start at 0.

There is actually no difference between main prog and subprog, it is better
to unify them, and register all progs detected into env->subprog_starts.

This could also help simplifying some code logic.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 57 ++++++++++++++++++++++++--------------------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e61c39..f655b92 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -191,7 +191,7 @@ struct bpf_verifier_env {
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	struct bpf_verifier_log log;
-	u32 subprog_starts[BPF_MAX_SUBPROGS];
+	u32 subprog_starts[BPF_MAX_SUBPROGS + 1];
 	/* computes the stack depth of each bpf function */
 	u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596..16ec977 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -765,7 +765,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 	ret = find_subprog(env, off);
 	if (ret >= 0)
 		return 0;
-	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
+	if (env->subprog_cnt > BPF_MAX_SUBPROGS) {
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
@@ -781,6 +781,11 @@ static int check_subprogs(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 
+	/* Add entry function. */
+	ret = add_subprog(env, 0);
+	if (ret < 0)
+		return ret;
+
 	/* determine subprog starts. The end is one before the next starts */
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
@@ -806,10 +811,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
 	/* now check that all jumps are within the same subprog */
 	subprog_start = 0;
-	if (env->subprog_cnt == cur_subprog)
+	if (env->subprog_cnt == cur_subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[cur_subprog++];
+		subprog_end = env->subprog_starts[cur_subprog + 1];
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -833,11 +838,13 @@ static int check_subprogs(struct bpf_verifier_env *env)
 				verbose(env, "last insn is not an exit or jmp\n");
 				return -EINVAL;
 			}
+			cur_subprog++;
 			subprog_start = subprog_end;
-			if (env->subprog_cnt == cur_subprog)
+			if (env->subprog_cnt == cur_subprog + 1)
 				subprog_end = insn_cnt;
 			else
-				subprog_end = env->subprog_starts[cur_subprog++];
+				subprog_end =
+					env->subprog_starts[cur_subprog + 1];
 		}
 	}
 	return 0;
@@ -1505,10 +1512,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == subprog)
+	if (env->subprog_cnt == subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[subprog];
+		subprog_end = env->subprog_starts[subprog + 1];
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -1526,7 +1533,6 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 				  i);
 			return -EFAULT;
 		}
-		subprog++;
 		frame++;
 		if (frame >= MAX_CALL_FRAMES) {
 			WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
@@ -1558,7 +1564,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 			  start);
 		return -EFAULT;
 	}
-	subprog++;
 	return env->subprog_stack_depth[subprog];
 }
 #endif
@@ -2087,7 +2092,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_tail_call:
 		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
 			goto error;
-		if (env->subprog_cnt) {
+		if (env->subprog_cnt > 1) {
 			verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
 			return -EINVAL;
 		}
@@ -2259,7 +2264,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			/* remember the callsite, it will be used by bpf_exit */
 			*insn_idx /* callsite */,
 			state->curframe + 1 /* frameno within this callchain */,
-			subprog + 1 /* subprog number within this prog */);
+			subprog /* subprog number within this prog */);
 
 	/* copy r1 - r5 args that callee can access */
 	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
@@ -3818,7 +3823,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
-	if (env->subprog_cnt) {
+	if (env->subprog_cnt > 1) {
 		/* when program has LD_ABS insn JITs and interpreter assume
 		 * that r1 == ctx == skb which is not the case for callees
 		 * that can have arbitrary arguments. It's problematic
@@ -4849,11 +4854,11 @@ static int do_check(struct bpf_verifier_env *env)
 
 	verbose(env, "processed %d insns (limit %d), stack depth ",
 		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
-	for (i = 0; i < env->subprog_cnt + 1; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		u32 depth = env->subprog_stack_depth[i];
 
 		verbose(env, "%d", depth);
-		if (i + 1 < env->subprog_cnt + 1)
+		if (i + 1 < env->subprog_cnt)
 			verbose(env, "+");
 	}
 	verbose(env, "\n");
@@ -5230,7 +5235,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	void *old_bpf_func;
 	int err = -ENOMEM;
 
-	if (env->subprog_cnt == 0)
+	if (env->subprog_cnt <= 1)
 		return 0;
 
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5246,7 +5251,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		/* temporarily remember subprog id inside insn instead of
 		 * aux_data, since next loop will split up all insns into funcs
 		 */
-		insn->off = subprog + 1;
+		insn->off = subprog;
 		/* remember original imm in case JIT fails and fallback
 		 * to interpreter will be needed
 		 */
@@ -5255,16 +5260,16 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		insn->imm = 1;
 	}
 
-	func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+	func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL);
 	if (!func)
 		return -ENOMEM;
 
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		subprog_start = subprog_end;
-		if (env->subprog_cnt == i)
+		if (env->subprog_cnt == i + 1)
 			subprog_end = prog->len;
 		else
-			subprog_end = env->subprog_starts[i];
+			subprog_end = env->subprog_starts[i + 1];
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5294,7 +5299,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * now populate all bpf_calls with correct addresses and
 	 * run last pass of JIT
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
 			if (insn->code != (BPF_JMP | BPF_CALL) ||
@@ -5307,7 +5312,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 				__bpf_call_base;
 		}
 	}
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
 		tmp = bpf_int_jit_compile(func[i]);
 		if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
@@ -5321,7 +5326,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	/* finally lock prog and jit images for all functions and
 	 * populate kallsysm
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		bpf_prog_lock_ro(func[i]);
 		bpf_prog_kallsyms_add(func[i]);
 	}
@@ -5338,7 +5343,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog + 1]->bpf_func;
+		addr  = (unsigned long)func[subprog]->bpf_func;
 		addr &= PAGE_MASK;
 		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 			    addr - __bpf_call_base;
@@ -5347,10 +5352,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->jited = 1;
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
-	prog->aux->func_cnt = env->subprog_cnt + 1;
+	prog->aux->func_cnt = env->subprog_cnt;
 	return 0;
 out_free:
-	for (i = 0; i <= env->subprog_cnt; i++)
+	for (i = 0; i < env->subprog_cnt; i++)
 		if (func[i])
 			bpf_jit_free(func[i]);
 	kfree(func);
-- 
2.7.4

^ permalink raw reply related

* [PATCH bpf-next 0/3] bpf: cleanups on managing subprog information
From: Jiong Wang @ 2018-04-30 22:28 UTC (permalink / raw)
  To: alexei.starovoitov, borkmann; +Cc: ecree, netdev, oss-drivers, Jiong Wang

This patch set clean up some code logic related with managing subprog
information.

Part of the set are inspried by Edwin's code in his RFC:

  "bpf/verifier: subprog/func_call simplifications"

but with clearer separation so it could be easier to review.

  - Path 1 unifies main prog and subprogs. All of them are registered in
    env->subprog_starts.

  - After patch 1, it is clear that subprog_starts and subprog_stack_depth
    could be merged as both of them now have main and subprog unified.
    Patch 2 therefore does the merge, all subprog information are centred
    at bpf_subprog_info.

  - Patch 3 goes further to introduce a new fake "exit" subprog which
    serves as an ending marker to the subprog list. We could then turn the
    following code snippets across verifier:

       if (env->subprog_cnt == cur_subprog + 1)
               subprog_end = insn_cnt;
       else
               subprog_end = env->subprog_info[cur_subprog + 1].start;

    into:
       subprog_end = env->subprog_info[cur_subprog + 1].start;

There is no functional change by this patch set.
No bpf selftest regression found after this patch set.

Jiong Wang (3):
  bpf: unify main prog and subprog
  bpf: centre subprog information fields
  bpf: add faked "ending" subprog

 include/linux/bpf_verifier.h |   9 ++--
 kernel/bpf/verifier.c        | 118 +++++++++++++++++++++----------------------
 2 files changed, 65 insertions(+), 62 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: Request for stable 4.14.x inclusion: net: don't call update_pmtu unconditionally
From: Thomas Deutschmann @ 2018-04-30 22:15 UTC (permalink / raw)
  To: Greg KH, Eddie Chapman; +Cc: stable, davem, nicolas.dichtel, netdev
In-Reply-To: <20180430182222.GC12118@kroah.com>

Hi,

On 2018-04-30 20:22, Greg KH wrote:
> The geneve hunk doesn't apply at all to the 4.14.y tree, so I think
> someone has a messed up tree somewhere...
> 
> I'll go look into this now.

Mh?

> $ git clone https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git
> $ cd linux-stable
> $ git checkout v4.14.38
> $ git cherry-pick 52a589d51f1008f62569bf89e95b26221ee76690

Works for me... then I cherry-pick
f15ca723c1ebe6c1a06bc95fda6b62cd87b44559 on top, adjust
"net/ipv6/ip6_tunnel.c" like shown in my previous mail and everything is
fine for me...


-- 
Regards,
Thomas Deutschmann / Gentoo Linux Developer
C4DD 695F A713 8F24 2AA1 5638 5849 7EE5 1D5D 74A5

^ permalink raw reply

* [PATCH RESEND] connector: add parent pid and tgid to coredump and exit events
From: Stefan Strogin @ 2018-04-30 22:04 UTC (permalink / raw)
  To: Evgeniy Polyakov, David Miller, netdev
  Cc: Stefan Strogin, linux-kernel, xe-linux-external, Jesper Derehag,
	Matt Helsley

The intention is to get notified of process failures as soon
as possible, before a possible core dumping (which could be very long)
(e.g. in some process-manager). Coredump and exit process events
are perfect for such use cases (see 2b5faa4c553f "connector: Added
coredumping event to the process connector").

The problem is that for now the process-manager cannot know the parent
of a dying process using connectors. This could be useful if the
process-manager should monitor for failures only children of certain
parents, so we could filter the coredump and exit events by parent
process and/or thread ID.

Add parent pid and tgid to coredump and exit process connectors event
data.

Signed-off-by: Stefan Strogin <sstrogin@cisco.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
---
 drivers/connector/cn_proc.c  | 4 ++++
 include/uapi/linux/cn_proc.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index a782ce87715c..ed5e42461094 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -262,6 +262,8 @@ void proc_coredump_connector(struct task_struct *task)
 	ev->what = PROC_EVENT_COREDUMP;
 	ev->event_data.coredump.process_pid = task->pid;
 	ev->event_data.coredump.process_tgid = task->tgid;
+	ev->event_data.coredump.parent_pid = task->real_parent->pid;
+	ev->event_data.coredump.parent_tgid = task->real_parent->tgid;
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
@@ -288,6 +290,8 @@ void proc_exit_connector(struct task_struct *task)
 	ev->event_data.exit.process_tgid = task->tgid;
 	ev->event_data.exit.exit_code = task->exit_code;
 	ev->event_data.exit.exit_signal = task->exit_signal;
+	ev->event_data.exit.parent_pid = task->real_parent->pid;
+	ev->event_data.exit.parent_tgid = task->real_parent->tgid;
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h
index 68ff25414700..db210625cee8 100644
--- a/include/uapi/linux/cn_proc.h
+++ b/include/uapi/linux/cn_proc.h
@@ -116,12 +116,16 @@ struct proc_event {
 		struct coredump_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
 		} coredump;
 
 		struct exit_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
 			__u32 exit_code, exit_signal;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
 		} exit;
 
 	} event_data;
-- 
2.16.1

^ permalink raw reply related

* [PATCH net-next v3 2/2] openvswitch: Support conntrack zone limit
From: Yi-Hung Wei @ 2018-04-30 21:28 UTC (permalink / raw)
  To: netdev, pshelar; +Cc: Yi-Hung Wei
In-Reply-To: <1525123713-38891-1-git-send-email-yihung.wei@gmail.com>

Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max.  In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table.  Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.

To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone.  For example, we can designate different zone to different VM,
and set conntrack limit to each zone.  By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs.  Moreover, the users can
set various conntrack limit to different zone based on their preference.

The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone.  If the number of
connection is above a configured limitation, ovs will return ENOMEM to the
userspace.  If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.

The following high leve APIs are provided to the userspace:
  - OVS_CT_LIMIT_CMD_SET:
    * set default connection limit for all zones
    * set the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_DEL:
    * remove the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_GET:
    * get the default connection limit for all zones
    * get the connection limit for a particular zone

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
 net/openvswitch/Kconfig     |   3 +-
 net/openvswitch/conntrack.c | 508 +++++++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/conntrack.h |   9 +-
 net/openvswitch/datapath.c  |   7 +-
 net/openvswitch/datapath.h  |   1 +
 5 files changed, 522 insertions(+), 6 deletions(-)

diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2650205cdaf9..89da9512ec1e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -9,7 +9,8 @@ config OPENVSWITCH
 		   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
 				     (!NF_NAT || NF_NAT) && \
 				     (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
-				     (!NF_NAT_IPV6 || NF_NAT_IPV6)))
+				     (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
+				     (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c5904f629091..8234964889d9 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -16,8 +16,11 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/sctp.h>
+#include <linux/static_key.h>
 #include <net/ip.h>
+#include <net/genetlink.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_count.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
@@ -76,6 +79,39 @@ struct ovs_conntrack_info {
 #endif
 };
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+#define OVS_CT_LIMIT_UNLIMITED	0
+#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
+#define CT_LIMIT_HASH_BUCKETS 512
+DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
+
+struct ovs_ct_limit {
+	/* Elements in ovs_ct_limit_info->limits hash table */
+	struct hlist_node hlist_node;
+	struct rcu_head rcu;
+	u16 zone;
+	u32 limit;
+};
+
+struct ovs_ct_limit_info {
+	u32 default_limit;
+	struct hlist_head *limits;
+	struct nf_conncount_data *data __aligned(8);
+};
+
+static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
+	[OVS_CT_LIMIT_ATTR_OPTION] = { .type = NLA_NESTED, },
+};
+
+static const struct nla_policy
+	ct_zone_limit_policy[OVS_CT_ZONE_LIMIT_ATTR_MAX + 1] = {
+		[OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT] = { .type = NLA_U32, },
+		[OVS_CT_ZONE_LIMIT_ATTR_ZONE] = { .type = NLA_U16, },
+		[OVS_CT_ZONE_LIMIT_ATTR_LIMIT] = { .type = NLA_U32, },
+		[OVS_CT_ZONE_LIMIT_ATTR_COUNT] = { .type = NLA_U32, },
+};
+#endif
+
 static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
 
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -1036,6 +1072,94 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
 	return false;
 }
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static struct hlist_head *ct_limit_hash_bucket(
+	const struct ovs_ct_limit_info *info, u16 zone)
+{
+	return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)];
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_set(const struct ovs_ct_limit_info *info,
+			 struct ovs_ct_limit *new_ct_limit)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, new_ct_limit->zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == new_ct_limit->zone) {
+			hlist_replace_rcu(&ct_limit->hlist_node,
+					  &new_ct_limit->hlist_node);
+			kfree_rcu(ct_limit, rcu);
+			return;
+		}
+	}
+
+	hlist_add_head_rcu(&new_ct_limit->hlist_node, head);
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == zone) {
+			hlist_del_rcu(&ct_limit->hlist_node);
+			kfree_rcu(ct_limit, rcu);
+			return;
+		}
+	}
+}
+
+/* Call with RCU read lock */
+static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == zone)
+			return ct_limit->limit;
+	}
+
+	return info->default_limit;
+}
+
+static int ovs_ct_check_limit(struct net *net,
+			      const struct ovs_conntrack_info *info,
+			      const struct nf_conntrack_tuple *tuple)
+{
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	u32 per_zone_limit, connections;
+	u32 conncount_key[5];
+
+	conncount_key[0] = info->zone.id;
+
+	rcu_read_lock();
+	per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
+	if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	connections = nf_conncount_count(net, ct_limit_info->data,
+					 conncount_key, tuple, &info->zone);
+	if (connections > per_zone_limit) {
+		rcu_read_unlock();
+		return -ENOMEM;
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+#endif
+
 /* Lookup connection and confirm if unconfirmed. */
 static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 			 const struct ovs_conntrack_info *info,
@@ -1054,6 +1178,20 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 	if (!ct)
 		return 0;
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
+		if (!nf_ct_is_confirmed(ct)) {
+			err = ovs_ct_check_limit(net, info,
+				&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+			if (err) {
+				net_warn_ratelimited("openvswitch: zone: %u "
+					"execeeds conntrack limit\n", info->zone.id);
+				return err;
+			}
+		}
+	}
+#endif
+
 	/* Set the conntrack event mask if given.  NEW and DELETE events have
 	 * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener
 	 * typically would receive many kinds of updates.  Setting the event
@@ -1655,7 +1793,365 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
 		nf_ct_tmpl_free(ct_info->ct);
 }
 
-void ovs_ct_init(struct net *net)
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
+{
+	int i, err;
+
+	ovs_net->ct_limit_info = kmalloc(sizeof *ovs_net->ct_limit_info,
+					 GFP_KERNEL);
+	if (!ovs_net->ct_limit_info)
+		return -ENOMEM;
+
+	ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
+	ovs_net->ct_limit_info->limits =
+		kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
+			      GFP_KERNEL);
+	if (!ovs_net->ct_limit_info->limits) {
+		kfree(ovs_net->ct_limit_info);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
+
+	ovs_net->ct_limit_info->data =
+		nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
+
+	if (IS_ERR(ovs_net->ct_limit_info->data)) {
+		err = PTR_ERR(ovs_net->ct_limit_info->data);
+		kfree(ovs_net->ct_limit_info->limits);
+		kfree(ovs_net->ct_limit_info);
+		return err;
+	}
+	return 0;
+}
+
+static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
+{
+	const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info;
+	int i;
+
+	nf_conncount_destroy(net, NFPROTO_INET, info->data);
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+		struct hlist_head *head = &info->limits[i];
+		struct ovs_ct_limit *ct_limit;
+
+		hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
+			kfree_rcu(ct_limit, rcu);
+	}
+	kfree(ovs_net->ct_limit_info->limits);
+	kfree(ovs_net->ct_limit_info);
+}
+
+static struct sk_buff *
+ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd,
+			     struct ovs_header **ovs_reply_header)
+{
+	struct sk_buff *skb;
+	struct ovs_header *ovs_header = info->userhdr;
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	*ovs_reply_header = genlmsg_put(skb, info->snd_portid,
+					info->snd_seq,
+					&dp_ct_limit_genl_family, 0, cmd);
+
+	if (!*ovs_reply_header) {
+		nlmsg_free(skb);
+		return ERR_PTR(-EMSGSIZE);
+	}
+	(*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex;
+
+	return skb;
+}
+
+static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info)
+{
+	struct nlattr *nla;
+	int rem, err;
+
+	nla_for_each_nested(nla, nla_zone_limit, rem) {
+		struct nlattr *attr[OVS_CT_ZONE_LIMIT_ATTR_MAX + 1];
+		struct ovs_ct_limit *ct_limit;
+
+		if (nla_type(nla) != OVS_CT_ZONE_LIMIT_ATTR_SET_REQ)
+			return  -EINVAL;
+
+		err = nla_parse((struct nlattr **)&attr,
+				OVS_CT_ZONE_LIMIT_ATTR_MAX, nla_data(nla),
+				nla_len(nla), ct_zone_limit_policy, NULL);
+		if (err)
+			return err;
+
+		if (attr[OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT]) {
+			u32 default_limit = nla_get_u32(
+				attr[OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT]);
+			ovs_lock();
+			info->default_limit = default_limit;
+			ovs_unlock();
+		} else {
+			if (!attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE] ||
+			    !attr[OVS_CT_ZONE_LIMIT_ATTR_LIMIT]) {
+				return -EINVAL;
+			}
+
+			ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
+			if (!ct_limit)
+				return -ENOMEM;
+
+			ct_limit->zone = nla_get_u16(
+				attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE]);
+			ct_limit->limit = nla_get_u32(
+				attr[OVS_CT_ZONE_LIMIT_ATTR_LIMIT]);
+
+			ovs_lock();
+			ct_limit_set(info, ct_limit);
+			ovs_unlock();
+		}
+	}
+	return 0;
+}
+
+static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info)
+{
+	struct nlattr *nla;
+	int rem, err;
+
+	nla_for_each_nested(nla, nla_zone_limit, rem) {
+		struct nlattr *attr[OVS_CT_ZONE_LIMIT_ATTR_MAX + 1];
+		u16 zone;
+
+		if (nla_type(nla) != OVS_CT_ZONE_LIMIT_ATTR_DEL_REQ)
+			return  -EINVAL;
+
+		err = nla_parse((struct nlattr **)&attr,
+				OVS_CT_ZONE_LIMIT_ATTR_MAX, nla_data(nla),
+				nla_len(nla), ct_zone_limit_policy, NULL);
+		if (err)
+			return err;
+
+		if (!attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE])
+			return -EINVAL;
+
+		zone = nla_get_u16(attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE]);
+
+		ovs_lock();
+		ct_limit_del(info, zone);
+		ovs_unlock();
+	}
+	return 0;
+}
+
+static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
+					  struct sk_buff *reply)
+{
+	int err;
+	struct nlattr *nla_nested;
+
+	nla_nested = nla_nest_start(reply, OVS_CT_ZONE_LIMIT_ATTR_GET_RLY);
+
+	err = nla_put_u32(reply, OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT,
+			  info->default_limit);
+	if (err)
+		return err;
+
+	nla_nest_end(reply, nla_nested);
+	return 0;
+}
+
+static int ovs_ct_limit_get_zone_limit(struct net *net,
+				       struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info,
+				       struct sk_buff *reply)
+{
+	struct nlattr *nla, *nla_nested;
+	int rem, err;
+	u16 zone;
+	u32 limit, count, conncount_key[5];
+	struct nf_conntrack_zone ct_zone;
+
+	nla_for_each_nested(nla, nla_zone_limit, rem) {
+		struct nlattr *attr[OVS_CT_ZONE_LIMIT_ATTR_MAX + 1];
+
+		if (nla_type(nla) != OVS_CT_ZONE_LIMIT_ATTR_GET_REQ)
+			return -EINVAL;
+
+		err = nla_parse((struct nlattr **)&attr,
+				OVS_CT_ZONE_LIMIT_ATTR_MAX, nla_data(nla),
+				nla_len(nla), ct_zone_limit_policy, NULL);
+		if (err)
+			return err;
+
+		if (!attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE])
+			return -EINVAL;
+
+		zone = nla_get_u16(attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE]);
+		nf_ct_zone_init(&ct_zone, zone, NF_CT_DEFAULT_ZONE_DIR, 0);
+		rcu_read_lock();
+		limit = ct_limit_get(info, zone);
+		rcu_read_unlock();
+
+		conncount_key[0] = zone;
+		count = nf_conncount_count(net, info->data, conncount_key,
+					   NULL, &ct_zone);
+
+		nla_nested = nla_nest_start(reply,
+					    OVS_CT_ZONE_LIMIT_ATTR_GET_RLY);
+		if (nla_put_u16(reply, OVS_CT_ZONE_LIMIT_ATTR_ZONE, zone) ||
+		    nla_put_u32(reply, OVS_CT_ZONE_LIMIT_ATTR_LIMIT, limit) ||
+		    nla_put_u32(reply, OVS_CT_ZONE_LIMIT_ATTR_COUNT, count))
+			return -EMSGSIZE;
+		nla_nest_end(reply, nla_nested);
+	}
+
+	return 0;
+}
+
+static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	if (!a[OVS_CT_LIMIT_ATTR_OPTION])
+		return -EINVAL;
+
+	err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_OPTION],
+					  ct_limit_info);
+	if (err)
+		goto exit_err;
+	static_branch_enable(&ovs_ct_limit_enabled);
+
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	if (!a[OVS_CT_LIMIT_ATTR_OPTION])
+		return -EINVAL;
+
+	err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_OPTION],
+					  ct_limit_info);
+	if (err)
+		goto exit_err;
+
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct nlattr *nla_reply;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct net *net = sock_net(skb->sk);
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_OPTION);
+
+	err = ovs_ct_limit_get_default_limit(ct_limit_info, reply);
+	if (err)
+		goto exit_err;
+
+	if (a[OVS_CT_LIMIT_ATTR_OPTION]) {
+		err = ovs_ct_limit_get_zone_limit(
+			net, a[OVS_CT_LIMIT_ATTR_OPTION], ct_limit_info,
+			reply);
+		if (err)
+			goto exit_err;
+	}
+
+	nla_nest_end(reply, nla_reply);
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static struct genl_ops ct_limit_genl_ops[] = {
+	{ .cmd = OVS_CT_LIMIT_CMD_SET,
+		.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+					   * privilege. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_set,
+	},
+	{ .cmd = OVS_CT_LIMIT_CMD_DEL,
+		.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+					   * privilege. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_del,
+	},
+	{ .cmd = OVS_CT_LIMIT_CMD_GET,
+		.flags = 0,		  /* OK for unprivileged users. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_get,
+	},
+};
+
+static const struct genl_multicast_group ovs_ct_limit_multicast_group = {
+	.name = OVS_CT_LIMIT_MCGROUP,
+};
+
+struct genl_family dp_ct_limit_genl_family __ro_after_init = {
+	.hdrsize = sizeof(struct ovs_header),
+	.name = OVS_CT_LIMIT_FAMILY,
+	.version = OVS_CT_LIMIT_VERSION,
+	.maxattr = OVS_CT_LIMIT_ATTR_MAX,
+	.netnsok = true,
+	.parallel_ops = true,
+	.ops = ct_limit_genl_ops,
+	.n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+	.mcgrps = &ovs_ct_limit_multicast_group,
+	.n_mcgrps = 1,
+	.module = THIS_MODULE,
+};
+#endif
+
+int ovs_ct_init(struct net *net)
 {
 	unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -1666,12 +2162,22 @@ void ovs_ct_init(struct net *net)
 	} else {
 		ovs_net->xt_label = true;
 	}
+
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	return ovs_ct_limit_init(net, ovs_net);
+#else
+	return 0;
+#endif
 }
 
 void ovs_ct_exit(struct net *net)
 {
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	ovs_ct_limit_exit(net, ovs_net);
+#endif
+
 	if (ovs_net->xt_label)
 		nf_connlabels_put(net);
 }
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 399dfdd2c4f9..900dadd70974 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -17,10 +17,11 @@
 #include "flow.h"
 
 struct ovs_conntrack_info;
+struct ovs_ct_limit_info;
 enum ovs_key_attr;
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-void ovs_ct_init(struct net *);
+int ovs_ct_init(struct net *);
 void ovs_ct_exit(struct net *);
 bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
 int ovs_ct_copy_action(struct net *, const struct nlattr *,
@@ -44,7 +45,7 @@ void ovs_ct_free_action(const struct nlattr *a);
 #else
 #include <linux/errno.h>
 
-static inline void ovs_ct_init(struct net *net) { }
+static inline int ovs_ct_init(struct net *net) { return 0; }
 
 static inline void ovs_ct_exit(struct net *net) { }
 
@@ -104,4 +105,8 @@ static inline void ovs_ct_free_action(const struct nlattr *a) { }
 
 #define CT_SUPPORTED_MASK 0
 #endif /* CONFIG_NF_CONNTRACK */
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+extern struct genl_family dp_ct_limit_genl_family;
+#endif
 #endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 015e24e08909..a61818e94396 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2288,6 +2288,9 @@ static struct genl_family * const dp_genl_families[] = {
 	&dp_flow_genl_family,
 	&dp_packet_genl_family,
 	&dp_meter_genl_family,
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	&dp_ct_limit_genl_family,
+#endif
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2323,8 +2326,7 @@ static int __net_init ovs_init_net(struct net *net)
 
 	INIT_LIST_HEAD(&ovs_net->dps);
 	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
-	ovs_ct_init(net);
-	return 0;
+	return ovs_ct_init(net);
 }
 
 static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2469,3 +2471,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 523d65526766..51bd4dcb6c8b 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,6 +144,7 @@ struct dp_upcall_info {
 struct ovs_net {
 	struct list_head dps;
 	struct work_struct dp_notify_work;
+	struct ovs_ct_limit_info *ct_limit_info;
 
 	/* Module reference for configuring conntrack. */
 	bool xt_label;
-- 
2.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox