Netdev List

Netdev List
 help / color / mirror / Atom feed

* [patch net-next v9 2/3] net: core: Add offload stats to if_stats_msg
From: Jiri Pirko @ 2016-09-14  9:28 UTC (permalink / raw)
  To: netdev
  Cc: davem, nogahf, idosch, eladr, yotamg, ogerlitz, roopa, nikolay,
	linville, tgraf, gospo, sfeldma, sd, eranbe, ast, edumazet,
	hannes, f.fainelli, dsa
In-Reply-To: <1473845322-16679-1-git-send-email-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

Add a nested attribute of offload stats to if_stats_msg
named IFLA_STATS_LINK_OFFLOAD_XSTATS.
Under it, add SW stats, meaning stats only per packets that went via
slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/uapi/linux/if_link.h |   9 ++++
 net/core/rtnetlink.c         | 111 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9bf3aec..2351776 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -826,6 +826,7 @@ enum {
 	IFLA_STATS_LINK_64,
 	IFLA_STATS_LINK_XSTATS,
 	IFLA_STATS_LINK_XSTATS_SLAVE,
+	IFLA_STATS_LINK_OFFLOAD_XSTATS,
 	__IFLA_STATS_MAX,
 };
 
@@ -845,6 +846,14 @@ enum {
 };
 #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1)
 
+/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */
+enum {
+	IFLA_OFFLOAD_XSTATS_UNSPEC,
+	IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */
+	__IFLA_OFFLOAD_XSTATS_MAX
+};
+#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1)
+
 /* XDP section */
 
 enum {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 937e459..ae2048a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3577,6 +3577,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
 	       (!idxattr || idxattr == attrid);
 }
 
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return sizeof(struct rtnl_link_stats64);
+	}
+
+	return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+				  int *prividx)
+{
+	struct nlattr *attr = NULL;
+	int attr_id, size;
+	void *attr_data;
+	int err;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return -ENODATA;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (attr_id < *prividx)
+			continue;
+
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		if (!size)
+			continue;
+
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+
+		attr = nla_reserve_64bit(skb, attr_id, size,
+					 IFLA_OFFLOAD_XSTATS_UNSPEC);
+		if (!attr)
+			goto nla_put_failure;
+
+		attr_data = nla_data(attr);
+		memset(attr_data, 0, size);
+		err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+							     attr_data);
+		if (err)
+			goto get_offload_stats_failure;
+	}
+
+	if (!attr)
+		return -ENODATA;
+
+	*prividx = 0;
+	return 0;
+
+nla_put_failure:
+	err = -EMSGSIZE;
+get_offload_stats_failure:
+	*prividx = attr_id;
+	return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+	int nla_size = 0;
+	int attr_id;
+	int size;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return 0;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		nla_size += nla_total_size_64bit(size);
+	}
+
+	if (nla_size != 0)
+		nla_size += nla_total_size(0);
+
+	return nla_size;
+}
+
 static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 			       int type, u32 pid, u32 seq, u32 change,
 			       unsigned int flags, unsigned int filter_mask,
@@ -3586,6 +3671,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 	struct nlmsghdr *nlh;
 	struct nlattr *attr;
 	int s_prividx = *prividx;
+	int err;
 
 	ASSERT_RTNL();
 
@@ -3614,8 +3700,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
 
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS);
@@ -3639,8 +3723,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		if (master)
 			ops = master->rtnl_link_ops;
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+			     *idxattr)) {
+		*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+		attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+		if (!attr)
+			goto nla_put_failure;
+
+		err = rtnl_get_offload_stats(skb, dev, prividx);
+		if (err == -ENODATA)
+			nla_nest_cancel(skb, attr);
+		else
+			nla_nest_end(skb, attr);
+
+		if ((err) && (err != -ENODATA))
+			goto nla_put_failure;
+		*idxattr = 0;
+	}
+
 	nlmsg_end(skb, nlh);
 
 	return 0;
@@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+		size += rtnl_get_offload_stats_size(dev);
+
 	return size;
 }
 
-- 
2.5.5

^ permalink raw reply related

* [patch net-next v9 1/3] netdevice: Add offload statistics ndo
From: Jiri Pirko @ 2016-09-14  9:28 UTC (permalink / raw)
  To: netdev
  Cc: davem, nogahf, idosch, eladr, yotamg, ogerlitz, roopa, nikolay,
	linville, tgraf, gospo, sfeldma, sd, eranbe, ast, edumazet,
	hannes, f.fainelli, dsa
In-Reply-To: <1473845322-16679-1-git-send-email-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

Add a new ndo to return statistics for offloaded operation.
Since there can be many different offloaded operation with many
stats types, the ndo gets an attribute id by which it knows which
stats are wanted. The ndo also gets a void pointer to be cast according
to the attribute id.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/linux/netdevice.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2095b6a..a10d8d1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -924,6 +924,14 @@ struct netdev_xdp {
  *	3. Update dev->stats asynchronously and atomically, and define
  *	   neither operation.
  *
+ * bool (*ndo_has_offload_stats)(int attr_id)
+ *	Return true if this device supports offload stats of this attr_id.
+ *
+ * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
+ *	void *attr_data)
+ *	Get statistics for offload operations by attr_id. Write it into the
+ *	attr_data pointer.
+ *
  * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
  *	If device supports VLAN filtering this function is called when a
  *	VLAN id is registered.
@@ -1155,6 +1163,10 @@ struct net_device_ops {
 
 	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
 						     struct rtnl_link_stats64 *storage);
+	bool			(*ndo_has_offload_stats)(int attr_id);
+	int			(*ndo_get_offload_stats)(int attr_id,
+							 const struct net_device *dev,
+							 void *attr_data);
 	struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 
 	int			(*ndo_vlan_rx_add_vid)(struct net_device *dev,
-- 
2.5.5

^ permalink raw reply related

* [patch net-next v9 0/3] return offloaded stats as default and expose original sw stats
From: Jiri Pirko @ 2016-09-14  9:28 UTC (permalink / raw)
  To: netdev
  Cc: davem, nogahf, idosch, eladr, yotamg, ogerlitz, roopa, nikolay,
	linville, tgraf, gospo, sfeldma, sd, eranbe, ast, edumazet,
	hannes, f.fainelli, dsa

From: Jiri Pirko <jiri@mellanox.com>

The problem we try to handle is about offloaded forwarded packets
which are not seen by kernel. Let me try to draw it:

    port1                       port2 (HW stats are counted here)
      \                          /
       \                        /
        \                      /
         --(A)---- ASIC --(B)--
                    |
                   (C)
                    |
                   CPU (SW stats are counted here)


Now we have couple of flows for TX and RX (direction does not matter here):

1) port1->A->ASIC->C->CPU

   For this flow, HW and SW stats are equal.

2) port1->A->ASIC->C->CPU->C->ASIC->B->port2

   For this flow, HW and SW stats are equal.

3) port1->A->ASIC->B->port2

   For this flow, SW stats are 0.

The purpose of this patchset is to provide facility for user to
find out the difference between flows 1+2 and 3. In other words, user
will be able to see the statistics for the slow-path (through kernel).

Also note that HW stats are what someone calls "accumulated" stats.
Every packet counted by SW is also counted by HW. Not the other way around.

As a default the accumulated stats (HW) will be exposed to user
so the userspace apps can react properly.

This patchset add the SW stats (flows 1+2) under offload related stats, so
in the future we can expose other offload related stat in a similar way.

---
v8->v9:
- patch 2/3
 - add using of idxattr and prividx
v7->v8:
- patch 2/3
 - move helping const from uapi to rtnetlink
 - cancel driver xstat nesting if it is empty
v6->v7:
- patch 1/3:
 - ndo interface changed to get the wanted stats type as an input.
 - change commit message.
- patch 2/3:
 - create a nesting for offloaded stat and put SW stats under it.
 - change the ndo call to indicate which offload stats we wants.
 - change commit message.
- patch 3/3:
 - change ndo implementation to match the changes in the previous patches.
 - change commit message.
v5->v6:
- patch 2/4 was dropped as requested by Roopa
- patch 1/3:
 - comment changed to indicate that default stats are combined stats
 - commit massage changed
- patch 2/3: (previously 3/4)
 - SW stats return nothing if there is no SW stats ndo
v4->v5:
- updated cover letter
- patch3/4:
  - using memcpy directly to copy stats as requested by DaveM
v3->v4:
- patch1/4:
  - fixed "return ()" pointed out by EricD
- patch2/4:
  - fixed if_nlmsg_size as pointed out by EricD
v2->v3:
- patch1/4:
  - added dev_have_sw_stats helper
- patch2/4:
  - avoided memcpy as requested by DaveM
- patch3/4:
  - use new dev_have_sw_stats helper
v1->v2:
- patch3/4:
  - fixed NULL initialization

Nogah Frankel (3):
  netdevice: Add offload statistics ndo
  net: core: Add offload stats to if_stats_msg
  mlxsw: spectrum: Implement offload stats ndo and expose HW stats by
    default

 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 129 +++++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |   5 +
 include/linux/netdevice.h                      |  12 +++
 include/uapi/linux/if_link.h                   |   9 ++
 net/core/rtnetlink.c                           | 111 ++++++++++++++++++++-
 5 files changed, 255 insertions(+), 11 deletions(-)

-- 
2.5.5

^ permalink raw reply

* Re: [PATCH RFC 1/6] spinlock: Add library function to allocate spinlock buckets array
From: Thomas Graf @ 2016-09-14  9:27 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, kernel-team
In-Reply-To: <1473463197-3076903-2-git-send-email-tom@herbertland.com>

On 09/09/16 at 04:19pm, Tom Herbert wrote:
> Add two new library functions alloc_bucket_spinlocks and
> free_bucket_spinlocks. These are use to allocate and free an array
> of spinlocks that are useful as locks for hash buckets. The interface
> specifies the maximum number of spinlocks in the array as well
> as a CPU multiplier to derive the number of spinlocks to allocate.
> The number to allocated is rounded up to a power of two to make
> the array amenable to hash lookup.
> 
> Signed-off-by: Tom Herbert <tom@herbertland.com>

Acked-by: Thomas Graf <tgraf@suug.ch>

^ permalink raw reply

* Re: [RFC 07/11] Add support for memory registeration verbs
From: Sagi Grimberg @ 2016-09-14  9:25 UTC (permalink / raw)
  To: Kalderon, Michal, Ram Amrani, dledford@redhat.com,
	davem@davemloft.net
  Cc: Yuval.Mintz@qlogic.com, Ariel.Elior@qlogic.com,
	Michal.Kalderon@qlogic.com, rajesh.borundia@qlogic.com,
	linux-rdma@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <BLUPR0701MB192207820BA5CF59A835181088F10@BLUPR0701MB1922.namprd07.prod.outlook.com>


>>> +struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, int
>>> +max_page_list_len) {
>>> +	struct qedr_pd *pd = get_qedr_pd(ibpd);
>>> +	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
>>> +	struct qedr_mr *mr;
>>> +	int rc = -ENOMEM;
>>> +
>>> +	DP_VERBOSE(dev, QEDR_MSG_MR,
>>> +		   "qedr_alloc_frmr pd = %d max_page_list_len= %d\n", pd-
>>> pd_id,
>>> +		   max_page_list_len);
>>> +
>>> +	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
>>> +	if (!mr)
>>> +		return ERR_PTR(rc);
>>> +
>>> +	mr->dev = dev;
>>> +	mr->type = QEDR_MR_FRMR;
>>> +
>>> +	rc = init_mr_info(dev, &mr->info, max_page_list_len, 1);
>>> +	if (rc)
>>> +		goto err0;
>>> +
>>> +	rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
>>> +	if (rc) {
>>> +		DP_ERR(dev, "roce alloc tid returned an error %d\n", rc);
>>> +		goto err0;
>>> +	}
>>> +
>>> +	/* Index only, 18 bit long, lkey = itid << 8 | key */
>>> +	mr->hw_mr.tid_type = QED_RDMA_TID_FMR;
>>> +	mr->hw_mr.key = 0;
>>> +	mr->hw_mr.pd = pd->pd_id;
>>
>> Do you have a real MR<->PD association in HW? If so, can you point me to the
>> code that binds it? If not, any reason not to expose the local_dma_lkey?
>>
> Yes, we send the pd id to the FW in function qed_rdma_register_tid.

Right, thanks.

> In any case, if we didn't have the association in HW
> Wouldn't the local_dma_lkey be relevant only to dma_mr ? ( code snippet above refers to FMR)

I was just sticking to a location in the code where you associate
MR<->PD...

The local_dma_lkey is a special key that spans the entire memory space
and unlike the notorious dma_mr, its not associated with a PD.

See the code in ib_alloc_pd(), if the device does not support a single
device local dma lkey, the core allocates a dma mr associated with
the pd. If your device has such a key, you can save a dma mr allocation
for each pd in the system.

^ permalink raw reply

* Re: [PATCH RFC 08/11] net/mlx5e: XDP fast RX drop bpf programs support
From: Tariq Toukan via iovisor-dev @ 2016-09-14  9:24 UTC (permalink / raw)
  To: Or Gerlitz, Jesper Dangaard Brouer
  Cc: Tom Herbert, iovisor-dev, Jamal Hadi Salim, Saeed Mahameed,
	Eric Dumazet, Linux Netdev List, Rana Shahout
In-Reply-To: <CAJ3xEMiDBZ2-FdE7wniW0Y_S6k8NKfKEdy3w+1vs83oPuMAG5Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>



On 08/09/2016 12:31 PM, Or Gerlitz wrote:
> On Thu, Sep 8, 2016 at 10:38 AM, Jesper Dangaard Brouer
> <brouer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> On Wed, 7 Sep 2016 23:55:42 +0300
>> Or Gerlitz <gerlitz.or-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>>
>>> On Wed, Sep 7, 2016 at 3:42 PM, Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>>>> From: Rana Shahout <ranas-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>
>>>> Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver.
>>>>
>>>> When XDP is on we make sure to change channels RQs type to
>>>> MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to
>>>> ensure "page per packet".
>>>>
>>>> On XDP set, we fail if HW LRO is set and request from user to turn it
>>>> off.  Since on ConnectX4-LX HW LRO is always on by default, this will be
>>>> annoying, but we prefer not to enforce LRO off from XDP set function.
>>>>
>>>> Full channels reset (close/open) is required only when setting XDP
>>>> on/off.
>>>>
>>>> When XDP set is called just to exchange programs, we will update
>>>> each RQ xdp program on the fly and for synchronization with current
>>>> data path RX activity of that RQ, we temporally disable that RQ and
>>>> ensure RX path is not running, quickly update and re-enable that RQ,
>>>> for that we do:
>>>>          - rq.state = disabled
>>>>          - napi_synnchronize
>>>>          - xchg(rq->xdp_prg)
>>>>          - rq.state = enabled
>>>>          - napi_schedule // Just in case we've missed an IRQ
>>>>
>>>> Packet rate performance testing was done with pktgen 64B packets and on
>>>> TX side and, TC drop action on RX side compared to XDP fast drop.
>>>>
>>>> CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
>>>>
>>>> Comparison is done between:
>>>>          1. Baseline, Before this patch with TC drop action
>>>>          2. This patch with TC drop action
>>>>          3. This patch with XDP RX fast drop
>>>>
>>>> Streams    Baseline(TC drop)    TC drop    XDP fast Drop
>>>> --------------------------------------------------------------
>>>> 1           5.51Mpps            5.14Mpps     13.5Mpps
>>> This (13.5 M PPS) is less than 50% of the result we presented @ the
>>> XDP summit which was obtained by Rana. Please see if/how much does
>>> this grows if you use more sender threads, but all of them to xmit the
>>> same stream/flows, so we're on one ring. That (XDP with single RX ring
>>> getting packets from N remote TX rings) would be your canonical
>>> base-line for any further numbers.
>> Well, my experiments with this hardware (mlx5/CX4 at 50Gbit/s) show
>> that you should be able to reach 23Mpps on a single CPU.  This is
>> a XDP-drop-simulation with order-0 pages being recycled through my
>> page_pool code, plus avoiding the cache-misses (notice you are using a
>> CPU E5-2680 with DDIO, thus you should only see a L3 cache miss).
> so this takes up from 13M to 23M, good.
>
> Could you explain why the move from order-3 to order-0 is hurting the
> performance so much (drop from 32M to 23M), any way we can overcome that?
The issue is not moving from high-order to order-0.
It's moving from Striding RQ to non-Striding RQ without using a 
page-reuse mechanism (not cache).
In current memory-scheme, each 64B packet consumes a 4K page, including 
allocate/release (from cache in this case, but still...).
I believe that once we implement page-reuse for non Striding RQ we'll 
hit 32M PPS again.
>> The 23Mpps number looks like some HW limitation, as the increase was
> not HW, I think. As I said, Rana got 32M with striding RQ when she was
> using order-3
> (or did we use order-5?)
order-5.
>> is not proportional to page-allocator overhead I removed (and CPU freq
>> starts to decrease).  I also did scaling tests to more CPUs, which
>> showed it scaled up to 40Mpps (you reported 45M).  And at the Phy RX
>> level I see 60Mpps (50G max is 74Mpps).

^ permalink raw reply

* Re: [PATCH RFC 4/6] rhashtable: abstract out function to get hash
From: Thomas Graf @ 2016-09-14  9:23 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, kernel-team
In-Reply-To: <1473463197-3076903-5-git-send-email-tom@herbertland.com>

On 09/09/16 at 04:19pm, Tom Herbert wrote:
> Split out most of rht_key_hashfn which is calculating the hash into
> its own function. This way the hash function can be called separately to
> get the hash value.
> 
> Signed-off-by: Tom Herbert <tom@herbertland.com>

Acked-by: Thomas Graf <tgraf@suug.ch>

^ permalink raw reply

* Re: [PATCH] [RFC] proc connector: add namespace events
From: Jiri Benc @ 2016-09-14  9:23 UTC (permalink / raw)
  To: Alban Crequy
  Cc: Evgeniy Polyakov, Alban Crequy, Tejun Heo, Aditya Kali,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	Iago Lopez Galeiras, Serge E. Hallyn
In-Reply-To: <CAMXgnP4tNg1U5v1R3XEDp9jFKhX4n70iqiDmt1Vp5vySpO9LhQ@mail.gmail.com>

On Tue, 13 Sep 2016 16:42:43 +0200, Alban Crequy wrote:
> Note that I will probably not have the chance to spend more time on
> this patch soon because Iago will explore other methods with
> eBPF+kprobes instead. eBPF+kprobes would not have the same API
> stability though. I was curious to see if anyone would find the
> namespace addition to proc connector interesting for other projects.

Yes, this is a sorely missing feature. I don't care how this is done
(proc connector or something else) but the feature itself is quite
important for system management daemons. In particular, we need an
application that monitors network configuration changes on the machine,
displays the current configuration and records history of the changes.
This is currently impossible to do reliably if net name spaces are in
use - which they are with OpenStack and Docker and similar things in
place on those machines. The current tools try to do things like
monitoring /var/run/netns which is obviously unreliable and broken.

There are actually two (orthogonal) problems here: apart of the one
described above, it's also startup of such daemon. There's currently no
way to find all current name spaces from the user space. We'll need an
API for this, too.

And no, eBPF is not the answer. This should just work like any other
system daemon. I can't imagine that we would need llvm compiler and
kernel sources/debuginfo/whatever on every machine that runs such
daemon.

Thanks,

 Jiri

^ permalink raw reply

* Re: [PATCH RFC 2/6] rhashtable: Call library function alloc_bucket_locks
From: Thomas Graf @ 2016-09-14  9:18 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, kernel-team
In-Reply-To: <1473463197-3076903-3-git-send-email-tom@herbertland.com>

On 09/09/16 at 04:19pm, Tom Herbert wrote:
> To allocate the array of bucket locks for the hash table we now
> call library function alloc_bucket_spinlocks. This function is
> based on the old alloc_bucket_locks in rhashtable and should
> produce the same effect.
> 
> Signed-off-by: Tom Herbert <tom@herbertland.com>

Acked-by: Thomas Graf <tgraf@suug.ch>

^ permalink raw reply

* Re: [PATCH v5 0/6] Add eBPF hooks for cgroups
From: Thomas Graf @ 2016-09-14  9:03 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Pablo Neira Ayuso, Daniel Mack, htejun, daniel, ast, davem, kafai,
	fw, harald, netdev, sargun, cgroups
In-Reply-To: <20160914044217.GA44742@ast-mbp.thefacebook.com>

[Sorry for the repost, gmail decided to start sending HTML crap along
 overnight for some reason]

On 09/13/16 at 09:42pm, Alexei Starovoitov wrote:
> On Tue, Sep 13, 2016 at 07:24:08PM +0200, Pablo Neira Ayuso wrote:
> > Then you have to explain me how can anyone else than systemd use this
> > infrastructure?
> 
> Jokes aside. I'm puzzled why systemd is even being mentioned here.
> Here we use tupperware (our internal container management system) that
> is heavily using cgroups and has nothing to do with systemd.

Just confirming that we are planning to use this decoupled from
systemd as well.  I fail to see how this is at all systemd specific.

> For us this cgroup+bpf is _not_ for filterting and _not_ for security.
> We run a ton of tasks in cgroups that launch all sorts of
> things on their own. We need to monitor what they do from networking
> point of view. Therefore bpf programs need to monitor the traffic in
> particular part of cgroup hierarchy. Not globally and no pass/drop decisions.

+10, although filtering/drop is a valid use case, the really strong
use case is definitely introspection at networking level. Statistics,
monitoring, verification of application correctness, etc. 

I don't see why this is at all an either or discussion. If nft wants
cgroups integration similar to this effort, I see no reason why that
should stop this effort.

^ permalink raw reply

* RE: [RFC 07/11] Add support for memory registeration verbs
From: Kalderon, Michal @ 2016-09-14  8:59 UTC (permalink / raw)
  To: Sagi Grimberg, Ram Amrani,
	dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org
  Cc: Yuval.Mintz-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	Ariel.Elior-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	Michal.Kalderon-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	rajesh.borundia-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <7fa4a9b8-7cb1-0f83-d6e5-1055ae59bce4-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>

> > +struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, int
> > +max_page_list_len) {
> > +	struct qedr_pd *pd = get_qedr_pd(ibpd);
> > +	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
> > +	struct qedr_mr *mr;
> > +	int rc = -ENOMEM;
> > +
> > +	DP_VERBOSE(dev, QEDR_MSG_MR,
> > +		   "qedr_alloc_frmr pd = %d max_page_list_len= %d\n", pd-
> >pd_id,
> > +		   max_page_list_len);
> > +
> > +	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
> > +	if (!mr)
> > +		return ERR_PTR(rc);
> > +
> > +	mr->dev = dev;
> > +	mr->type = QEDR_MR_FRMR;
> > +
> > +	rc = init_mr_info(dev, &mr->info, max_page_list_len, 1);
> > +	if (rc)
> > +		goto err0;
> > +
> > +	rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
> > +	if (rc) {
> > +		DP_ERR(dev, "roce alloc tid returned an error %d\n", rc);
> > +		goto err0;
> > +	}
> > +
> > +	/* Index only, 18 bit long, lkey = itid << 8 | key */
> > +	mr->hw_mr.tid_type = QED_RDMA_TID_FMR;
> > +	mr->hw_mr.key = 0;
> > +	mr->hw_mr.pd = pd->pd_id;
> 
> Do you have a real MR<->PD association in HW? If so, can you point me to the
> code that binds it? If not, any reason not to expose the local_dma_lkey?
> 
Yes, we send the pd id to the FW in function qed_rdma_register_tid. In any case, if we didn't have the association in HW
Wouldn't the local_dma_lkey be relevant only to dma_mr ? ( code snippet above refers to FMR) 

> > +struct ib_mr *qedr_get_dma_mr(struct ib_pd *ibpd, int acc) {
> > +	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
> > +	struct qedr_pd *pd = get_qedr_pd(ibpd);
> > +	struct qedr_mr *mr;
> > +	int rc;
> > +
> > +	if (acc & IB_ACCESS_MW_BIND) {
> > +		DP_ERR(dev, "Unsupported access flags received for dma
> mr\n");
> > +		return ERR_PTR(-EINVAL);
> > +	}
> 
> This check looks like it really belongs in the core, it would help everyone if you
> move it...
> 
> Although I know Christoph is trying to get rid of this API altogether...
Sure, will do.
 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body
> of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] MAINTAINERS: Remove myself from PA Semi entries
From: Michael Ellerman @ 2016-09-14  8:57 UTC (permalink / raw)
  To: Olof Johansson; +Cc: netdev, linux-i2c, davem, jdelvare, Olof Johansson
In-Reply-To: <1473803318-25197-1-git-send-email-olof@lixom.net>

Olof Johansson <olof@lixom.net> writes:

> The platform is old, very few users and I lack bandwidth to keep after
> it these days.
>
> Mark the base platform as well as the drivers as orphans, patches have
> been flowing through the fallback maintainers for a while already.

Sorry to see you go, but thanks for keeping an eye on it as long as you
did!

> Jean, Dave,
>
> I was hoping to have Michael merge this since the bulk of the platform is under him,
> cc:ing you mostly to be aware that I am orphaning a driver in your subsystems.

I'll merge it unless I hear otherwise from Dave.

Should we go the whole hog and just do as below? I think most folks use
get_maintainers.pl these days, so this should have basically the same
effect. Happy to go with your original version though if you prefer.

cheers


diff --git a/MAINTAINERS b/MAINTAINERS
index 0bbe4b105c34..8ca1c25d870d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7049,6 +7049,7 @@ N:	powermac
 N:	powernv
 N:	[^a-z0-9]ps3
 N:	pseries
+N:	pasemi
 
 LINUX FOR POWER MACINTOSH
 M:	Benjamin Herrenschmidt <benh@kernel.crashing.org>
@@ -7098,14 +7099,6 @@ S:	Maintained
 F:	arch/powerpc/platforms/83xx/
 F:	arch/powerpc/platforms/85xx/
 
-LINUX FOR POWERPC PA SEMI PWRFICIENT
-M:	Olof Johansson <olof@lixom.net>
-L:	linuxppc-dev@lists.ozlabs.org
-S:	Maintained
-F:	arch/powerpc/platforms/pasemi/
-F:	drivers/*/*pasemi*
-F:	drivers/*/*/*pasemi*
-
 LINUX SECURITY MODULE (LSM) FRAMEWORK
 M:	Chris Wright <chrisw@sous-sol.org>
 L:	linux-security-module@vger.kernel.org
@@ -8773,18 +8766,6 @@ W:	http://wireless.kernel.org/en/users/Drivers/p54
 S:	Maintained
 F:	drivers/net/wireless/intersil/p54/
 
-PA SEMI ETHERNET DRIVER
-M:	Olof Johansson <olof@lixom.net>
-L:	netdev@vger.kernel.org
-S:	Maintained
-F:	drivers/net/ethernet/pasemi/*
-
-PA SEMI SMBUS DRIVER
-M:	Olof Johansson <olof@lixom.net>
-L:	linux-i2c@vger.kernel.org
-S:	Maintained
-F:	drivers/i2c/busses/i2c-pasemi.c
-
 PADATA PARALLEL EXECUTION MECHANISM
 M:	Steffen Klassert <steffen.klassert@secunet.com>
 L:	linux-crypto@vger.kernel.org

^ permalink raw reply related

* Re: [PATCH v3 5/9] ARM: dts: sun8i-h3: add sun8i-emac ethernet driver
From: LABBE Corentin @ 2016-09-14  8:54 UTC (permalink / raw)
  To: Maxime Ripard
  Cc: robh+dt, mark.rutland, wens, linux, davem, netdev, devicetree,
	linux-arm-kernel, linux-kernel
In-Reply-To: <20160912072933.GC9449@lukather>

On Mon, Sep 12, 2016 at 09:29:33AM +0200, Maxime Ripard wrote:
> On Fri, Sep 09, 2016 at 02:45:13PM +0200, Corentin Labbe wrote:
> > The sun8i-emac is an ethernet MAC hardware that support 10/100/1000
> > speed.
> > 
> > This patch enable the sun8i-emac on the Allwinner H3 SoC Device-tree.
> > The SoC H3 have an internal PHY, so optionals syscon and ephy are set.
> > 
> > Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
> > ---
> >  arch/arm/boot/dts/sun8i-h3.dtsi | 19 +++++++++++++++++++
> >  1 file changed, 19 insertions(+)
> > 
> > diff --git a/arch/arm/boot/dts/sun8i-h3.dtsi b/arch/arm/boot/dts/sun8i-h3.dtsi
> > index a39da6f..a3ac476 100644
> > --- a/arch/arm/boot/dts/sun8i-h3.dtsi
> > +++ b/arch/arm/boot/dts/sun8i-h3.dtsi
> > @@ -50,6 +50,10 @@
> >  / {
> >  	interrupt-parent = <&gic>;
> >  
> > +	aliases {
> > +		ethernet0 = &emac;
> > +	};
> > +
> 
> This needs to be done at the board level.
> 

ok

> >  	cpus {
> >  		#address-cells = <1>;
> >  		#size-cells = <0>;
> > @@ -446,6 +450,21 @@
> >  			status = "disabled";
> >  		};
> >  
> > +		emac: ethernet@1c30000 {
> > +			compatible = "allwinner,sun8i-h3-emac";
> > +			syscon = <&syscon>;
> > +			reg = <0x01c30000 0x104>;
> > +			reg-names = "emac";
> 
> You don't need reg-names anymore.
> 

ok

> > +			interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
> > +			resets = <&ccu RST_BUS_EMAC>, <&ccu RST_BUS_EPHY>;
> > +			reset-names = "ahb", "ephy";
> > +			clocks = <&ccu CLK_BUS_EMAC>, <&ccu CLK_BUS_EPHY>;
> > +			clock-names = "ahb", "ephy";
> 
> I still believe that having the same node for both the PHY and the MAC
> is wrong.
> 

Ok I have moved clock/reset of ephy in its node.

Thanks

Regards

Corentin Labbe

^ permalink raw reply

* Re: [PATCH] MAINTAINERS: Remove myself from PA Semi entries
From: Wolfram Sang @ 2016-09-14  8:36 UTC (permalink / raw)
  To: Olof Johansson; +Cc: Michael Ellerman, netdev, linux-i2c, davem, jdelvare
In-Reply-To: <20160914003157.GA1420@katana>

[-- Attachment #1: Type: text/plain, Size: 449 bytes --]


> > I was hoping to have Michael merge this since the bulk of the platform is under him,
> > cc:ing you mostly to be aware that I am orphaning a driver in your subsystems.
> 
> Let me answer for Jean since I took over I2C in November 2012 ;) I'd
> think the entry can go completely. The last 'F:' tag for the platform
> catches the I2C driver anyhow. But in general:

To make it crystal clear: I meant the I2C entry for PASEMI could go.


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [net-next PATCH 00/11] iw_cxgb4,cxgbit: remove duplicate code
From: Or Gerlitz @ 2016-09-14  8:32 UTC (permalink / raw)
  To: Varun Prakash
  Cc: David Miller, Linux Netdev List,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	target-devel-u79uwXL29TY76Z2rM5mHXA, Nicholas A. Bellinger,
	Doug Ledford, Steve Wise, indranil-ut6Up61K2wZBDgjK7y7TUQ
In-Reply-To: <cover.1473781521.git.varun-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

On Tue, Sep 13, 2016 at 6:53 PM, Varun Prakash <varun-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org> wrote:
> This patch series removes duplicate code from
> iw_cxgb4 and cxgbit by adding common function definitions in libcxgb.

Is that bunch of misc functionalities or you can provide a more high
level description what
you are cleaning out. Also, what other areas are you planning to
refactor following the review
comments we had on the target driver?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: [RFC 07/11] Add support for memory registeration verbs
From: Amrani, Ram @ 2016-09-14  8:02 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Yuval.Mintz@qlogic.com, Ariel.Elior@qlogic.com,
	Michal.Kalderon@qlogic.com, rajesh.borundia@qlogic.com,
	linux-rdma@vger.kernel.org, netdev@vger.kernel.org,
	dledford@redhat.com, davem@davemloft.net
In-Reply-To: <dfff3b6a-07da-38eb-e844-a48f713ff0f7@grimberg.me>

> > +static inline struct qedr_ah *get_qedr_ah(struct ib_ah *ibah) {
> > +	return container_of(ibah, struct qedr_ah, ibah); }
> 
> Little surprising to find that here... how is the ah related to this patch?

Thanks, Sagi. Will move into a proper location.

^ permalink raw reply

* Re: [RFC 00/11] QLogic RDMA Driver (qedr) RFC
From: Sagi Grimberg @ 2016-09-14  8:17 UTC (permalink / raw)
  To: Elior, Ariel, Ram Amrani,
	dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org
  Cc: Yuval.Mintz-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	Ariel.Elior-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	Michal.Kalderon-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	rajesh.borundia-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <CY1PR0701MB133732FA8478FC0B5003D97A90F10-UpKza+2NMNLi6bjPjkn3FE5OhdzP3rhOnBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>

>> > SRQ is not part of the RFC (but we do have it and NVMF was tested
> with it).
>
>>
>
>> Nice, I have plans to make SRQs better usable for our ULPs so it'd be
>
>> good to have it.
>
> That’s good to know. Are there plans on implementing XRC?
>
> Right now it looks like none of the ULPS make use of it.

The problem with XRC is that it needs to be reflected on the
wire protocol for it to actually be useful for our ULPs. Not
sure if its worth the effort...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC 02/11] Add RoCE driver framework
From: Mintz, Yuval @ 2016-09-14  8:15 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Yuval Mintz, Mark Bloch, Ram Amrani,
	dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org, David Miller,
	Ariel Elior, Michal Kalderon, Rajesh Borundia,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, netdev
In-Reply-To: <20160913101616.GT8812-2ukJVAZIZ/Y@public.gmane.org>

> > >> >> +uint debug;
> > >> >> +module_param(debug, uint, 0);
> > > >>> +MODULE_PARM_DESC(debug, "Default debug msglevel");
> > >>
> > >> >Why are you adding this as a module parameter?
> > >>
> > >>  I believe this is mostly to follow same line as qede which also defines
> > > > 'debug' module parameter for allowing easy user control of debug
> > > > prints [& specifically for probe prints, which can't be controlled
> > > > otherwise].
> >
> > > Can you give us an example where dynamic debug and tracing infrastructures
> > > are not enough?
> >
> > > AFAIK, most of these debug module parameters are legacy copy/paste
> > > code which is useless in real life scenarios.
> >
> > Define 'enough'; Using dynamic debug you can provide all the necessary
> > information and at an even better granularity that's achieved by suggested
> > infrastructure,  but is harder for an end-user to use. Same goes for tracing.
> >
> > The 'debug' option provides an easy grouping for prints related to a specific
> > area in the driver.
> 
> It is hard to agree with you that user which knows how-to load modules
> with parameters won't success to enable debug prints.

I think you're giving too much credit to the end-user. :-D

> In addition, global increase in debug level for whole driver will create
> printk storm in dmesg and give nothing to debuggability.

So basically, what you're claiming is that ethtool 'msglvl' setting for devices
is completely obselete. While this *might* be true, we use it extensively
in our qede and qed drivers; The debug module parameter merely provides
a manner of setting the debug value prior to initial probe for all interfaces.
qedr follows the same practice.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next 0/4] rxrpc: Support IPv6
From: David Howells @ 2016-09-14  7:36 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <147380649153.30728.6717292274642860064.stgit@warthog.procyon.org.uk>

David Howells <dhowells@redhat.com> wrote:

> Here is a set of patches that add IPv6 support.  They need to be applied on
> top of the just-posted miscellaneous fix patches.  They are:

This subset needs to be made to depend on CONFIG_IPV6.

David

^ permalink raw reply

* RE: [RFC 02/11] Add RoCE driver framework
From: Amrani, Ram @ 2016-09-14  7:30 UTC (permalink / raw)
  To: Steve Wise
  Cc: Yuval.Mintz-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	Ariel.Elior-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	Michal.Kalderon-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	rajesh.borundia-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org
In-Reply-To: <018901d20dcd$98d0c040$ca7240c0$@opengridcomputing.com>

> > +	if ((event != NETDEV_CHANGENAME) && (event !=
> > NETDEV_CHANGEADDR))
> 
> nit: You don't really need the extra parens here.
> 
Sure, thanks. Will remove.
 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [RFC v3 22/22] samples/landlock: Add sandbox example
From: Mickaël Salaün @ 2016-09-14  7:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Arnd Bergmann, Casey Schaufler, Daniel Borkmann, Daniel Mack,
	David Drysdale, David S . Miller, Elena Reshetova,
	Eric W . Biederman, James Morris, Kees Cook, Paul Moore,
	Sargun Dhillon, Serge E . Hallyn, Tejun Heo, Will Drewry,
	kernel-hardening, linux-api
In-Reply-To: <20160914072415.26021-1-mic@digikod.net>

Add a basic sandbox tool to create a process isolated from some part of
the system. This can depend of the current cgroup.

Example with the current process hierarchy (seccomp):

  $ ls /home
  user1
  $ LANDLOCK_ALLOWED='/bin:/lib:/usr:/tmp:/proc/self/fd/0' \
      ./samples/landlock/sandbox /bin/sh -i
  Launching a new sandboxed process.
  $ ls /home
  ls: cannot open directory '/home': Permission denied

Example with a cgroup:

  $ mkdir /sys/fs/cgroup/sandboxed
  $ ls /home
  user1
  $ LANDLOCK_CGROUPS='/sys/fs/cgroup/sandboxed' \
      LANDLOCK_ALLOWED='/bin:/lib:/usr:/tmp:/proc/self/fd/0' \
      ./samples/landlock/sandbox
  Ready to sandbox with cgroups.
  $ ls /home
  user1
  $ echo $$ > /sys/fs/cgroup/sandboxed/cgroup.procs
  $ ls /home
  ls: cannot open directory '/home': Permission denied

Changes since v2:
* use BPF_PROG_ATTACH for cgroup handling

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
---
 samples/Makefile            |   2 +-
 samples/landlock/.gitignore |   1 +
 samples/landlock/Makefile   |  16 +++
 samples/landlock/sandbox.c  | 307 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 325 insertions(+), 1 deletion(-)
 create mode 100644 samples/landlock/.gitignore
 create mode 100644 samples/landlock/Makefile
 create mode 100644 samples/landlock/sandbox.c

diff --git a/samples/Makefile b/samples/Makefile
index 1a20169d85ac..a2dcd57ca7ac 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -2,4 +2,4 @@
 
 obj-$(CONFIG_SAMPLES)	+= kobject/ kprobes/ trace_events/ livepatch/ \
 			   hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \
-			   configfs/ connector/ v4l/ trace_printk/
+			   configfs/ connector/ v4l/ trace_printk/ landlock/
diff --git a/samples/landlock/.gitignore b/samples/landlock/.gitignore
new file mode 100644
index 000000000000..f6c6da930a30
--- /dev/null
+++ b/samples/landlock/.gitignore
@@ -0,0 +1 @@
+/sandbox
diff --git a/samples/landlock/Makefile b/samples/landlock/Makefile
new file mode 100644
index 000000000000..d1044b2afd27
--- /dev/null
+++ b/samples/landlock/Makefile
@@ -0,0 +1,16 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+hostprogs-$(CONFIG_SECURITY_LANDLOCK) := sandbox
+sandbox-objs := sandbox.o
+
+always := $(hostprogs-y)
+
+HOSTCFLAGS += -I$(objtree)/usr/include
+
+# Trick to allow make to be run from this directory
+all:
+	$(MAKE) -C ../../ $$PWD/
+
+clean:
+	$(MAKE) -C ../../ M=$$PWD clean
diff --git a/samples/landlock/sandbox.c b/samples/landlock/sandbox.c
new file mode 100644
index 000000000000..9d6ac00cdd23
--- /dev/null
+++ b/samples/landlock/sandbox.c
@@ -0,0 +1,307 @@
+/*
+ * Landlock LSM - Sandbox example
+ *
+ * Copyright (C) 2016  Mickaël Salaün <mic@digikod.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3, as
+ * published by the Free Software Foundation.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h> /* open() */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/prctl.h>
+#include <linux/seccomp.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "../../tools/include/linux/filter.h"
+
+#include "../bpf/libbpf.c"
+
+#ifndef seccomp
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+	errno = 0;
+	return syscall(__NR_seccomp, op, flags, args);
+}
+#endif
+
+static int landlock_prog_load(const struct bpf_insn *insns, int prog_len,
+		enum landlock_hook_id hook_id, __u64 access)
+{
+	union bpf_attr attr = {
+		.prog_type = BPF_PROG_TYPE_LANDLOCK,
+		.insns = ptr_to_u64((void *) insns),
+		.insn_cnt = prog_len / sizeof(struct bpf_insn),
+		.license = ptr_to_u64((void *) "GPL"),
+		.log_buf = ptr_to_u64(bpf_log_buf),
+		.log_size = LOG_BUF_SIZE,
+		.log_level = 1,
+		.prog_subtype.landlock_hook = {
+			.id = hook_id,
+			.origin = LANDLOCK_FLAG_ORIGIN_SECCOMP |
+				LANDLOCK_FLAG_ORIGIN_SYSCALL |
+				LANDLOCK_FLAG_ORIGIN_INTERRUPT,
+			.access = access,
+		},
+	};
+
+	/* assign one field outside of struct init to make sure any
+	 * padding is zero initialized
+	 */
+	attr.kern_version = 0;
+
+	bpf_log_buf[0] = 0;
+
+	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+
+#define ARRAY_SIZE(a)	(sizeof(a) / sizeof(a[0]))
+
+static int apply_sandbox(const char **allowed_paths, int path_nb, const char
+		**cgroup_paths, int cgroup_nb)
+{
+	__u32 key;
+	int i, ret = 0, map_fs = -1, offset;
+
+	/* set up the test sandbox */
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		perror("prctl(no_new_priv)");
+		return 1;
+	}
+
+	/* register a new syscall filter */
+	struct sock_filter filter0[] = {
+		/* pass a cookie containing 5 to the LSM hook filter */
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LANDLOCK | 5),
+	};
+	struct sock_fprog prog0 = {
+		.len = (unsigned short)ARRAY_SIZE(filter0),
+		.filter = filter0,
+	};
+	if (!cgroup_nb) {
+		if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog0)) {
+			perror("seccomp(set_filter)");
+			return 1;
+		}
+	}
+
+	if (path_nb) {
+		map_fs = bpf_create_map(BPF_MAP_TYPE_LANDLOCK_ARRAY,
+				sizeof(key), sizeof(struct landlock_handle),
+				10, 0);
+		if (map_fs < 0) {
+			fprintf(stderr, "bpf_create_map(fs): %s\n",
+					strerror(errno));
+			return 1;
+		}
+		for (key = 0; key < path_nb; key++) {
+			int fd = open(allowed_paths[key],
+					O_RDONLY | O_CLOEXEC);
+			if (fd < 0) {
+				fprintf(stderr, "open(fs: \"%s\"): %s\n",
+						allowed_paths[key],
+						strerror(errno));
+				return 1;
+			}
+			struct landlock_handle handle = {
+				.type = BPF_MAP_HANDLE_TYPE_LANDLOCK_FS_FD,
+				.fd = (__u64)fd,
+			};
+
+			/* register a new LSM handle */
+			if (bpf_update_elem(map_fs, &key, &handle, BPF_ANY)) {
+				fprintf(stderr, "bpf_update_elem(fs: \"%s\"): %s\n",
+						allowed_paths[key],
+						strerror(errno));
+				close(fd);
+				return 1;
+			}
+			close(fd);
+		}
+	}
+
+	/* load a LSM filter hook (eBPF) */
+	struct bpf_insn hook_pre[] = {
+		/* save context */
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+#if 0
+		/* check our cookie (not used in this example) */
+		BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_6, offsetof(struct
+					landlock_data, cookie)),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 5, 2),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+#endif
+	};
+	struct bpf_insn hook_path[] = {
+		/* specify an option, if any */
+		BPF_MOV32_IMM(BPF_REG_1, 0),
+		/* handles to compare with */
+		BPF_LD_MAP_FD(BPF_REG_2, map_fs),
+		BPF_MOV64_IMM(BPF_REG_3, BPF_MAP_ARRAY_OP_OR),
+		/* hook argument (struct file) */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_6, offsetof(struct
+					landlock_data, args[0])),
+		/* checker function */
+		BPF_EMIT_CALL(BPF_FUNC_landlock_cmp_fs_beneath_with_struct_file),
+
+		/* if the checked path is beneath the handle */
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+		/* allow anonymous mapping */
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -ENOENT, 2),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+		/* deny by default, if any error */
+		BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 2),
+		BPF_MOV32_IMM(BPF_REG_0, EACCES),
+		BPF_EXIT_INSN(),
+	};
+	struct bpf_insn hook_post[] = {
+		BPF_MOV32_IMM(BPF_REG_0, EACCES),
+		BPF_EXIT_INSN(),
+	};
+
+	unsigned long hook_size = sizeof(hook_pre) + sizeof(hook_path) *
+		(path_nb ? 1 : 0) + sizeof(hook_post);
+
+	struct bpf_insn *hook0 = malloc(hook_size);
+	if (!hook0) {
+		perror("malloc");
+		ret = 1;
+		goto err_alloc;
+	}
+	memcpy(hook0, hook_pre, sizeof(hook_pre));
+	offset = sizeof(hook_pre) / sizeof(hook0[0]);
+	if (path_nb) {
+		memcpy(hook0 + offset, hook_path, sizeof(hook_path));
+		offset += sizeof(hook_path) / sizeof(hook0[0]);
+	}
+	memcpy(hook0 + offset, hook_post, sizeof(hook_post));
+
+	/* TODO: handle inode_permission hook (e.g. chdir) */
+	enum landlock_hook_id hooks[] = {
+		LANDLOCK_HOOK_FILE_OPEN,
+		LANDLOCK_HOOK_FILE_PERMISSION,
+		LANDLOCK_HOOK_MMAP_FILE,
+	};
+	for (i = 0; i < ARRAY_SIZE(hooks) && !ret; i++) {
+		int bpf0 = landlock_prog_load(hook0, hook_size, hooks[i], 0);
+		if (bpf0 == -1) {
+			perror("prog_load");
+			fprintf(stderr, "%s", bpf_log_buf);
+			ret = 1;
+			break;
+		}
+		if (!cgroup_nb) {
+			if (seccomp(SECCOMP_SET_LANDLOCK_HOOK, 0, &bpf0)) {
+				perror("seccomp(set_hook)");
+				ret = 1;
+			}
+		} else {
+			for (key = 0; key < cgroup_nb && !ret; key++) {
+				int fd = open(cgroup_paths[key],
+						O_DIRECTORY | O_CLOEXEC);
+				if (fd < 0) {
+					fprintf(stderr, "open(cgroup: \"%s\"): %s\n",
+							cgroup_paths[key], strerror(errno));
+					ret = 1;
+					break;
+				}
+				if (bpf_prog_attach(bpf0, fd, BPF_CGROUP_LANDLOCK)) {
+					fprintf(stderr, "bpf_prog_attach(cgroup: \"%s\"): %s\n",
+							cgroup_paths[key], strerror(errno));
+					ret = 1;
+				}
+				close(fd);
+			}
+		}
+		close(bpf0);
+	}
+
+	free(hook0);
+err_alloc:
+	if (path_nb) {
+		close(map_fs);
+	}
+	return ret;
+}
+
+#define ENV_FS_PATH_NAME "LANDLOCK_ALLOWED"
+#define ENV_CGROUP_PATH_NAME "LANDLOCK_CGROUPS"
+#define ENV_PATH_TOKEN ":"
+
+static int parse_path(char *env_path, const char ***path_list)
+{
+	int i, path_nb = 0;
+
+	if (env_path) {
+		path_nb++;
+		for (i = 0; env_path[i]; i++) {
+			if (env_path[i] == ENV_PATH_TOKEN[0]) {
+				path_nb++;
+			}
+		}
+	}
+	*path_list = malloc(path_nb * sizeof(**path_list));
+	for (i = 0; i < path_nb; i++) {
+		(*path_list)[i] = strsep(&env_path, ENV_PATH_TOKEN);
+	}
+
+	return path_nb;
+}
+
+int main(int argc, char * const argv[], char * const *envp)
+{
+	char *cmd_path;
+	char *env_path_allowed, *env_path_cgroup;
+	int path_nb, cgroup_nb;
+	const char **sb_paths = NULL;
+	const char **cg_paths = NULL;
+	char * const *cmd_argv;
+
+	env_path_allowed = getenv(ENV_FS_PATH_NAME);
+	if (env_path_allowed)
+		env_path_allowed = strdup(env_path_allowed);
+	env_path_cgroup = getenv(ENV_CGROUP_PATH_NAME);
+	if (env_path_cgroup)
+		env_path_cgroup = strdup(env_path_cgroup);
+
+	path_nb = parse_path(env_path_allowed, &sb_paths);
+	cgroup_nb = parse_path(env_path_cgroup, &cg_paths);
+	if (argc < 2 && !cgroup_nb) {
+		fprintf(stderr, "usage: %s <cmd> [args]...\n\n", argv[0]);
+		fprintf(stderr, "Environment variables containing paths, each separated by a colon:\n");
+		fprintf(stderr, "* %s (whitelist of allowed files and directories)\n",
+				ENV_FS_PATH_NAME);
+		fprintf(stderr, "* %s (optional cgroup paths for which the sandbox is enabled)\n",
+				ENV_CGROUP_PATH_NAME);
+		fprintf(stderr, "\nexample:\n%s='/bin:/lib:/usr:/tmp:/proc/self/fd/0' %s /bin/sh -i\n",
+				ENV_FS_PATH_NAME, argv[0]);
+		return 1;
+	}
+	if (apply_sandbox(sb_paths, path_nb, cg_paths, cgroup_nb))
+		return 1;
+	if (!cgroup_nb) {
+		cmd_path = argv[1];
+		cmd_argv = argv + 1;
+		fprintf(stderr, "Launching a new sandboxed process.\n");
+		execve(cmd_path, cmd_argv, envp);
+		perror("execve");
+		return 1;
+	}
+	fprintf(stderr, "Ready to sandbox with cgroups.\n");
+	return 0;
+}
-- 
2.9.3

^ permalink raw reply related

* [RFC v3 17/22] cgroup: Add access check for cgroup_get_from_fd()
From: Mickaël Salaün @ 2016-09-14  7:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Arnd Bergmann, Casey Schaufler, Daniel Borkmann, Daniel Mack,
	David Drysdale, David S . Miller, Elena Reshetova,
	Eric W . Biederman, James Morris, Kees Cook, Paul Moore,
	Sargun Dhillon, Serge E . Hallyn, Tejun Heo, Will Drewry,
	kernel-hardening, linux-api
In-Reply-To: <20160914072415.26021-1-mic@digikod.net>

Add security access check for cgroup backed FD. The "cgroup.procs" file
of the corresponding cgroup must be readable to identify the cgroup, and
writable to prove that the current process can manage this cgroup (e.g.
through delegation). This is similar to the check done by
cgroup_procs_write_permission().

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Mack <daniel@zonque.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  2 +-
 kernel/bpf/arraymap.c  |  2 +-
 kernel/bpf/syscall.c   |  6 +++---
 kernel/cgroup.c        | 16 +++++++++++++++-
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c4688742ddc4..5767d471e292 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -87,7 +87,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 						       struct cgroup_subsys *ss);
 
 struct cgroup *cgroup_get_from_path(const char *path);
-struct cgroup *cgroup_get_from_fd(int fd);
+struct cgroup *cgroup_get_from_fd(int fd, int access_mask);
 
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index edaab4c87292..1d4de8e0ab13 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -552,7 +552,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 				     struct file *map_file /* not used */,
 				     int fd)
 {
-	return cgroup_get_from_fd(fd);
+	return cgroup_get_from_fd(fd, MAY_READ);
 }
 
 static void cgroup_fd_array_put_ptr(void *ptr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e9c5add327e6..f90225dbbb59 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -17,6 +17,7 @@
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
+#include <linux/fs.h>
 
 DEFINE_PER_CPU(int, bpf_prog_active);
 
@@ -863,7 +864,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	cgrp = cgroup_get_from_fd(attr->target_fd);
+	cgrp = cgroup_get_from_fd(attr->target_fd, MAY_WRITE);
 	if (IS_ERR(cgrp)) {
 		bpf_prog_put(prog);
 		return PTR_ERR(cgrp);
@@ -891,10 +892,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
 
-		cgrp = cgroup_get_from_fd(attr->target_fd);
+		cgrp = cgroup_get_from_fd(attr->target_fd, MAY_WRITE);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
-
 		result = cgroup_bpf_update(cgrp, NULL, attr->attach_type);
 		cgroup_put(cgrp);
 		break;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 48b650a640a9..3bbaf3f02ed2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6241,17 +6241,20 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
 /**
  * cgroup_get_from_fd - get a cgroup pointer from a fd
  * @fd: fd obtained by open(cgroup2_dir)
+ * @access_mask: contains the permission mask
  *
  * Find the cgroup from a fd which should be obtained
  * by opening a cgroup directory.  Returns a pointer to the
  * cgroup on success. ERR_PTR is returned if the cgroup
  * cannot be found.
  */
-struct cgroup *cgroup_get_from_fd(int fd)
+struct cgroup *cgroup_get_from_fd(int fd, int access_mask)
 {
 	struct cgroup_subsys_state *css;
 	struct cgroup *cgrp;
 	struct file *f;
+	struct inode *inode;
+	int ret;
 
 	f = fget_raw(fd);
 	if (!f)
@@ -6268,6 +6271,17 @@ struct cgroup *cgroup_get_from_fd(int fd)
 		return ERR_PTR(-EBADF);
 	}
 
+	ret = -ENOMEM;
+	inode = kernfs_get_inode(f->f_path.dentry->d_sb, cgrp->procs_file.kn);
+	if (inode) {
+		ret = inode_permission(inode, access_mask);
+		iput(inode);
+	}
+	if (ret) {
+		cgroup_put(cgrp);
+		return ERR_PTR(ret);
+	}
+
 	return cgrp;
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
-- 
2.9.3

^ permalink raw reply related

* [RFC v3 21/22] bpf,landlock: Add optional skb pointer in the Landlock context
From: Mickaël Salaün @ 2016-09-14  7:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Arnd Bergmann, Casey Schaufler, Daniel Borkmann, Daniel Mack,
	David Drysdale, David S . Miller, Elena Reshetova,
	Eric W . Biederman, James Morris, Kees Cook, Paul Moore,
	Sargun Dhillon, Serge E . Hallyn, Tejun Heo, Will Drewry,
	kernel-hardening, linux-api
In-Reply-To: <20160914072415.26021-1-mic@digikod.net>

This is a proof of concept to expose optional values that could depend
of the process access rights.

There is two dedicated flags: LANDLOCK_FLAG_ACCESS_SKB_READ and
LANDLOCK_FLAG_ACCESS_SKB_WRITE. Each of them can be activated to access
eBPF functions manipulating a skb in a read or write way.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Sargun Dhillon <sargun@sargun.me>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  7 ++++++-
 kernel/bpf/verifier.c    |  6 ++++++
 security/landlock/lsm.c  | 26 ++++++++++++++++++++++++++
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f7325c17f720..218973777612 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -88,6 +88,7 @@ enum bpf_arg_type {
 
 	ARG_PTR_TO_STRUCT_FILE,		/* pointer to struct file */
 	ARG_CONST_PTR_TO_LANDLOCK_HANDLE_FS,	/* pointer to Landlock FS handle */
+	ARG_PTR_TO_STRUCT_SKB,		/* pointer to struct skb */
 };
 
 /* type of values returned from helper functions */
@@ -150,6 +151,7 @@ enum bpf_reg_type {
 	/* Landlock */
 	PTR_TO_STRUCT_FILE,
 	CONST_PTR_TO_LANDLOCK_HANDLE_FS,
+	PTR_TO_STRUCT_SKB,
 };
 
 struct bpf_prog;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8cfc2de2ab76..7d9e56952ed9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -586,7 +586,9 @@ enum landlock_hook_id {
 /* context of function access flags */
 #define LANDLOCK_FLAG_ACCESS_UPDATE	(1 << 0)
 #define LANDLOCK_FLAG_ACCESS_DEBUG	(1 << 1)
-#define _LANDLOCK_FLAG_ACCESS_MASK	((1ULL << 2) - 1)
+#define LANDLOCK_FLAG_ACCESS_SKB_READ	(1 << 2)
+#define LANDLOCK_FLAG_ACCESS_SKB_WRITE	(1 << 3)
+#define _LANDLOCK_FLAG_ACCESS_MASK	((1ULL << 4) - 1)
 
 /* Handle check flags */
 #define LANDLOCK_FLAG_FS_DENTRY		(1 << 0)
@@ -619,12 +621,15 @@ struct landlock_handle {
  * @args: LSM hook arguments, see include/linux/lsm_hooks.h for there
  *        description and the LANDLOCK_HOOK* definitions from
  *        security/landlock/lsm.c for their types.
+ * @opt_skb: optional skb pointer, accessible with the
+ *           LANDLOCK_FLAG_ACCESS_SKB_* flags for network-related hooks.
  */
 struct landlock_data {
 	__u32 hook; /* enum landlock_hook_id */
 	__u16 origin; /* LANDLOCK_FLAG_ORIGIN_* */
 	__u16 cookie; /* seccomp RET_LANDLOCK */
 	__u64 args[6];
+	__u64 opt_skb;
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d7b18574f5a..a95154c1a60f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -247,6 +247,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_END]	= "pkt_end",
 	[PTR_TO_STRUCT_FILE]	= "struct_file",
 	[CONST_PTR_TO_LANDLOCK_HANDLE_FS] = "landlock_handle_fs",
+	[PTR_TO_STRUCT_SKB]	= "struct_skb",
 };
 
 static void print_verifier_state(struct verifier_state *state)
@@ -559,6 +560,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_STRUCT_FILE:
 	case CONST_PTR_TO_LANDLOCK_HANDLE_FS:
+	case PTR_TO_STRUCT_SKB:
 		return true;
 	default:
 		return false;
@@ -984,6 +986,10 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		expected_type = CONST_PTR_TO_LANDLOCK_HANDLE_FS;
 		if (type != expected_type)
 			goto err_type;
+	} else if (arg_type == ARG_PTR_TO_STRUCT_SKB) {
+		expected_type = PTR_TO_STRUCT_SKB;
+		if (type != expected_type)
+			goto err_type;
 	} else if (arg_type == ARG_PTR_TO_STACK ||
 		   arg_type == ARG_PTR_TO_RAW_STACK) {
 		expected_type = PTR_TO_STACK;
diff --git a/security/landlock/lsm.c b/security/landlock/lsm.c
index 56c45abe979c..8b0e6f0eb6b7 100644
--- a/security/landlock/lsm.c
+++ b/security/landlock/lsm.c
@@ -281,6 +281,7 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type,
 		break;
 	case offsetof(struct landlock_data, args[0]) ...
 			offsetof(struct landlock_data, args[5]):
+	case offsetof(struct landlock_data, opt_skb):
 		expected_size = sizeof(__u64);
 		break;
 	default:
@@ -299,6 +300,13 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type,
 		if (*reg_type == NOT_INIT)
 			return false;
 		break;
+	case offsetof(struct landlock_data, opt_skb):
+		if (!(prog_subtype->landlock_hook.access &
+				(LANDLOCK_FLAG_ACCESS_SKB_READ |
+				 LANDLOCK_FLAG_ACCESS_SKB_WRITE)))
+			return false;
+		*reg_type = PTR_TO_STRUCT_SKB;
+		break;
 	}
 
 	return true;
@@ -401,6 +409,24 @@ static inline bool bpf_landlock_is_valid_subtype(
 	if (prog_subtype->landlock_hook.access & LANDLOCK_FLAG_ACCESS_DEBUG &&
 			!capable(CAP_SYS_ADMIN))
 		return false;
+	/*
+	 * Capability checks must be enforced for every landlocked process.
+	 * To support user namespaces/capabilities, we must then check the
+	 * namespaces of a task before putting it in a landlocked cgroup.
+	 * This could be implemented in the future.
+	 */
+	if (prog_subtype->landlock_hook.access & LANDLOCK_FLAG_ACCESS_SKB_READ &&
+			!capable(CAP_NET_ADMIN))
+		return false;
+	/*
+	 * It is interesting to differentiate read and write access to be able
+	 * to securely delegate some work to unprivileged (and potentially
+	 * compromised/untrusted) processes. This different type of access can
+	 * be checked for function calls or context accesses.
+	 */
+	if (prog_subtype->landlock_hook.access & LANDLOCK_FLAG_ACCESS_SKB_WRITE &&
+			!capable(CAP_NET_ADMIN))
+		return false;
 
 	return true;
 }
-- 
2.9.3

^ permalink raw reply related

* [RFC v3 20/22] landlock: Add update and debug access flags
From: Mickaël Salaün @ 2016-09-14  7:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Arnd Bergmann, Casey Schaufler, Daniel Borkmann, Daniel Mack,
	David Drysdale, David S . Miller, Elena Reshetova,
	Eric W . Biederman, James Morris, Kees Cook, Paul Moore,
	Sargun Dhillon, Serge E . Hallyn, Tejun Heo, Will Drewry,
	kernel-hardening, linux-api
In-Reply-To: <20160914072415.26021-1-mic@digikod.net>

For now, the update and debug accesses are only accessible to a process
with CAP_SYS_ADMIN. This could change in the future.

The capability check is statically done when loading an eBPF program,
according to the current process. If the process has enough rights and
set the appropriate access flags, then the dedicated functions or data
will be accessible.

With the update access, the following functions are available:
* bpf_map_lookup_elem
* bpf_map_update_elem
* bpf_map_delete_elem
* bpf_tail_call

With the debug access, the following functions are available:
* bpf_trace_printk
* bpf_get_prandom_u32
* bpf_get_current_pid_tgid
* bpf_get_current_uid_gid
* bpf_get_current_comm

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Sargun Dhillon <sargun@sargun.me>
---
 include/uapi/linux/bpf.h |  4 +++-
 security/landlock/lsm.c  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3cc52e51357f..8cfc2de2ab76 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -584,7 +584,9 @@ enum landlock_hook_id {
 #define _LANDLOCK_FLAG_ORIGIN_MASK	((1 << 3) - 1)
 
 /* context of function access flags */
-#define _LANDLOCK_FLAG_ACCESS_MASK	((1ULL << 0) - 1)
+#define LANDLOCK_FLAG_ACCESS_UPDATE	(1 << 0)
+#define LANDLOCK_FLAG_ACCESS_DEBUG	(1 << 1)
+#define _LANDLOCK_FLAG_ACCESS_MASK	((1ULL << 2) - 1)
 
 /* Handle check flags */
 #define LANDLOCK_FLAG_FS_DENTRY		(1 << 0)
diff --git a/security/landlock/lsm.c b/security/landlock/lsm.c
index 2a15839a08c8..56c45abe979c 100644
--- a/security/landlock/lsm.c
+++ b/security/landlock/lsm.c
@@ -202,11 +202,57 @@ static int landlock_run_prog(enum landlock_hook_id hook_id, __u64 args[6])
 static const struct bpf_func_proto *bpf_landlock_func_proto(
 		enum bpf_func_id func_id, union bpf_prog_subtype *prog_subtype)
 {
+	bool access_update = !!(prog_subtype->landlock_hook.access &
+			LANDLOCK_FLAG_ACCESS_UPDATE);
+	bool access_debug = !!(prog_subtype->landlock_hook.access &
+			LANDLOCK_FLAG_ACCESS_DEBUG);
+
 	switch (func_id) {
 	case BPF_FUNC_landlock_cmp_fs_prop_with_struct_file:
 		return &bpf_landlock_cmp_fs_prop_with_struct_file_proto;
 	case BPF_FUNC_landlock_cmp_fs_beneath_with_struct_file:
 		return &bpf_landlock_cmp_fs_beneath_with_struct_file_proto;
+
+	/* access_update */
+	case BPF_FUNC_map_lookup_elem:
+		if (access_update)
+			return &bpf_map_lookup_elem_proto;
+		return NULL;
+	case BPF_FUNC_map_update_elem:
+		if (access_update)
+			return &bpf_map_update_elem_proto;
+		return NULL;
+	case BPF_FUNC_map_delete_elem:
+		if (access_update)
+			return &bpf_map_delete_elem_proto;
+		return NULL;
+	case BPF_FUNC_tail_call:
+		if (access_update)
+			return &bpf_tail_call_proto;
+		return NULL;
+
+	/* access_debug */
+	case BPF_FUNC_trace_printk:
+		if (access_debug)
+			return bpf_get_trace_printk_proto();
+		return NULL;
+	case BPF_FUNC_get_prandom_u32:
+		if (access_debug)
+			return &bpf_get_prandom_u32_proto;
+		return NULL;
+	case BPF_FUNC_get_current_pid_tgid:
+		if (access_debug)
+			return &bpf_get_current_pid_tgid_proto;
+		return NULL;
+	case BPF_FUNC_get_current_uid_gid:
+		if (access_debug)
+			return &bpf_get_current_uid_gid_proto;
+		return NULL;
+	case BPF_FUNC_get_current_comm:
+		if (access_debug)
+			return &bpf_get_current_comm_proto;
+		return NULL;
+
 	default:
 		return NULL;
 	}
@@ -348,6 +394,14 @@ static inline bool bpf_landlock_is_valid_subtype(
 	if (prog_subtype->landlock_hook.access & ~_LANDLOCK_FLAG_ACCESS_MASK)
 		return false;
 
+	/* check access flags */
+	if (prog_subtype->landlock_hook.access & LANDLOCK_FLAG_ACCESS_UPDATE &&
+			!capable(CAP_SYS_ADMIN))
+		return false;
+	if (prog_subtype->landlock_hook.access & LANDLOCK_FLAG_ACCESS_DEBUG &&
+			!capable(CAP_SYS_ADMIN))
+		return false;
+
 	return true;
 }
 
-- 
2.9.3

^ permalink raw reply related

* [RFC v3 19/22] landlock: Add interrupted origin
From: Mickaël Salaün @ 2016-09-14  7:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Arnd Bergmann, Casey Schaufler, Daniel Borkmann, Daniel Mack,
	David Drysdale, David S . Miller, Elena Reshetova,
	Eric W . Biederman, James Morris, Kees Cook, Paul Moore,
	Sargun Dhillon, Serge E . Hallyn, Tejun Heo, Will Drewry,
	kernel-hardening, linux-api, linux-security-module
In-Reply-To: <20160914072415.26021-1-mic@digikod.net>

This third origin of hook call should cover all possible trigger paths
(e.g. page fault). Landlock eBPF programs can then take decisions
accordingly.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/bpf.h |  3 ++-
 security/landlock/lsm.c  | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 12e61508f879..3cc52e51357f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -580,7 +580,8 @@ enum landlock_hook_id {
 /* Trigger type */
 #define LANDLOCK_FLAG_ORIGIN_SYSCALL	(1 << 0)
 #define LANDLOCK_FLAG_ORIGIN_SECCOMP	(1 << 1)
-#define _LANDLOCK_FLAG_ORIGIN_MASK	((1 << 2) - 1)
+#define LANDLOCK_FLAG_ORIGIN_INTERRUPT	(1 << 2)
+#define _LANDLOCK_FLAG_ORIGIN_MASK	((1 << 3) - 1)
 
 /* context of function access flags */
 #define _LANDLOCK_FLAG_ACCESS_MASK	((1ULL << 0) - 1)
diff --git a/security/landlock/lsm.c b/security/landlock/lsm.c
index 000dd0c7ec3d..2a15839a08c8 100644
--- a/security/landlock/lsm.c
+++ b/security/landlock/lsm.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h> /* FIELD_SIZEOF() */
 #include <linux/landlock.h>
 #include <linux/lsm_hooks.h>
+#include <linux/preempt.h> /* in_interrupt() */
 #include <linux/seccomp.h> /* struct seccomp_* */
 #include <linux/types.h> /* uintptr_t */
 
@@ -109,6 +110,7 @@ static int landlock_run_prog(enum landlock_hook_id hook_id, __u64 args[6])
 #endif /* CONFIG_CGROUP_BPF */
 	struct landlock_rule *rule;
 	u32 hook_idx = get_index(hook_id);
+	u16 current_call;
 
 	struct landlock_data ctx = {
 		.hook = hook_id,
@@ -128,6 +130,16 @@ static int landlock_run_prog(enum landlock_hook_id hook_id, __u64 args[6])
 	 * prioritize fine-grained policies (i.e. per thread), and return early.
 	 */
 
+	if (unlikely(in_interrupt())) {
+		current_call = LANDLOCK_FLAG_ORIGIN_INTERRUPT;
+#ifdef CONFIG_SECCOMP_FILTER
+		/* bypass landlock_ret evaluation */
+		goto seccomp_int;
+#endif /* CONFIG_SECCOMP_FILTER */
+	} else {
+		current_call = LANDLOCK_FLAG_ORIGIN_SYSCALL;
+	}
+
 #ifdef CONFIG_SECCOMP_FILTER
 	/* seccomp triggers and landlock_ret cleanup */
 	ctx.origin = LANDLOCK_FLAG_ORIGIN_SECCOMP;
@@ -164,8 +176,9 @@ static int landlock_run_prog(enum landlock_hook_id hook_id, __u64 args[6])
 		return -ret;
 	ctx.cookie = 0;
 
+seccomp_int:
 	/* syscall trigger */
-	ctx.origin = LANDLOCK_FLAG_ORIGIN_SYSCALL;
+	ctx.origin = current_call;
 	ret = landlock_run_prog_for_syscall(hook_idx, &ctx,
 			current->seccomp.landlock_hooks);
 	if (ret)
@@ -175,7 +188,7 @@ static int landlock_run_prog(enum landlock_hook_id hook_id, __u64 args[6])
 #ifdef CONFIG_CGROUP_BPF
 	/* syscall trigger */
 	if (cgroup_bpf_enabled) {
-		ctx.origin = LANDLOCK_FLAG_ORIGIN_SYSCALL;
+		ctx.origin = current_call;
 		/* get the default cgroup associated with the current thread */
 		cgrp = task_css_set(current)->dfl_cgrp;
 		ret = landlock_run_prog_for_syscall(hook_idx, &ctx,
-- 
2.9.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox