Linux Security Modules development

Linux Security Modules development
 help / color / mirror / Atom feed

* [PATCH bpf-next v7 2/7] bpf: Generalize caching for sk_storage.
From: KP Singh @ 2020-07-30 14:07 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200730140716.404558-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

Provide the a ability to define local storage caches on a per-object
type basis. The caches and caching indices for different objects should
not be inter-mixed as suggested in:

  https://lore.kernel.org/bpf/20200630193441.kdwnkestulg5erii@kafai-mbp.dhcp.thefacebook.com/

  "Caching a sk-storage at idx=0 of a sk should not stop an
  inode-storage to be cached at the same idx of a inode."

Signed-off-by: KP Singh <kpsingh@google.com>
---
 include/net/bpf_sk_storage.h | 19 +++++++++++++++++++
 net/core/bpf_sk_storage.c    | 31 +++++++++++++++----------------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 5036c94c0503..950c5aaba15e 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -3,6 +3,9 @@
 #ifndef _BPF_SK_STORAGE_H
 #define _BPF_SK_STORAGE_H
 
+#include <linux/types.h>
+#include <linux/spinlock.h>
+
 struct sock;
 
 void bpf_sk_storage_free(struct sock *sk);
@@ -15,6 +18,22 @@ struct sk_buff;
 struct nlattr;
 struct sock;
 
+#define BPF_LOCAL_STORAGE_CACHE_SIZE	16
+
+struct bpf_local_storage_cache {
+	spinlock_t idx_lock;
+	u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE];
+};
+
+#define DEFINE_BPF_STORAGE_CACHE(name)				\
+static struct bpf_local_storage_cache name = {			\
+	.idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock),	\
+}
+
+u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache);
+void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+				      u16 idx);
+
 #ifdef CONFIG_BPF_SYSCALL
 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
 struct bpf_sk_storage_diag *
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index a5cc218834ee..99dde7b74767 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -14,6 +14,8 @@
 
 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
 
+DEFINE_BPF_STORAGE_CACHE(sk_cache);
+
 struct bpf_local_storage_map_bucket {
 	struct hlist_head list;
 	raw_spinlock_t lock;
@@ -78,10 +80,6 @@ struct bpf_local_storage_elem {
 #define SELEM(_SDATA)							\
 	container_of((_SDATA), struct bpf_local_storage_elem, sdata)
 #define SDATA(_SELEM) (&(_SELEM)->sdata)
-#define BPF_LOCAL_STORAGE_CACHE_SIZE	16
-
-static DEFINE_SPINLOCK(cache_idx_lock);
-static u64 cache_idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE];
 
 struct bpf_local_storage {
 	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
@@ -517,16 +515,16 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
 	return 0;
 }
 
-static u16 cache_idx_get(void)
+u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
 {
 	u64 min_usage = U64_MAX;
 	u16 i, res = 0;
 
-	spin_lock(&cache_idx_lock);
+	spin_lock(&cache->idx_lock);
 
 	for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
-		if (cache_idx_usage_counts[i] < min_usage) {
-			min_usage = cache_idx_usage_counts[i];
+		if (cache->idx_usage_counts[i] < min_usage) {
+			min_usage = cache->idx_usage_counts[i];
 			res = i;
 
 			/* Found a free cache_idx */
@@ -534,18 +532,19 @@ static u16 cache_idx_get(void)
 				break;
 		}
 	}
-	cache_idx_usage_counts[res]++;
+	cache->idx_usage_counts[res]++;
 
-	spin_unlock(&cache_idx_lock);
+	spin_unlock(&cache->idx_lock);
 
 	return res;
 }
 
-static void cache_idx_free(u16 idx)
+void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+				      u16 idx)
 {
-	spin_lock(&cache_idx_lock);
-	cache_idx_usage_counts[idx]--;
-	spin_unlock(&cache_idx_lock);
+	spin_lock(&cache->idx_lock);
+	cache->idx_usage_counts[idx]--;
+	spin_unlock(&cache->idx_lock);
 }
 
 /* Called by __sk_destruct() & bpf_sk_storage_clone() */
@@ -597,7 +596,7 @@ static void bpf_local_storage_map_free(struct bpf_map *map)
 
 	smap = (struct bpf_local_storage_map *)map;
 
-	cache_idx_free(smap->cache_idx);
+	bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
 
 	/* Note that this map might be concurrently cloned from
 	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
@@ -713,7 +712,7 @@ static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
 	}
 
 	smap->elem_size = sizeof(struct bpf_local_storage_elem) + attr->value_size;
-	smap->cache_idx = cache_idx_get();
+	smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
 
 	return &smap->map;
 }
-- 
2.28.0.rc0.142.g3c755180ce-goog


^ permalink raw reply related

* [PATCH bpf-next v7 3/7] bpf: Generalize bpf_sk_storage
From: KP Singh @ 2020-07-30 14:07 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Martin KaFai Lau, Alexei Starovoitov, Daniel Borkmann,
	Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200730140716.404558-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

Refactor the functionality in bpf_sk_storage.c so that concept of
storage linked to kernel objects can be extended to other objects like
inode, task_struct etc.

Each new local storage will still be a separate map and provide its own
set of helpers. This allows for future object specific extensions and
still share a lot of the underlying implementation.

This includes the changes suggested by Martin in:

  https://lore.kernel.org/bpf/20200725013047.4006241-1-kafai@fb.com/

which adds map_local_storage_charge, map_local_storage_uncharge,
and map_owner_storage_ptr.

Co-developed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: KP Singh <kpsingh@google.com>
---
 include/linux/bpf.h            |   9 ++
 include/net/bpf_sk_storage.h   |  51 +++++++
 include/uapi/linux/bpf.h       |   8 +-
 net/core/bpf_sk_storage.c      | 246 +++++++++++++++++++++------------
 tools/include/uapi/linux/bpf.h |   8 +-
 5 files changed, 233 insertions(+), 89 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 40c5e206ecf2..1bf809449e7d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -34,6 +34,9 @@ struct btf_type;
 struct exception_table_entry;
 struct seq_operations;
 struct bpf_iter_aux_info;
+struct bpf_local_storage;
+struct bpf_local_storage_map;
+struct bpf_local_storage_elem;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -104,6 +107,12 @@ struct bpf_map_ops {
 	__poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
 			     struct poll_table_struct *pts);
 
+	/* Functions called by bpf_local_storage maps */
+	int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
+					void *owner, u32 size);
+	void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
+					   void *owner, u32 size);
+	struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
 	/* BTF name and id of struct allocated by map_alloc */
 	const char * const map_btf_name;
 	int *map_btf_id;
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 950c5aaba15e..05b777950eb3 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -3,8 +3,15 @@
 #ifndef _BPF_SK_STORAGE_H
 #define _BPF_SK_STORAGE_H
 
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
 
 struct sock;
 
@@ -34,6 +41,50 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache);
 void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
 				      u16 idx);
 
+/* Helper functions for bpf_local_storage */
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
+
+struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr);
+
+struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+			 struct bpf_local_storage_map *smap,
+			 bool cacheit_lockit);
+
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap);
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+				    const struct btf *btf,
+				    const struct btf_type *key_type,
+				    const struct btf_type *value_type);
+
+void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
+			    struct bpf_local_storage_elem *selem);
+
+bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
+			      struct bpf_local_storage_elem *selem,
+			      bool uncharge_omem);
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem);
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+			struct bpf_local_storage_elem *selem);
+
+void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
+		bool charge_mem);
+
+int
+bpf_local_storage_alloc(void *owner,
+			struct bpf_local_storage_map *smap,
+			struct bpf_local_storage_elem *first_selem);
+
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_map *map, void *value,
+			 u64 map_flags);
+
 #ifdef CONFIG_BPF_SYSCALL
 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
 struct bpf_sk_storage_diag *
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index eb5e0c38eb2c..cd5a47a74533 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3642,9 +3642,13 @@ enum {
 	BPF_F_SYSCTL_BASE_NAME		= (1ULL << 0),
 };
 
-/* BPF_FUNC_sk_storage_get flags */
+/* BPF_FUNC_<local>_storage_get flags */
 enum {
-	BPF_SK_STORAGE_GET_F_CREATE	= (1ULL << 0),
+	BPF_LOCAL_STORAGE_GET_F_CREATE	= (1ULL << 0),
+	/* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility
+	 * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead.
+	 */
+	BPF_SK_STORAGE_GET_F_CREATE  = BPF_LOCAL_STORAGE_GET_F_CREATE,
 };
 
 /* BPF_FUNC_read_branch_records flags. */
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 99dde7b74767..bb2375769ca1 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -84,7 +84,7 @@ struct bpf_local_storage_elem {
 struct bpf_local_storage {
 	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
 	struct hlist_head list; /* List of bpf_local_storage_elem */
-	struct sock *owner;	/* The object that owns the the above "list" of
+	void *owner;		/* The object that owns the the above "list" of
 				 * bpf_local_storage_elem.
 				 */
 	struct rcu_head rcu;
@@ -110,6 +110,33 @@ static int omem_charge(struct sock *sk, unsigned int size)
 	return -ENOMEM;
 }
 
+static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
+{
+	struct bpf_map *map = &smap->map;
+
+	if (!map->ops->map_local_storage_charge)
+		return 0;
+
+	return map->ops->map_local_storage_charge(smap, owner, size);
+}
+
+static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
+			 u32 size)
+{
+	struct bpf_map *map = &smap->map;
+
+	if (map->ops->map_local_storage_uncharge)
+		map->ops->map_local_storage_uncharge(smap, owner, size);
+}
+
+static struct bpf_local_storage __rcu **
+owner_storage(struct bpf_local_storage_map *smap, void *owner)
+{
+	struct bpf_map *map = &smap->map;
+
+	return map->ops->map_owner_storage_ptr(owner);
+}
+
 static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
 {
 	return !hlist_unhashed(&selem->snode);
@@ -120,13 +147,13 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
 	return !hlist_unhashed(&selem->map_node);
 }
 
-static struct bpf_local_storage_elem *
-bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk,
-		void *value, bool charge_omem)
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
+		void *value, bool charge_mem)
 {
 	struct bpf_local_storage_elem *selem;
 
-	if (charge_omem && omem_charge(sk, smap->elem_size))
+	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
 		return NULL;
 
 	selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
@@ -136,40 +163,42 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk,
 		return selem;
 	}
 
-	if (charge_omem)
-		atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
+	if (charge_mem)
+		mem_uncharge(smap, owner, smap->elem_size);
 
 	return NULL;
 }
 
-/* sk_storage->lock must be held and selem->sk_storage == sk_storage.
+/* local_storage->lock must be held and selem->sk_storage == sk_storage.
  * The caller must ensure selem->smap is still valid to be
  * dereferenced for its smap->elem_size and smap->cache_idx.
  */
-static bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
-				     struct bpf_local_storage_elem *selem,
-				     bool uncharge_omem)
+bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
+			      struct bpf_local_storage_elem *selem,
+			      bool uncharge_mem)
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
-	struct sock *sk;
+	void *owner;
 
 	smap = rcu_dereference(SDATA(selem)->smap);
-	sk = local_storage->owner;
+	owner = local_storage->owner;
 
-	/* All uncharging on sk->sk_omem_alloc must be done first.
-	 * sk may be freed once the last selem is unlinked from local_storage.
+	/* All uncharging on owner must be done first.
+	 * The owner may be freed once the last selem is unlinked from
+	 * local_storage.
 	 */
-	if (uncharge_omem)
-		atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
+	if (uncharge_mem)
+		mem_uncharge(smap, owner, smap->elem_size);
 
 	free_local_storage = hlist_is_singular_node(&selem->snode,
 						    &local_storage->list);
 	if (free_local_storage) {
-		atomic_sub(sizeof(struct bpf_local_storage), &sk->sk_omem_alloc);
+		mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
 		local_storage->owner = NULL;
-		/* After this RCU_INIT, sk may be freed and cannot be used */
-		RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
+
+		/* After this RCU_INIT, owner may be freed and cannot be used */
+		RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
 
 		/* local_storage is not freed now.  local_storage->lock is
 		 * still held and raw_spin_unlock_bh(&local_storage->lock)
@@ -215,14 +244,14 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
 		kfree_rcu(local_storage, rcu);
 }
 
-static void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
-				   struct bpf_local_storage_elem *selem)
+void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
+			    struct bpf_local_storage_elem *selem)
 {
 	RCU_INIT_POINTER(selem->local_storage, local_storage);
 	hlist_add_head(&selem->snode, &local_storage->list);
 }
 
-static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 {
 	struct bpf_local_storage_map *smap;
 	struct bpf_local_storage_map_bucket *b;
@@ -239,8 +268,8 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 	raw_spin_unlock_bh(&b->lock);
 }
 
-static void bpf_selem_link_map(struct bpf_local_storage_map *smap,
-			       struct bpf_local_storage_elem *selem)
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+			struct bpf_local_storage_elem *selem)
 {
 	struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
 
@@ -250,7 +279,7 @@ static void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	raw_spin_unlock_bh(&b->lock);
 }
 
-static void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
 {
 	/* Always unlink from map before unlinking from local_storage
 	 * because selem will be freed after successfully unlinked from
@@ -260,7 +289,7 @@ static void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
 	__bpf_selem_unlink_storage(selem);
 }
 
-static struct bpf_local_storage_data *
+struct bpf_local_storage_data *
 bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 			 struct bpf_local_storage_map *smap,
 			 bool cacheit_lockit)
@@ -326,40 +355,45 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
 	return 0;
 }
 
-static int sk_storage_alloc(struct sock *sk,
+int bpf_local_storage_alloc(void *owner,
 			    struct bpf_local_storage_map *smap,
 			    struct bpf_local_storage_elem *first_selem)
 {
-	struct bpf_local_storage *prev_sk_storage, *sk_storage;
+	struct bpf_local_storage *prev_storage, *storage;
+	struct bpf_local_storage **owner_storage_ptr;
 	int err;
 
-	err = omem_charge(sk, sizeof(*sk_storage));
+	err = mem_charge(smap, owner, sizeof(*storage));
 	if (err)
 		return err;
 
-	sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN);
-	if (!sk_storage) {
+	storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN);
+	if (!storage) {
 		err = -ENOMEM;
 		goto uncharge;
 	}
-	INIT_HLIST_HEAD(&sk_storage->list);
-	raw_spin_lock_init(&sk_storage->lock);
-	sk_storage->owner = sk;
 
-	bpf_selem_link_storage(sk_storage, first_selem);
+	INIT_HLIST_HEAD(&storage->list);
+	raw_spin_lock_init(&storage->lock);
+	storage->owner = owner;
+
+	bpf_selem_link_storage(storage, first_selem);
 	bpf_selem_link_map(smap, first_selem);
-	/* Publish sk_storage to sk.  sk->sk_lock cannot be acquired.
-	 * Hence, atomic ops is used to set sk->sk_bpf_storage
-	 * from NULL to the newly allocated sk_storage ptr.
+
+	owner_storage_ptr =
+		(struct bpf_local_storage **)owner_storage(smap, owner);
+	/* Publish storage to the owner.
+	 * Instead of using any lock of the kernel object (i.e. owner),
+	 * cmpxchg will work with any kernel object regardless what
+	 * the running context is, bh, irq...etc.
 	 *
-	 * From now on, the sk->sk_bpf_storage pointer is protected
-	 * by the sk_storage->lock.  Hence,  when freeing
-	 * the sk->sk_bpf_storage, the sk_storage->lock must
-	 * be held before setting sk->sk_bpf_storage to NULL.
+	 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
+	 * is protected by the storage->lock.  Hence, when freeing
+	 * the owner->storage, the storage->lock must be held before
+	 * setting owner->storage ptr to NULL.
 	 */
-	prev_sk_storage = cmpxchg((struct bpf_local_storage **)&sk->sk_bpf_storage,
-				  NULL, sk_storage);
-	if (unlikely(prev_sk_storage)) {
+	prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
+	if (unlikely(prev_storage)) {
 		bpf_selem_unlink_map(first_selem);
 		err = -EAGAIN;
 		goto uncharge;
@@ -367,7 +401,7 @@ static int sk_storage_alloc(struct sock *sk,
 		/* Note that even first_selem was linked to smap's
 		 * bucket->list, first_selem can be freed immediately
 		 * (instead of kfree_rcu) because
-		 * bpf_sk_storage_map_free() does a
+		 * bpf_local_storage_map_free() does a
 		 * synchronize_rcu() before walking the bucket->list.
 		 * Hence, no one is accessing selem from the
 		 * bucket->list under rcu_read_lock().
@@ -377,8 +411,8 @@ static int sk_storage_alloc(struct sock *sk,
 	return 0;
 
 uncharge:
-	kfree(sk_storage);
-	atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc);
+	kfree(storage);
+	mem_uncharge(smap, owner, sizeof(*storage));
 	return err;
 }
 
@@ -387,9 +421,9 @@ static int sk_storage_alloc(struct sock *sk,
  * Otherwise, it will become a leak (and other memory issues
  * during map destruction).
  */
-static struct bpf_local_storage_data *
-bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value,
-			 u64 map_flags)
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_map *map,
+			 void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *old_sdata = NULL;
 	struct bpf_local_storage_elem *selem;
@@ -404,21 +438,21 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value,
 		return ERR_PTR(-EINVAL);
 
 	smap = (struct bpf_local_storage_map *)map;
-	local_storage = rcu_dereference(sk->sk_bpf_storage);
+	local_storage = rcu_dereference(*owner_storage(smap, owner));
 	if (!local_storage || hlist_empty(&local_storage->list)) {
-		/* Very first elem for this object */
+		/* Very first elem for the owner */
 		err = check_flags(NULL, map_flags);
 		if (err)
 			return ERR_PTR(err);
 
-		selem = bpf_selem_alloc(smap, sk, value, true);
+		selem = bpf_selem_alloc(smap, owner, value, true);
 		if (!selem)
 			return ERR_PTR(-ENOMEM);
 
-		err = sk_storage_alloc(sk, smap, selem);
+		err = bpf_local_storage_alloc(owner, smap, selem);
 		if (err) {
 			kfree(selem);
-			atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
+			mem_uncharge(smap, owner, smap->elem_size);
 			return ERR_PTR(err);
 		}
 
@@ -430,8 +464,8 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value,
 		 * such that it can avoid taking the local_storage->lock
 		 * and changing the lists.
 		 */
-		old_sdata =
-			bpf_local_storage_lookup(local_storage, smap, false);
+		old_sdata = bpf_local_storage_lookup(local_storage, smap,
+						     false);
 		err = check_flags(old_sdata, map_flags);
 		if (err)
 			return ERR_PTR(err);
@@ -475,7 +509,7 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value,
 	 * old_sdata will not be uncharged later during
 	 * bpf_selem_unlink_storage().
 	 */
-	selem = bpf_selem_alloc(smap, sk, value, !old_sdata);
+	selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
 	if (!selem) {
 		err = -ENOMEM;
 		goto unlock_err;
@@ -567,7 +601,7 @@ void bpf_sk_storage_free(struct sock *sk)
 	 * Thus, no elem can be added-to or deleted-from the
 	 * sk_storage->list by the bpf_prog or by the bpf-map's syscall.
 	 *
-	 * It is racing with bpf_sk_storage_map_free() alone
+	 * It is racing with bpf_local_storage_map_free() alone
 	 * when unlinking elem from the sk_storage->list and
 	 * the map's bucket->list.
 	 */
@@ -587,17 +621,12 @@ void bpf_sk_storage_free(struct sock *sk)
 		kfree_rcu(sk_storage, rcu);
 }
 
-static void bpf_local_storage_map_free(struct bpf_map *map)
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
 {
 	struct bpf_local_storage_elem *selem;
-	struct bpf_local_storage_map *smap;
 	struct bpf_local_storage_map_bucket *b;
 	unsigned int i;
 
-	smap = (struct bpf_local_storage_map *)map;
-
-	bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
-
 	/* Note that this map might be concurrently cloned from
 	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
 	 * RCU read section to finish before proceeding. New RCU
@@ -607,11 +636,12 @@ static void bpf_local_storage_map_free(struct bpf_map *map)
 
 	/* bpf prog and the userspace can no longer access this map
 	 * now.  No new selem (of this map) can be added
-	 * to the sk->sk_bpf_storage or to the map bucket's list.
+	 * to the bpf_local_storage or to the map bucket's list.
 	 *
 	 * The elem of this map can be cleaned up here
 	 * or
-	 * by bpf_sk_storage_free() during __sk_destruct().
+	 * by bpf_local_storage_free() during the destruction of the
+	 * owner object. eg. __sk_destruct.
 	 */
 	for (i = 0; i < (1U << smap->bucket_log); i++) {
 		b = &smap->buckets[i];
@@ -627,22 +657,31 @@ static void bpf_local_storage_map_free(struct bpf_map *map)
 		rcu_read_unlock();
 	}
 
-	/* bpf_sk_storage_free() may still need to access the map.
-	 * e.g. bpf_sk_storage_free() has unlinked selem from the map
+	/* bpf_local_storage_free() may still need to access the map.
+	 * e.g. bpf_local_storage_free() has unlinked selem from the map
 	 * which then made the above while((selem = ...)) loop
 	 * exited immediately.
 	 *
-	 * However, the bpf_sk_storage_free() still needs to access
+	 * However, the bpf_local_storage_free() still needs to access
 	 * the smap->elem_size to do the uncharging in
 	 * bpf_selem_unlink_storage().
 	 *
 	 * Hence, wait another rcu grace period for the
-	 * bpf_sk_storage_free() to finish.
+	 * bpf_local_storage_free() to finish.
 	 */
 	synchronize_rcu();
 
 	kvfree(smap->buckets);
-	kfree(map);
+	kfree(smap);
+}
+
+static void sk_storage_map_free(struct bpf_map *map)
+{
+	struct bpf_local_storage_map *smap;
+
+	smap = (struct bpf_local_storage_map *)map;
+	bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
+	bpf_local_storage_map_free(smap);
 }
 
 /* U16_MAX is much more than enough for sk local storage
@@ -654,7 +693,7 @@ static void bpf_local_storage_map_free(struct bpf_map *map)
 	       sizeof(struct bpf_local_storage_elem)),			\
 	      (U16_MAX - sizeof(struct bpf_local_storage_elem)))
 
-static int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
 	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
@@ -673,7 +712,7 @@ static int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
-static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
+struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_local_storage_map *smap;
 	unsigned int i;
@@ -711,9 +750,21 @@ static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&smap->buckets[i].lock);
 	}
 
-	smap->elem_size = sizeof(struct bpf_local_storage_elem) + attr->value_size;
-	smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
+	smap->elem_size =
+		sizeof(struct bpf_local_storage_elem) + attr->value_size;
+
+	return smap;
+}
+
+static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_local_storage_map *smap;
 
+	smap = bpf_local_storage_map_alloc(attr);
+	if (IS_ERR(smap))
+		return ERR_CAST(smap);
+
+	smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
 	return &smap->map;
 }
 
@@ -723,10 +774,10 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
 	return -ENOTSUPP;
 }
 
-static int bpf_local_storage_map_check_btf(const struct bpf_map *map,
-					   const struct btf *btf,
-					   const struct btf_type *key_type,
-					   const struct btf_type *value_type)
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+				    const struct btf *btf,
+				    const struct btf_type *key_type,
+				    const struct btf_type *value_type)
 {
 	u32 int_data;
 
@@ -857,7 +908,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 			bpf_selem_link_map(smap, copy_selem);
 			bpf_selem_link_storage(new_sk_storage, copy_selem);
 		} else {
-			ret = sk_storage_alloc(newsk, smap, copy_selem);
+			ret = bpf_local_storage_alloc(newsk, smap, copy_selem);
 			if (ret) {
 				kfree(copy_selem);
 				atomic_sub(smap->elem_size,
@@ -926,11 +977,33 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
 	return -ENOENT;
 }
 
+static int sk_storage_charge(struct bpf_local_storage_map *smap,
+			     void *owner, u32 size)
+{
+	return omem_charge(owner, size);
+}
+
+static void sk_storage_uncharge(struct bpf_local_storage_map *smap,
+				void *owner, u32 size)
+{
+	struct sock *sk = owner;
+
+	atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+static struct bpf_local_storage __rcu **
+sk_storage_ptr(void *owner)
+{
+	struct sock *sk = owner;
+
+	return &sk->sk_bpf_storage;
+}
+
 static int sk_storage_map_btf_id;
 const struct bpf_map_ops sk_storage_map_ops = {
 	.map_alloc_check = bpf_local_storage_map_alloc_check,
-	.map_alloc = bpf_local_storage_map_alloc,
-	.map_free = bpf_local_storage_map_free,
+	.map_alloc = sk_storage_map_alloc,
+	.map_free = sk_storage_map_free,
 	.map_get_next_key = notsupp_get_next_key,
 	.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
 	.map_update_elem = bpf_fd_sk_storage_update_elem,
@@ -938,6 +1011,9 @@ const struct bpf_map_ops sk_storage_map_ops = {
 	.map_check_btf = bpf_local_storage_map_check_btf,
 	.map_btf_name = "bpf_local_storage_map",
 	.map_btf_id = &sk_storage_map_btf_id,
+	.map_local_storage_charge = sk_storage_charge,
+	.map_local_storage_uncharge = sk_storage_uncharge,
+	.map_owner_storage_ptr = sk_storage_ptr,
 };
 
 const struct bpf_func_proto bpf_sk_storage_get_proto = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index eb5e0c38eb2c..cd5a47a74533 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3642,9 +3642,13 @@ enum {
 	BPF_F_SYSCTL_BASE_NAME		= (1ULL << 0),
 };
 
-/* BPF_FUNC_sk_storage_get flags */
+/* BPF_FUNC_<local>_storage_get flags */
 enum {
-	BPF_SK_STORAGE_GET_F_CREATE	= (1ULL << 0),
+	BPF_LOCAL_STORAGE_GET_F_CREATE	= (1ULL << 0),
+	/* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility
+	 * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead.
+	 */
+	BPF_SK_STORAGE_GET_F_CREATE  = BPF_LOCAL_STORAGE_GET_F_CREATE,
 };
 
 /* BPF_FUNC_read_branch_records flags. */
-- 
2.28.0.rc0.142.g3c755180ce-goog


^ permalink raw reply related

* [PATCH bpf-next v7 6/7] bpf: Allow local storage to be used from LSM programs
From: KP Singh @ 2020-07-30 14:07 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200730140716.404558-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

Adds support for both bpf_{sk, inode}_storage_{get, delete} to be used
in LSM programs. These helpers are not used for tracing programs
(currently) as their usage is tied to the life-cycle of the object and
should only be used where the owning object won't be freed (when the
owning object is passed as an argument to the LSM hook). Thus, they
are safer to use in LSM hooks than tracing. Usage of local storage in
tracing programs will probably follow a per function based whitelist
approach.

Since the UAPI helper signature for bpf_sk_storage expect a bpf_sock,
it, leads to a compilation warning for LSM programs, it's also updated
to accept a void * pointer instead.

Signed-off-by: KP Singh <kpsingh@google.com>
---
 include/net/bpf_sk_storage.h   |  2 ++
 include/uapi/linux/bpf.h       |  8 ++++++--
 kernel/bpf/bpf_lsm.c           | 21 ++++++++++++++++++++-
 net/core/bpf_sk_storage.c      | 27 +++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  8 ++++++--
 5 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 847926cf2899..c5702d7baeaa 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -20,6 +20,8 @@ void bpf_sk_storage_free(struct sock *sk);
 
 extern const struct bpf_func_proto bpf_sk_storage_get_proto;
 extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
+extern const struct bpf_func_proto sk_storage_get_btf_proto;
+extern const struct bpf_func_proto sk_storage_delete_btf_proto;
 
 struct bpf_sk_storage_diag;
 struct sk_buff;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d3617f1ce3fc..c01211c8a7bc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2802,7 +2802,7 @@ union bpf_attr {
  *
  *		**-ERANGE** if resulting value was out of range.
  *
- * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags)
+ * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
  *	Description
  *		Get a bpf-local-storage from a *sk*.
  *
@@ -2818,6 +2818,10 @@ union bpf_attr {
  *		"type". The bpf-local-storage "type" (i.e. the *map*) is
  *		searched against all bpf-local-storages residing at *sk*.
  *
+ *		For socket programs, *sk* should be a **struct bpf_sock** pointer
+ *		and an **ARG_PTR_TO_BTF_ID** of type **struct sock** for LSM
+ *		programs.
+ *
  *		An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
  *		used such that a new bpf-local-storage will be
  *		created if one does not exist.  *value* can be used
@@ -2830,7 +2834,7 @@ union bpf_attr {
  *		**NULL** if not found or there was an error in adding
  *		a new bpf-local-storage.
  *
- * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk)
+ * long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
  *	Description
  *		Delete a bpf-local-storage from a *sk*.
  *	Return
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index fb278144e9fd..9cd1428c7199 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -11,6 +11,8 @@
 #include <linux/bpf_lsm.h>
 #include <linux/kallsyms.h>
 #include <linux/bpf_verifier.h>
+#include <net/bpf_sk_storage.h>
+#include <linux/bpf_local_storage.h>
 
 /* For every LSM hook that allows attachment of BPF programs, declare a nop
  * function where a BPF program can be attached.
@@ -45,10 +47,27 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 	return 0;
 }
 
+static const struct bpf_func_proto *
+bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_inode_storage_get:
+		return &bpf_inode_storage_get_proto;
+	case BPF_FUNC_inode_storage_delete:
+		return &bpf_inode_storage_delete_proto;
+	case BPF_FUNC_sk_storage_get:
+		return &sk_storage_get_btf_proto;
+	case BPF_FUNC_sk_storage_delete:
+		return &sk_storage_delete_btf_proto;
+	default:
+		return tracing_prog_func_proto(func_id, prog);
+	}
+}
+
 const struct bpf_prog_ops lsm_prog_ops = {
 };
 
 const struct bpf_verifier_ops lsm_verifier_ops = {
-	.get_func_proto = tracing_prog_func_proto,
+	.get_func_proto = bpf_lsm_func_proto,
 	.is_valid_access = btf_ctx_access,
 };
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 8f6a8d6549be..c547ddf69b91 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -12,6 +12,7 @@
 #include <net/sock.h>
 #include <uapi/linux/sock_diag.h>
 #include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
 
 DEFINE_BPF_STORAGE_CACHE(sk_cache);
 
@@ -374,6 +375,32 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
 	.arg2_type	= ARG_PTR_TO_SOCKET,
 };
 
+BTF_ID_LIST(sk_storage_get_btf_ids)
+BTF_ID(struct, sock)
+
+const struct bpf_func_proto sk_storage_get_btf_proto = {
+	.func		= bpf_sk_storage_get,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+	.btf_id		= sk_storage_get_btf_ids,
+};
+
+BTF_ID_LIST(sk_storage_delete_btf_ids)
+BTF_ID(struct, sock)
+
+const struct bpf_func_proto sk_storage_delete_btf_proto = {
+	.func		= bpf_sk_storage_delete,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.btf_id		= sk_storage_delete_btf_ids,
+};
+
 struct bpf_sk_storage_diag {
 	u32 nr_maps;
 	struct bpf_map *maps[];
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d3617f1ce3fc..c01211c8a7bc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2802,7 +2802,7 @@ union bpf_attr {
  *
  *		**-ERANGE** if resulting value was out of range.
  *
- * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags)
+ * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
  *	Description
  *		Get a bpf-local-storage from a *sk*.
  *
@@ -2818,6 +2818,10 @@ union bpf_attr {
  *		"type". The bpf-local-storage "type" (i.e. the *map*) is
  *		searched against all bpf-local-storages residing at *sk*.
  *
+ *		For socket programs, *sk* should be a **struct bpf_sock** pointer
+ *		and an **ARG_PTR_TO_BTF_ID** of type **struct sock** for LSM
+ *		programs.
+ *
  *		An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
  *		used such that a new bpf-local-storage will be
  *		created if one does not exist.  *value* can be used
@@ -2830,7 +2834,7 @@ union bpf_attr {
  *		**NULL** if not found or there was an error in adding
  *		a new bpf-local-storage.
  *
- * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk)
+ * long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
  *	Description
  *		Delete a bpf-local-storage from a *sk*.
  *	Return
-- 
2.28.0.rc0.142.g3c755180ce-goog


^ permalink raw reply related

* [PATCH bpf-next v7 7/7] bpf: Add selftests for local_storage
From: KP Singh @ 2020-07-30 14:07 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Andrii Nakryiko, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200730140716.404558-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

inode_local_storage:

* Hook to the file_open and inode_unlink LSM hooks.
* Create and unlink a temporary file.
* Store some information in the inode's bpf_local_storage during
  file_open.
* Verify that this information exists when the file is unlinked.

sk_local_storage:

* Hook to the socket_post_create and socket_bind LSM hooks.
* Open and bind a socket and set the sk_storage in the
  socket_post_create hook using the start_server helper.
* Verify if the information is set in the socket_bind hook.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: KP Singh <kpsingh@google.com>
---
 .../bpf/prog_tests/test_local_storage.c       |  60 ++++++++
 .../selftests/bpf/progs/local_storage.c       | 136 ++++++++++++++++++
 2 files changed, 196 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_local_storage.c
 create mode 100644 tools/testing/selftests/bpf/progs/local_storage.c

diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
new file mode 100644
index 000000000000..d4ba89195c43
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2020 Google LLC.
+ */
+
+#include <test_progs.h>
+#include <linux/limits.h>
+
+#include "local_storage.skel.h"
+#include "network_helpers.h"
+
+int create_and_unlink_file(void)
+{
+	char fname[PATH_MAX] = "/tmp/fileXXXXXX";
+	int fd;
+
+	fd = mkstemp(fname);
+	if (fd < 0)
+		return fd;
+
+	close(fd);
+	unlink(fname);
+	return 0;
+}
+
+void test_test_local_storage(void)
+{
+	struct local_storage *skel = NULL;
+	int err, duration = 0, serv_sk = -1;
+
+	skel = local_storage__open_and_load();
+	if (CHECK(!skel, "skel_load", "lsm skeleton failed\n"))
+		goto close_prog;
+
+	err = local_storage__attach(skel);
+	if (CHECK(err, "attach", "lsm attach failed: %d\n", err))
+		goto close_prog;
+
+	skel->bss->monitored_pid = getpid();
+
+	err = create_and_unlink_file();
+	if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	CHECK(!skel->bss->inode_storage_result, "inode_storage_result",
+	      "inode_local_storage not set");
+
+	serv_sk = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0);
+	if (CHECK(serv_sk < 0, "start_server", "failed to start server\n"))
+		goto close_prog;
+
+	CHECK(!skel->bss->sk_storage_result, "sk_storage_result",
+	      "sk_local_storage not set");
+
+	close(serv_sk);
+
+close_prog:
+	local_storage__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c
new file mode 100644
index 000000000000..cb608b7b90f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/local_storage.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define DUMMY_STORAGE_VALUE 0xdeadbeef
+
+int monitored_pid = 0;
+bool inode_storage_result = false;
+bool sk_storage_result = false;
+
+struct dummy_storage {
+	__u32 value;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_INODE_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct dummy_storage);
+} inode_storage_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+	__type(key, int);
+	__type(value, struct dummy_storage);
+} sk_storage_map SEC(".maps");
+
+/* TODO Use vmlinux.h once BTF pruning for embedded types is fixed.
+ */
+struct sock {} __attribute__((preserve_access_index));
+struct sockaddr {} __attribute__((preserve_access_index));
+struct socket {
+	struct sock *sk;
+} __attribute__((preserve_access_index));
+
+struct inode {} __attribute__((preserve_access_index));
+struct dentry {
+	struct inode *d_inode;
+} __attribute__((preserve_access_index));
+struct file {
+	struct inode *f_inode;
+} __attribute__((preserve_access_index));
+
+
+SEC("lsm/inode_unlink")
+int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
+{
+	__u32 pid = bpf_get_current_pid_tgid() >> 32;
+	struct dummy_storage *storage;
+
+	if (pid != monitored_pid)
+		return 0;
+
+	storage = bpf_inode_storage_get(&inode_storage_map, victim->d_inode, 0,
+				     BPF_SK_STORAGE_GET_F_CREATE);
+	if (!storage)
+		return 0;
+
+	if (storage->value == DUMMY_STORAGE_VALUE)
+		inode_storage_result = true;
+
+	return 0;
+}
+
+SEC("lsm/socket_bind")
+int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
+	     int addrlen)
+{
+	__u32 pid = bpf_get_current_pid_tgid() >> 32;
+	struct dummy_storage *storage;
+
+	if (pid != monitored_pid)
+		return 0;
+
+	storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0,
+				     BPF_SK_STORAGE_GET_F_CREATE);
+	if (!storage)
+		return 0;
+
+	if (storage->value == DUMMY_STORAGE_VALUE)
+		sk_storage_result = true;
+
+	return 0;
+}
+
+SEC("lsm/socket_post_create")
+int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
+	     int protocol, int kern)
+{
+	__u32 pid = bpf_get_current_pid_tgid() >> 32;
+	struct dummy_storage *storage;
+
+	if (pid != monitored_pid)
+		return 0;
+
+	storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0,
+				     BPF_SK_STORAGE_GET_F_CREATE);
+	if (!storage)
+		return 0;
+
+	storage->value = DUMMY_STORAGE_VALUE;
+
+	return 0;
+}
+
+SEC("lsm/file_open")
+int BPF_PROG(test_int_hook, struct file *file)
+{
+	__u32 pid = bpf_get_current_pid_tgid() >> 32;
+	struct dummy_storage *storage;
+
+	if (pid != monitored_pid)
+		return 0;
+
+	if (!file->f_inode)
+		return 0;
+
+	storage = bpf_inode_storage_get(&inode_storage_map, file->f_inode, 0,
+				     BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!storage)
+		return 0;
+
+	storage->value = DUMMY_STORAGE_VALUE;
+	return 0;
+}
-- 
2.28.0.rc0.142.g3c755180ce-goog


^ permalink raw reply related

* [PATCH bpf-next v7 4/7] bpf: Split bpf_local_storage to bpf_sk_storage
From: KP Singh @ 2020-07-30 14:07 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200730140716.404558-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

A purely mechanical change:

	bpf_sk_storage.c = bpf_sk_storage.c + bpf_local_storage.c
	bpf_sk_storage.h = bpf_sk_storage.h + bpf_local_storage.h

Signed-off-by: KP Singh <kpsingh@google.com>
---
 include/linux/bpf_local_storage.h | 163 ++++++++
 include/net/bpf_sk_storage.h      |  61 +--
 kernel/bpf/Makefile               |   1 +
 kernel/bpf/bpf_local_storage.c    | 600 ++++++++++++++++++++++++++
 net/core/bpf_sk_storage.c         | 672 +-----------------------------
 5 files changed, 766 insertions(+), 731 deletions(-)
 create mode 100644 include/linux/bpf_local_storage.h
 create mode 100644 kernel/bpf/bpf_local_storage.c

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
new file mode 100644
index 000000000000..3685f681a7fc
--- /dev/null
+++ b/include/linux/bpf_local_storage.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2019 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#ifndef _BPF_LOCAL_STORAGE_H
+#define _BPF_LOCAL_STORAGE_H
+
+#include <linux/bpf.h>
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <uapi/linux/btf.h>
+
+#define BPF_LOCAL_STORAGE_CACHE_SIZE	16
+
+struct bpf_local_storage_map_bucket {
+	struct hlist_head list;
+	raw_spinlock_t lock;
+};
+
+/* Thp map is not the primary owner of a bpf_local_storage_elem.
+ * Instead, the container object (eg. sk->sk_bpf_storage) is.
+ *
+ * The map (bpf_local_storage_map) is for two purposes
+ * 1. Define the size of the "local storage".  It is
+ *    the map's value_size.
+ *
+ * 2. Maintain a list to keep track of all elems such
+ *    that they can be cleaned up during the map destruction.
+ *
+ * When a bpf local storage is being looked up for a
+ * particular object,  the "bpf_map" pointer is actually used
+ * as the "key" to search in the list of elem in
+ * the respective bpf_local_storage owned by the object.
+ *
+ * e.g. sk->sk_bpf_storage is the mini-map with the "bpf_map" pointer
+ * as the searching key.
+ */
+struct bpf_local_storage_map {
+	struct bpf_map map;
+	/* Lookup elem does not require accessing the map.
+	 *
+	 * Updating/Deleting requires a bucket lock to
+	 * link/unlink the elem from the map.  Having
+	 * multiple buckets to improve contention.
+	 */
+	struct bpf_local_storage_map_bucket *buckets;
+	u32 bucket_log;
+	u16 elem_size;
+	u16 cache_idx;
+};
+
+struct bpf_local_storage_data {
+	/* smap is used as the searching key when looking up
+	 * from the object's bpf_local_storage.
+	 *
+	 * Put it in the same cacheline as the data to minimize
+	 * the number of cachelines access during the cache hit case.
+	 */
+	struct bpf_local_storage_map __rcu *smap;
+	u8 data[] __aligned(8);
+};
+
+/* Linked to bpf_local_storage and bpf_local_storage_map */
+struct bpf_local_storage_elem {
+	struct hlist_node map_node;	/* Linked to bpf_local_storage_map */
+	struct hlist_node snode;	/* Linked to bpf_local_storage */
+	struct bpf_local_storage __rcu *local_storage;
+	struct rcu_head rcu;
+	/* 8 bytes hole */
+	/* The data is stored in aother cacheline to minimize
+	 * the number of cachelines access during a cache hit.
+	 */
+	struct bpf_local_storage_data sdata ____cacheline_aligned;
+};
+
+struct bpf_local_storage {
+	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
+	struct hlist_head list; /* List of bpf_local_storage_elem */
+	void *owner;		/* The object that owns the the above "list" of
+				 * bpf_local_storage_elem.
+				 */
+	struct rcu_head rcu;
+	raw_spinlock_t lock;	/* Protect adding/removing from the "list" */
+};
+
+/* U16_MAX is much more than enough for sk local storage
+ * considering a tcp_sock is ~2k.
+ */
+#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE				       \
+	min_t(u32,                                                             \
+	      (KMALLOC_MAX_SIZE - MAX_BPF_STACK -                              \
+	       sizeof(struct bpf_local_storage_elem)),                         \
+	      (U16_MAX - sizeof(struct bpf_local_storage_elem)))
+
+#define SELEM(_SDATA)                                                          \
+	container_of((_SDATA), struct bpf_local_storage_elem, sdata)
+#define SDATA(_SELEM) (&(_SELEM)->sdata)
+
+#define BPF_LOCAL_STORAGE_CACHE_SIZE	16
+
+struct bpf_local_storage_cache {
+	spinlock_t idx_lock;
+	u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE];
+};
+
+#define DEFINE_BPF_STORAGE_CACHE(name)				\
+static struct bpf_local_storage_cache name = {			\
+	.idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock),	\
+}
+
+u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache);
+void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+				      u16 idx);
+
+/* Helper functions for bpf_local_storage */
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
+
+struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr);
+
+struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+			 struct bpf_local_storage_map *smap,
+			 bool cacheit_lockit);
+
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap);
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+				    const struct btf *btf,
+				    const struct btf_type *key_type,
+				    const struct btf_type *value_type);
+
+void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
+			    struct bpf_local_storage_elem *selem);
+
+bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
+			      struct bpf_local_storage_elem *selem,
+			      bool uncharge_omem);
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem);
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+			struct bpf_local_storage_elem *selem);
+
+void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
+		bool charge_mem);
+
+int
+bpf_local_storage_alloc(void *owner,
+			struct bpf_local_storage_map *smap,
+			struct bpf_local_storage_elem *first_selem);
+
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_map *map, void *value,
+			 u64 map_flags);
+
+#endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 05b777950eb3..847926cf2899 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -12,6 +12,7 @@
 #include <net/sock.h>
 #include <uapi/linux/sock_diag.h>
 #include <uapi/linux/btf.h>
+#include <linux/bpf_local_storage.h>
 
 struct sock;
 
@@ -25,66 +26,6 @@ struct sk_buff;
 struct nlattr;
 struct sock;
 
-#define BPF_LOCAL_STORAGE_CACHE_SIZE	16
-
-struct bpf_local_storage_cache {
-	spinlock_t idx_lock;
-	u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE];
-};
-
-#define DEFINE_BPF_STORAGE_CACHE(name)				\
-static struct bpf_local_storage_cache name = {			\
-	.idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock),	\
-}
-
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache);
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
-				      u16 idx);
-
-/* Helper functions for bpf_local_storage */
-int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
-
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr);
-
-struct bpf_local_storage_data *
-bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
-			 struct bpf_local_storage_map *smap,
-			 bool cacheit_lockit);
-
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap);
-
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
-				    const struct btf *btf,
-				    const struct btf_type *key_type,
-				    const struct btf_type *value_type);
-
-void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
-			    struct bpf_local_storage_elem *selem);
-
-bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
-			      struct bpf_local_storage_elem *selem,
-			      bool uncharge_omem);
-
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem);
-
-void bpf_selem_link_map(struct bpf_local_storage_map *smap,
-			struct bpf_local_storage_elem *selem);
-
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
-
-struct bpf_local_storage_elem *
-bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
-		bool charge_mem);
-
-int
-bpf_local_storage_alloc(void *owner,
-			struct bpf_local_storage_map *smap,
-			struct bpf_local_storage_elem *first_selem);
-
-struct bpf_local_storage_data *
-bpf_local_storage_update(void *owner, struct bpf_map *map, void *value,
-			 u64 map_flags);
-
 #ifdef CONFIG_BPF_SYSCALL
 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
 struct bpf_sk_storage_diag *
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e6eb9c0402da..a9a147e18600 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
 endif
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
new file mode 100644
index 000000000000..79750579dd7a
--- /dev/null
+++ b/kernel/bpf/bpf_local_storage.c
@@ -0,0 +1,600 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+
+#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
+static struct bpf_local_storage_map_bucket *
+select_bucket(struct bpf_local_storage_map *smap,
+	      struct bpf_local_storage_elem *selem)
+{
+	return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
+}
+
+static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
+{
+	struct bpf_map *map = &smap->map;
+
+	if (!map->ops->map_local_storage_charge)
+		return 0;
+
+	return map->ops->map_local_storage_charge(smap, owner, size);
+}
+
+static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
+			 u32 size)
+{
+	struct bpf_map *map = &smap->map;
+
+	if (map->ops->map_local_storage_uncharge)
+		map->ops->map_local_storage_uncharge(smap, owner, size);
+}
+
+static struct bpf_local_storage __rcu **
+owner_storage(struct bpf_local_storage_map *smap, void *owner)
+{
+	struct bpf_map *map = &smap->map;
+
+	return map->ops->map_owner_storage_ptr(owner);
+}
+
+static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
+{
+	return !hlist_unhashed(&selem->snode);
+}
+
+static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
+{
+	return !hlist_unhashed(&selem->map_node);
+}
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
+		void *value, bool charge_mem)
+{
+	struct bpf_local_storage_elem *selem;
+
+	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+		return NULL;
+
+	selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	if (selem) {
+		if (value)
+			memcpy(SDATA(selem)->data, value, smap->map.value_size);
+		return selem;
+	}
+
+	if (charge_mem)
+		mem_uncharge(smap, owner, smap->elem_size);
+
+	return NULL;
+}
+
+/* local_storage->lock must be held and selem->sk_storage == sk_storage.
+ * The caller must ensure selem->smap is still valid to be
+ * dereferenced for its smap->elem_size and smap->cache_idx.
+ */
+bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
+			      struct bpf_local_storage_elem *selem,
+			      bool uncharge_mem)
+{
+	struct bpf_local_storage_map *smap;
+	bool free_local_storage;
+	void *owner;
+
+	smap = rcu_dereference(SDATA(selem)->smap);
+	owner = local_storage->owner;
+
+	/* All uncharging on owner must be done first.
+	 * The owner may be freed once the last selem is unlinked from
+	 * local_storage.
+	 */
+	if (uncharge_mem)
+		mem_uncharge(smap, owner, smap->elem_size);
+
+	free_local_storage = hlist_is_singular_node(&selem->snode,
+						    &local_storage->list);
+	if (free_local_storage) {
+		mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
+		local_storage->owner = NULL;
+
+		/* After this RCU_INIT, owner may be freed and cannot be used */
+		RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+
+		/* local_storage is not freed now.  local_storage->lock is
+		 * still held and raw_spin_unlock_bh(&local_storage->lock)
+		 * will be done by the caller.
+		 *
+		 * Although the unlock will be done under
+		 * rcu_read_lock(),  it is more intutivie to
+		 * read if kfree_rcu(local_storage, rcu) is done
+		 * after the raw_spin_unlock_bh(&local_storage->lock).
+		 *
+		 * Hence, a "bool free_local_storage" is returned
+		 * to the caller which then calls the kfree_rcu()
+		 * after unlock.
+		 */
+	}
+	hlist_del_init_rcu(&selem->snode);
+	if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
+	    SDATA(selem))
+		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
+
+	kfree_rcu(selem, rcu);
+
+	return free_local_storage;
+}
+
+static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
+{
+	struct bpf_local_storage *local_storage;
+	bool free_local_storage = false;
+
+	if (unlikely(!selem_linked_to_storage(selem)))
+		/* selem has already been unlinked from sk */
+		return;
+
+	local_storage = rcu_dereference(selem->local_storage);
+	raw_spin_lock_bh(&local_storage->lock);
+	if (likely(selem_linked_to_storage(selem)))
+		free_local_storage =
+			bpf_selem_unlink_storage(local_storage, selem, true);
+	raw_spin_unlock_bh(&local_storage->lock);
+
+	if (free_local_storage)
+		kfree_rcu(local_storage, rcu);
+}
+
+void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
+			    struct bpf_local_storage_elem *selem)
+{
+	RCU_INIT_POINTER(selem->local_storage, local_storage);
+	hlist_add_head(&selem->snode, &local_storage->list);
+}
+
+void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+{
+	struct bpf_local_storage_map *smap;
+	struct bpf_local_storage_map_bucket *b;
+
+	if (unlikely(!selem_linked_to_map(selem)))
+		/* selem has already be unlinked from smap */
+		return;
+
+	smap = rcu_dereference(SDATA(selem)->smap);
+	b = select_bucket(smap, selem);
+	raw_spin_lock_bh(&b->lock);
+	if (likely(selem_linked_to_map(selem)))
+		hlist_del_init_rcu(&selem->map_node);
+	raw_spin_unlock_bh(&b->lock);
+}
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+			struct bpf_local_storage_elem *selem)
+{
+	struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+
+	raw_spin_lock_bh(&b->lock);
+	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+	hlist_add_head_rcu(&selem->map_node, &b->list);
+	raw_spin_unlock_bh(&b->lock);
+}
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+{
+	/* Always unlink from map before unlinking from local_storage
+	 * because selem will be freed after successfully unlinked from
+	 * the local_storage.
+	 */
+	bpf_selem_unlink_map(selem);
+	__bpf_selem_unlink_storage(selem);
+}
+
+struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+			 struct bpf_local_storage_map *smap,
+			 bool cacheit_lockit)
+{
+	struct bpf_local_storage_data *sdata;
+	struct bpf_local_storage_elem *selem;
+
+	/* Fast path (cache hit) */
+	sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
+	if (sdata && rcu_access_pointer(sdata->smap) == smap)
+		return sdata;
+
+	/* Slow path (cache miss) */
+	hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
+		if (rcu_access_pointer(SDATA(selem)->smap) == smap)
+			break;
+
+	if (!selem)
+		return NULL;
+
+	sdata = SDATA(selem);
+	if (cacheit_lockit) {
+		/* spinlock is needed to avoid racing with the
+		 * parallel delete.  Otherwise, publishing an already
+		 * deleted sdata to the cache will become a use-after-free
+		 * problem in the next bpf_local_storage_lookup().
+		 */
+		raw_spin_lock_bh(&local_storage->lock);
+		if (selem_linked_to_storage(selem))
+			rcu_assign_pointer(local_storage->cache[smap->cache_idx],
+					   sdata);
+		raw_spin_unlock_bh(&local_storage->lock);
+	}
+
+	return sdata;
+}
+
+static int check_flags(const struct bpf_local_storage_data *old_sdata,
+		       u64 map_flags)
+{
+	if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
+		/* elem already exists */
+		return -EEXIST;
+
+	if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
+		/* elem doesn't exist, cannot update it */
+		return -ENOENT;
+
+	return 0;
+}
+
+int bpf_local_storage_alloc(void *owner,
+			    struct bpf_local_storage_map *smap,
+			    struct bpf_local_storage_elem *first_selem)
+{
+	struct bpf_local_storage *prev_storage, *storage;
+	struct bpf_local_storage **owner_storage_ptr;
+	int err;
+
+	err = mem_charge(smap, owner, sizeof(*storage));
+	if (err)
+		return err;
+
+	storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN);
+	if (!storage) {
+		err = -ENOMEM;
+		goto uncharge;
+	}
+
+	INIT_HLIST_HEAD(&storage->list);
+	raw_spin_lock_init(&storage->lock);
+	storage->owner = owner;
+
+	bpf_selem_link_storage(storage, first_selem);
+	bpf_selem_link_map(smap, first_selem);
+
+	owner_storage_ptr =
+		(struct bpf_local_storage **)owner_storage(smap, owner);
+	/* Publish storage to the owner.
+	 * Instead of using any lock of the kernel object (i.e. owner),
+	 * cmpxchg will work with any kernel object regardless what
+	 * the running context is, bh, irq...etc.
+	 *
+	 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
+	 * is protected by the storage->lock.  Hence, when freeing
+	 * the owner->storage, the storage->lock must be held before
+	 * setting owner->storage ptr to NULL.
+	 */
+	prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
+	if (unlikely(prev_storage)) {
+		bpf_selem_unlink_map(first_selem);
+		err = -EAGAIN;
+		goto uncharge;
+
+		/* Note that even first_selem was linked to smap's
+		 * bucket->list, first_selem can be freed immediately
+		 * (instead of kfree_rcu) because
+		 * bpf_local_storage_map_free() does a
+		 * synchronize_rcu() before walking the bucket->list.
+		 * Hence, no one is accessing selem from the
+		 * bucket->list under rcu_read_lock().
+		 */
+	}
+
+	return 0;
+
+uncharge:
+	kfree(storage);
+	mem_uncharge(smap, owner, sizeof(*storage));
+	return err;
+}
+
+/* sk cannot be going away because it is linking new elem
+ * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
+ * Otherwise, it will become a leak (and other memory issues
+ * during map destruction).
+ */
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_map *map,
+			 void *value, u64 map_flags)
+{
+	struct bpf_local_storage_data *old_sdata = NULL;
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage *local_storage;
+	struct bpf_local_storage_map *smap;
+	int err;
+
+	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
+	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
+	    /* BPF_F_LOCK can only be used in a value with spin_lock */
+	    unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+		return ERR_PTR(-EINVAL);
+
+	smap = (struct bpf_local_storage_map *)map;
+	local_storage = rcu_dereference(*owner_storage(smap, owner));
+	if (!local_storage || hlist_empty(&local_storage->list)) {
+		/* Very first elem for the owner */
+		err = check_flags(NULL, map_flags);
+		if (err)
+			return ERR_PTR(err);
+
+		selem = bpf_selem_alloc(smap, owner, value, true);
+		if (!selem)
+			return ERR_PTR(-ENOMEM);
+
+		err = bpf_local_storage_alloc(owner, smap, selem);
+		if (err) {
+			kfree(selem);
+			mem_uncharge(smap, owner, smap->elem_size);
+			return ERR_PTR(err);
+		}
+
+		return SDATA(selem);
+	}
+
+	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
+		/* Hoping to find an old_sdata to do inline update
+		 * such that it can avoid taking the local_storage->lock
+		 * and changing the lists.
+		 */
+		old_sdata = bpf_local_storage_lookup(local_storage, smap,
+						     false);
+		err = check_flags(old_sdata, map_flags);
+		if (err)
+			return ERR_PTR(err);
+		if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
+			copy_map_value_locked(map, old_sdata->data,
+					      value, false);
+			return old_sdata;
+		}
+	}
+
+	raw_spin_lock_bh(&local_storage->lock);
+
+	/* Recheck local_storage->list under local_storage->lock */
+	if (unlikely(hlist_empty(&local_storage->list))) {
+		/* A parallel del is happening and local_storage is going
+		 * away.  It has just been checked before, so very
+		 * unlikely.  Return instead of retry to keep things
+		 * simple.
+		 */
+		err = -EAGAIN;
+		goto unlock_err;
+	}
+
+	old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
+	err = check_flags(old_sdata, map_flags);
+	if (err)
+		goto unlock_err;
+
+	if (old_sdata && (map_flags & BPF_F_LOCK)) {
+		copy_map_value_locked(map, old_sdata->data, value, false);
+		selem = SELEM(old_sdata);
+		goto unlock;
+	}
+
+	/* local_storage->lock is held.  Hence, we are sure
+	 * we can unlink and uncharge the old_sdata successfully
+	 * later.  Hence, instead of charging the new selem now
+	 * and then uncharge the old selem later (which may cause
+	 * a potential but unnecessary charge failure),  avoid taking
+	 * a charge at all here (the "!old_sdata" check) and the
+	 * old_sdata will not be uncharged later during
+	 * bpf_selem_unlink_storage().
+	 */
+	selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
+	if (!selem) {
+		err = -ENOMEM;
+		goto unlock_err;
+	}
+
+	/* First, link the new selem to the map */
+	bpf_selem_link_map(smap, selem);
+
+	/* Second, link (and publish) the new selem to local_storage */
+	bpf_selem_link_storage(local_storage, selem);
+
+	/* Third, remove old selem, SELEM(old_sdata) */
+	if (old_sdata) {
+		bpf_selem_unlink_map(SELEM(old_sdata));
+		bpf_selem_unlink_storage(local_storage, SELEM(old_sdata), false);
+	}
+
+unlock:
+	raw_spin_unlock_bh(&local_storage->lock);
+	return SDATA(selem);
+
+unlock_err:
+	raw_spin_unlock_bh(&local_storage->lock);
+	return ERR_PTR(err);
+}
+
+u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+{
+	u64 min_usage = U64_MAX;
+	u16 i, res = 0;
+
+	spin_lock(&cache->idx_lock);
+
+	for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
+		if (cache->idx_usage_counts[i] < min_usage) {
+			min_usage = cache->idx_usage_counts[i];
+			res = i;
+
+			/* Found a free cache_idx */
+			if (!min_usage)
+				break;
+		}
+	}
+	cache->idx_usage_counts[res]++;
+
+	spin_unlock(&cache->idx_lock);
+
+	return res;
+}
+
+void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+				      u16 idx)
+{
+	spin_lock(&cache->idx_lock);
+	cache->idx_usage_counts[idx]--;
+	spin_unlock(&cache->idx_lock);
+}
+
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
+{
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map_bucket *b;
+	unsigned int i;
+
+	/* Note that this map might be concurrently cloned from
+	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+	 * RCU read section to finish before proceeding. New RCU
+	 * read sections should be prevented via bpf_map_inc_not_zero.
+	 */
+	synchronize_rcu();
+
+	/* bpf prog and the userspace can no longer access this map
+	 * now.  No new selem (of this map) can be added
+	 * to the bpf_local_storage or to the map bucket's list.
+	 *
+	 * The elem of this map can be cleaned up here
+	 * or
+	 * by bpf_local_storage_free() during the destruction of the
+	 * owner object. eg. __sk_destruct.
+	 */
+	for (i = 0; i < (1U << smap->bucket_log); i++) {
+		b = &smap->buckets[i];
+
+		rcu_read_lock();
+		/* No one is adding to b->list now */
+		while ((selem = hlist_entry_safe(
+				rcu_dereference_raw(hlist_first_rcu(&b->list)),
+				struct bpf_local_storage_elem, map_node))) {
+			bpf_selem_unlink(selem);
+			cond_resched_rcu();
+		}
+		rcu_read_unlock();
+	}
+
+	/* bpf_local_storage_free() may still need to access the map.
+	 * e.g. bpf_local_storage_free() has unlinked selem from the map
+	 * which then made the above while((selem = ...)) loop
+	 * exited immediately.
+	 *
+	 * However, the bpf_local_storage_free() still needs to access
+	 * the smap->elem_size to do the uncharging in
+	 * bpf_selem_unlink_storage().
+	 *
+	 * Hence, wait another rcu grace period for the
+	 * bpf_local_storage_free() to finish.
+	 */
+	synchronize_rcu();
+
+	kvfree(smap->buckets);
+	kfree(smap);
+}
+
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
+	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+	    attr->max_entries ||
+	    attr->key_size != sizeof(int) || !attr->value_size ||
+	    /* Enforce BTF for userspace sk dumping */
+	    !attr->btf_key_type_id || !attr->btf_value_type_id)
+		return -EINVAL;
+
+	if (!bpf_capable())
+		return -EPERM;
+
+	if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
+		return -E2BIG;
+
+	return 0;
+}
+
+struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_local_storage_map *smap;
+	unsigned int i;
+	u32 nbuckets;
+	u64 cost;
+	int ret;
+
+	smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
+	if (!smap)
+		return ERR_PTR(-ENOMEM);
+	bpf_map_init_from_attr(&smap->map, attr);
+
+	nbuckets = roundup_pow_of_two(num_possible_cpus());
+	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+	nbuckets = max_t(u32, 2, nbuckets);
+	smap->bucket_log = ilog2(nbuckets);
+	cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
+
+	ret = bpf_map_charge_init(&smap->map.memory, cost);
+	if (ret < 0) {
+		kfree(smap);
+		return ERR_PTR(ret);
+	}
+
+	smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
+				 GFP_USER | __GFP_NOWARN);
+	if (!smap->buckets) {
+		bpf_map_charge_finish(&smap->map.memory);
+		kfree(smap);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < nbuckets; i++) {
+		INIT_HLIST_HEAD(&smap->buckets[i].list);
+		raw_spin_lock_init(&smap->buckets[i].lock);
+	}
+
+	smap->elem_size =
+		sizeof(struct bpf_local_storage_elem) + attr->value_size;
+
+	return smap;
+}
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+				    const struct btf *btf,
+				    const struct btf_type *key_type,
+				    const struct btf_type *value_type)
+{
+	u32 int_data;
+
+	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+		return -EINVAL;
+
+	int_data = *(u32 *)(key_type + 1);
+	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+		return -EINVAL;
+
+	return 0;
+}
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index bb2375769ca1..8f6a8d6549be 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -7,97 +7,14 @@
 #include <linux/spinlock.h>
 #include <linux/bpf.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
 #include <net/bpf_sk_storage.h>
 #include <net/sock.h>
 #include <uapi/linux/sock_diag.h>
 #include <uapi/linux/btf.h>
 
-#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
-
 DEFINE_BPF_STORAGE_CACHE(sk_cache);
 
-struct bpf_local_storage_map_bucket {
-	struct hlist_head list;
-	raw_spinlock_t lock;
-};
-
-/* Thp map is not the primary owner of a bpf_local_storage_elem.
- * Instead, the container object (eg. sk->sk_bpf_storage) is.
- *
- * The map (bpf_local_storage_map) is for two purposes
- * 1. Define the size of the "local storage".  It is
- *    the map's value_size.
- *
- * 2. Maintain a list to keep track of all elems such
- *    that they can be cleaned up during the map destruction.
- *
- * When a bpf local storage is being looked up for a
- * particular object,  the "bpf_map" pointer is actually used
- * as the "key" to search in the list of elem in
- * the respective bpf_local_storage owned by the object.
- *
- * e.g. sk->sk_bpf_storage is the mini-map with the "bpf_map" pointer
- * as the searching key.
- */
-struct bpf_local_storage_map {
-	struct bpf_map map;
-	/* Lookup elem does not require accessing the map.
-	 *
-	 * Updating/Deleting requires a bucket lock to
-	 * link/unlink the elem from the map.  Having
-	 * multiple buckets to improve contention.
-	 */
-	struct bpf_local_storage_map_bucket *buckets;
-	u32 bucket_log;
-	u16 elem_size;
-	u16 cache_idx;
-};
-
-struct bpf_local_storage_data {
-	/* smap is used as the searching key when looking up
-	 * from the object's bpf_local_storage.
-	 *
-	 * Put it in the same cacheline as the data to minimize
-	 * the number of cachelines access during the cache hit case.
-	 */
-	struct bpf_local_storage_map __rcu *smap;
-	u8 data[] __aligned(8);
-};
-
-/* Linked to bpf_local_storage and bpf_local_storage_map */
-struct bpf_local_storage_elem {
-	struct hlist_node map_node;	/* Linked to bpf_local_storage_map */
-	struct hlist_node snode;	/* Linked to bpf_local_storage */
-	struct bpf_local_storage __rcu *local_storage;
-	struct rcu_head rcu;
-	/* 8 bytes hole */
-	/* The data is stored in aother cacheline to minimize
-	 * the number of cachelines access during a cache hit.
-	 */
-	struct bpf_local_storage_data sdata ____cacheline_aligned;
-};
-
-#define SELEM(_SDATA)							\
-	container_of((_SDATA), struct bpf_local_storage_elem, sdata)
-#define SDATA(_SELEM) (&(_SELEM)->sdata)
-
-struct bpf_local_storage {
-	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
-	struct hlist_head list; /* List of bpf_local_storage_elem */
-	void *owner;		/* The object that owns the the above "list" of
-				 * bpf_local_storage_elem.
-				 */
-	struct rcu_head rcu;
-	raw_spinlock_t lock;	/* Protect adding/removing from the "list" */
-};
-
-static struct bpf_local_storage_map_bucket *
-select_bucket(struct bpf_local_storage_map *smap,
-	      struct bpf_local_storage_elem *selem)
-{
-	return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
-}
-
 static int omem_charge(struct sock *sk, unsigned int size)
 {
 	/* same check as in sock_kmalloc() */
@@ -110,223 +27,6 @@ static int omem_charge(struct sock *sk, unsigned int size)
 	return -ENOMEM;
 }
 
-static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
-{
-	struct bpf_map *map = &smap->map;
-
-	if (!map->ops->map_local_storage_charge)
-		return 0;
-
-	return map->ops->map_local_storage_charge(smap, owner, size);
-}
-
-static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
-			 u32 size)
-{
-	struct bpf_map *map = &smap->map;
-
-	if (map->ops->map_local_storage_uncharge)
-		map->ops->map_local_storage_uncharge(smap, owner, size);
-}
-
-static struct bpf_local_storage __rcu **
-owner_storage(struct bpf_local_storage_map *smap, void *owner)
-{
-	struct bpf_map *map = &smap->map;
-
-	return map->ops->map_owner_storage_ptr(owner);
-}
-
-static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
-{
-	return !hlist_unhashed(&selem->snode);
-}
-
-static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
-{
-	return !hlist_unhashed(&selem->map_node);
-}
-
-struct bpf_local_storage_elem *
-bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
-		void *value, bool charge_mem)
-{
-	struct bpf_local_storage_elem *selem;
-
-	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
-		return NULL;
-
-	selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-	if (selem) {
-		if (value)
-			memcpy(SDATA(selem)->data, value, smap->map.value_size);
-		return selem;
-	}
-
-	if (charge_mem)
-		mem_uncharge(smap, owner, smap->elem_size);
-
-	return NULL;
-}
-
-/* local_storage->lock must be held and selem->sk_storage == sk_storage.
- * The caller must ensure selem->smap is still valid to be
- * dereferenced for its smap->elem_size and smap->cache_idx.
- */
-bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
-			      struct bpf_local_storage_elem *selem,
-			      bool uncharge_mem)
-{
-	struct bpf_local_storage_map *smap;
-	bool free_local_storage;
-	void *owner;
-
-	smap = rcu_dereference(SDATA(selem)->smap);
-	owner = local_storage->owner;
-
-	/* All uncharging on owner must be done first.
-	 * The owner may be freed once the last selem is unlinked from
-	 * local_storage.
-	 */
-	if (uncharge_mem)
-		mem_uncharge(smap, owner, smap->elem_size);
-
-	free_local_storage = hlist_is_singular_node(&selem->snode,
-						    &local_storage->list);
-	if (free_local_storage) {
-		mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
-		local_storage->owner = NULL;
-
-		/* After this RCU_INIT, owner may be freed and cannot be used */
-		RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
-
-		/* local_storage is not freed now.  local_storage->lock is
-		 * still held and raw_spin_unlock_bh(&local_storage->lock)
-		 * will be done by the caller.
-		 *
-		 * Although the unlock will be done under
-		 * rcu_read_lock(),  it is more intutivie to
-		 * read if kfree_rcu(local_storage, rcu) is done
-		 * after the raw_spin_unlock_bh(&local_storage->lock).
-		 *
-		 * Hence, a "bool free_local_storage" is returned
-		 * to the caller which then calls the kfree_rcu()
-		 * after unlock.
-		 */
-	}
-	hlist_del_init_rcu(&selem->snode);
-	if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
-	    SDATA(selem))
-		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
-
-	kfree_rcu(selem, rcu);
-
-	return free_local_storage;
-}
-
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
-{
-	struct bpf_local_storage *local_storage;
-	bool free_local_storage = false;
-
-	if (unlikely(!selem_linked_to_storage(selem)))
-		/* selem has already been unlinked from sk */
-		return;
-
-	local_storage = rcu_dereference(selem->local_storage);
-	raw_spin_lock_bh(&local_storage->lock);
-	if (likely(selem_linked_to_storage(selem)))
-		free_local_storage =
-			bpf_selem_unlink_storage(local_storage, selem, true);
-	raw_spin_unlock_bh(&local_storage->lock);
-
-	if (free_local_storage)
-		kfree_rcu(local_storage, rcu);
-}
-
-void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
-			    struct bpf_local_storage_elem *selem)
-{
-	RCU_INIT_POINTER(selem->local_storage, local_storage);
-	hlist_add_head(&selem->snode, &local_storage->list);
-}
-
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
-{
-	struct bpf_local_storage_map *smap;
-	struct bpf_local_storage_map_bucket *b;
-
-	if (unlikely(!selem_linked_to_map(selem)))
-		/* selem has already be unlinked from smap */
-		return;
-
-	smap = rcu_dereference(SDATA(selem)->smap);
-	b = select_bucket(smap, selem);
-	raw_spin_lock_bh(&b->lock);
-	if (likely(selem_linked_to_map(selem)))
-		hlist_del_init_rcu(&selem->map_node);
-	raw_spin_unlock_bh(&b->lock);
-}
-
-void bpf_selem_link_map(struct bpf_local_storage_map *smap,
-			struct bpf_local_storage_elem *selem)
-{
-	struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
-
-	raw_spin_lock_bh(&b->lock);
-	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
-	hlist_add_head_rcu(&selem->map_node, &b->list);
-	raw_spin_unlock_bh(&b->lock);
-}
-
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
-{
-	/* Always unlink from map before unlinking from local_storage
-	 * because selem will be freed after successfully unlinked from
-	 * the local_storage.
-	 */
-	bpf_selem_unlink_map(selem);
-	__bpf_selem_unlink_storage(selem);
-}
-
-struct bpf_local_storage_data *
-bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
-			 struct bpf_local_storage_map *smap,
-			 bool cacheit_lockit)
-{
-	struct bpf_local_storage_data *sdata;
-	struct bpf_local_storage_elem *selem;
-
-	/* Fast path (cache hit) */
-	sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
-	if (sdata && rcu_access_pointer(sdata->smap) == smap)
-		return sdata;
-
-	/* Slow path (cache miss) */
-	hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
-		if (rcu_access_pointer(SDATA(selem)->smap) == smap)
-			break;
-
-	if (!selem)
-		return NULL;
-
-	sdata = SDATA(selem);
-	if (cacheit_lockit) {
-		/* spinlock is needed to avoid racing with the
-		 * parallel delete.  Otherwise, publishing an already
-		 * deleted sdata to the cache will become a use-after-free
-		 * problem in the next bpf_local_storage_lookup().
-		 */
-		raw_spin_lock_bh(&local_storage->lock);
-		if (selem_linked_to_storage(selem))
-			rcu_assign_pointer(local_storage->cache[smap->cache_idx],
-					   sdata);
-		raw_spin_unlock_bh(&local_storage->lock);
-	}
-
-	return sdata;
-}
-
 static struct bpf_local_storage_data *
 sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
 {
@@ -341,201 +41,6 @@ sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
 	return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit);
 }
 
-static int check_flags(const struct bpf_local_storage_data *old_sdata,
-		       u64 map_flags)
-{
-	if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
-		/* elem already exists */
-		return -EEXIST;
-
-	if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
-		/* elem doesn't exist, cannot update it */
-		return -ENOENT;
-
-	return 0;
-}
-
-int bpf_local_storage_alloc(void *owner,
-			    struct bpf_local_storage_map *smap,
-			    struct bpf_local_storage_elem *first_selem)
-{
-	struct bpf_local_storage *prev_storage, *storage;
-	struct bpf_local_storage **owner_storage_ptr;
-	int err;
-
-	err = mem_charge(smap, owner, sizeof(*storage));
-	if (err)
-		return err;
-
-	storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN);
-	if (!storage) {
-		err = -ENOMEM;
-		goto uncharge;
-	}
-
-	INIT_HLIST_HEAD(&storage->list);
-	raw_spin_lock_init(&storage->lock);
-	storage->owner = owner;
-
-	bpf_selem_link_storage(storage, first_selem);
-	bpf_selem_link_map(smap, first_selem);
-
-	owner_storage_ptr =
-		(struct bpf_local_storage **)owner_storage(smap, owner);
-	/* Publish storage to the owner.
-	 * Instead of using any lock of the kernel object (i.e. owner),
-	 * cmpxchg will work with any kernel object regardless what
-	 * the running context is, bh, irq...etc.
-	 *
-	 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
-	 * is protected by the storage->lock.  Hence, when freeing
-	 * the owner->storage, the storage->lock must be held before
-	 * setting owner->storage ptr to NULL.
-	 */
-	prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
-	if (unlikely(prev_storage)) {
-		bpf_selem_unlink_map(first_selem);
-		err = -EAGAIN;
-		goto uncharge;
-
-		/* Note that even first_selem was linked to smap's
-		 * bucket->list, first_selem can be freed immediately
-		 * (instead of kfree_rcu) because
-		 * bpf_local_storage_map_free() does a
-		 * synchronize_rcu() before walking the bucket->list.
-		 * Hence, no one is accessing selem from the
-		 * bucket->list under rcu_read_lock().
-		 */
-	}
-
-	return 0;
-
-uncharge:
-	kfree(storage);
-	mem_uncharge(smap, owner, sizeof(*storage));
-	return err;
-}
-
-/* sk cannot be going away because it is linking new elem
- * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
- * Otherwise, it will become a leak (and other memory issues
- * during map destruction).
- */
-struct bpf_local_storage_data *
-bpf_local_storage_update(void *owner, struct bpf_map *map,
-			 void *value, u64 map_flags)
-{
-	struct bpf_local_storage_data *old_sdata = NULL;
-	struct bpf_local_storage_elem *selem;
-	struct bpf_local_storage *local_storage;
-	struct bpf_local_storage_map *smap;
-	int err;
-
-	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
-	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
-	    /* BPF_F_LOCK can only be used in a value with spin_lock */
-	    unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
-		return ERR_PTR(-EINVAL);
-
-	smap = (struct bpf_local_storage_map *)map;
-	local_storage = rcu_dereference(*owner_storage(smap, owner));
-	if (!local_storage || hlist_empty(&local_storage->list)) {
-		/* Very first elem for the owner */
-		err = check_flags(NULL, map_flags);
-		if (err)
-			return ERR_PTR(err);
-
-		selem = bpf_selem_alloc(smap, owner, value, true);
-		if (!selem)
-			return ERR_PTR(-ENOMEM);
-
-		err = bpf_local_storage_alloc(owner, smap, selem);
-		if (err) {
-			kfree(selem);
-			mem_uncharge(smap, owner, smap->elem_size);
-			return ERR_PTR(err);
-		}
-
-		return SDATA(selem);
-	}
-
-	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
-		/* Hoping to find an old_sdata to do inline update
-		 * such that it can avoid taking the local_storage->lock
-		 * and changing the lists.
-		 */
-		old_sdata = bpf_local_storage_lookup(local_storage, smap,
-						     false);
-		err = check_flags(old_sdata, map_flags);
-		if (err)
-			return ERR_PTR(err);
-		if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
-			copy_map_value_locked(map, old_sdata->data,
-					      value, false);
-			return old_sdata;
-		}
-	}
-
-	raw_spin_lock_bh(&local_storage->lock);
-
-	/* Recheck local_storage->list under local_storage->lock */
-	if (unlikely(hlist_empty(&local_storage->list))) {
-		/* A parallel del is happening and local_storage is going
-		 * away.  It has just been checked before, so very
-		 * unlikely.  Return instead of retry to keep things
-		 * simple.
-		 */
-		err = -EAGAIN;
-		goto unlock_err;
-	}
-
-	old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
-	err = check_flags(old_sdata, map_flags);
-	if (err)
-		goto unlock_err;
-
-	if (old_sdata && (map_flags & BPF_F_LOCK)) {
-		copy_map_value_locked(map, old_sdata->data, value, false);
-		selem = SELEM(old_sdata);
-		goto unlock;
-	}
-
-	/* local_storage->lock is held.  Hence, we are sure
-	 * we can unlink and uncharge the old_sdata successfully
-	 * later.  Hence, instead of charging the new selem now
-	 * and then uncharge the old selem later (which may cause
-	 * a potential but unnecessary charge failure),  avoid taking
-	 * a charge at all here (the "!old_sdata" check) and the
-	 * old_sdata will not be uncharged later during
-	 * bpf_selem_unlink_storage().
-	 */
-	selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
-	if (!selem) {
-		err = -ENOMEM;
-		goto unlock_err;
-	}
-
-	/* First, link the new selem to the map */
-	bpf_selem_link_map(smap, selem);
-
-	/* Second, link (and publish) the new selem to local_storage */
-	bpf_selem_link_storage(local_storage, selem);
-
-	/* Third, remove old selem, SELEM(old_sdata) */
-	if (old_sdata) {
-		bpf_selem_unlink_map(SELEM(old_sdata));
-		bpf_selem_unlink_storage(local_storage, SELEM(old_sdata), false);
-	}
-
-unlock:
-	raw_spin_unlock_bh(&local_storage->lock);
-	return SDATA(selem);
-
-unlock_err:
-	raw_spin_unlock_bh(&local_storage->lock);
-	return ERR_PTR(err);
-}
-
 static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
 {
 	struct bpf_local_storage_data *sdata;
@@ -549,38 +54,6 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
 	return 0;
 }
 
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
-{
-	u64 min_usage = U64_MAX;
-	u16 i, res = 0;
-
-	spin_lock(&cache->idx_lock);
-
-	for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
-		if (cache->idx_usage_counts[i] < min_usage) {
-			min_usage = cache->idx_usage_counts[i];
-			res = i;
-
-			/* Found a free cache_idx */
-			if (!min_usage)
-				break;
-		}
-	}
-	cache->idx_usage_counts[res]++;
-
-	spin_unlock(&cache->idx_lock);
-
-	return res;
-}
-
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
-				      u16 idx)
-{
-	spin_lock(&cache->idx_lock);
-	cache->idx_usage_counts[idx]--;
-	spin_unlock(&cache->idx_lock);
-}
-
 /* Called by __sk_destruct() & bpf_sk_storage_clone() */
 void bpf_sk_storage_free(struct sock *sk)
 {
@@ -621,60 +94,6 @@ void bpf_sk_storage_free(struct sock *sk)
 		kfree_rcu(sk_storage, rcu);
 }
 
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
-{
-	struct bpf_local_storage_elem *selem;
-	struct bpf_local_storage_map_bucket *b;
-	unsigned int i;
-
-	/* Note that this map might be concurrently cloned from
-	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
-	 * RCU read section to finish before proceeding. New RCU
-	 * read sections should be prevented via bpf_map_inc_not_zero.
-	 */
-	synchronize_rcu();
-
-	/* bpf prog and the userspace can no longer access this map
-	 * now.  No new selem (of this map) can be added
-	 * to the bpf_local_storage or to the map bucket's list.
-	 *
-	 * The elem of this map can be cleaned up here
-	 * or
-	 * by bpf_local_storage_free() during the destruction of the
-	 * owner object. eg. __sk_destruct.
-	 */
-	for (i = 0; i < (1U << smap->bucket_log); i++) {
-		b = &smap->buckets[i];
-
-		rcu_read_lock();
-		/* No one is adding to b->list now */
-		while ((selem = hlist_entry_safe(
-				rcu_dereference_raw(hlist_first_rcu(&b->list)),
-				struct bpf_local_storage_elem, map_node))) {
-			bpf_selem_unlink(selem);
-			cond_resched_rcu();
-		}
-		rcu_read_unlock();
-	}
-
-	/* bpf_local_storage_free() may still need to access the map.
-	 * e.g. bpf_local_storage_free() has unlinked selem from the map
-	 * which then made the above while((selem = ...)) loop
-	 * exited immediately.
-	 *
-	 * However, the bpf_local_storage_free() still needs to access
-	 * the smap->elem_size to do the uncharging in
-	 * bpf_selem_unlink_storage().
-	 *
-	 * Hence, wait another rcu grace period for the
-	 * bpf_local_storage_free() to finish.
-	 */
-	synchronize_rcu();
-
-	kvfree(smap->buckets);
-	kfree(smap);
-}
-
 static void sk_storage_map_free(struct bpf_map *map)
 {
 	struct bpf_local_storage_map *smap;
@@ -684,78 +103,6 @@ static void sk_storage_map_free(struct bpf_map *map)
 	bpf_local_storage_map_free(smap);
 }
 
-/* U16_MAX is much more than enough for sk local storage
- * considering a tcp_sock is ~2k.
- */
-#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE				\
-	min_t(u32,							\
-	      (KMALLOC_MAX_SIZE - MAX_BPF_STACK -			\
-	       sizeof(struct bpf_local_storage_elem)),			\
-	      (U16_MAX - sizeof(struct bpf_local_storage_elem)))
-
-int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
-{
-	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
-	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
-	    attr->max_entries ||
-	    attr->key_size != sizeof(int) || !attr->value_size ||
-	    /* Enforce BTF for userspace sk dumping */
-	    !attr->btf_key_type_id || !attr->btf_value_type_id)
-		return -EINVAL;
-
-	if (!bpf_capable())
-		return -EPERM;
-
-	if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
-		return -E2BIG;
-
-	return 0;
-}
-
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
-{
-	struct bpf_local_storage_map *smap;
-	unsigned int i;
-	u32 nbuckets;
-	u64 cost;
-	int ret;
-
-	smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
-	if (!smap)
-		return ERR_PTR(-ENOMEM);
-	bpf_map_init_from_attr(&smap->map, attr);
-
-	nbuckets = roundup_pow_of_two(num_possible_cpus());
-	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
-	nbuckets = max_t(u32, 2, nbuckets);
-	smap->bucket_log = ilog2(nbuckets);
-	cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
-
-	ret = bpf_map_charge_init(&smap->map.memory, cost);
-	if (ret < 0) {
-		kfree(smap);
-		return ERR_PTR(ret);
-	}
-
-	smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
-				 GFP_USER | __GFP_NOWARN);
-	if (!smap->buckets) {
-		bpf_map_charge_finish(&smap->map.memory);
-		kfree(smap);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	for (i = 0; i < nbuckets; i++) {
-		INIT_HLIST_HEAD(&smap->buckets[i].list);
-		raw_spin_lock_init(&smap->buckets[i].lock);
-	}
-
-	smap->elem_size =
-		sizeof(struct bpf_local_storage_elem) + attr->value_size;
-
-	return smap;
-}
-
 static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_local_storage_map *smap;
@@ -774,23 +121,6 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
 	return -ENOTSUPP;
 }
 
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
-				    const struct btf *btf,
-				    const struct btf_type *key_type,
-				    const struct btf_type *value_type)
-{
-	u32 int_data;
-
-	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
-		return -EINVAL;
-
-	int_data = *(u32 *)(key_type + 1);
-	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
-		return -EINVAL;
-
-	return 0;
-}
-
 static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_local_storage_data *sdata;
-- 
2.28.0.rc0.142.g3c755180ce-goog


^ permalink raw reply related

* [PATCH bpf-next v7 5/7] bpf: Implement bpf_local_storage for inodes
From: KP Singh @ 2020-07-30 14:07 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200730140716.404558-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

Similar to bpf_local_storage for sockets, add local storage for inodes.
The life-cycle of storage is managed with the life-cycle of the inode.
i.e. the storage is destroyed along with the owning inode.

The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
security blob which are now stackable and can co-exist with other LSMs.

Signed-off-by: KP Singh <kpsingh@google.com>
---
 include/linux/bpf_local_storage.h             |  10 +
 include/linux/bpf_lsm.h                       |  21 ++
 include/linux/bpf_types.h                     |   3 +
 include/uapi/linux/bpf.h                      |  38 +++
 kernel/bpf/Makefile                           |   1 +
 kernel/bpf/bpf_inode_storage.c                | 254 ++++++++++++++++++
 kernel/bpf/syscall.c                          |   3 +-
 kernel/bpf/verifier.c                         |  10 +
 security/bpf/hooks.c                          |   7 +
 .../bpf/bpftool/Documentation/bpftool-map.rst |   2 +-
 tools/bpf/bpftool/bash-completion/bpftool     |   3 +-
 tools/bpf/bpftool/map.c                       |   3 +-
 tools/include/uapi/linux/bpf.h                |  38 +++
 tools/lib/bpf/libbpf_probes.c                 |   5 +-
 14 files changed, 392 insertions(+), 6 deletions(-)
 create mode 100644 kernel/bpf/bpf_inode_storage.c

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 3685f681a7fc..75c76847fad5 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -160,4 +160,14 @@ struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_map *map, void *value,
 			 u64 map_flags);
 
+#ifdef CONFIG_BPF_LSM
+extern const struct bpf_func_proto bpf_inode_storage_get_proto;
+extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
+void bpf_inode_storage_free(struct inode *inode);
+#else
+static inline void bpf_inode_storage_free(struct inode *inode)
+{
+}
+#endif /* CONFIG_BPF_LSM */
+
 #endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index af74712af585..d0683ada1e49 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -17,9 +17,24 @@
 #include <linux/lsm_hook_defs.h>
 #undef LSM_HOOK
 
+struct bpf_storage_blob {
+	struct bpf_local_storage __rcu *storage;
+};
+
+extern struct lsm_blob_sizes bpf_lsm_blob_sizes;
+
 int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 			const struct bpf_prog *prog);
 
+static inline struct bpf_storage_blob *bpf_inode(
+	const struct inode *inode)
+{
+	if (unlikely(!inode->i_security))
+		return NULL;
+
+	return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
+}
+
 #else /* !CONFIG_BPF_LSM */
 
 static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
@@ -28,6 +43,12 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 	return -EOPNOTSUPP;
 }
 
+static inline struct bpf_storage_blob *bpf_inode(
+	const struct inode *inode)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a52a5688418e..2e6f568377f1 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -107,6 +107,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
+#ifdef CONFIG_BPF_LSM
+BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
+#endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cd5a47a74533..d3617f1ce3fc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -148,6 +148,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP_HASH,
 	BPF_MAP_TYPE_STRUCT_OPS,
 	BPF_MAP_TYPE_RINGBUF,
+	BPF_MAP_TYPE_INODE_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -3389,6 +3390,41 @@ union bpf_attr {
  *		A non-negative value equal to or less than *size* on success,
  *		or a negative error in case of failure.
  *
+ * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags)
+ *	Description
+ *		Get a bpf_local_storage from an *inode*.
+ *
+ *		Logically, it could be thought of as getting the value from
+ *		a *map* with *inode* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this
+ *		helper enforces the key must be an inode and the map must also
+ *		be a **BPF_MAP_TYPE_INODE_STORAGE**.
+ *
+ *		Underneath, the value is stored locally at *inode* instead of
+ *		the *map*.  The *map* is used as the bpf-local-storage
+ *		"type". The bpf-local-storage "type" (i.e. the *map*) is
+ *		searched against all bpf_local_storage residing at *inode*.
+ *
+ *		An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ *		used such that a new bpf_local_storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ *		the initial value of a bpf_local_storage.  If *value* is
+ *		**NULL**, the new bpf_local_storage will be zero initialized.
+ *	Return
+ *		A bpf_local_storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf_local_storage.
+ *
+ * int bpf_inode_storage_delete(struct bpf_map *map, void *inode)
+ *	Description
+ *		Delete a bpf_local_storage from an *inode*.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf_local_storage cannot be found.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3533,6 +3569,8 @@ union bpf_attr {
 	FN(skc_to_tcp_request_sock),	\
 	FN(skc_to_udp6_sock),		\
 	FN(get_task_stack),		\
+	FN(inode_storage_get),		\
+	FN(inode_storage_delete),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a9a147e18600..25f40588dfbf 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,6 +5,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init)
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
new file mode 100644
index 000000000000..16c133f36d7e
--- /dev/null
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/fdtable.h>
+
+DEFINE_BPF_STORAGE_CACHE(inode_cache);
+
+static struct bpf_local_storage __rcu **
+inode_storage_ptr(void *owner)
+{
+	struct inode *inode = owner;
+	struct bpf_storage_blob *bsb;
+	bsb = bpf_inode(inode);
+	if (!bsb)
+		return NULL;
+	return &bsb->storage;
+}
+
+static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
+							   struct bpf_map *map,
+							   bool cacheit_lockit)
+{
+	struct bpf_local_storage *inode_storage;
+	struct bpf_local_storage_map *smap;
+	struct bpf_storage_blob *bsb;
+
+	bsb = bpf_inode(inode);
+	if (!bsb)
+		return ERR_PTR(-ENOENT);
+
+	inode_storage = rcu_dereference(bsb->storage);
+	if (!inode_storage)
+		return NULL;
+
+	smap = (struct bpf_local_storage_map *)map;
+	return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
+}
+
+void bpf_inode_storage_free(struct inode *inode)
+{
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage *local_storage;
+	bool free_inode_storage = false;
+	struct bpf_storage_blob *bsb;
+	struct hlist_node *n;
+
+	bsb = bpf_inode(inode);
+	if (!bsb)
+		return;
+
+	rcu_read_lock();
+
+	local_storage = rcu_dereference(bsb->storage);
+	if (!local_storage) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* Netiher the bpf_prog nor the bpf-map's syscall
+	 * could be modifying the local_storage->list now.
+	 * Thus, no elem can be added-to or deleted-from the
+	 * local_storage->list by the bpf_prog or by the bpf-map's syscall.
+	 *
+	 * It is racing with bpf_local_storage_map_free() alone
+	 * when unlinking elem from the local_storage->list and
+	 * the map's bucket->list.
+	 */
+	raw_spin_lock_bh(&local_storage->lock);
+	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+		/* Always unlink from map before unlinking from
+		 * local_storage.
+		 */
+		bpf_selem_unlink_map(selem);
+		free_inode_storage =
+			bpf_selem_unlink_storage(local_storage, selem, false);
+	}
+	raw_spin_unlock_bh(&local_storage->lock);
+	rcu_read_unlock();
+
+	/* free_inoode_storage should always be true as long as
+	 * local_storage->list was non-empty.
+	 */
+	if (free_inode_storage)
+		kfree_rcu(local_storage, rcu);
+}
+
+
+static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_local_storage_data *sdata;
+	struct file *f;
+	int fd;
+
+	fd = *(int *)key;
+	f = fcheck(fd);
+	if (!f)
+		return ERR_PTR(-EINVAL);
+
+	sdata = inode_storage_lookup(f->f_inode, map, true);
+	return sdata ? sdata->data : NULL;
+}
+
+static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+					 void *value, u64 map_flags)
+{
+	struct bpf_local_storage_data *sdata;
+	struct file *f;
+	int fd;
+
+	fd = *(int *)key;
+	f = fcheck(fd);
+	if (!f)
+		return -EINVAL;
+
+	sdata = bpf_local_storage_update(f->f_inode, map, value, map_flags);
+	return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
+{
+	struct bpf_local_storage_data *sdata;
+
+	sdata = inode_storage_lookup(inode, map, false);
+	if (!sdata)
+		return -ENOENT;
+
+	bpf_selem_unlink(SELEM(sdata));
+
+	return 0;
+}
+
+static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+{
+	struct file *f;
+	int fd;
+
+	fd = *(int *)key;
+	f = fcheck(fd);
+	if (!f)
+		return -EINVAL;
+
+	return inode_storage_delete(f->f_inode, map);
+}
+
+BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+	   void *, value, u64, flags)
+{
+	struct bpf_local_storage_data *sdata;
+
+	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return (unsigned long)NULL;
+
+	sdata = inode_storage_lookup(inode, map, true);
+	if (sdata)
+		return (unsigned long)sdata->data;
+
+	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+		sdata = bpf_local_storage_update(inode, map, value,
+						 BPF_NOEXIST);
+		return IS_ERR(sdata) ? (unsigned long)NULL :
+					     (unsigned long)sdata->data;
+	}
+
+	return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_inode_storage_delete,
+	   struct bpf_map *, map, struct inode *, inode)
+{
+	return inode_storage_delete(inode, map);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key,
+				void *next_key)
+{
+	return -ENOTSUPP;
+}
+
+static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_local_storage_map *smap;
+
+	smap = bpf_local_storage_map_alloc(attr);
+	if (IS_ERR(smap))
+		return ERR_CAST(smap);
+
+	smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
+	return &smap->map;
+}
+
+static void inode_storage_map_free(struct bpf_map *map)
+{
+	struct bpf_local_storage_map *smap;
+
+	smap = (struct bpf_local_storage_map *)map;
+	bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
+	bpf_local_storage_map_free(smap);
+}
+
+static int sk_storage_map_btf_id;
+const struct bpf_map_ops inode_storage_map_ops = {
+	.map_alloc_check = bpf_local_storage_map_alloc_check,
+	.map_alloc = inode_storage_map_alloc,
+	.map_free = inode_storage_map_free,
+	.map_get_next_key = notsupp_get_next_key,
+	.map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
+	.map_update_elem = bpf_fd_inode_storage_update_elem,
+	.map_delete_elem = bpf_fd_inode_storage_delete_elem,
+	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_btf_name = "bpf_local_storage_map",
+	.map_btf_id = &sk_storage_map_btf_id,
+	.map_owner_storage_ptr = inode_storage_ptr,
+};
+
+BTF_ID_LIST(bpf_inode_storage_get_btf_ids)
+BTF_ID(struct, inode)
+
+const struct bpf_func_proto bpf_inode_storage_get_proto = {
+	.func		= bpf_inode_storage_get,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+	.btf_id		= bpf_inode_storage_get_btf_ids,
+};
+
+BTF_ID_LIST(bpf_inode_storage_delete_btf_ids)
+BTF_ID(struct, inode)
+
+const struct bpf_func_proto bpf_inode_storage_delete_proto = {
+	.func		= bpf_inode_storage_delete,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.btf_id		= bpf_inode_storage_delete_btf_ids,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd3d599e9e90..78795cd50b45 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -768,7 +768,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		if (map->map_type != BPF_MAP_TYPE_HASH &&
 		    map->map_type != BPF_MAP_TYPE_ARRAY &&
 		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
 			return -ENOTSUPP;
 		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
 		    map->value_size) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b6ccfce3bf4c..c48378afbd95 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4242,6 +4242,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_sk_storage_delete)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_INODE_STORAGE:
+		if (func_id != BPF_FUNC_inode_storage_get &&
+		    func_id != BPF_FUNC_inode_storage_delete)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -4315,6 +4320,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
 			goto error;
 		break;
+	case BPF_FUNC_inode_storage_get:
+	case BPF_FUNC_inode_storage_delete:
+		if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c
index 32d32d485451..35f9b19259e5 100644
--- a/security/bpf/hooks.c
+++ b/security/bpf/hooks.c
@@ -3,6 +3,7 @@
 /*
  * Copyright (C) 2020 Google LLC.
  */
+#include <linux/bpf_local_storage.h>
 #include <linux/lsm_hooks.h>
 #include <linux/bpf_lsm.h>
 
@@ -11,6 +12,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(NAME, bpf_lsm_##NAME),
 	#include <linux/lsm_hook_defs.h>
 	#undef LSM_HOOK
+	LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free),
 };
 
 static int __init bpf_lsm_init(void)
@@ -20,7 +22,12 @@ static int __init bpf_lsm_init(void)
 	return 0;
 }
 
+struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = {
+	.lbs_inode = sizeof(struct bpf_storage_blob),
+};
+
 DEFINE_LSM(bpf) = {
 	.name = "bpf",
 	.init = bpf_lsm_init,
+	.blobs = &bpf_lsm_blob_sizes
 };
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 41e2a74252d0..083db6c2fc67 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -49,7 +49,7 @@ MAP COMMANDS
 |		| **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
 |		| **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
 |		| **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
-|		| **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** }
+|		| **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** }
 
 DESCRIPTION
 ===========
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 257fa310ea2b..45a93ed5ed89 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -704,7 +704,8 @@ _bpftool()
                                 lru_percpu_hash lpm_trie array_of_maps \
                                 hash_of_maps devmap devmap_hash sockmap cpumap \
                                 xskmap sockhash cgroup_storage reuseport_sockarray \
-                                percpu_cgroup_storage queue stack' -- \
+                                percpu_cgroup_storage queue stack sk_storage \
+                                struct_ops inode_storage' -- \
                                                    "$cur" ) )
                             return 0
                             ;;
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 3a27d31a1856..bc0071228f88 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -50,6 +50,7 @@ const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_SK_STORAGE]		= "sk_storage",
 	[BPF_MAP_TYPE_STRUCT_OPS]		= "struct_ops",
 	[BPF_MAP_TYPE_RINGBUF]			= "ringbuf",
+	[BPF_MAP_TYPE_INODE_STORAGE]		= "inode_storage",
 };
 
 const size_t map_type_name_size = ARRAY_SIZE(map_type_name);
@@ -1442,7 +1443,7 @@ static int do_help(int argc, char **argv)
 		"                 lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n"
 		"                 devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
 		"                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
-		"                 queue | stack | sk_storage | struct_ops | ringbuf }\n"
+		"                 queue | stack | sk_storage | struct_ops | ringbuf | inode_storage }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, argv[-2]);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cd5a47a74533..d3617f1ce3fc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -148,6 +148,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP_HASH,
 	BPF_MAP_TYPE_STRUCT_OPS,
 	BPF_MAP_TYPE_RINGBUF,
+	BPF_MAP_TYPE_INODE_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -3389,6 +3390,41 @@ union bpf_attr {
  *		A non-negative value equal to or less than *size* on success,
  *		or a negative error in case of failure.
  *
+ * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags)
+ *	Description
+ *		Get a bpf_local_storage from an *inode*.
+ *
+ *		Logically, it could be thought of as getting the value from
+ *		a *map* with *inode* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this
+ *		helper enforces the key must be an inode and the map must also
+ *		be a **BPF_MAP_TYPE_INODE_STORAGE**.
+ *
+ *		Underneath, the value is stored locally at *inode* instead of
+ *		the *map*.  The *map* is used as the bpf-local-storage
+ *		"type". The bpf-local-storage "type" (i.e. the *map*) is
+ *		searched against all bpf_local_storage residing at *inode*.
+ *
+ *		An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ *		used such that a new bpf_local_storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ *		the initial value of a bpf_local_storage.  If *value* is
+ *		**NULL**, the new bpf_local_storage will be zero initialized.
+ *	Return
+ *		A bpf_local_storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf_local_storage.
+ *
+ * int bpf_inode_storage_delete(struct bpf_map *map, void *inode)
+ *	Description
+ *		Delete a bpf_local_storage from an *inode*.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf_local_storage cannot be found.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3533,6 +3569,8 @@ union bpf_attr {
 	FN(skc_to_tcp_request_sock),	\
 	FN(skc_to_udp6_sock),		\
 	FN(get_task_stack),		\
+	FN(inode_storage_get),		\
+	FN(inode_storage_delete),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 5a3d3f078408..daaad635d0ed 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -173,7 +173,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 	return btf_fd;
 }
 
-static int load_sk_storage_btf(void)
+static int load_local_storage_btf(void)
 {
 	const char strs[] = "\0bpf_spin_lock\0val\0cnt\0l";
 	/* struct bpf_spin_lock {
@@ -232,12 +232,13 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 		key_size	= 0;
 		break;
 	case BPF_MAP_TYPE_SK_STORAGE:
+	case BPF_MAP_TYPE_INODE_STORAGE:
 		btf_key_type_id = 1;
 		btf_value_type_id = 3;
 		value_size = 8;
 		max_entries = 0;
 		map_flags = BPF_F_NO_PREALLOC;
-		btf_fd = load_sk_storage_btf();
+		btf_fd = load_local_storage_btf();
 		if (btf_fd < 0)
 			return false;
 		break;
-- 
2.28.0.rc0.142.g3c755180ce-goog


^ permalink raw reply related

* [PATCH bpf-next v7 1/7] A purely mechanical change to split the renaming from the actual generalization.
From: KP Singh @ 2020-07-30 14:07 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200730140716.404558-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

Flags/consts:

  SK_STORAGE_CREATE_FLAG_MASK	BPF_LOCAL_STORAGE_CREATE_FLAG_MASK
  BPF_SK_STORAGE_CACHE_SIZE	BPF_LOCAL_STORAGE_CACHE_SIZE
  MAX_VALUE_SIZE		BPF_LOCAL_STORAGE_MAX_VALUE_SIZE

Structs:

  bucket			bpf_local_storage_map_bucket
  bpf_sk_storage_map		bpf_local_storage_map
  bpf_sk_storage_data		bpf_local_storage_data
  bpf_sk_storage_elem		bpf_local_storage_elem
  bpf_sk_storage		bpf_local_storage

The "sk" member in bpf_local_storage is also updated to "owner"
in preparation for changing the type to void * in a subsequent patch.

Functions:

  selem_linked_to_sk			selem_linked_to_storage
  selem_alloc				bpf_selem_alloc
  __selem_unlink_sk			bpf_selem_unlink_storage
  __selem_link_sk			bpf_selem_link_storage
  selem_unlink_sk			__bpf_selem_unlink_storage
  sk_storage_update			bpf_local_storage_update
  __sk_storage_lookup			bpf_local_storage_lookup
  bpf_sk_storage_map_free		bpf_local_storage_map_free
  bpf_sk_storage_map_alloc		bpf_local_storage_map_alloc
  bpf_sk_storage_map_alloc_check	bpf_local_storage_map_alloc_check
  bpf_sk_storage_map_check_btf		bpf_local_storage_map_check_btf

Signed-off-by: KP Singh <kpsingh@google.com>
---
 include/net/sock.h        |   4 +-
 net/core/bpf_sk_storage.c | 455 +++++++++++++++++++-------------------
 2 files changed, 233 insertions(+), 226 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 2cc3ba667908..685aee71b91a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -246,7 +246,7 @@ struct sock_common {
 	/* public: */
 };
 
-struct bpf_sk_storage;
+struct bpf_local_storage;
 
 /**
   *	struct sock - network layer representation of sockets
@@ -517,7 +517,7 @@ struct sock {
 	void                    (*sk_destruct)(struct sock *sk);
 	struct sock_reuseport __rcu	*sk_reuseport_cb;
 #ifdef CONFIG_BPF_SYSCALL
-	struct bpf_sk_storage __rcu	*sk_bpf_storage;
+	struct bpf_local_storage __rcu	*sk_bpf_storage;
 #endif
 	struct rcu_head		sk_rcu;
 };
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index d3377c90a291..a5cc218834ee 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -12,33 +12,32 @@
 #include <uapi/linux/sock_diag.h>
 #include <uapi/linux/btf.h>
 
-#define SK_STORAGE_CREATE_FLAG_MASK					\
-	(BPF_F_NO_PREALLOC | BPF_F_CLONE)
+#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
 
-struct bucket {
+struct bpf_local_storage_map_bucket {
 	struct hlist_head list;
 	raw_spinlock_t lock;
 };
 
-/* Thp map is not the primary owner of a bpf_sk_storage_elem.
- * Instead, the sk->sk_bpf_storage is.
+/* Thp map is not the primary owner of a bpf_local_storage_elem.
+ * Instead, the container object (eg. sk->sk_bpf_storage) is.
  *
- * The map (bpf_sk_storage_map) is for two purposes
- * 1. Define the size of the "sk local storage".  It is
+ * The map (bpf_local_storage_map) is for two purposes
+ * 1. Define the size of the "local storage".  It is
  *    the map's value_size.
  *
  * 2. Maintain a list to keep track of all elems such
  *    that they can be cleaned up during the map destruction.
  *
  * When a bpf local storage is being looked up for a
- * particular sk,  the "bpf_map" pointer is actually used
+ * particular object,  the "bpf_map" pointer is actually used
  * as the "key" to search in the list of elem in
- * sk->sk_bpf_storage.
+ * the respective bpf_local_storage owned by the object.
  *
- * Hence, consider sk->sk_bpf_storage is the mini-map
- * with the "bpf_map" pointer as the searching key.
+ * e.g. sk->sk_bpf_storage is the mini-map with the "bpf_map" pointer
+ * as the searching key.
  */
-struct bpf_sk_storage_map {
+struct bpf_local_storage_map {
 	struct bpf_map map;
 	/* Lookup elem does not require accessing the map.
 	 *
@@ -46,55 +45,57 @@ struct bpf_sk_storage_map {
 	 * link/unlink the elem from the map.  Having
 	 * multiple buckets to improve contention.
 	 */
-	struct bucket *buckets;
+	struct bpf_local_storage_map_bucket *buckets;
 	u32 bucket_log;
 	u16 elem_size;
 	u16 cache_idx;
 };
 
-struct bpf_sk_storage_data {
+struct bpf_local_storage_data {
 	/* smap is used as the searching key when looking up
-	 * from sk->sk_bpf_storage.
+	 * from the object's bpf_local_storage.
 	 *
 	 * Put it in the same cacheline as the data to minimize
 	 * the number of cachelines access during the cache hit case.
 	 */
-	struct bpf_sk_storage_map __rcu *smap;
+	struct bpf_local_storage_map __rcu *smap;
 	u8 data[] __aligned(8);
 };
 
-/* Linked to bpf_sk_storage and bpf_sk_storage_map */
-struct bpf_sk_storage_elem {
-	struct hlist_node map_node;	/* Linked to bpf_sk_storage_map */
-	struct hlist_node snode;	/* Linked to bpf_sk_storage */
-	struct bpf_sk_storage __rcu *sk_storage;
+/* Linked to bpf_local_storage and bpf_local_storage_map */
+struct bpf_local_storage_elem {
+	struct hlist_node map_node;	/* Linked to bpf_local_storage_map */
+	struct hlist_node snode;	/* Linked to bpf_local_storage */
+	struct bpf_local_storage __rcu *local_storage;
 	struct rcu_head rcu;
 	/* 8 bytes hole */
 	/* The data is stored in aother cacheline to minimize
 	 * the number of cachelines access during a cache hit.
 	 */
-	struct bpf_sk_storage_data sdata ____cacheline_aligned;
+	struct bpf_local_storage_data sdata ____cacheline_aligned;
 };
 
-#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata)
+#define SELEM(_SDATA)							\
+	container_of((_SDATA), struct bpf_local_storage_elem, sdata)
 #define SDATA(_SELEM) (&(_SELEM)->sdata)
-#define BPF_SK_STORAGE_CACHE_SIZE	16
+#define BPF_LOCAL_STORAGE_CACHE_SIZE	16
 
 static DEFINE_SPINLOCK(cache_idx_lock);
-static u64 cache_idx_usage_counts[BPF_SK_STORAGE_CACHE_SIZE];
+static u64 cache_idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE];
 
-struct bpf_sk_storage {
-	struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE];
-	struct hlist_head list;	/* List of bpf_sk_storage_elem */
-	struct sock *sk;	/* The sk that owns the the above "list" of
-				 * bpf_sk_storage_elem.
+struct bpf_local_storage {
+	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
+	struct hlist_head list; /* List of bpf_local_storage_elem */
+	struct sock *owner;	/* The object that owns the the above "list" of
+				 * bpf_local_storage_elem.
 				 */
 	struct rcu_head rcu;
 	raw_spinlock_t lock;	/* Protect adding/removing from the "list" */
 };
 
-static struct bucket *select_bucket(struct bpf_sk_storage_map *smap,
-				    struct bpf_sk_storage_elem *selem)
+static struct bpf_local_storage_map_bucket *
+select_bucket(struct bpf_local_storage_map *smap,
+	      struct bpf_local_storage_elem *selem)
 {
 	return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
 }
@@ -111,21 +112,21 @@ static int omem_charge(struct sock *sk, unsigned int size)
 	return -ENOMEM;
 }
 
-static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem)
+static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
 {
 	return !hlist_unhashed(&selem->snode);
 }
 
-static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem)
+static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
 {
 	return !hlist_unhashed(&selem->map_node);
 }
 
-static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap,
-					       struct sock *sk, void *value,
-					       bool charge_omem)
+static struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk,
+		void *value, bool charge_omem)
 {
-	struct bpf_sk_storage_elem *selem;
+	struct bpf_local_storage_elem *selem;
 
 	if (charge_omem && omem_charge(sk, smap->elem_size))
 		return NULL;
@@ -147,85 +148,86 @@ static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap,
  * The caller must ensure selem->smap is still valid to be
  * dereferenced for its smap->elem_size and smap->cache_idx.
  */
-static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage,
-			      struct bpf_sk_storage_elem *selem,
-			      bool uncharge_omem)
+static bool bpf_selem_unlink_storage(struct bpf_local_storage *local_storage,
+				     struct bpf_local_storage_elem *selem,
+				     bool uncharge_omem)
 {
-	struct bpf_sk_storage_map *smap;
-	bool free_sk_storage;
+	struct bpf_local_storage_map *smap;
+	bool free_local_storage;
 	struct sock *sk;
 
 	smap = rcu_dereference(SDATA(selem)->smap);
-	sk = sk_storage->sk;
+	sk = local_storage->owner;
 
 	/* All uncharging on sk->sk_omem_alloc must be done first.
-	 * sk may be freed once the last selem is unlinked from sk_storage.
+	 * sk may be freed once the last selem is unlinked from local_storage.
 	 */
 	if (uncharge_omem)
 		atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
 
-	free_sk_storage = hlist_is_singular_node(&selem->snode,
-						 &sk_storage->list);
-	if (free_sk_storage) {
-		atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc);
-		sk_storage->sk = NULL;
+	free_local_storage = hlist_is_singular_node(&selem->snode,
+						    &local_storage->list);
+	if (free_local_storage) {
+		atomic_sub(sizeof(struct bpf_local_storage), &sk->sk_omem_alloc);
+		local_storage->owner = NULL;
 		/* After this RCU_INIT, sk may be freed and cannot be used */
 		RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
 
-		/* sk_storage is not freed now.  sk_storage->lock is
-		 * still held and raw_spin_unlock_bh(&sk_storage->lock)
+		/* local_storage is not freed now.  local_storage->lock is
+		 * still held and raw_spin_unlock_bh(&local_storage->lock)
 		 * will be done by the caller.
 		 *
 		 * Although the unlock will be done under
 		 * rcu_read_lock(),  it is more intutivie to
-		 * read if kfree_rcu(sk_storage, rcu) is done
-		 * after the raw_spin_unlock_bh(&sk_storage->lock).
+		 * read if kfree_rcu(local_storage, rcu) is done
+		 * after the raw_spin_unlock_bh(&local_storage->lock).
 		 *
-		 * Hence, a "bool free_sk_storage" is returned
+		 * Hence, a "bool free_local_storage" is returned
 		 * to the caller which then calls the kfree_rcu()
 		 * after unlock.
 		 */
 	}
 	hlist_del_init_rcu(&selem->snode);
-	if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) ==
+	if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
 	    SDATA(selem))
-		RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL);
+		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
 	kfree_rcu(selem, rcu);
 
-	return free_sk_storage;
+	return free_local_storage;
 }
 
-static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
+static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
 {
-	struct bpf_sk_storage *sk_storage;
-	bool free_sk_storage = false;
+	struct bpf_local_storage *local_storage;
+	bool free_local_storage = false;
 
-	if (unlikely(!selem_linked_to_sk(selem)))
+	if (unlikely(!selem_linked_to_storage(selem)))
 		/* selem has already been unlinked from sk */
 		return;
 
-	sk_storage = rcu_dereference(selem->sk_storage);
-	raw_spin_lock_bh(&sk_storage->lock);
-	if (likely(selem_linked_to_sk(selem)))
-		free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
-	raw_spin_unlock_bh(&sk_storage->lock);
+	local_storage = rcu_dereference(selem->local_storage);
+	raw_spin_lock_bh(&local_storage->lock);
+	if (likely(selem_linked_to_storage(selem)))
+		free_local_storage =
+			bpf_selem_unlink_storage(local_storage, selem, true);
+	raw_spin_unlock_bh(&local_storage->lock);
 
-	if (free_sk_storage)
-		kfree_rcu(sk_storage, rcu);
+	if (free_local_storage)
+		kfree_rcu(local_storage, rcu);
 }
 
-static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
-			    struct bpf_sk_storage_elem *selem)
+static void bpf_selem_link_storage(struct bpf_local_storage *local_storage,
+				   struct bpf_local_storage_elem *selem)
 {
-	RCU_INIT_POINTER(selem->sk_storage, sk_storage);
-	hlist_add_head(&selem->snode, &sk_storage->list);
+	RCU_INIT_POINTER(selem->local_storage, local_storage);
+	hlist_add_head(&selem->snode, &local_storage->list);
 }
 
-static void selem_unlink_map(struct bpf_sk_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 {
-	struct bpf_sk_storage_map *smap;
-	struct bucket *b;
+	struct bpf_local_storage_map *smap;
+	struct bpf_local_storage_map_bucket *b;
 
 	if (unlikely(!selem_linked_to_map(selem)))
 		/* selem has already be unlinked from smap */
@@ -239,10 +241,10 @@ static void selem_unlink_map(struct bpf_sk_storage_elem *selem)
 	raw_spin_unlock_bh(&b->lock);
 }
 
-static void selem_link_map(struct bpf_sk_storage_map *smap,
-			   struct bpf_sk_storage_elem *selem)
+static void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+			       struct bpf_local_storage_elem *selem)
 {
-	struct bucket *b = select_bucket(smap, selem);
+	struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
 
 	raw_spin_lock_bh(&b->lock);
 	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
@@ -250,31 +252,31 @@ static void selem_link_map(struct bpf_sk_storage_map *smap,
 	raw_spin_unlock_bh(&b->lock);
 }
 
-static void selem_unlink(struct bpf_sk_storage_elem *selem)
+static void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
 {
-	/* Always unlink from map before unlinking from sk_storage
+	/* Always unlink from map before unlinking from local_storage
 	 * because selem will be freed after successfully unlinked from
-	 * the sk_storage.
+	 * the local_storage.
 	 */
-	selem_unlink_map(selem);
-	selem_unlink_sk(selem);
+	bpf_selem_unlink_map(selem);
+	__bpf_selem_unlink_storage(selem);
 }
 
-static struct bpf_sk_storage_data *
-__sk_storage_lookup(struct bpf_sk_storage *sk_storage,
-		    struct bpf_sk_storage_map *smap,
-		    bool cacheit_lockit)
+static struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+			 struct bpf_local_storage_map *smap,
+			 bool cacheit_lockit)
 {
-	struct bpf_sk_storage_data *sdata;
-	struct bpf_sk_storage_elem *selem;
+	struct bpf_local_storage_data *sdata;
+	struct bpf_local_storage_elem *selem;
 
 	/* Fast path (cache hit) */
-	sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]);
+	sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
 	if (sdata && rcu_access_pointer(sdata->smap) == smap)
 		return sdata;
 
 	/* Slow path (cache miss) */
-	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode)
+	hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
 		if (rcu_access_pointer(SDATA(selem)->smap) == smap)
 			break;
 
@@ -286,33 +288,33 @@ __sk_storage_lookup(struct bpf_sk_storage *sk_storage,
 		/* spinlock is needed to avoid racing with the
 		 * parallel delete.  Otherwise, publishing an already
 		 * deleted sdata to the cache will become a use-after-free
-		 * problem in the next __sk_storage_lookup().
+		 * problem in the next bpf_local_storage_lookup().
 		 */
-		raw_spin_lock_bh(&sk_storage->lock);
-		if (selem_linked_to_sk(selem))
-			rcu_assign_pointer(sk_storage->cache[smap->cache_idx],
+		raw_spin_lock_bh(&local_storage->lock);
+		if (selem_linked_to_storage(selem))
+			rcu_assign_pointer(local_storage->cache[smap->cache_idx],
 					   sdata);
-		raw_spin_unlock_bh(&sk_storage->lock);
+		raw_spin_unlock_bh(&local_storage->lock);
 	}
 
 	return sdata;
 }
 
-static struct bpf_sk_storage_data *
+static struct bpf_local_storage_data *
 sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
 {
-	struct bpf_sk_storage *sk_storage;
-	struct bpf_sk_storage_map *smap;
+	struct bpf_local_storage *sk_storage;
+	struct bpf_local_storage_map *smap;
 
 	sk_storage = rcu_dereference(sk->sk_bpf_storage);
 	if (!sk_storage)
 		return NULL;
 
-	smap = (struct bpf_sk_storage_map *)map;
-	return __sk_storage_lookup(sk_storage, smap, cacheit_lockit);
+	smap = (struct bpf_local_storage_map *)map;
+	return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit);
 }
 
-static int check_flags(const struct bpf_sk_storage_data *old_sdata,
+static int check_flags(const struct bpf_local_storage_data *old_sdata,
 		       u64 map_flags)
 {
 	if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
@@ -327,10 +329,10 @@ static int check_flags(const struct bpf_sk_storage_data *old_sdata,
 }
 
 static int sk_storage_alloc(struct sock *sk,
-			    struct bpf_sk_storage_map *smap,
-			    struct bpf_sk_storage_elem *first_selem)
+			    struct bpf_local_storage_map *smap,
+			    struct bpf_local_storage_elem *first_selem)
 {
-	struct bpf_sk_storage *prev_sk_storage, *sk_storage;
+	struct bpf_local_storage *prev_sk_storage, *sk_storage;
 	int err;
 
 	err = omem_charge(sk, sizeof(*sk_storage));
@@ -344,10 +346,10 @@ static int sk_storage_alloc(struct sock *sk,
 	}
 	INIT_HLIST_HEAD(&sk_storage->list);
 	raw_spin_lock_init(&sk_storage->lock);
-	sk_storage->sk = sk;
+	sk_storage->owner = sk;
 
-	__selem_link_sk(sk_storage, first_selem);
-	selem_link_map(smap, first_selem);
+	bpf_selem_link_storage(sk_storage, first_selem);
+	bpf_selem_link_map(smap, first_selem);
 	/* Publish sk_storage to sk.  sk->sk_lock cannot be acquired.
 	 * Hence, atomic ops is used to set sk->sk_bpf_storage
 	 * from NULL to the newly allocated sk_storage ptr.
@@ -357,10 +359,10 @@ static int sk_storage_alloc(struct sock *sk,
 	 * the sk->sk_bpf_storage, the sk_storage->lock must
 	 * be held before setting sk->sk_bpf_storage to NULL.
 	 */
-	prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage,
+	prev_sk_storage = cmpxchg((struct bpf_local_storage **)&sk->sk_bpf_storage,
 				  NULL, sk_storage);
 	if (unlikely(prev_sk_storage)) {
-		selem_unlink_map(first_selem);
+		bpf_selem_unlink_map(first_selem);
 		err = -EAGAIN;
 		goto uncharge;
 
@@ -387,15 +389,14 @@ static int sk_storage_alloc(struct sock *sk,
  * Otherwise, it will become a leak (and other memory issues
  * during map destruction).
  */
-static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
-						     struct bpf_map *map,
-						     void *value,
-						     u64 map_flags)
+static struct bpf_local_storage_data *
+bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value,
+			 u64 map_flags)
 {
-	struct bpf_sk_storage_data *old_sdata = NULL;
-	struct bpf_sk_storage_elem *selem;
-	struct bpf_sk_storage *sk_storage;
-	struct bpf_sk_storage_map *smap;
+	struct bpf_local_storage_data *old_sdata = NULL;
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage *local_storage;
+	struct bpf_local_storage_map *smap;
 	int err;
 
 	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
@@ -404,15 +405,15 @@ static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
 	    unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
 		return ERR_PTR(-EINVAL);
 
-	smap = (struct bpf_sk_storage_map *)map;
-	sk_storage = rcu_dereference(sk->sk_bpf_storage);
-	if (!sk_storage || hlist_empty(&sk_storage->list)) {
-		/* Very first elem for this sk */
+	smap = (struct bpf_local_storage_map *)map;
+	local_storage = rcu_dereference(sk->sk_bpf_storage);
+	if (!local_storage || hlist_empty(&local_storage->list)) {
+		/* Very first elem for this object */
 		err = check_flags(NULL, map_flags);
 		if (err)
 			return ERR_PTR(err);
 
-		selem = selem_alloc(smap, sk, value, true);
+		selem = bpf_selem_alloc(smap, sk, value, true);
 		if (!selem)
 			return ERR_PTR(-ENOMEM);
 
@@ -428,25 +429,26 @@ static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
 
 	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
 		/* Hoping to find an old_sdata to do inline update
-		 * such that it can avoid taking the sk_storage->lock
+		 * such that it can avoid taking the local_storage->lock
 		 * and changing the lists.
 		 */
-		old_sdata = __sk_storage_lookup(sk_storage, smap, false);
+		old_sdata =
+			bpf_local_storage_lookup(local_storage, smap, false);
 		err = check_flags(old_sdata, map_flags);
 		if (err)
 			return ERR_PTR(err);
-		if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) {
+		if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
 			copy_map_value_locked(map, old_sdata->data,
 					      value, false);
 			return old_sdata;
 		}
 	}
 
-	raw_spin_lock_bh(&sk_storage->lock);
+	raw_spin_lock_bh(&local_storage->lock);
 
-	/* Recheck sk_storage->list under sk_storage->lock */
-	if (unlikely(hlist_empty(&sk_storage->list))) {
-		/* A parallel del is happening and sk_storage is going
+	/* Recheck local_storage->list under local_storage->lock */
+	if (unlikely(hlist_empty(&local_storage->list))) {
+		/* A parallel del is happening and local_storage is going
 		 * away.  It has just been checked before, so very
 		 * unlikely.  Return instead of retry to keep things
 		 * simple.
@@ -455,7 +457,7 @@ static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
 		goto unlock_err;
 	}
 
-	old_sdata = __sk_storage_lookup(sk_storage, smap, false);
+	old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
 	err = check_flags(old_sdata, map_flags);
 	if (err)
 		goto unlock_err;
@@ -466,50 +468,51 @@ static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
 		goto unlock;
 	}
 
-	/* sk_storage->lock is held.  Hence, we are sure
+	/* local_storage->lock is held.  Hence, we are sure
 	 * we can unlink and uncharge the old_sdata successfully
 	 * later.  Hence, instead of charging the new selem now
 	 * and then uncharge the old selem later (which may cause
 	 * a potential but unnecessary charge failure),  avoid taking
 	 * a charge at all here (the "!old_sdata" check) and the
-	 * old_sdata will not be uncharged later during __selem_unlink_sk().
+	 * old_sdata will not be uncharged later during
+	 * bpf_selem_unlink_storage().
 	 */
-	selem = selem_alloc(smap, sk, value, !old_sdata);
+	selem = bpf_selem_alloc(smap, sk, value, !old_sdata);
 	if (!selem) {
 		err = -ENOMEM;
 		goto unlock_err;
 	}
 
 	/* First, link the new selem to the map */
-	selem_link_map(smap, selem);
+	bpf_selem_link_map(smap, selem);
 
-	/* Second, link (and publish) the new selem to sk_storage */
-	__selem_link_sk(sk_storage, selem);
+	/* Second, link (and publish) the new selem to local_storage */
+	bpf_selem_link_storage(local_storage, selem);
 
 	/* Third, remove old selem, SELEM(old_sdata) */
 	if (old_sdata) {
-		selem_unlink_map(SELEM(old_sdata));
-		__selem_unlink_sk(sk_storage, SELEM(old_sdata), false);
+		bpf_selem_unlink_map(SELEM(old_sdata));
+		bpf_selem_unlink_storage(local_storage, SELEM(old_sdata), false);
 	}
 
 unlock:
-	raw_spin_unlock_bh(&sk_storage->lock);
+	raw_spin_unlock_bh(&local_storage->lock);
 	return SDATA(selem);
 
 unlock_err:
-	raw_spin_unlock_bh(&sk_storage->lock);
+	raw_spin_unlock_bh(&local_storage->lock);
 	return ERR_PTR(err);
 }
 
 static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
 {
-	struct bpf_sk_storage_data *sdata;
+	struct bpf_local_storage_data *sdata;
 
 	sdata = sk_storage_lookup(sk, map, false);
 	if (!sdata)
 		return -ENOENT;
 
-	selem_unlink(SELEM(sdata));
+	bpf_selem_unlink(SELEM(sdata));
 
 	return 0;
 }
@@ -521,7 +524,7 @@ static u16 cache_idx_get(void)
 
 	spin_lock(&cache_idx_lock);
 
-	for (i = 0; i < BPF_SK_STORAGE_CACHE_SIZE; i++) {
+	for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
 		if (cache_idx_usage_counts[i] < min_usage) {
 			min_usage = cache_idx_usage_counts[i];
 			res = i;
@@ -548,8 +551,8 @@ static void cache_idx_free(u16 idx)
 /* Called by __sk_destruct() & bpf_sk_storage_clone() */
 void bpf_sk_storage_free(struct sock *sk)
 {
-	struct bpf_sk_storage_elem *selem;
-	struct bpf_sk_storage *sk_storage;
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage *sk_storage;
 	bool free_sk_storage = false;
 	struct hlist_node *n;
 
@@ -574,8 +577,9 @@ void bpf_sk_storage_free(struct sock *sk)
 		/* Always unlink from map before unlinking from
 		 * sk_storage.
 		 */
-		selem_unlink_map(selem);
-		free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
+		bpf_selem_unlink_map(selem);
+		free_sk_storage =
+			bpf_selem_unlink_storage(sk_storage, selem, true);
 	}
 	raw_spin_unlock_bh(&sk_storage->lock);
 	rcu_read_unlock();
@@ -584,14 +588,14 @@ void bpf_sk_storage_free(struct sock *sk)
 		kfree_rcu(sk_storage, rcu);
 }
 
-static void bpf_sk_storage_map_free(struct bpf_map *map)
+static void bpf_local_storage_map_free(struct bpf_map *map)
 {
-	struct bpf_sk_storage_elem *selem;
-	struct bpf_sk_storage_map *smap;
-	struct bucket *b;
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map *smap;
+	struct bpf_local_storage_map_bucket *b;
 	unsigned int i;
 
-	smap = (struct bpf_sk_storage_map *)map;
+	smap = (struct bpf_local_storage_map *)map;
 
 	cache_idx_free(smap->cache_idx);
 
@@ -615,10 +619,10 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
 		rcu_read_lock();
 		/* No one is adding to b->list now */
-		while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)),
-						 struct bpf_sk_storage_elem,
-						 map_node))) {
-			selem_unlink(selem);
+		while ((selem = hlist_entry_safe(
+				rcu_dereference_raw(hlist_first_rcu(&b->list)),
+				struct bpf_local_storage_elem, map_node))) {
+			bpf_selem_unlink(selem);
 			cond_resched_rcu();
 		}
 		rcu_read_unlock();
@@ -631,7 +635,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 	 *
 	 * However, the bpf_sk_storage_free() still needs to access
 	 * the smap->elem_size to do the uncharging in
-	 * __selem_unlink_sk().
+	 * bpf_selem_unlink_storage().
 	 *
 	 * Hence, wait another rcu grace period for the
 	 * bpf_sk_storage_free() to finish.
@@ -645,14 +649,15 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 /* U16_MAX is much more than enough for sk local storage
  * considering a tcp_sock is ~2k.
  */
-#define MAX_VALUE_SIZE							\
+#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE				\
 	min_t(u32,							\
-	      (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
-	      (U16_MAX - sizeof(struct bpf_sk_storage_elem)))
+	      (KMALLOC_MAX_SIZE - MAX_BPF_STACK -			\
+	       sizeof(struct bpf_local_storage_elem)),			\
+	      (U16_MAX - sizeof(struct bpf_local_storage_elem)))
 
-static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
+static int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 {
-	if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
+	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
 	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
 	    attr->max_entries ||
 	    attr->key_size != sizeof(int) || !attr->value_size ||
@@ -663,15 +668,15 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 	if (!bpf_capable())
 		return -EPERM;
 
-	if (attr->value_size > MAX_VALUE_SIZE)
+	if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
 		return -E2BIG;
 
 	return 0;
 }
 
-static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
+static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
 {
-	struct bpf_sk_storage_map *smap;
+	struct bpf_local_storage_map *smap;
 	unsigned int i;
 	u32 nbuckets;
 	u64 cost;
@@ -707,7 +712,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&smap->buckets[i].lock);
 	}
 
-	smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
+	smap->elem_size = sizeof(struct bpf_local_storage_elem) + attr->value_size;
 	smap->cache_idx = cache_idx_get();
 
 	return &smap->map;
@@ -719,10 +724,10 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
 	return -ENOTSUPP;
 }
 
-static int bpf_sk_storage_map_check_btf(const struct bpf_map *map,
-					const struct btf *btf,
-					const struct btf_type *key_type,
-					const struct btf_type *value_type)
+static int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+					   const struct btf *btf,
+					   const struct btf_type *key_type,
+					   const struct btf_type *value_type)
 {
 	u32 int_data;
 
@@ -738,7 +743,7 @@ static int bpf_sk_storage_map_check_btf(const struct bpf_map *map,
 
 static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
 {
-	struct bpf_sk_storage_data *sdata;
+	struct bpf_local_storage_data *sdata;
 	struct socket *sock;
 	int fd, err;
 
@@ -756,14 +761,15 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
 static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
 					 void *value, u64 map_flags)
 {
-	struct bpf_sk_storage_data *sdata;
+	struct bpf_local_storage_data *sdata;
 	struct socket *sock;
 	int fd, err;
 
 	fd = *(int *)key;
 	sock = sockfd_lookup(fd, &err);
 	if (sock) {
-		sdata = sk_storage_update(sock->sk, map, value, map_flags);
+		sdata = bpf_local_storage_update(sock->sk, map, value,
+						 map_flags);
 		sockfd_put(sock);
 		return PTR_ERR_OR_ZERO(sdata);
 	}
@@ -787,14 +793,14 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
 	return err;
 }
 
-static struct bpf_sk_storage_elem *
+static struct bpf_local_storage_elem *
 bpf_sk_storage_clone_elem(struct sock *newsk,
-			  struct bpf_sk_storage_map *smap,
-			  struct bpf_sk_storage_elem *selem)
+			  struct bpf_local_storage_map *smap,
+			  struct bpf_local_storage_elem *selem)
 {
-	struct bpf_sk_storage_elem *copy_selem;
+	struct bpf_local_storage_elem *copy_selem;
 
-	copy_selem = selem_alloc(smap, newsk, NULL, true);
+	copy_selem = bpf_selem_alloc(smap, newsk, NULL, true);
 	if (!copy_selem)
 		return NULL;
 
@@ -810,9 +816,9 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
 
 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 {
-	struct bpf_sk_storage *new_sk_storage = NULL;
-	struct bpf_sk_storage *sk_storage;
-	struct bpf_sk_storage_elem *selem;
+	struct bpf_local_storage *new_sk_storage = NULL;
+	struct bpf_local_storage *sk_storage;
+	struct bpf_local_storage_elem *selem;
 	int ret = 0;
 
 	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
@@ -824,8 +830,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 		goto out;
 
 	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
-		struct bpf_sk_storage_elem *copy_selem;
-		struct bpf_sk_storage_map *smap;
+		struct bpf_local_storage_elem *copy_selem;
+		struct bpf_local_storage_map *smap;
 		struct bpf_map *map;
 
 		smap = rcu_dereference(SDATA(selem)->smap);
@@ -849,8 +855,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 		}
 
 		if (new_sk_storage) {
-			selem_link_map(smap, copy_selem);
-			__selem_link_sk(new_sk_storage, copy_selem);
+			bpf_selem_link_map(smap, copy_selem);
+			bpf_selem_link_storage(new_sk_storage, copy_selem);
 		} else {
 			ret = sk_storage_alloc(newsk, smap, copy_selem);
 			if (ret) {
@@ -861,7 +867,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 				goto out;
 			}
 
-			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+			new_sk_storage =
+				rcu_dereference(copy_selem->local_storage);
 		}
 		bpf_map_put(map);
 	}
@@ -879,7 +886,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 	   void *, value, u64, flags)
 {
-	struct bpf_sk_storage_data *sdata;
+	struct bpf_local_storage_data *sdata;
 
 	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
 		return (unsigned long)NULL;
@@ -895,7 +902,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 	     *  destruction).
 	     */
 	    refcount_inc_not_zero(&sk->sk_refcnt)) {
-		sdata = sk_storage_update(sk, map, value, BPF_NOEXIST);
+		sdata = bpf_local_storage_update(sk, map, value, BPF_NOEXIST);
 		/* sk must be a fullsock (guaranteed by verifier),
 		 * so sock_gen_put() is unnecessary.
 		 */
@@ -922,15 +929,15 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
 
 static int sk_storage_map_btf_id;
 const struct bpf_map_ops sk_storage_map_ops = {
-	.map_alloc_check = bpf_sk_storage_map_alloc_check,
-	.map_alloc = bpf_sk_storage_map_alloc,
-	.map_free = bpf_sk_storage_map_free,
+	.map_alloc_check = bpf_local_storage_map_alloc_check,
+	.map_alloc = bpf_local_storage_map_alloc,
+	.map_free = bpf_local_storage_map_free,
 	.map_get_next_key = notsupp_get_next_key,
 	.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
 	.map_update_elem = bpf_fd_sk_storage_update_elem,
 	.map_delete_elem = bpf_fd_sk_storage_delete_elem,
-	.map_check_btf = bpf_sk_storage_map_check_btf,
-	.map_btf_name = "bpf_sk_storage_map",
+	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_btf_name = "bpf_local_storage_map",
 	.map_btf_id = &sk_storage_map_btf_id,
 };
 
@@ -1022,7 +1029,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
 	u32 nr_maps = 0;
 	int rem, err;
 
-	/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
+	/* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as
 	 * the map_alloc_check() side also does.
 	 */
 	if (!bpf_capable())
@@ -1072,13 +1079,13 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
 }
 EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
 
-static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
+static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
 {
 	struct nlattr *nla_stg, *nla_value;
-	struct bpf_sk_storage_map *smap;
+	struct bpf_local_storage_map *smap;
 
 	/* It cannot exceed max nlattr's payload */
-	BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
+	BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
 
 	nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
 	if (!nla_stg)
@@ -1114,9 +1121,9 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
 {
 	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
 	unsigned int diag_size = nla_total_size(0);
-	struct bpf_sk_storage *sk_storage;
-	struct bpf_sk_storage_elem *selem;
-	struct bpf_sk_storage_map *smap;
+	struct bpf_local_storage *sk_storage;
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map *smap;
 	struct nlattr *nla_stgs;
 	unsigned int saved_len;
 	int err = 0;
@@ -1169,8 +1176,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
 {
 	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
 	unsigned int diag_size = nla_total_size(0);
-	struct bpf_sk_storage *sk_storage;
-	struct bpf_sk_storage_data *sdata;
+	struct bpf_local_storage *sk_storage;
+	struct bpf_local_storage_data *sdata;
 	struct nlattr *nla_stgs;
 	unsigned int saved_len;
 	int err = 0;
@@ -1197,8 +1204,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
 
 	saved_len = skb->len;
 	for (i = 0; i < diag->nr_maps; i++) {
-		sdata = __sk_storage_lookup(sk_storage,
-				(struct bpf_sk_storage_map *)diag->maps[i],
+		sdata = bpf_local_storage_lookup(sk_storage,
+				(struct bpf_local_storage_map *)diag->maps[i],
 				false);
 
 		if (!sdata)
@@ -1235,19 +1242,19 @@ struct bpf_iter_seq_sk_storage_map_info {
 	unsigned skip_elems;
 };
 
-static struct bpf_sk_storage_elem *
+static struct bpf_local_storage_elem *
 bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
-				 struct bpf_sk_storage_elem *prev_selem)
+				 struct bpf_local_storage_elem *prev_selem)
 {
-	struct bpf_sk_storage *sk_storage;
-	struct bpf_sk_storage_elem *selem;
+	struct bpf_local_storage *sk_storage;
+	struct bpf_local_storage_elem *selem;
 	u32 skip_elems = info->skip_elems;
-	struct bpf_sk_storage_map *smap;
+	struct bpf_local_storage_map *smap;
 	u32 bucket_id = info->bucket_id;
 	u32 i, count, n_buckets;
-	struct bucket *b;
+	struct bpf_local_storage_map_bucket *b;
 
-	smap = (struct bpf_sk_storage_map *)info->map;
+	smap = (struct bpf_local_storage_map *)info->map;
 	n_buckets = 1U << smap->bucket_log;
 	if (bucket_id >= n_buckets)
 		return NULL;
@@ -1257,7 +1264,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 	count = 0;
 	while (selem) {
 		selem = hlist_entry_safe(selem->map_node.next,
-					 struct bpf_sk_storage_elem, map_node);
+					 struct bpf_local_storage_elem, map_node);
 		if (!selem) {
 			/* not found, unlock and go to the next bucket */
 			b = &smap->buckets[bucket_id++];
@@ -1265,7 +1272,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 			skip_elems = 0;
 			break;
 		}
-		sk_storage = rcu_dereference_raw(selem->sk_storage);
+		sk_storage = rcu_dereference_raw(selem->local_storage);
 		if (sk_storage) {
 			info->skip_elems = skip_elems + count;
 			return selem;
@@ -1278,7 +1285,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 		raw_spin_lock_bh(&b->lock);
 		count = 0;
 		hlist_for_each_entry(selem, &b->list, map_node) {
-			sk_storage = rcu_dereference_raw(selem->sk_storage);
+			sk_storage = rcu_dereference_raw(selem->local_storage);
 			if (sk_storage && count >= skip_elems) {
 				info->bucket_id = i;
 				info->skip_elems = count;
@@ -1297,7 +1304,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 
 static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct bpf_sk_storage_elem *selem;
+	struct bpf_local_storage_elem *selem;
 
 	selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
 	if (!selem)
@@ -1330,11 +1337,11 @@ DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
 		     void *value)
 
 static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
-					 struct bpf_sk_storage_elem *selem)
+					 struct bpf_local_storage_elem *selem)
 {
 	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
 	struct bpf_iter__bpf_sk_storage_map ctx = {};
-	struct bpf_sk_storage *sk_storage;
+	struct bpf_local_storage *sk_storage;
 	struct bpf_iter_meta meta;
 	struct bpf_prog *prog;
 	int ret = 0;
@@ -1345,8 +1352,8 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
 		ctx.meta = &meta;
 		ctx.map = info->map;
 		if (selem) {
-			sk_storage = rcu_dereference_raw(selem->sk_storage);
-			ctx.sk = sk_storage->sk;
+			sk_storage = rcu_dereference_raw(selem->local_storage);
+			ctx.sk = sk_storage->owner;
 			ctx.value = SDATA(selem)->data;
 		}
 		ret = bpf_iter_run_prog(prog, &ctx);
@@ -1363,13 +1370,13 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
 static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
 {
 	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
-	struct bpf_sk_storage_map *smap;
-	struct bucket *b;
+	struct bpf_local_storage_map *smap;
+	struct bpf_local_storage_map_bucket *b;
 
 	if (!v) {
 		(void)__bpf_sk_storage_map_seq_show(seq, v);
 	} else {
-		smap = (struct bpf_sk_storage_map *)info->map;
+		smap = (struct bpf_local_storage_map *)info->map;
 		b = &smap->buckets[info->bucket_id];
 		raw_spin_unlock_bh(&b->lock);
 	}
-- 
2.28.0.rc0.142.g3c755180ce-goog


^ permalink raw reply related

* Re: [PATCH v1 2/4] [RFC] x86/trampfd: Provide support for the trampoline file descriptor
From: Madhavan T. Venkataraman @ 2020-07-30 14:25 UTC (permalink / raw)
  To: Greg KH
  Cc: kernel-hardening, linux-api, linux-arm-kernel, linux-fsdevel,
	linux-integrity, linux-kernel, linux-security-module, oleg, x86
In-Reply-To: <20200730090612.GA900546@kroah.com>

Yes. I will fix this.

Thanks.

Madhavan

On 7/30/20 4:06 AM, Greg KH wrote:
> On Tue, Jul 28, 2020 at 08:10:48AM -0500, madvenka@linux.microsoft.com wrote:
>> +EXPORT_SYMBOL_GPL(trampfd_valid_regs);
> Why are all of these exported?  I don't see a module user in this
> series, or did I miss it somehow?
>
> EXPORT_SYMBOL* is only needed for symbols to be used by modules, not by
> code that is built into the kernel.
>
> thanks,
>
> greg k-h


^ permalink raw reply

* Re: [PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor
From: Madhavan T. Venkataraman @ 2020-07-30 14:42 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Kernel Hardening, Linux API, linux-arm-kernel, Linux FS Devel,
	linux-integrity, LKML, LSM List, Oleg Nesterov, X86 ML
In-Reply-To: <CALCETrVy5OMuUx04-wWk9FJbSxkrT2vMfN_kANinudrDwC4Cig@mail.gmail.com>

For some reason my email program is not delivering to all the
recipients because of some formatting issues. I am resending.
I apologize. I will try to get this fixed.

Sorry for the delay. I just needed to think about it a little.
I will respond to your first suggestion in this email. I will
respond to the others in separate emails if that is alright
with you.

On 7/28/20 12:31 PM, Andy Lutomirski wrote:
>> On Jul 28, 2020, at 6:11 AM, madvenka@linux.microsoft.com wrote:
>>
>> From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
>>
>> The kernel creates the trampoline mapping without any permissions. When
>> the trampoline is executed by user code, a page fault happens and the
>> kernel gets control. The kernel recognizes that this is a trampoline
>> invocation. It sets up the user registers based on the specified
>> register context, and/or pushes values on the user stack based on the
>> specified stack context, and sets the user PC to the requested target
>> PC. When the kernel returns, execution continues at the target PC.
>> So, the kernel does the work of the trampoline on behalf of the
>> application.
> This is quite clever, but now I’m wondering just how much kernel help
> is really needed. In your series, the trampoline is an non-executable
> page.  I can think of at least two alternative approaches, and I'd
> like to know the pros and cons.
>
> 1. Entirely userspace: a return trampoline would be something like:
>
> 1:
> pushq %rax
> pushq %rbc
> pushq %rcx
> ...
> pushq %r15
> movq %rsp, %rdi # pointer to saved regs
> leaq 1b(%rip), %rsi # pointer to the trampoline itself
> callq trampoline_handler # see below
>
> You would fill a page with a bunch of these, possibly compacted to get
> more per page, and then you would remap as many copies as needed.  The
> 'callq trampoline_handler' part would need to be a bit clever to make
> it continue to work despite this remapping.  This will be *much*
> faster than trampfd. How much of your use case would it cover?  For
> the inverse, it's not too hard to write a bit of asm to set all
> registers and jump somewhere.

Let me state my understanding of what you are suggesting. Correct me if
I get anything wrong. If you don't mind, I will also take the liberty
of generalizing and paraphrasing your suggestion.

The goal is to create two page mappings that are adjacent to each other:

- a code page that contains template code for a trampoline. Since the
  template code would tend to be small in size, pack as many of them
  as possible within a page to conserve memory. In other words, create
  an array of the template code fragments. Each element in the array
  would be used for one trampoline instance.

- a data page that contains an array of data elements. Corresponding
  to each code element in the code page, there would be a data element
  in the data page that would contain data that is specific to a
  trampoline instance.

- Code will access data using PC-relative addressing.

The management of the code pages and allocation for each trampoline
instance would all be done in user space.

Is this the general idea?

Creating a code page
----------------------------

We can do this in one of the following ways:
- Allocate a writable page at run time, write the template code into
  the page and have execute permissions on the page.

- Allocate a writable page at run time, write the template code into
  the page and remap the page with just execute permissions.

- Allocate a writable page at run time, write the template code into
  the page, write the page into a temporary file and map the file with
  execute permissions.

- Include the template code in a code page at build time itself and
  just remap the code page each time you need a code page.

Pros and Cons
-------------------

As long as the OS provides the functionality to do this and the security
subsystem in the OS allows the actions, this is totally feasible. If not,
we need something like trampfd.

As Floren mentioned, libffi does implement something like this for MACH.

In fact, in my libffi changes, I use trampfd only after all the other methods
have failed because of security settings.

But the above approach only solves the problem for this simple type of
trampoline. It does not provide a framework for addressing more complex types
or even other forms of dynamic code.

Also, each application would need to implement this solution for itself
as opposed to relying on one implementation provided by the kernel.

Trampfd-based solution
-------------------------------

I outlined an enhancement to trampfd in a response to David Laight. In this
enhancement, the kernel is the one that would set up the code page.

The kernel would call an arch-specific support function to generate the
code required to load registers, push values on the stack and jump to a PC
for a trampoline instance based on its current context. The trampoline
instance data could be baked into the code.

My initial idea was to only have one trampoline instance per page. But I
think I can implement multiple instances per page. I just have to manage
the trampfd file private data and VMA private data accordingly to map an
element in a code page to its trampoline object.

The two approaches are similar except for the detail about who sets up
and manages the trampoline pages. In both approaches, the performance problem
is addressed. But trampfd can be used even when security settings are
restrictive.

Is my solution acceptable?

A couple of things
------------------------

- In the current trampfd implementation, no physical pages are actually
  allocated. It is just a virtual mapping. From a memory footprint
  perspective, this is good. May be, we can let the user specify if
  he wants a fast trampoline that consumes memory or a slow one that doesn't?

- In the future, we may define additional types that need the kernel to do
  the job. Examples:

    - The kernel may have a trampoline type for which it is not willing
       or able to generate code

    - The kernel could emulate dynamic code for the user

     - The kernel could interpret dynamic code for the user

     - The kernel could allow the user to access some kernel functionality
        using the framework

  In such cases, there isn't any physical code page that gets mapped into
  the user address space. We need the kernel to handle the address fault
  and provide the functionality.

One question for the reviewers
----------------------------------------

Do you think that the file descriptor based approach is fine? Or, does this
need a regular system call based implementation? There are some advantages
with a regular system call:

- We don't consume file descriptors. E.g., in libffi, we have to
  keep the file descriptor open for a closure until the closure
  is freed.

- Trampoline operations can be performed based on the trampoline
  address instead of an fd.

- Sharing of objects across processes can be implemented through
  a regular ID based method rather than sending the file descriptor
  over a unix domain socket.

- Shared objects can be persistent.

- An fd based API does structure parsing in read()/write() calls
  to obtain arguments. With a regular system call, that is not
  necessary.

Please let me know your thoughts.

Madhavan

^ permalink raw reply

* Re: [PATCH v5 1/4] IMA: Add func to measure LSM state and policy
From: Tyler Hicks @ 2020-07-30 15:02 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: zohar, stephen.smalley.work, casey, sashal, jmorris,
	linux-integrity, selinux, linux-security-module, linux-kernel
In-Reply-To: <20200730034724.3298-2-nramas@linux.microsoft.com>

On 2020-07-29 20:47:21, Lakshmi Ramasubramanian wrote:
> Critical data structures of security modules need to be measured to
> enable an attestation service to verify if the configuration and
> policies for the security modules have been setup correctly and
> that they haven't been tampered with at runtime. A new IMA policy is
> required for handling this measurement.
> 
> Define two new IMA policy func namely LSM_STATE and LSM_POLICY to
> measure the state and the policy provided by the security modules.
> Update ima_match_rules() and ima_validate_rule() to check for
> the new func and ima_parse_rule() to handle the new func.
> 
> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
> ---
>  Documentation/ABI/testing/ima_policy |  9 ++++++++
>  security/integrity/ima/ima.h         |  2 ++
>  security/integrity/ima/ima_api.c     |  2 +-
>  security/integrity/ima/ima_policy.c  | 31 ++++++++++++++++++++++++----
>  4 files changed, 39 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
> index cd572912c593..b7c7fb548c0c 100644
> --- a/Documentation/ABI/testing/ima_policy
> +++ b/Documentation/ABI/testing/ima_policy
> @@ -30,6 +30,7 @@ Description:
>  				[FIRMWARE_CHECK]
>  				[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
>  				[KEXEC_CMDLINE] [KEY_CHECK]
> +				[LSM_STATE] [LSM_POLICY]
>  			mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
>  			       [[^]MAY_EXEC]
>  			fsmagic:= hex value
> @@ -125,3 +126,11 @@ Description:
>  		keys added to .builtin_trusted_keys or .ima keyring:
>  
>  			measure func=KEY_CHECK keyrings=.builtin_trusted_keys|.ima
> +
> +		Example of measure rule using LSM_STATE to measure LSM state:
> +
> +			measure func=LSM_STATE template=ima-buf
> +
> +		Example of measure rule using LSM_POLICY to measure LSM policy:
> +
> +			measure func=LSM_POLICY template=ima-ng
> diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
> index 38043074ce5e..1b5f4b2f17d0 100644
> --- a/security/integrity/ima/ima.h
> +++ b/security/integrity/ima/ima.h
> @@ -200,6 +200,8 @@ static inline unsigned int ima_hash_key(u8 *digest)
>  	hook(POLICY_CHECK, policy)			\
>  	hook(KEXEC_CMDLINE, kexec_cmdline)		\
>  	hook(KEY_CHECK, key)				\
> +	hook(LSM_STATE, lsm_state)			\
> +	hook(LSM_POLICY, lsm_policy)			\
>  	hook(MAX_CHECK, none)
>  
>  #define __ima_hook_enumify(ENUM, str)	ENUM,
> diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
> index 4f39fb93f278..8c8b4e4a6493 100644
> --- a/security/integrity/ima/ima_api.c
> +++ b/security/integrity/ima/ima_api.c
> @@ -176,7 +176,7 @@ void ima_add_violation(struct file *file, const unsigned char *filename,
>   *		subj=, obj=, type=, func=, mask=, fsmagic=
>   *	subj,obj, and type: are LSM specific.
>   *	func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
> - *	| KEXEC_CMDLINE | KEY_CHECK
> + *	| KEXEC_CMDLINE | KEY_CHECK | LSM_STATE | LSM_POLICY
>   *	mask: contains the permission mask
>   *	fsmagic: hex value
>   *
> diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
> index 07f033634b27..a0f5c39d9084 100644
> --- a/security/integrity/ima/ima_policy.c
> +++ b/security/integrity/ima/ima_policy.c
> @@ -442,13 +442,20 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
>  {
>  	int i;
>  
> -	if (func == KEY_CHECK) {
> -		return (rule->flags & IMA_FUNC) && (rule->func == func) &&
> -		       ima_match_keyring(rule, keyring, cred);
> -	}
>  	if ((rule->flags & IMA_FUNC) &&
>  	    (rule->func != func && func != POST_SETATTR))
>  		return false;
> +
> +	switch (func) {
> +	case KEY_CHECK:
> +		return ima_match_keyring(rule, keyring, cred);
> +	case LSM_STATE:
> +	case LSM_POLICY:
> +		return true;
> +	default:
> +		break;
> +	}
> +
>  	if ((rule->flags & IMA_MASK) &&
>  	    (rule->mask != mask && func != POST_SETATTR))
>  		return false;
> @@ -1044,6 +1051,18 @@ static bool ima_validate_rule(struct ima_rule_entry *entry)
>  		if (ima_rule_contains_lsm_cond(entry))
>  			return false;
>  
> +		break;
> +	case LSM_STATE:
> +	case LSM_POLICY:
> +		if (entry->action & ~(MEASURE | DONT_MEASURE))
> +			return false;
> +
> +		if (entry->flags & ~(IMA_FUNC | IMA_PCR))
> +			return false;
> +
> +		if (ima_rule_contains_lsm_cond(entry))
> +			return false;
> +
>  		break;
>  	default:
>  		return false;
> @@ -1176,6 +1195,10 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
>  				entry->func = KEXEC_CMDLINE;
>  			else if (strcmp(args[0].from, "KEY_CHECK") == 0)
>  				entry->func = KEY_CHECK;
> +			else if (strcmp(args[0].from, "LSM_STATE") == 0)
> +				entry->func = LSM_STATE;
> +			else if (strcmp(args[0].from, "LSM_POLICY") == 0)
> +				entry->func = LSM_POLICY;

This patch generally looks really good to me with the exception of one
thing...

We should only accept rules with these specified hook functions when an
LSM that has measurement support is enabled. This messes up the ordering
of your patch series but it could be as simple as doing this:

			else if (IS_ENABLED(CONFIG_SECURITY_SELINUX) &&
				 strcmp(args[0].from, "LSM_STATE") == 0)
				 entry->func = LSM_STATE;

Or you could do something a little more complex, like what's done with
CONFIG_IMA_LSM_RULES. You could create a CONFIG_IMA_MEASURE_LSM option
that's default enabled but depends on CONFIG_SECURITY_SELINUX and then
check for IS_ENABLED(CONFIG_IMA_MEASURE_LSM) in ima_parse_rule().

I'd personally opt for just placing the
IS_ENABLED(CONFIG_SECURITY_SELINUX) check directly into
ima_parse_rule().

Tyler

>  			else
>  				result = -EINVAL;
>  			if (!result)
> -- 
> 2.27.0

^ permalink raw reply

* Re: [PATCH v5 2/4] IMA: Define IMA hooks to measure LSM state and policy
From: Tyler Hicks @ 2020-07-30 15:04 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: zohar, stephen.smalley.work, casey, sashal, jmorris,
	linux-integrity, selinux, linux-security-module, linux-kernel
In-Reply-To: <20200730034724.3298-3-nramas@linux.microsoft.com>

On 2020-07-29 20:47:22, Lakshmi Ramasubramanian wrote:
> IMA subsystem needs to define IMA hooks that the security modules can
> call to measure state and policy data.
> 
> Define two new IMA hooks, namely ima_lsm_state() and ima_lsm_policy(),
> that the security modules can call to measure LSM state and LSM policy
> respectively. Return the status of the measurement operation from these
> two IMA hooks.
> 
> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>

Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>

Tyler

> ---
>  include/linux/ima.h               | 14 +++++++++
>  security/integrity/ima/ima.h      |  6 ++--
>  security/integrity/ima/ima_main.c | 50 ++++++++++++++++++++++++++-----
>  3 files changed, 60 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/ima.h b/include/linux/ima.h
> index d15100de6cdd..442ca0dce3c8 100644
> --- a/include/linux/ima.h
> +++ b/include/linux/ima.h
> @@ -26,6 +26,10 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
>  extern void ima_post_path_mknod(struct dentry *dentry);
>  extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
>  extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
> +extern int ima_measure_lsm_state(const char *lsm_event_name, const void *buf,
> +				 int size);
> +extern int ima_measure_lsm_policy(const char *lsm_event_name, const void *buf,
> +				  int size);
>  
>  #ifdef CONFIG_IMA_KEXEC
>  extern void ima_add_kexec_buffer(struct kimage *image);
> @@ -104,6 +108,16 @@ static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size)
>  }
>  
>  static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
> +static inline int ima_measure_lsm_state(const char *lsm_event_name,
> +					const void *buf, int size)
> +{
> +	return -EOPNOTSUPP;
> +}
> +static inline int ima_measure_lsm_policy(const char *lsm_event_name,
> +					 const void *buf, int size)
> +{
> +	return -EOPNOTSUPP;
> +}
>  #endif /* CONFIG_IMA */
>  
>  #ifndef CONFIG_IMA_KEXEC
> diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
> index 1b5f4b2f17d0..8ed9f5e1dd40 100644
> --- a/security/integrity/ima/ima.h
> +++ b/security/integrity/ima/ima.h
> @@ -267,9 +267,9 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file,
>  			   struct evm_ima_xattr_data *xattr_value,
>  			   int xattr_len, const struct modsig *modsig, int pcr,
>  			   struct ima_template_desc *template_desc);
> -void process_buffer_measurement(struct inode *inode, const void *buf, int size,
> -				const char *eventname, enum ima_hooks func,
> -				int pcr, const char *keyring);
> +int process_buffer_measurement(struct inode *inode, const void *buf, int size,
> +			       const char *eventname, enum ima_hooks func,
> +			       int pcr, const char *keyring);
>  void ima_audit_measurement(struct integrity_iint_cache *iint,
>  			   const unsigned char *filename);
>  int ima_alloc_init_template(struct ima_event_data *event_data,
> diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
> index 8a91711ca79b..74d421e40c8f 100644
> --- a/security/integrity/ima/ima_main.c
> +++ b/security/integrity/ima/ima_main.c
> @@ -736,9 +736,9 @@ int ima_load_data(enum kernel_load_data_id id)
>   *
>   * Based on policy, the buffer is measured into the ima log.
>   */
> -void process_buffer_measurement(struct inode *inode, const void *buf, int size,
> -				const char *eventname, enum ima_hooks func,
> -				int pcr, const char *keyring)
> +int process_buffer_measurement(struct inode *inode, const void *buf, int size,
> +			       const char *eventname, enum ima_hooks func,
> +			       int pcr, const char *keyring)
>  {
>  	int ret = 0;
>  	const char *audit_cause = "ENOMEM";
> @@ -758,7 +758,7 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size,
>  	u32 secid;
>  
>  	if (!ima_policy_flag)
> -		return;
> +		return 0;
>  
>  	/*
>  	 * Both LSM hooks and auxilary based buffer measurements are
> @@ -772,7 +772,7 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size,
>  		action = ima_get_action(inode, current_cred(), secid, 0, func,
>  					&pcr, &template, keyring);
>  		if (!(action & IMA_MEASURE))
> -			return;
> +			return 0;
>  	}
>  
>  	if (!pcr)
> @@ -787,7 +787,7 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size,
>  			pr_err("template %s init failed, result: %d\n",
>  			       (strlen(template->name) ?
>  				template->name : template->fmt), ret);
> -			return;
> +			return ret;
>  		}
>  	}
>  
> @@ -819,7 +819,7 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size,
>  					func_measure_str(func),
>  					audit_cause, ret, 0, ret);
>  
> -	return;
> +	return ret;
>  }
>  
>  /**
> @@ -846,6 +846,42 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
>  	fdput(f);
>  }
>  
> +/**
> + * ima_measure_lsm_state - measure LSM specific state
> + * @lsm_event_name: LSM event
> + * @buf: pointer to buffer containing LSM specific state
> + * @size: Number of bytes in buf
> + *
> + * Buffers can only be measured, not appraised.
> + */
> +int ima_measure_lsm_state(const char *lsm_event_name, const void *buf,
> +			  int size)
> +{
> +	if (!lsm_event_name || !buf || !size)
> +		return -EINVAL;
> +
> +	return process_buffer_measurement(NULL, buf, size, lsm_event_name,
> +					  LSM_STATE, 0, NULL);
> +}
> +
> +/**
> + * ima_measure_lsm_policy - measure LSM specific policy
> + * @lsm_event_name: LSM event
> + * @buf: pointer to buffer containing LSM specific policy
> + * @size: Number of bytes in buf
> + *
> + * Buffers can only be measured, not appraised.
> + */
> +int ima_measure_lsm_policy(const char *lsm_event_name, const void *buf,
> +			   int size)
> +{
> +	if (!lsm_event_name || !buf || !size)
> +		return -EINVAL;
> +
> +	return process_buffer_measurement(NULL, buf, size, lsm_event_name,
> +					  LSM_POLICY, 0, NULL);
> +}
> +
>  static int __init init_ima(void)
>  {
>  	int error;
> -- 
> 2.27.0

^ permalink raw reply

* Re: [PATCH v5 1/4] IMA: Add func to measure LSM state and policy
From: Lakshmi Ramasubramanian @ 2020-07-30 15:15 UTC (permalink / raw)
  To: Tyler Hicks
  Cc: zohar, stephen.smalley.work, casey, sashal, jmorris,
	linux-integrity, selinux, linux-security-module, linux-kernel
In-Reply-To: <20200730150228.GV4181@sequoia>

On 7/30/20 8:02 AM, Tyler Hicks wrote:

>> diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
>> index 07f033634b27..a0f5c39d9084 100644
>> --- a/security/integrity/ima/ima_policy.c
>> +++ b/security/integrity/ima/ima_policy.c
>> @@ -442,13 +442,20 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
>>   {
>>   	int i;
>>   
>> -	if (func == KEY_CHECK) {
>> -		return (rule->flags & IMA_FUNC) && (rule->func == func) &&
>> -		       ima_match_keyring(rule, keyring, cred);
>> -	}
>>   	if ((rule->flags & IMA_FUNC) &&
>>   	    (rule->func != func && func != POST_SETATTR))
>>   		return false;
>> +
>> +	switch (func) {
>> +	case KEY_CHECK:
>> +		return ima_match_keyring(rule, keyring, cred);
>> +	case LSM_STATE:
>> +	case LSM_POLICY:
>> +		return true;
>> +	default:
>> +		break;
>> +	}
>> +
>>   	if ((rule->flags & IMA_MASK) &&
>>   	    (rule->mask != mask && func != POST_SETATTR))
>>   		return false;
>> @@ -1044,6 +1051,18 @@ static bool ima_validate_rule(struct ima_rule_entry *entry)
>>   		if (ima_rule_contains_lsm_cond(entry))
>>   			return false;
>>   
>> +		break;
>> +	case LSM_STATE:
>> +	case LSM_POLICY:
>> +		if (entry->action & ~(MEASURE | DONT_MEASURE))
>> +			return false;
>> +
>> +		if (entry->flags & ~(IMA_FUNC | IMA_PCR))
>> +			return false;
>> +
>> +		if (ima_rule_contains_lsm_cond(entry))
>> +			return false;
>> +
>>   		break;
>>   	default:
>>   		return false;
>> @@ -1176,6 +1195,10 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
>>   				entry->func = KEXEC_CMDLINE;
>>   			else if (strcmp(args[0].from, "KEY_CHECK") == 0)
>>   				entry->func = KEY_CHECK;
>> +			else if (strcmp(args[0].from, "LSM_STATE") == 0)
>> +				entry->func = LSM_STATE;
>> +			else if (strcmp(args[0].from, "LSM_POLICY") == 0)
>> +				entry->func = LSM_POLICY;
> 
> This patch generally looks really good to me with the exception of one
> thing...
> 
> We should only accept rules with these specified hook functions when an
> LSM that has measurement support is enabled. This messes up the ordering
> of your patch series but it could be as simple as doing this:
> 
> 			else if (IS_ENABLED(CONFIG_SECURITY_SELINUX) &&
> 				 strcmp(args[0].from, "LSM_STATE") == 0)
> 				 entry->func = LSM_STATE;
> 
> Or you could do something a little more complex, like what's done with
> CONFIG_IMA_LSM_RULES. You could create a CONFIG_IMA_MEASURE_LSM option
> that's default enabled but depends on CONFIG_SECURITY_SELINUX and then
> check for IS_ENABLED(CONFIG_IMA_MEASURE_LSM) in ima_parse_rule().
> 
> I'd personally opt for just placing the
> IS_ENABLED(CONFIG_SECURITY_SELINUX) check directly into
> ima_parse_rule().
> 

The LSM hook can be used by any security module (not just SELinux) to 
measure their data.

I have implemented measurement in SELinux to illustrate the usage. 
Maybe, I can add the check you have suggested for now and when more 
security modules start using this IMA policy additional checks can be 
added as appropriate.

thanks,
  -lakshmi


^ permalink raw reply

* Re: [PATCH v5 1/4] IMA: Add func to measure LSM state and policy
From: Tyler Hicks @ 2020-07-30 15:17 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: zohar, stephen.smalley.work, casey, sashal, jmorris,
	linux-integrity, selinux, linux-security-module, linux-kernel
In-Reply-To: <c2f0c4cc-67a9-d467-1b2c-7edaea47c9d6@linux.microsoft.com>

On 2020-07-30 08:15:34, Lakshmi Ramasubramanian wrote:
> On 7/30/20 8:02 AM, Tyler Hicks wrote:
> 
> > > diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
> > > index 07f033634b27..a0f5c39d9084 100644
> > > --- a/security/integrity/ima/ima_policy.c
> > > +++ b/security/integrity/ima/ima_policy.c
> > > @@ -442,13 +442,20 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
> > >   {
> > >   	int i;
> > > -	if (func == KEY_CHECK) {
> > > -		return (rule->flags & IMA_FUNC) && (rule->func == func) &&
> > > -		       ima_match_keyring(rule, keyring, cred);
> > > -	}
> > >   	if ((rule->flags & IMA_FUNC) &&
> > >   	    (rule->func != func && func != POST_SETATTR))
> > >   		return false;
> > > +
> > > +	switch (func) {
> > > +	case KEY_CHECK:
> > > +		return ima_match_keyring(rule, keyring, cred);
> > > +	case LSM_STATE:
> > > +	case LSM_POLICY:
> > > +		return true;
> > > +	default:
> > > +		break;
> > > +	}
> > > +
> > >   	if ((rule->flags & IMA_MASK) &&
> > >   	    (rule->mask != mask && func != POST_SETATTR))
> > >   		return false;
> > > @@ -1044,6 +1051,18 @@ static bool ima_validate_rule(struct ima_rule_entry *entry)
> > >   		if (ima_rule_contains_lsm_cond(entry))
> > >   			return false;
> > > +		break;
> > > +	case LSM_STATE:
> > > +	case LSM_POLICY:
> > > +		if (entry->action & ~(MEASURE | DONT_MEASURE))
> > > +			return false;
> > > +
> > > +		if (entry->flags & ~(IMA_FUNC | IMA_PCR))
> > > +			return false;
> > > +
> > > +		if (ima_rule_contains_lsm_cond(entry))
> > > +			return false;
> > > +
> > >   		break;
> > >   	default:
> > >   		return false;
> > > @@ -1176,6 +1195,10 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
> > >   				entry->func = KEXEC_CMDLINE;
> > >   			else if (strcmp(args[0].from, "KEY_CHECK") == 0)
> > >   				entry->func = KEY_CHECK;
> > > +			else if (strcmp(args[0].from, "LSM_STATE") == 0)
> > > +				entry->func = LSM_STATE;
> > > +			else if (strcmp(args[0].from, "LSM_POLICY") == 0)
> > > +				entry->func = LSM_POLICY;
> > 
> > This patch generally looks really good to me with the exception of one
> > thing...
> > 
> > We should only accept rules with these specified hook functions when an
> > LSM that has measurement support is enabled. This messes up the ordering
> > of your patch series but it could be as simple as doing this:
> > 
> > 			else if (IS_ENABLED(CONFIG_SECURITY_SELINUX) &&
> > 				 strcmp(args[0].from, "LSM_STATE") == 0)
> > 				 entry->func = LSM_STATE;
> > 
> > Or you could do something a little more complex, like what's done with
> > CONFIG_IMA_LSM_RULES. You could create a CONFIG_IMA_MEASURE_LSM option
> > that's default enabled but depends on CONFIG_SECURITY_SELINUX and then
> > check for IS_ENABLED(CONFIG_IMA_MEASURE_LSM) in ima_parse_rule().
> > 
> > I'd personally opt for just placing the
> > IS_ENABLED(CONFIG_SECURITY_SELINUX) check directly into
> > ima_parse_rule().
> > 
> 
> The LSM hook can be used by any security module (not just SELinux) to
> measure their data.
> 
> I have implemented measurement in SELinux to illustrate the usage. Maybe, I
> can add the check you have suggested for now and when more security modules
> start using this IMA policy additional checks can be added as appropriate.

Yes, that's what I envision.

The main idea is that there's negative feedback to userspace when IMA
can't possibly do anything with an LSM_STATE/LSM_POLICY rule.

Tyler

> 
> thanks,
>  -lakshmi

^ permalink raw reply

* Re: [PATCH v5 1/4] IMA: Add func to measure LSM state and policy
From: Casey Schaufler @ 2020-07-30 16:19 UTC (permalink / raw)
  To: Tyler Hicks, Lakshmi Ramasubramanian
  Cc: zohar, stephen.smalley.work, sashal, jmorris, linux-integrity,
	selinux, linux-security-module, linux-kernel, Casey Schaufler
In-Reply-To: <20200730150228.GV4181@sequoia>

On 7/30/2020 8:02 AM, Tyler Hicks wrote:
> On 2020-07-29 20:47:21, Lakshmi Ramasubramanian wrote:
>> Critical data structures of security modules need to be measured to
>> enable an attestation service to verify if the configuration and
>> policies for the security modules have been setup correctly and
>> that they haven't been tampered with at runtime. A new IMA policy is
>> required for handling this measurement.
>>
>> Define two new IMA policy func namely LSM_STATE and LSM_POLICY to
>> measure the state and the policy provided by the security modules.

If, as you suggest below, this is SELinux specific,
these should be SELINUX_STATE and SELINUX_POLICY.
It makes me very uncomfortable when I see LSM used
in cases where SELinux is required. The LSM is supposed
to be an agnostic interface, so if you need to throw

	if (IS_ENABLED(CONFIG_SECURITY_SELINUX) &&

into the IMA code you're clearly not thinking in terms
of the LSM layer. I have no problem with seeing SELinux
oriented and/or specific code in IMA if that's what you want.
Just don't call it LSM.

>> Update ima_match_rules() and ima_validate_rule() to check for
>> the new func and ima_parse_rule() to handle the new func.
>>
>> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
>> ---
>>  Documentation/ABI/testing/ima_policy |  9 ++++++++
>>  security/integrity/ima/ima.h         |  2 ++
>>  security/integrity/ima/ima_api.c     |  2 +-
>>  security/integrity/ima/ima_policy.c  | 31 ++++++++++++++++++++++++----
>>  4 files changed, 39 insertions(+), 5 deletions(-)
>>
>> diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
>> index cd572912c593..b7c7fb548c0c 100644
>> --- a/Documentation/ABI/testing/ima_policy
>> +++ b/Documentation/ABI/testing/ima_policy
>> @@ -30,6 +30,7 @@ Description:
>>  				[FIRMWARE_CHECK]
>>  				[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
>>  				[KEXEC_CMDLINE] [KEY_CHECK]
>> +				[LSM_STATE] [LSM_POLICY]
>>  			mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
>>  			       [[^]MAY_EXEC]
>>  			fsmagic:= hex value
>> @@ -125,3 +126,11 @@ Description:
>>  		keys added to .builtin_trusted_keys or .ima keyring:
>>  
>>  			measure func=KEY_CHECK keyrings=.builtin_trusted_keys|.ima
>> +
>> +		Example of measure rule using LSM_STATE to measure LSM state:
>> +
>> +			measure func=LSM_STATE template=ima-buf
>> +
>> +		Example of measure rule using LSM_POLICY to measure LSM policy:
>> +
>> +			measure func=LSM_POLICY template=ima-ng
>> diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
>> index 38043074ce5e..1b5f4b2f17d0 100644
>> --- a/security/integrity/ima/ima.h
>> +++ b/security/integrity/ima/ima.h
>> @@ -200,6 +200,8 @@ static inline unsigned int ima_hash_key(u8 *digest)
>>  	hook(POLICY_CHECK, policy)			\
>>  	hook(KEXEC_CMDLINE, kexec_cmdline)		\
>>  	hook(KEY_CHECK, key)				\
>> +	hook(LSM_STATE, lsm_state)			\
>> +	hook(LSM_POLICY, lsm_policy)			\
>>  	hook(MAX_CHECK, none)
>>  
>>  #define __ima_hook_enumify(ENUM, str)	ENUM,
>> diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
>> index 4f39fb93f278..8c8b4e4a6493 100644
>> --- a/security/integrity/ima/ima_api.c
>> +++ b/security/integrity/ima/ima_api.c
>> @@ -176,7 +176,7 @@ void ima_add_violation(struct file *file, const unsigned char *filename,
>>   *		subj=, obj=, type=, func=, mask=, fsmagic=
>>   *	subj,obj, and type: are LSM specific.
>>   *	func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
>> - *	| KEXEC_CMDLINE | KEY_CHECK
>> + *	| KEXEC_CMDLINE | KEY_CHECK | LSM_STATE | LSM_POLICY
>>   *	mask: contains the permission mask
>>   *	fsmagic: hex value
>>   *
>> diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
>> index 07f033634b27..a0f5c39d9084 100644
>> --- a/security/integrity/ima/ima_policy.c
>> +++ b/security/integrity/ima/ima_policy.c
>> @@ -442,13 +442,20 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
>>  {
>>  	int i;
>>  
>> -	if (func == KEY_CHECK) {
>> -		return (rule->flags & IMA_FUNC) && (rule->func == func) &&
>> -		       ima_match_keyring(rule, keyring, cred);
>> -	}
>>  	if ((rule->flags & IMA_FUNC) &&
>>  	    (rule->func != func && func != POST_SETATTR))
>>  		return false;
>> +
>> +	switch (func) {
>> +	case KEY_CHECK:
>> +		return ima_match_keyring(rule, keyring, cred);
>> +	case LSM_STATE:
>> +	case LSM_POLICY:
>> +		return true;
>> +	default:
>> +		break;
>> +	}
>> +
>>  	if ((rule->flags & IMA_MASK) &&
>>  	    (rule->mask != mask && func != POST_SETATTR))
>>  		return false;
>> @@ -1044,6 +1051,18 @@ static bool ima_validate_rule(struct ima_rule_entry *entry)
>>  		if (ima_rule_contains_lsm_cond(entry))
>>  			return false;
>>  
>> +		break;
>> +	case LSM_STATE:
>> +	case LSM_POLICY:
>> +		if (entry->action & ~(MEASURE | DONT_MEASURE))
>> +			return false;
>> +
>> +		if (entry->flags & ~(IMA_FUNC | IMA_PCR))
>> +			return false;
>> +
>> +		if (ima_rule_contains_lsm_cond(entry))
>> +			return false;
>> +
>>  		break;
>>  	default:
>>  		return false;
>> @@ -1176,6 +1195,10 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
>>  				entry->func = KEXEC_CMDLINE;
>>  			else if (strcmp(args[0].from, "KEY_CHECK") == 0)
>>  				entry->func = KEY_CHECK;
>> +			else if (strcmp(args[0].from, "LSM_STATE") == 0)
>> +				entry->func = LSM_STATE;
>> +			else if (strcmp(args[0].from, "LSM_POLICY") == 0)
>> +				entry->func = LSM_POLICY;
> This patch generally looks really good to me with the exception of one
> thing...
>
> We should only accept rules with these specified hook functions when an
> LSM that has measurement support is enabled. This messes up the ordering
> of your patch series but it could be as simple as doing this:
>
> 			else if (IS_ENABLED(CONFIG_SECURITY_SELINUX) &&
> 				 strcmp(args[0].from, "LSM_STATE") == 0)
> 				 entry->func = LSM_STATE;
>
> Or you could do something a little more complex, like what's done with
> CONFIG_IMA_LSM_RULES. You could create a CONFIG_IMA_MEASURE_LSM option
> that's default enabled but depends on CONFIG_SECURITY_SELINUX and then
> check for IS_ENABLED(CONFIG_IMA_MEASURE_LSM) in ima_parse_rule().
>
> I'd personally opt for just placing the
> IS_ENABLED(CONFIG_SECURITY_SELINUX) check directly into
> ima_parse_rule().
>
> Tyler
>
>>  			else
>>  				result = -EINVAL;
>>  			if (!result)
>> -- 
>> 2.27.0

^ permalink raw reply

* Re: [PATCH v5 1/4] IMA: Add func to measure LSM state and policy
From: Lakshmi Ramasubramanian @ 2020-07-30 16:33 UTC (permalink / raw)
  To: Casey Schaufler, Tyler Hicks
  Cc: zohar, stephen.smalley.work, sashal, jmorris, linux-integrity,
	selinux, linux-security-module, linux-kernel
In-Reply-To: <b4428195-7a68-365d-a792-2855609c2221@schaufler-ca.com>

On 7/30/20 9:19 AM, Casey Schaufler wrote:

>>> Critical data structures of security modules need to be measured to
>>> enable an attestation service to verify if the configuration and
>>> policies for the security modules have been setup correctly and
>>> that they haven't been tampered with at runtime. A new IMA policy is
>>> required for handling this measurement.
>>>
>>> Define two new IMA policy func namely LSM_STATE and LSM_POLICY to
>>> measure the state and the policy provided by the security modules.
> 
> If, as you suggest below, this is SELinux specific,
> these should be SELINUX_STATE and SELINUX_POLICY.
> It makes me very uncomfortable when I see LSM used
> in cases where SELinux is required. The LSM is supposed
> to be an agnostic interface, so if you need to throw
> 
> 	if (IS_ENABLED(CONFIG_SECURITY_SELINUX) &&
> 
> into the IMA code you're clearly not thinking in terms
> of the LSM layer. I have no problem with seeing SELinux
> oriented and/or specific code in IMA if that's what you want.
> Just don't call it LSM.

The hook defined in IMA is not SELinux specific - it is generic enough 
to be used by any security module to measure their STATE and POLICY.

I have implemented the measurement for SELinux to illustrate the usage.

Tyler's suggestion was to allow this IMA policy only when component(s) 
that are using it are also enabled.

  -lakshmi



^ permalink raw reply

* Re: [PATCH] watch_queue: Limit the number of watches a user can hold
From: David Howells @ 2020-07-30 17:19 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, jarkko.sakkinen, keyrings, linux-security-module,
	linux-fsdevel, linux-kernel
In-Reply-To: <439876.1596106009@warthog.procyon.org.uk>

David Howells <dhowells@redhat.com> wrote:

> Could you consider taking this patch as a bugfix since the problem exists
> already in upstream code?

Alternatively, I can include it in a set with the mount notifications.

David


^ permalink raw reply

* Re: [PATCH v5 4/4] IMA: Handle early boot data measurement
From: Lakshmi Ramasubramanian @ 2020-07-30 18:02 UTC (permalink / raw)
  To: zohar, stephen.smalley.work, casey
  Cc: tyhicks, sashal, jmorris, linux-integrity, selinux,
	linux-security-module, linux-kernel
In-Reply-To: <20200730034724.3298-5-nramas@linux.microsoft.com>

On 7/29/20 8:47 PM, Lakshmi Ramasubramanian wrote:

Hi Tyler,

> diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
> index 080c53545ff0..86cba844f73c 100644
> --- a/security/integrity/ima/Kconfig
> +++ b/security/integrity/ima/Kconfig
> @@ -322,10 +322,9 @@ config IMA_MEASURE_ASYMMETRIC_KEYS
>   	depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
>   	default y
>   
> -config IMA_QUEUE_EARLY_BOOT_KEYS
> +config IMA_QUEUE_EARLY_BOOT_DATA
>   	bool
> -	depends on IMA_MEASURE_ASYMMETRIC_KEYS
> -	depends on SYSTEM_TRUSTED_KEYRING
> +	depends on SECURITY || (IMA_MEASURE_ASYMMETRIC_KEYS && SYSTEM_TRUSTED_KEYRING)
>   	default y
>   
Similar to the change you'd suggested for validating LSM_STATE and 
LSM_POLICY func, I think IMA_QUEUE_EARLY_BOOT_DATA config should be 
enabled for SECURITY_SELINUX.

depends on SECURITY_SELINUX ||
            (IMA_MEASURE_ASYMMETRIC_KEYS && SYSTEM_TRUSTED_KEYRING)

And, when more security modules are added update this CONFIG as appropriate.

Does that sound okay?

  -lakshmi

^ permalink raw reply

* Re: [PATCH v5 4/4] IMA: Handle early boot data measurement
From: Tyler Hicks @ 2020-07-30 20:04 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: zohar, stephen.smalley.work, casey, sashal, jmorris,
	linux-integrity, selinux, linux-security-module, linux-kernel
In-Reply-To: <ea3bba66-9b21-b842-990b-2bf1e4ac2179@linux.microsoft.com>

On 2020-07-30 11:02:50, Lakshmi Ramasubramanian wrote:
> On 7/29/20 8:47 PM, Lakshmi Ramasubramanian wrote:
> 
> Hi Tyler,
> 
> > diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
> > index 080c53545ff0..86cba844f73c 100644
> > --- a/security/integrity/ima/Kconfig
> > +++ b/security/integrity/ima/Kconfig
> > @@ -322,10 +322,9 @@ config IMA_MEASURE_ASYMMETRIC_KEYS
> >   	depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
> >   	default y
> > -config IMA_QUEUE_EARLY_BOOT_KEYS
> > +config IMA_QUEUE_EARLY_BOOT_DATA
> >   	bool
> > -	depends on IMA_MEASURE_ASYMMETRIC_KEYS
> > -	depends on SYSTEM_TRUSTED_KEYRING
> > +	depends on SECURITY || (IMA_MEASURE_ASYMMETRIC_KEYS && SYSTEM_TRUSTED_KEYRING)
> >   	default y
> Similar to the change you'd suggested for validating LSM_STATE and
> LSM_POLICY func, I think IMA_QUEUE_EARLY_BOOT_DATA config should be enabled
> for SECURITY_SELINUX.
> 
> depends on SECURITY_SELINUX ||
>            (IMA_MEASURE_ASYMMETRIC_KEYS && SYSTEM_TRUSTED_KEYRING)
> 
> And, when more security modules are added update this CONFIG as appropriate.
> 
> Does that sound okay?

Yes, I think so.

Tyler

> 
>  -lakshmi

^ permalink raw reply

* Re: [PATCH v19 22/23] LSM: Add /proc attr entry for full LSM context
From: Casey Schaufler @ 2020-07-30 20:44 UTC (permalink / raw)
  To: John Johansen, casey.schaufler, jmorris, linux-security-module,
	selinux
  Cc: linux-audit, keescook, penguin-kernel, paul, sds, linux-api,
	Casey Schaufler
In-Reply-To: <e885d90d-c873-5ab4-235d-6171f49f4ee4@canonical.com>

On 7/30/2020 3:03 AM, John Johansen wrote:
> On 7/24/20 1:32 PM, Casey Schaufler wrote:
>> Add an entry /proc/.../attr/context which displays the full
>> process security "context" in compound format:
>>         lsm1\0value\0lsm2\0value\0...
>> This entry is not writable.
>>
>> A security module may decide that its policy does not allow
>> this information to be displayed. In this case none of the
>> information will be displayed.
>>
>> Reviewed-by: Kees Cook <keescook@chromium.org>
>> Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
>> Cc: linux-api@vger.kernel.org
>> ---
>>  Documentation/security/lsm.rst       | 28 +++++++++++
>>  fs/proc/base.c                       |  1 +
>>  include/linux/lsm_hooks.h            |  6 +++
>>  security/apparmor/include/procattr.h |  2 +-
>>  security/apparmor/lsm.c              |  8 +++-
>>  security/apparmor/procattr.c         | 22 +++++----
>>  security/security.c                  | 70 ++++++++++++++++++++++++++++
>>  security/selinux/hooks.c             |  2 +-
>>  security/smack/smack_lsm.c           |  2 +-
>>  9 files changed, 126 insertions(+), 15 deletions(-)

<snip>

>>  
>>  /**
>> diff --git a/security/security.c b/security/security.c
>> index d35e578fa45b..bce6be720401 100644
>> --- a/security/security.c
>> +++ b/security/security.c
>> @@ -754,6 +754,48 @@ static void __init lsm_early_task(struct task_struct *task)
>>  		panic("%s: Early task alloc failed.\n", __func__);
>>  }
>>  
>> +/**
>> + * append_ctx - append a lsm/context pair to a compound context
>> + * @ctx: the existing compound context
>> + * @ctxlen: size of the old context, including terminating nul byte
>> + * @lsm: new lsm name, nul terminated
>> + * @new: new context, possibly nul terminated
>> + * @newlen: maximum size of @new
>> + *
>> + * replace @ctx with a new compound context, appending @newlsm and @new
>> + * to @ctx. On exit the new data replaces the old, which is freed.
>> + * @ctxlen is set to the new size, which includes a trailing nul byte.
>> + *
>> + * Returns 0 on success, -ENOMEM if no memory is available.
>> + */
>> +static int append_ctx(char **ctx, int *ctxlen, const char *lsm, char *new,
>> +		      int newlen)
>> +{
>> +	char *final;
>> +	size_t llen;
>> +
>> +	llen = strlen(lsm) + 1;
>> +	/*
>> +	 * A security module may or may not provide a trailing nul on
>> +	 * when returning a security context. There is no definition
>> +	 * of which it should be, and there are modules that do it
>> +	 * each way.
>> +	 */
>> +	newlen = strnlen(new, newlen) + 1;
>> +
>> +	final = kzalloc(*ctxlen + llen + newlen, GFP_KERNEL);
>> +	if (final == NULL)
>> +		return -ENOMEM;
>> +	if (*ctxlen)
>> +		memcpy(final, *ctx, *ctxlen);
>> +	memcpy(final + *ctxlen, lsm, llen);
>> +	memcpy(final + *ctxlen + llen, new, newlen);
> if @new doesn't have a newline appended at its end this will read 1 byte
> passed the end of the @new buffer. Nor will the result have a trailing
> \0 as expected unless we get lucky.

@new will never have a newline at the end. The trailing nul comes
from the allocation being done with kzalloc(). This function has to
be considered in the context of its caller.

>
>
>> +	kfree(*ctx);
>> +	*ctx = final;
>> +	*ctxlen = *ctxlen + llen + newlen;
>> +	return 0;
>> +}
>> +
>>  /*
>>   * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
>>   * can be accessed with:
>> @@ -2124,6 +2166,10 @@ int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
>>  				char **value)
>>  {
>>  	struct security_hook_list *hp;
>> +	char *final = NULL;
>> +	char *cp;
>> +	int rc = 0;
>> +	int finallen = 0;
> these are only used by context so they could be moved under its if, this
> is really just a style comment and I'll leave it up to you

Old coding habits die hard. Unless there's value to gain, I'll leave it
as is.

>
>>  	int display = lsm_task_display(current);
>>  	int slot = 0;
>>  
>> @@ -2151,6 +2197,30 @@ int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
>>  		return -ENOMEM;
>>  	}
>>  
>> +	if (!strcmp(name, "context")) {
>> +		hlist_for_each_entry(hp, &security_hook_heads.getprocattr,
>> +				     list) {
>> +			rc = hp->hook.getprocattr(p, "context", &cp);
>> +			if (rc == -EINVAL)
>> +				continue;
>> +			if (rc < 0) {
>> +				kfree(final);
>> +				return rc;
>> +			}
>> +			rc = append_ctx(&final, &finallen, hp->lsmid->lsm,
>> +					cp, rc);
>> +			kfree(cp);
>> +			if (rc < 0) {
>> +				kfree(final);
>> +				return rc;
>> +			}
>> +		}
>> +		if (final == NULL)
>> +			return -EINVAL;
>> +		*value = final;
>> +		return finallen;
>> +	}
>> +
>>  	hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
>>  		if (lsm != NULL && strcmp(lsm, hp->lsmid->lsm))
>>  			continue;
>> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
>> index c13c207c5da1..43d5c09b9a9e 100644
>> --- a/security/selinux/hooks.c
>> +++ b/security/selinux/hooks.c
>> @@ -6288,7 +6288,7 @@ static int selinux_getprocattr(struct task_struct *p,
>>  			goto bad;
>>  	}
>>  
>> -	if (!strcmp(name, "current"))
>> +	if (!strcmp(name, "current") || !strcmp(name, "context"))
>>  		sid = __tsec->sid;
>>  	else if (!strcmp(name, "prev"))
>>  		sid = __tsec->osid;
>> diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
>> index 6f0cdb40addc..d7bb6442f192 100644
>> --- a/security/smack/smack_lsm.c
>> +++ b/security/smack/smack_lsm.c
>> @@ -3463,7 +3463,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
>>  	char *cp;
>>  	int slen;
>>  
>> -	if (strcmp(name, "current") != 0)
>> +	if (strcmp(name, "current") != 0 && strcmp(name, "context") != 0)
>>  		return -EINVAL;
>>  
>>  	cp = kstrdup(skp->smk_known, GFP_KERNEL);
>>

^ permalink raw reply

* Re: [PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor
From: Andy Lutomirski @ 2020-07-30 20:54 UTC (permalink / raw)
  To: Madhavan T. Venkataraman
  Cc: Andy Lutomirski, Kernel Hardening, Linux API, linux-arm-kernel,
	Linux FS Devel, linux-integrity, LKML, LSM List, Oleg Nesterov,
	X86 ML
In-Reply-To: <6540b4b7-3f70-adbf-c922-43886599713a@linux.microsoft.com>

On Thu, Jul 30, 2020 at 7:24 AM Madhavan T. Venkataraman
<madvenka@linux.microsoft.com> wrote:
>
> Sorry for the delay. I just wanted to think about this a little.
> In this email, I will respond to your first suggestion. I will
> respond to the rest in separate emails if that is alright with
> you.
>
> On 7/28/20 12:31 PM, Andy Lutomirski wrote:
>
> On Jul 28, 2020, at 6:11 AM, madvenka@linux.microsoft.com wrote:
>
> From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
>
> The kernel creates the trampoline mapping without any permissions. When
> the trampoline is executed by user code, a page fault happens and the
> kernel gets control. The kernel recognizes that this is a trampoline
> invocation. It sets up the user registers based on the specified
> register context, and/or pushes values on the user stack based on the
> specified stack context, and sets the user PC to the requested target
> PC. When the kernel returns, execution continues at the target PC.
> So, the kernel does the work of the trampoline on behalf of the
> application.
>
> This is quite clever, but now I’m wondering just how much kernel help
> is really needed. In your series, the trampoline is an non-executable
> page.  I can think of at least two alternative approaches, and I'd
> like to know the pros and cons.
>
> 1. Entirely userspace: a return trampoline would be something like:
>
> 1:
> pushq %rax
> pushq %rbc
> pushq %rcx
> ...
> pushq %r15
> movq %rsp, %rdi # pointer to saved regs
> leaq 1b(%rip), %rsi # pointer to the trampoline itself
> callq trampoline_handler # see below
>
> You would fill a page with a bunch of these, possibly compacted to get
> more per page, and then you would remap as many copies as needed.  The
> 'callq trampoline_handler' part would need to be a bit clever to make
> it continue to work despite this remapping.  This will be *much*
> faster than trampfd. How much of your use case would it cover?  For
> the inverse, it's not too hard to write a bit of asm to set all
> registers and jump somewhere.
>
> Let me state what I have understood about this suggestion. Correct me if
> I get anything wrong. If you don't mind, I will also take the liberty
> of generalizing and paraphrasing your suggestion.
>
> The goal is to create two page mappings that are adjacent to each other:
>
> - a code page that contains template code for a trampoline. Since the
>  template code would tend to be small in size, pack as many of them
>  as possible within a page to conserve memory. In other words, create
>  an array of the template code fragments. Each element in the array
>  would be used for one trampoline instance.
>
> - a data page that contains an array of data elements. Corresponding
>  to each code element in the code page, there would be a data element
>  in the data page that would contain data that is specific to a
>  trampoline instance.
>
> - Code will access data using PC-relative addressing.
>
> The management of the code pages and allocation for each trampoline
> instance would all be done in user space.
>
> Is this the general idea?

Yes.

>
> Creating a code page
> --------------------
>
> We can do this in one of the following ways:
>
> - Allocate a writable page at run time, write the template code into
>   the page and have execute permissions on the page.
>
> - Allocate a writable page at run time, write the template code into
>   the page and remap the page with just execute permissions.
>
> - Allocate a writable page at run time, write the template code into
>   the page, write the page into a temporary file and map the file with
>   execute permissions.
>
> - Include the template code in a code page at build time itself and
>   just remap the code page each time you need a code page.

This latter part shouldn't need any special permissions as far as I know.

>
> Pros and Cons
> -------------
>
> As long as the OS provides the functionality to do this and the security
> subsystem in the OS allows the actions, this is totally feasible. If not,
> we need something like trampfd.
>
> As Floren mentioned, libffi does implement something like this for MACH.
>
> In fact, in my libffi changes, I use trampfd only after all the other methods
> have failed because of security settings.
>
> But the above approach only solves the problem for this simple type of
> trampoline. It does not provide a framework for addressing more complex types
> or even other forms of dynamic code.
>
> Also, each application would need to implement this solution for itself
> as opposed to relying on one implementation provided by the kernel.

I would argue this is a benefit.  If the whole implementation is in
userspace, there is no ABI compatibility issue.  The user program
contains the trampoline code and the code that uses it.

>
> Trampfd-based solution
> ----------------------
>
> I outlined an enhancement to trampfd in a response to David Laight. In this
> enhancement, the kernel is the one that would set up the code page.
>
> The kernel would call an arch-specific support function to generate the
> code required to load registers, push values on the stack and jump to a PC
> for a trampoline instance based on its current context. The trampoline
> instance data could be baked into the code.
>
> My initial idea was to only have one trampoline instance per page. But I
> think I can implement multiple instances per page. I just have to manage
> the trampfd file private data and VMA private data accordingly to map an
> element in a code page to its trampoline object.
>
> The two approaches are similar except for the detail about who sets up
> and manages the trampoline pages. In both approaches, the performance problem
> is addressed. But trampfd can be used even when security settings are
> restrictive.
>
> Is my solution acceptable?

Perhaps.  In general, before adding a new ABI to the kernel, it's nice
to understand how it's better than doing the same thing in userspace.
Saying that it's easier for user code to work with if it's in the
kernel isn't necessarily an adequate justification.

Why would remapping two pages of actual application text ever fail?

^ permalink raw reply

* Re: [PATCH v19 22/23] LSM: Add /proc attr entry for full LSM context
From: John Johansen @ 2020-07-30 20:57 UTC (permalink / raw)
  To: Casey Schaufler, casey.schaufler, jmorris, linux-security-module,
	selinux
  Cc: linux-audit, keescook, penguin-kernel, paul, sds, linux-api
In-Reply-To: <705fb82d-ad7a-2874-59ed-ba6bc7ae3722@schaufler-ca.com>

On 7/30/20 1:44 PM, Casey Schaufler wrote:
> On 7/30/2020 3:03 AM, John Johansen wrote:
>> On 7/24/20 1:32 PM, Casey Schaufler wrote:
>>> Add an entry /proc/.../attr/context which displays the full
>>> process security "context" in compound format:
>>>         lsm1\0value\0lsm2\0value\0...
>>> This entry is not writable.
>>>
>>> A security module may decide that its policy does not allow
>>> this information to be displayed. In this case none of the
>>> information will be displayed.
>>>
>>> Reviewed-by: Kees Cook <keescook@chromium.org>
>>> Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
>>> Cc: linux-api@vger.kernel.org
>>> ---
>>>  Documentation/security/lsm.rst       | 28 +++++++++++
>>>  fs/proc/base.c                       |  1 +
>>>  include/linux/lsm_hooks.h            |  6 +++
>>>  security/apparmor/include/procattr.h |  2 +-
>>>  security/apparmor/lsm.c              |  8 +++-
>>>  security/apparmor/procattr.c         | 22 +++++----
>>>  security/security.c                  | 70 ++++++++++++++++++++++++++++
>>>  security/selinux/hooks.c             |  2 +-
>>>  security/smack/smack_lsm.c           |  2 +-
>>>  9 files changed, 126 insertions(+), 15 deletions(-)
> 
> <snip>
> 
>>>  
>>>  /**
>>> diff --git a/security/security.c b/security/security.c
>>> index d35e578fa45b..bce6be720401 100644
>>> --- a/security/security.c
>>> +++ b/security/security.c
>>> @@ -754,6 +754,48 @@ static void __init lsm_early_task(struct task_struct *task)
>>>  		panic("%s: Early task alloc failed.\n", __func__);
>>>  }
>>>  
>>> +/**
>>> + * append_ctx - append a lsm/context pair to a compound context
>>> + * @ctx: the existing compound context
>>> + * @ctxlen: size of the old context, including terminating nul byte
>>> + * @lsm: new lsm name, nul terminated
>>> + * @new: new context, possibly nul terminated
>>> + * @newlen: maximum size of @new
>>> + *
>>> + * replace @ctx with a new compound context, appending @newlsm and @new
>>> + * to @ctx. On exit the new data replaces the old, which is freed.
>>> + * @ctxlen is set to the new size, which includes a trailing nul byte.
>>> + *
>>> + * Returns 0 on success, -ENOMEM if no memory is available.
>>> + */
>>> +static int append_ctx(char **ctx, int *ctxlen, const char *lsm, char *new,
>>> +		      int newlen)
>>> +{
>>> +	char *final;
>>> +	size_t llen;
>>> +
>>> +	llen = strlen(lsm) + 1;
>>> +	/*
>>> +	 * A security module may or may not provide a trailing nul on
>>> +	 * when returning a security context. There is no definition
>>> +	 * of which it should be, and there are modules that do it
>>> +	 * each way.
>>> +	 */
>>> +	newlen = strnlen(new, newlen) + 1;
>>> +
>>> +	final = kzalloc(*ctxlen + llen + newlen, GFP_KERNEL);
>>> +	if (final == NULL)
>>> +		return -ENOMEM;
>>> +	if (*ctxlen)
>>> +		memcpy(final, *ctx, *ctxlen);
>>> +	memcpy(final + *ctxlen, lsm, llen);
>>> +	memcpy(final + *ctxlen + llen, new, newlen);
>> if @new doesn't have a newline appended at its end this will read 1 byte
>> passed the end of the @new buffer. Nor will the result have a trailing
>> \0 as expected unless we get lucky.
> 
> @new will never have a newline at the end. The trailing nul comes
> from the allocation being done with kzalloc(). This function has to
> be considered in the context of its caller.
> 

ugh, sorry not trailing newline, I meant trailing \0. The problem isn't
the kzalloc, the target has the space. It is the source @new. It is
dangerous to assume that the @new buffer has a null byte after its
declared length. Which is potentially what we are doing if @new
doesn't have an embedded null byte. In that case strlen(new, newlen)
will then return newlen and we add 1 to it.

which means in the memcpy we are copying an extra byte beyond what
was declared to exist in @new.

>>
>>
>>> +	kfree(*ctx);
>>> +	*ctx = final;
>>> +	*ctxlen = *ctxlen + llen + newlen;
>>> +	return 0;
>>> +}
>>> +
>>>  /*
>>>   * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
>>>   * can be accessed with:
>>> @@ -2124,6 +2166,10 @@ int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
>>>  				char **value)
>>>  {
>>>  	struct security_hook_list *hp;
>>> +	char *final = NULL;
>>> +	char *cp;
>>> +	int rc = 0;
>>> +	int finallen = 0;
>> these are only used by context so they could be moved under its if, this
>> is really just a style comment and I'll leave it up to you
> 
> Old coding habits die hard. Unless there's value to gain, I'll leave it
> as is.
> 
>>
>>>  	int display = lsm_task_display(current);
>>>  	int slot = 0;
>>>  
>>> @@ -2151,6 +2197,30 @@ int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
>>>  		return -ENOMEM;
>>>  	}
>>>  
>>> +	if (!strcmp(name, "context")) {
>>> +		hlist_for_each_entry(hp, &security_hook_heads.getprocattr,
>>> +				     list) {
>>> +			rc = hp->hook.getprocattr(p, "context", &cp);
>>> +			if (rc == -EINVAL)
>>> +				continue;
>>> +			if (rc < 0) {
>>> +				kfree(final);
>>> +				return rc;
>>> +			}
>>> +			rc = append_ctx(&final, &finallen, hp->lsmid->lsm,
>>> +					cp, rc);
>>> +			kfree(cp);
>>> +			if (rc < 0) {
>>> +				kfree(final);
>>> +				return rc;
>>> +			}
>>> +		}
>>> +		if (final == NULL)
>>> +			return -EINVAL;
>>> +		*value = final;
>>> +		return finallen;
>>> +	}
>>> +
>>>  	hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
>>>  		if (lsm != NULL && strcmp(lsm, hp->lsmid->lsm))
>>>  			continue;
>>> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
>>> index c13c207c5da1..43d5c09b9a9e 100644
>>> --- a/security/selinux/hooks.c
>>> +++ b/security/selinux/hooks.c
>>> @@ -6288,7 +6288,7 @@ static int selinux_getprocattr(struct task_struct *p,
>>>  			goto bad;
>>>  	}
>>>  
>>> -	if (!strcmp(name, "current"))
>>> +	if (!strcmp(name, "current") || !strcmp(name, "context"))
>>>  		sid = __tsec->sid;
>>>  	else if (!strcmp(name, "prev"))
>>>  		sid = __tsec->osid;
>>> diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
>>> index 6f0cdb40addc..d7bb6442f192 100644
>>> --- a/security/smack/smack_lsm.c
>>> +++ b/security/smack/smack_lsm.c
>>> @@ -3463,7 +3463,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
>>>  	char *cp;
>>>  	int slen;
>>>  
>>> -	if (strcmp(name, "current") != 0)
>>> +	if (strcmp(name, "current") != 0 && strcmp(name, "context") != 0)
>>>  		return -EINVAL;
>>>  
>>>  	cp = kstrdup(skp->smk_known, GFP_KERNEL);
>>>


^ permalink raw reply

* Re: [PATCH v19 22/23] LSM: Add /proc attr entry for full LSM context
From: Casey Schaufler @ 2020-07-30 22:22 UTC (permalink / raw)
  To: John Johansen, casey.schaufler, jmorris, linux-security-module,
	selinux
  Cc: linux-audit, keescook, penguin-kernel, paul, sds, linux-api,
	Casey Schaufler
In-Reply-To: <97330b2d-5447-cfef-b6d0-444249e671b7@canonical.com>

On 7/30/2020 1:57 PM, John Johansen wrote:
> On 7/30/20 1:44 PM, Casey Schaufler wrote:
>> On 7/30/2020 3:03 AM, John Johansen wrote:
>>> On 7/24/20 1:32 PM, Casey Schaufler wrote:
>>>> Add an entry /proc/.../attr/context which displays the full
>>>> process security "context" in compound format:
>>>>         lsm1\0value\0lsm2\0value\0...
>>>> This entry is not writable.
>>>>
>>>> A security module may decide that its policy does not allow
>>>> this information to be displayed. In this case none of the
>>>> information will be displayed.
>>>>
>>>> Reviewed-by: Kees Cook <keescook@chromium.org>
>>>> Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
>>>> Cc: linux-api@vger.kernel.org
>>>> ---
>>>>  Documentation/security/lsm.rst       | 28 +++++++++++
>>>>  fs/proc/base.c                       |  1 +
>>>>  include/linux/lsm_hooks.h            |  6 +++
>>>>  security/apparmor/include/procattr.h |  2 +-
>>>>  security/apparmor/lsm.c              |  8 +++-
>>>>  security/apparmor/procattr.c         | 22 +++++----
>>>>  security/security.c                  | 70 ++++++++++++++++++++++++++++
>>>>  security/selinux/hooks.c             |  2 +-
>>>>  security/smack/smack_lsm.c           |  2 +-
>>>>  9 files changed, 126 insertions(+), 15 deletions(-)
>> <snip>
>>
>>>>  
>>>>  /**
>>>> diff --git a/security/security.c b/security/security.c
>>>> index d35e578fa45b..bce6be720401 100644
>>>> --- a/security/security.c
>>>> +++ b/security/security.c
>>>> @@ -754,6 +754,48 @@ static void __init lsm_early_task(struct task_struct *task)
>>>>  		panic("%s: Early task alloc failed.\n", __func__);
>>>>  }
>>>>  
>>>> +/**
>>>> + * append_ctx - append a lsm/context pair to a compound context
>>>> + * @ctx: the existing compound context
>>>> + * @ctxlen: size of the old context, including terminating nul byte
>>>> + * @lsm: new lsm name, nul terminated
>>>> + * @new: new context, possibly nul terminated
>>>> + * @newlen: maximum size of @new
>>>> + *
>>>> + * replace @ctx with a new compound context, appending @newlsm and @new
>>>> + * to @ctx. On exit the new data replaces the old, which is freed.
>>>> + * @ctxlen is set to the new size, which includes a trailing nul byte.
>>>> + *
>>>> + * Returns 0 on success, -ENOMEM if no memory is available.
>>>> + */
>>>> +static int append_ctx(char **ctx, int *ctxlen, const char *lsm, char *new,
>>>> +		      int newlen)
>>>> +{
>>>> +	char *final;
>>>> +	size_t llen;
>>>> +
>>>> +	llen = strlen(lsm) + 1;
>>>> +	/*
>>>> +	 * A security module may or may not provide a trailing nul on
>>>> +	 * when returning a security context. There is no definition
>>>> +	 * of which it should be, and there are modules that do it
>>>> +	 * each way.
>>>> +	 */
>>>> +	newlen = strnlen(new, newlen) + 1;
>>>> +
>>>> +	final = kzalloc(*ctxlen + llen + newlen, GFP_KERNEL);
>>>> +	if (final == NULL)
>>>> +		return -ENOMEM;
>>>> +	if (*ctxlen)
>>>> +		memcpy(final, *ctx, *ctxlen);
>>>> +	memcpy(final + *ctxlen, lsm, llen);
>>>> +	memcpy(final + *ctxlen + llen, new, newlen);
>>> if @new doesn't have a newline appended at its end this will read 1 byte
>>> passed the end of the @new buffer. Nor will the result have a trailing
>>> \0 as expected unless we get lucky.
>> @new will never have a newline at the end. The trailing nul comes
>> from the allocation being done with kzalloc(). This function has to
>> be considered in the context of its caller.
>>
> ugh, sorry not trailing newline, I meant trailing \0. The problem isn't
> the kzalloc, the target has the space. It is the source @new. It is
> dangerous to assume that the @new buffer has a null byte after its
> declared length. Which is potentially what we are doing if @new
> doesn't have an embedded null byte. In that case strlen(new, newlen)
> will then return newlen and we add 1 to it.
>
> which means in the memcpy we are copying an extra byte beyond what
> was declared to exist in @new.

You're right. Good point. Fix coming.
??


^ permalink raw reply

* Re: [PATCH bpf-next v7 5/7] bpf: Implement bpf_local_storage for inodes
From: Martin KaFai Lau @ 2020-07-31  1:08 UTC (permalink / raw)
  To: KP Singh
  Cc: linux-kernel, bpf, linux-security-module, Alexei Starovoitov,
	Daniel Borkmann, Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200730140716.404558-6-kpsingh@chromium.org>

On Thu, Jul 30, 2020 at 04:07:14PM +0200, KP Singh wrote:
> From: KP Singh <kpsingh@google.com>
> 
> Similar to bpf_local_storage for sockets, add local storage for inodes.
> The life-cycle of storage is managed with the life-cycle of the inode.
> i.e. the storage is destroyed along with the owning inode.
> 
> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
> security blob which are now stackable and can co-exist with other LSMs.
> 

[ ... ]

> +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
> +					 void *value, u64 map_flags)
> +{
> +	struct bpf_local_storage_data *sdata;
> +	struct file *f;
> +	int fd;
> +
> +	fd = *(int *)key;
> +	f = fcheck(fd);
> +	if (!f)
> +		return -EINVAL;
> +
> +	sdata = bpf_local_storage_update(f->f_inode, map, value, map_flags);
n00b question.  inode will not be going away here (like another
task calls close(fd))?  and there is no chance that bpf_local_storage_update()
will be adding new storage after bpf_inode_storage_free() was called?

A few comments will be useful here.

> +	return PTR_ERR_OR_ZERO(sdata);
> +}
> +
> +static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
> +{
> +	struct bpf_local_storage_data *sdata;
> +
> +	sdata = inode_storage_lookup(inode, map, false);
> +	if (!sdata)
> +		return -ENOENT;
> +
> +	bpf_selem_unlink(SELEM(sdata));
> +
> +	return 0;
> +}
> +
> +static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
> +{
> +	struct file *f;
> +	int fd;
> +
> +	fd = *(int *)key;
> +	f = fcheck(fd);
> +	if (!f)
> +		return -EINVAL;
> +
> +	return inode_storage_delete(f->f_inode, map);
> +}
> +
> +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
> +	   void *, value, u64, flags)
> +{
> +	struct bpf_local_storage_data *sdata;
> +
> +	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
> +		return (unsigned long)NULL;
> +
> +	sdata = inode_storage_lookup(inode, map, true);
> +	if (sdata)
> +		return (unsigned long)sdata->data;
> +
> +	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> +		sdata = bpf_local_storage_update(inode, map, value,
> +						 BPF_NOEXIST);
The same question here

> +		return IS_ERR(sdata) ? (unsigned long)NULL :
> +					     (unsigned long)sdata->data;
> +	}
> +
> +	return (unsigned long)NULL;
> +}
> +
> +BPF_CALL_2(bpf_inode_storage_delete,
> +	   struct bpf_map *, map, struct inode *, inode)
> +{
> +	return inode_storage_delete(inode, map);
> +}
> +
> +static int notsupp_get_next_key(struct bpf_map *map, void *key,
> +				void *next_key)
> +{
> +	return -ENOTSUPP;
> +}
> +
> +static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
> +{
> +	struct bpf_local_storage_map *smap;
> +
> +	smap = bpf_local_storage_map_alloc(attr);
> +	if (IS_ERR(smap))
> +		return ERR_CAST(smap);
> +
> +	smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
> +	return &smap->map;
> +}
> +
> +static void inode_storage_map_free(struct bpf_map *map)
> +{
> +	struct bpf_local_storage_map *smap;
> +
> +	smap = (struct bpf_local_storage_map *)map;
> +	bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
> +	bpf_local_storage_map_free(smap);
> +}
> +
> +static int sk_storage_map_btf_id;
> +const struct bpf_map_ops inode_storage_map_ops = {
> +	.map_alloc_check = bpf_local_storage_map_alloc_check,
> +	.map_alloc = inode_storage_map_alloc,
> +	.map_free = inode_storage_map_free,
> +	.map_get_next_key = notsupp_get_next_key,
> +	.map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
> +	.map_update_elem = bpf_fd_inode_storage_update_elem,
> +	.map_delete_elem = bpf_fd_inode_storage_delete_elem,
> +	.map_check_btf = bpf_local_storage_map_check_btf,
> +	.map_btf_name = "bpf_local_storage_map",
> +	.map_btf_id = &sk_storage_map_btf_id,
> +	.map_owner_storage_ptr = inode_storage_ptr,
> +};
> +
> +BTF_ID_LIST(bpf_inode_storage_get_btf_ids)
> +BTF_ID(struct, inode)
The ARG_PTR_TO_BTF_ID is the second arg instead of the first
arg in bpf_inode_storage_get_proto.
Does it just happen to work here without needing BTF_ID_UNUSED?

> +
> +const struct bpf_func_proto bpf_inode_storage_get_proto = {
> +	.func		= bpf_inode_storage_get,
> +	.gpl_only	= false,
> +	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
> +	.arg1_type	= ARG_CONST_MAP_PTR,
> +	.arg2_type	= ARG_PTR_TO_BTF_ID,
> +	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
> +	.arg4_type	= ARG_ANYTHING,
> +	.btf_id		= bpf_inode_storage_get_btf_ids,
> +};
> +
> +BTF_ID_LIST(bpf_inode_storage_delete_btf_ids)
> +BTF_ID(struct, inode)
> +
> +const struct bpf_func_proto bpf_inode_storage_delete_proto = {
> +	.func		= bpf_inode_storage_delete,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_CONST_MAP_PTR,
> +	.arg2_type	= ARG_PTR_TO_BTF_ID,
> +	.btf_id		= bpf_inode_storage_delete_btf_ids,
> +};

^ permalink raw reply

* Re: [PATCH bpf-next v7 5/7] bpf: Implement bpf_local_storage for inodes
From: KP Singh @ 2020-07-31 12:08 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: linux-kernel, bpf, linux-security-module, Alexei Starovoitov,
	Daniel Borkmann, Paul Turner, Jann Horn, Florent Revest
In-Reply-To: <20200731010822.fctk5lawnr3p7blf@kafai-mbp.dhcp.thefacebook.com>



On 31.07.20 03:08, Martin KaFai Lau wrote:
> On Thu, Jul 30, 2020 at 04:07:14PM +0200, KP Singh wrote:
>> From: KP Singh <kpsingh@google.com>
>>
>> Similar to bpf_local_storage for sockets, add local storage for inodes.
>> The life-cycle of storage is managed with the life-cycle of the inode.
>> i.e. the storage is destroyed along with the owning inode.
>>
>> The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the
>> security blob which are now stackable and can co-exist with other LSMs.
>>
> 
> [ ... ]
> 
>> +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
>> +					 void *value, u64 map_flags)
>> +{
>> +	struct bpf_local_storage_data *sdata;
>> +	struct file *f;
>> +	int fd;
>> +
>> +	fd = *(int *)key;
>> +	f = fcheck(fd);
>> +	if (!f)
>> +		return -EINVAL;
>> +
>> +	sdata = bpf_local_storage_update(f->f_inode, map, value, map_flags);
> n00b question.  inode will not be going away here (like another
> task calls close(fd))?  and there is no chance that bpf_local_storage_update()
> will be adding new storage after bpf_inode_storage_free() was called?

Good catch, I think we need to guard this update by grabbing a reference 
to the file here.

> 
> A few comments will be useful here.
> 
>> +	return PTR_ERR_OR_ZERO(sdata);
>> +}
>> +
>> +static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
>> +{
>> +	struct bpf_local_storage_data *sdata;
>> +
>> +	sdata = inode_storage_lookup(inode, map, false);
>> +	if (!sdata)
>> +		return -ENOENT;
>> +
>> +	bpf_selem_unlink(SELEM(sdata));
>> +
>> +	return 0;
>> +}
>> +
>> +static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
>> +{
>> +	struct file *f;
>> +	int fd;
>> +
>> +	fd = *(int *)key;
>> +	f = fcheck(fd);
>> +	if (!f)
>> +		return -EINVAL;
>> +
>> +	return inode_storage_delete(f->f_inode, map);
>> +}
>> +
>> +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
>> +	   void *, value, u64, flags)
>> +{
>> +	struct bpf_local_storage_data *sdata;
>> +
>> +	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
>> +		return (unsigned long)NULL;
>> +
>> +	sdata = inode_storage_lookup(inode, map, true);
>> +	if (sdata)
>> +		return (unsigned long)sdata->data;
>> +
>> +	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
>> +		sdata = bpf_local_storage_update(inode, map, value,
>> +						 BPF_NOEXIST);
> The same question here

This is slightly different. The helper gets called from the BPF program.
We are only allowing this from LSM hooks which have better guarantees
w.r.t the lifetime of the object unlike tracing programs.

I will add a comment that explains this. Once we have sleepable BPF we can
also grab a reference to the inode here.

> 
>> +		return IS_ERR(sdata) ? (unsigned long)NULL :
>> +					     (unsigned long)sdata->data;
>> +	}
>> +
>> +	return (unsigned long)NULL;
>> +}
>> +
>> +BPF_CALL_2(bpf_inode_storage_delete,
>> +	   struct bpf_map *, map, struct inode *, inode)
>> +{
>> +	return inode_storage_delete(inode, map);
>> +}
>> +
>> +static int notsupp_get_next_key(struct bpf_map *map, void *key,
>> +				void *next_key)
>> +{
>> +	return -ENOTSUPP;
>> +}
>> +
>> +static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
>> +{
>> +	struct bpf_local_storage_map *smap;
>> +
>> +	smap = bpf_local_storage_map_alloc(attr);
>> +	if (IS_ERR(smap))
>> +		return ERR_CAST(smap);
>> +
>> +	smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
>> +	return &smap->map;
>> +}
>> +
>> +static void inode_storage_map_free(struct bpf_map *map)
>> +{
>> +	struct bpf_local_storage_map *smap;
>> +
>> +	smap = (struct bpf_local_storage_map *)map;
>> +	bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
>> +	bpf_local_storage_map_free(smap);
>> +}
>> +
>> +static int sk_storage_map_btf_id;

This name needs to be fixed as well.

>> +const struct bpf_map_ops inode_storage_map_ops = {
>> +	.map_alloc_check = bpf_local_storage_map_alloc_check,
>> +	.map_alloc = inode_storage_map_alloc,
>> +	.map_free = inode_storage_map_free,
>> +	.map_get_next_key = notsupp_get_next_key,
>> +	.map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
>> +	.map_update_elem = bpf_fd_inode_storage_update_elem,
>> +	.map_delete_elem = bpf_fd_inode_storage_delete_elem,
>> +	.map_check_btf = bpf_local_storage_map_check_btf,
>> +	.map_btf_name = "bpf_local_storage_map",
>> +	.map_btf_id = &sk_storage_map_btf_id,
>> +	.map_owner_storage_ptr = inode_storage_ptr,
>> +};
>> +
>> +BTF_ID_LIST(bpf_inode_storage_get_btf_ids)
>> +BTF_ID(struct, inode)
> The ARG_PTR_TO_BTF_ID is the second arg instead of the first
> arg in bpf_inode_storage_get_proto.
> Does it just happen to work here without needing BTF_ID_UNUSED?


Yeah, this surprised me as to why it worked, so I did some debugging:


diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 9cd1428c7199..95e84bcf1a74 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -52,6 +52,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
        switch (func_id) {
        case BPF_FUNC_inode_storage_get:
+               pr_err("btf_ids[0]=%d\n", bpf_inode_storage_get_proto.btf_id[0]);
+               pr_err("btf_ids[1]=%d\n", bpf_inode_storage_get_proto.btf_id[1]);
                return &bpf_inode_storage_get_proto;
        case BPF_FUNC_inode_storage_delete:
                return &bpf_inode_storage_delete_proto;

./test_progs -t test_local_storage

[   21.694473] btf_ids[0]=915
[   21.694974] btf_ids[1]=915

btf  dump file /sys/kernel/btf/vmlinux | grep "STRUCT 'inode'"
"[915] STRUCT 'inode' size=984 vlen=48

So it seems like btf_id[0] and btf_id[1] are set to the BTF ID
for inode. Now I think this might just be a coincidence as
the next helper (bpf_inode_storage_delete) 
also has a BTF argument of type inode.

and sure enough if I call:

bpf_inode_storage_delete from my selftests program, 
it does not load:

arg#0 type is not a struct
Unrecognized arg#0 type PTR
; int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
0: (79) r6 = *(u64 *)(r1 +8)
func 'bpf_lsm_inode_unlink' arg1 has btf_id 912 type STRUCT 'dentry'
; __u32 pid = bpf_get_current_pid_tgid() >> 32;

[...]

So, The BTF_ID_UNUSED is actually needed here. I also added a call to
bpf_inode_storage_delete in my test to catch any issues with it.

After adding BTF_ID_UNUSED, the result is what we expect:

./test_progs -t test_local_storage
[   20.577223] btf_ids[0]=0
[   20.577702] btf_ids[1]=915

Thanks for noticing this! 

- KP

> 
>> +
>> +const struct bpf_func_proto bpf_inode_storage_get_proto = {
>> +	.func		= bpf_inode_storage_get,
>> +	.gpl_only	= false,
>> +	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
>> +	.arg1_type	= ARG_CONST_MAP_PTR,
>> +	.arg2_type	= ARG_PTR_TO_BTF_ID,
>> +	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
>> +	.arg4_type	= ARG_ANYTHING,
>> +	.btf_id		= bpf_inode_storage_get_btf_ids,
>> +};
>> +
>> +BTF_ID_LIST(bpf_inode_storage_delete_btf_ids)
>> +BTF_ID(struct, inode)
>> +
>> +const struct bpf_func_proto bpf_inode_storage_delete_proto = {
>> +	.func		= bpf_inode_storage_delete,
>> +	.gpl_only	= false,
>> +	.ret_type	= RET_INTEGER,
>> +	.arg1_type	= ARG_CONST_MAP_PTR,
>> +	.arg2_type	= ARG_PTR_TO_BTF_ID,
>> +	.btf_id		= bpf_inode_storage_delete_btf_ids,
>> +};

^ permalink raw reply related

* Re: [PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor
From: Madhavan T. Venkataraman @ 2020-07-31 17:13 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Kernel Hardening, Linux API, linux-arm-kernel, Linux FS Devel,
	linux-integrity, LKML, LSM List, Oleg Nesterov, X86 ML
In-Reply-To: <CALCETrWnNR5v3ZCLfBVQGYK8M0jAvQMaAc9uuO05kfZuh-4d6w@mail.gmail.com>

On 7/30/20 3:54 PM, Andy Lutomirski wrote:
> On Thu, Jul 30, 2020 at 7:24 AM Madhavan T. Venkataraman
> <madvenka@linux.microsoft.com> wrote:
>> ...
>> Creating a code page
>> --------------------
>>
>> We can do this in one of the following ways:
>>
>> - Allocate a writable page at run time, write the template code into
>>   the page and have execute permissions on the page.
>>
>> - Allocate a writable page at run time, write the template code into
>>   the page and remap the page with just execute permissions.
>>
>> - Allocate a writable page at run time, write the template code into
>>   the page, write the page into a temporary file and map the file with
>>   execute permissions.
>>
>> - Include the template code in a code page at build time itself and
>>   just remap the code page each time you need a code page.
> This latter part shouldn't need any special permissions as far as I know.

Agreed.
>
>> Pros and Cons
>> -------------
>>
>> As long as the OS provides the functionality to do this and the security
>> subsystem in the OS allows the actions, this is totally feasible. If not,
>> we need something like trampfd.
>>
>> As Floren mentioned, libffi does implement something like this for MACH.
>>
>> In fact, in my libffi changes, I use trampfd only after all the other methods
>> have failed because of security settings.
>>
>> But the above approach only solves the problem for this simple type of
>> trampoline. It does not provide a framework for addressing more complex types
>> or even other forms of dynamic code.
>>
>> Also, each application would need to implement this solution for itself
>> as opposed to relying on one implementation provided by the kernel.
> I would argue this is a benefit.  If the whole implementation is in
> userspace, there is no ABI compatibility issue.  The user program
> contains the trampoline code and the code that uses it.

The current trampfd implementation also does not have an ABI issue.
ABI details are to be handled in user land. In the case of libffi, they
are. Trampfd only addresses the trampoline required to jump to the
ABI handler.

>
>> Trampfd-based solution
>> ----------------------
>>
>> I outlined an enhancement to trampfd in a response to David Laight. In this
>> enhancement, the kernel is the one that would set up the code page.
>>
>> The kernel would call an arch-specific support function to generate the
>> code required to load registers, push values on the stack and jump to a PC
>> for a trampoline instance based on its current context. The trampoline
>> instance data could be baked into the code.
>>
>> My initial idea was to only have one trampoline instance per page. But I
>> think I can implement multiple instances per page. I just have to manage
>> the trampfd file private data and VMA private data accordingly to map an
>> element in a code page to its trampoline object.
>>
>> The two approaches are similar except for the detail about who sets up
>> and manages the trampoline pages. In both approaches, the performance problem
>> is addressed. But trampfd can be used even when security settings are
>> restrictive.
>>
>> Is my solution acceptable?
> Perhaps.  In general, before adding a new ABI to the kernel, it's nice
> to understand how it's better than doing the same thing in userspace.
> Saying that it's easier for user code to work with if it's in the
> kernel isn't necessarily an adequate justification.

Fair enough.

Dealing with multiple architectures
-----------------------------------------------

One good reason to use trampfd is multiple architecture support. The
trampoline table in a code page approach is neat. I don't deny that at
all. But my question is - can it be used in all cases?

It requires PC-relative data references. I have not worked on all architectures.
So, I need to study this. But do all ISAs support PC-relative data references?

Even in an ISA that supports it, there would be a maximum supported offset
from the current PC that can be reached for a data reference. That maximum
needs to be at least the size of a base page in the architecture. This is because
the code page and the data page need to be separate for security reasons.
Do all ISAs support a sufficiently large offset?

When the kernel generates the code for a trampoline, it can hard code data values
in the generated code itself so it does not need PC-relative data referencing.

And, for ISAs that do support the large offset, we do have to implement and
maintain the code page stuff for different ISAs for each application and library
if we did not use trampfd.

If you look at the libffi reference patch that I have linked in the cover letter,
I have added functions in common code that wrap trampfd calls. From architecture
specific code, there is just one function call to one of those wrapper functions
to set the register context for the trampoline. This is a very small C code change
in each architecture. So, support can be extended to all architectures without
exception easily.

Runtime generated trampolines
-------------------------------------------

libffi trampolines are simple. But there may be many cases out there where the
trampoline code cannot be statically defined at build time. It may have to be
generated at runtime. For this, we will need trampfd.

Security
-----------

With the user level trampoline table approach, the data part of the trampoline table
can be hacked by an attacker if an application has a vulnerability. Specifically, the
target PC can be altered to some arbitrary location. Trampfd implements an
"Allowed PCS" context. In the libffi changes, I have created a read-only array of
all ABI handlers used in closures for each architecture. This read-only array
can be used to restrict the PC values for libffi trampolines to prevent hacking.

To generalize, we can implement security rules/features if the trampoline
object is in the kernel.

Standardization
---------------------

Trampfd is a framework that can be used to implement multiple things. May be,
a few of those things can also be implemented in user land itself. But I think having
just one mechanism to execute dynamic code objects is preferable to having
multiple mechanisms not standardized across all applications.

As an example, let us say that I am able to implement support for JIT code. Let us
say that an interpreter uses libffi to execute a generated function. The interpreter
would use trampfd for the JIT code object and get an address. Then, it would pass
that to libffi which would then use trampfd for the trampoline. So, trampfd based
code objects can be chained.

> Why would remapping two pages of actual application text ever fail?

Remapping a page may not be available on all OSes. However, that is not a problem
for the code page approach. One can always memory map the code page from the
binary file directly. So, yes, this would not fail.

Madhavan

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox