Linux Security Modules development
 help / color / mirror / Atom feed
* [PATCH v3 1/9] KEYS: Defined an IMA hook to measure keys on key create or update
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

Asymmetric keys used for verifying file signatures or certificates
are currently not included in the IMA measurement list.

This patch defines a new IMA hook namely ima_post_key_create_or_update()
to measure asymmetric keys.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 security/integrity/ima/ima.h      |  2 ++
 security/integrity/ima/ima_main.c | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 997a57137351..22d0628faf56 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -21,6 +21,8 @@
 #include <linux/tpm.h>
 #include <linux/audit.h>
 #include <crypto/hash_info.h>
+#include <crypto/public_key.h>
+#include <keys/asymmetric-type.h>
 
 #include "../integrity.h"
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 492b8f241d39..18e1bc105be7 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -635,6 +635,9 @@ void process_buffer_measurement(const void *buf, int size,
 	int action = 0;
 	u32 secid;
 
+	if (!ima_policy_flag)
+		return;
+
 	if (func) {
 		security_task_getsecid(current, &secid);
 		action = ima_get_action(NULL, current_cred(), secid, 0, func,
@@ -695,6 +698,29 @@ void ima_kexec_cmdline(const void *buf, int size)
 	}
 }
 
+/**
+ * ima_post_key_create_or_update - measure asymmetric keys
+ * @keyring: keyring to which the key is linked to
+ * @key: created or updated key
+ * @flags: key flags
+ * @create: flag indicating whether the key was created or updated
+ *
+ * Keys can only be measured, not appraised.
+ */
+void ima_post_key_create_or_update(struct key *keyring, struct key *key,
+				   unsigned long flags, bool create)
+{
+	const struct public_key *pk;
+
+	if (key->type != &key_type_asymmetric)
+		return;
+
+	pk = key->payload.data[asym_crypto];
+	process_buffer_measurement(pk->key, pk->keylen,
+				   keyring->description,
+				   NONE, 0);
+}
+
 static int __init init_ima(void)
 {
 	int error;
-- 
2.17.1


^ permalink raw reply related

* [PATCH v3 4/9] KEYS: Updated IMA policy functions for handling key measurement
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

Information regarding what keyrings need to be measured is missing.

A new field in the IMA policy, namely, keyrings is added to
convey what keyrings need to be measured.

This patch updates the IMA function to retrieve keyrings from the policy.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 security/integrity/ima/ima.h          |  6 ++--
 security/integrity/ima/ima_api.c      |  3 +-
 security/integrity/ima/ima_appraise.c |  2 +-
 security/integrity/ima/ima_policy.c   | 40 +++++++++++++++++++++++++--
 4 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 12e9ec6847b5..3539a159a7ac 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -240,8 +240,10 @@ void ima_measure_queued_keys(void);
 
 /* IMA policy related functions */
 int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
-		     enum ima_hooks func, int mask, int flags, int *pcr,
-		     struct ima_template_desc **template_desc);
+		     enum ima_hooks func, int mask,
+		     int flags, int *pcr,
+		     struct ima_template_desc **template_desc,
+		     char **keyrings);
 void ima_init_policy(void);
 void ima_update_policy(void);
 void ima_update_policy_flag(void);
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index f614e22bf39f..f488d1cead79 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -175,6 +175,7 @@ void ima_add_violation(struct file *file, const unsigned char *filename,
  *	subj,obj, and type: are LSM specific.
  *	func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
  *	| KEXEC_CMDLINE
+ *      | KEYRING_CHECK
  *	mask: contains the permission mask
  *	fsmagic: hex value
  *
@@ -190,7 +191,7 @@ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
 	flags &= ima_policy_flag;
 
 	return ima_match_policy(inode, cred, secid, func, mask, flags, pcr,
-				template_desc);
+				template_desc, NULL);
 }
 
 /*
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 89b83194d1dc..5bed19be0f6a 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -54,7 +54,7 @@ int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func)
 
 	security_task_getsecid(current, &secid);
 	return ima_match_policy(inode, current_cred(), secid, func, mask,
-				IMA_APPRAISE | IMA_HASH, NULL, NULL);
+				IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
 }
 
 static int ima_fix_xattr(struct dentry *dentry,
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 0cc49f2d5233..b972a32ccb1b 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -31,6 +31,7 @@
 #define IMA_EUID	0x0080
 #define IMA_PCR		0x0100
 #define IMA_FSNAME	0x0200
+#define IMA_KEYRINGS	0x0400
 
 #define UNKNOWN		0
 #define MEASURE		0x0001	/* same as IMA_MEASURE */
@@ -76,6 +77,7 @@ struct ima_rule_entry {
 		int type;	/* audit type */
 	} lsm[MAX_LSM_RULES];
 	char *fsname;
+	char *keyrings; /* Keyrings to measure */
 	struct ima_template_desc *template;
 };
 
@@ -476,6 +478,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
  * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
  * @pcr: set the pcr to extend
  * @template_desc: the template that should be used for this rule
+ * @keyrings: keyrings for this rule, if specified
  *
  * Measure decision based on func/mask/fsmagic and LSM(subj/obj/type)
  * conditions.
@@ -486,7 +489,8 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
  */
 int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
 		     enum ima_hooks func, int mask, int flags, int *pcr,
-		     struct ima_template_desc **template_desc)
+		     struct ima_template_desc **template_desc,
+		     char **keyrings)
 {
 	struct ima_rule_entry *entry;
 	int action = 0, actmask = flags | (flags << 1);
@@ -518,6 +522,9 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
 		if ((pcr) && (entry->flags & IMA_PCR))
 			*pcr = entry->pcr;
 
+		if ((keyrings) && (entry->flags & IMA_KEYRINGS))
+			*keyrings = entry->keyrings;
+
 		if (template_desc && entry->template)
 			*template_desc = entry->template;
 		else if (template_desc)
@@ -761,7 +768,7 @@ enum {
 	Opt_uid_gt, Opt_euid_gt, Opt_fowner_gt,
 	Opt_uid_lt, Opt_euid_lt, Opt_fowner_lt,
 	Opt_appraise_type, Opt_permit_directio,
-	Opt_pcr, Opt_template, Opt_err
+	Opt_pcr, Opt_template, Opt_keyrings, Opt_err
 };
 
 static const match_table_t policy_tokens = {
@@ -796,6 +803,7 @@ static const match_table_t policy_tokens = {
 	{Opt_permit_directio, "permit_directio"},
 	{Opt_pcr, "pcr=%s"},
 	{Opt_template, "template=%s"},
+	{Opt_keyrings, "keyrings=%s"},
 	{Opt_err, NULL}
 };
 
@@ -959,6 +967,8 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
 				entry->func = POLICY_CHECK;
 			else if (strcmp(args[0].from, "KEXEC_CMDLINE") == 0)
 				entry->func = KEXEC_CMDLINE;
+			else if (strcmp(args[0].from, "KEYRING_CHECK") == 0)
+				entry->func = KEYRING_CHECK;
 			else
 				result = -EINVAL;
 			if (!result)
@@ -1011,6 +1021,23 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
 			result = 0;
 			entry->flags |= IMA_FSNAME;
 			break;
+		case Opt_keyrings:
+			ima_log_string(ab, "keyrings", args[0].from);
+
+			if ((entry->keyrings) ||
+			    (entry->action != MEASURE) ||
+			    (entry->func != KEYRING_CHECK)) {
+				result = -EINVAL;
+				break;
+			}
+			entry->keyrings = kstrdup(args[0].from, GFP_KERNEL);
+			if (!entry->keyrings) {
+				result = -ENOMEM;
+				break;
+			}
+			result = 0;
+			entry->flags |= IMA_KEYRINGS;
+			break;
 		case Opt_fsuuid:
 			ima_log_string(ab, "fsuuid", args[0].from);
 
@@ -1371,6 +1398,15 @@ int ima_policy_show(struct seq_file *m, void *v)
 		seq_puts(m, " ");
 	}
 
+	if (entry->flags & IMA_KEYRINGS) {
+		if (entry->keyrings != NULL)
+			snprintf(tbuf, sizeof(tbuf), "%s", entry->keyrings);
+		else
+			snprintf(tbuf, sizeof(tbuf), "%s", "All keyrings");
+		seq_printf(m, pt(Opt_keyrings), tbuf);
+		seq_puts(m, " ");
+	}
+
 	if (entry->flags & IMA_PCR) {
 		snprintf(tbuf, sizeof(tbuf), "%d", entry->pcr);
 		seq_printf(m, pt(Opt_pcr), tbuf);
-- 
2.17.1


^ permalink raw reply related

* [PATCH v3 2/9] KEYS: Defined functions to queue and dequeue keys for measurement
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

Key measurements cannot be done if the IMA hook to measure keys is
called before IMA is initialized. Key measurement needs to be deferred
if IMA is not yet initialized. Queued keys need to be processed when
IMA initialization is completed.

This patch defines functions to queue and de-queue keys for measurement.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 security/integrity/ima/ima.h       | 12 ++++
 security/integrity/ima/ima_queue.c | 92 ++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 22d0628faf56..b9600070e415 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -198,6 +198,16 @@ enum ima_hooks {
 	__ima_hooks(__ima_hook_enumify)
 };
 
+/*
+ * To track keys that need to be measured.
+ */
+struct ima_measure_key_entry {
+	struct list_head list;
+	void *public_key;
+	u32  public_key_len;
+	char *keyring_name;
+};
+
 /* LIM API function definitions */
 int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
 		   int mask, enum ima_hooks func, int *pcr,
@@ -224,6 +234,8 @@ int ima_store_template(struct ima_template_entry *entry, int violation,
 		       const unsigned char *filename, int pcr);
 void ima_free_template_entry(struct ima_template_entry *entry);
 const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);
+int ima_queue_key_for_measurement(struct key *keyring, struct key *key);
+void ima_measure_queued_keys(void);
 
 /* IMA policy related functions */
 int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c
index 1ce8b1701566..f2503f10abf4 100644
--- a/security/integrity/ima/ima_queue.c
+++ b/security/integrity/ima/ima_queue.c
@@ -46,6 +46,12 @@ struct ima_h_table ima_htable = {
  */
 static DEFINE_MUTEX(ima_extend_list_mutex);
 
+/*
+ * To synchronize access to the list of keys that need to be measured
+ */
+static DEFINE_MUTEX(ima_measure_keys_mutex);
+static LIST_HEAD(ima_measure_keys);
+
 /* lookup up the digest value in the hash table, and return the entry */
 static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value,
 						       int pcr)
@@ -232,3 +238,89 @@ int __init ima_init_digests(void)
 
 	return 0;
 }
+
+static void ima_free_measure_key_entry(struct ima_measure_key_entry *entry)
+{
+	if (entry != NULL) {
+		if (entry->public_key != NULL)
+			kzfree(entry->public_key);
+		if (entry->keyring_name != NULL)
+			kzfree(entry->keyring_name);
+		kzfree(entry);
+	}
+}
+
+static struct ima_measure_key_entry *ima_alloc_measure_key_entry(
+	struct key *keyring,
+	struct key *key)
+{
+	int rc = 0;
+	const struct public_key *pk;
+	size_t keyring_name_len;
+	struct ima_measure_key_entry *entry = NULL;
+
+	pk = key->payload.data[asym_crypto];
+	keyring_name_len = strlen(keyring->description) + 1;
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry != NULL) {
+		entry->public_key = kzalloc(pk->keylen, GFP_KERNEL);
+		entry->keyring_name =
+			kzalloc(keyring_name_len, GFP_KERNEL);
+	}
+
+	if ((entry == NULL) || (entry->public_key == NULL) ||
+	    (entry->keyring_name == NULL)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	strcpy(entry->keyring_name, keyring->description);
+	memcpy(entry->public_key, pk->key, pk->keylen);
+	entry->public_key_len = pk->keylen;
+	rc = 0;
+
+out:
+	if (rc) {
+		ima_free_measure_key_entry(entry);
+		entry = NULL;
+	}
+
+	return entry;
+}
+
+int ima_queue_key_for_measurement(struct key *keyring, struct key *key)
+{
+	int rc = 0;
+	struct ima_measure_key_entry *entry = NULL;
+
+	mutex_lock(&ima_measure_keys_mutex);
+
+	entry = ima_alloc_measure_key_entry(keyring, key);
+	if (entry != NULL) {
+		INIT_LIST_HEAD(&entry->list);
+		list_add_tail(&entry->list, &ima_measure_keys);
+	} else
+		rc = -ENOMEM;
+
+	mutex_unlock(&ima_measure_keys_mutex);
+
+	return rc;
+}
+
+void ima_measure_queued_keys(void)
+{
+	struct ima_measure_key_entry *entry, *tmp;
+
+	mutex_lock(&ima_measure_keys_mutex);
+
+	list_for_each_entry_safe(entry, tmp, &ima_measure_keys, list) {
+		process_buffer_measurement(entry->public_key,
+					   entry->public_key_len,
+					   entry->keyring_name,
+					   NONE, 0);
+		list_del(&entry->list);
+		ima_free_measure_key_entry(entry);
+	}
+
+	mutex_unlock(&ima_measure_keys_mutex);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH v3 0/9] KEYS: Measure keys when they are created or updated
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva

Problem Statement:

Keys created or updated in the system are currently not being measured.
Therefore an attestation service, for instance, would not be able to
attest whether or not the trusted keys keyring(s), for instance, contain
only known good (trusted) keys.

IMA measures system files, command line arguments passed to kexec,
boot aggregate, etc. It can be used to measure keys as well.
But there is no mechanism available in the kernel for IMA to
know when a key is created or updated.

This change aims to address measuring keys created or updated
in the system.

To achieve the above the following changes have been made:

 - Added a new IMA hook namely, ima_post_key_create_or_update, which
   measures the key. This IMA hook is called from key_create_or_update
   function. The key measurement can be controlled through IMA policy.

   In this change set a new IMA policy function KEYRING_CHECK has been
   added to measure keys. The policy can optionally specify a set of
   keyrings to measure. By default all keyrings are included in
   the measurement when KEYRING_CHECK policy is specified.

   # measure all keys
   measure func=KEYRING_CHECK

   # measure keys on the IMA keyring
   measure func=KEYRING_CHECK keyring=".ima"

   # measure keys on the BUILTIN and IMA keyrings into a different PCR
   measure func=KEYRING_CHECK keyring=".builtin_trusted_keys|.ima" pcr=11

Testing performed:

  * Booted the kernel with this change.
  * Executed keyctl tests from the Linux Test Project (LTP)
  * All keys are measured when only KEYRING_CHECK is set.
  * Only keys added to the given keyrings are measured
    when keyrings option is set.
  * Keys are not measured when KEYRING_CHECK is not set.
  * Key is queued for measurement if IMA is not yet initialized
    and processed when IMA is initialized.
  * Key is measured rightaway when IMA is initialized.
  * Added a new key to a keyring and verified "key create" code path.
    => In this case added a key to .ima keyring.
  * Added the same key again and verified "key update" code path.
    => Add the same key to .ima keyring.

Change Log:

  v3:

  => Added KEYRING_CHECK for measuring keys. This can optionally specify
     keyrings to measure.
  => Updated ima_get_action() and related functions to return
     the keyrings if specified in the policy.
  => process_buffer_measurement() function is updated to take keyring
     as a parameter. The key will be measured if the policy includes
     the keyring in the list of measured keyrings. If the policy does not
     specify any keyrings then all keys are measured.

  v2:

  => Per suggestion from Mimi reordered the patch set to first
     enable measuring keys added or updated in the system.
     And, then scope the measurement to keys added to 
     builtin_trusted_keys keyring through ima policy.
  => Removed security_key_create_or_update function and instead
     call ima hook, to measure the key, directly from 
     key_create_or_update function.

  v1:

  => LSM function for key_create_or_update. It calls ima.
  => Added ima hook for measuring keys
  => ima measures keys based on ima policy.

  v0:

  => Added LSM hook for key_create_or_update.
  => Measure keys added to builtin or secondary trusted keys keyring.

Background:

Currently ima measures file hashes and .ima signatures. ima signatures
are validated against keys in the ".ima" keyring. If the kernel is built
with CONFIG_IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY enabled,
then all keys in ".ima" keyring must be signed by a key in
".builtin_trusted_keys" or ".secondary_trusted_keys" keyrings.

Although ima supports the above configuration, not having an insight
into what keys are present in these trusted keys keyrings would prevent
an attestation service from validating a client machine.
 
On systems with CONFIG_IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY
enabled, measuring keys in the  ".builtin_trusted_keys" keyring provides
a mechanism to attest that the client's system binaries are indeed signed
by signers that chain to known trusted keys.

Without this change, to attest the clients one needs to maintain
an "allowed list" of file hashes of all versions of all client binaries
that are deployed on the clients in the enterprise. That is a huge
operational challenge in a large scale environment of clients with
heterogenous builds. This also limits scalability and agility of
rolling out frequent client binary updates.

Questions and concerns raised by reviewers on this patch set:

Question 1:
Is "Signed with a trusted key" equal to "Trusted file"?
Doesn't the service need the hashes of the system files to determine
whether a file is trusted or not?
"Signed with a trusted key" does not equal "Trusted"

Answer:
Agree "Signed with a trusted key" may not equal "Trusted".
To address this, the attesting service can maintain a small
manageable set of bad hashes (a "Blocked list") and a list of
trusted keys expected in client's .builtin_trusted_keys" keyring.
Using this data, the service can detect the presence of
"Disallowed (untrusted) version of client binaries".

Question 2:
Providing more data to the service (such as ".builtin_trusted_keys"),
empowers the service  to deny access to clients (block clients).
IMA walks a fine line in enforcing and measuring file integrity.
This patchset breaches that fine line and in doing so brings back
the fears of trusted computing.

Answer:
Any new measurement we add in IMA will provide more data to service
and can enable it to deny access to clients. It is not clear why this patch
set would breach the fine line between measuring and enforcing.
Since this patch set is disabled by default and enabled through
CONFIG_IMA_MEASURE_TRUSTED_KEYS, only those enterprises that
require this new measurement can opt-in for it. Since it is disabled
by default, it does not restrict the autonomy of independent users
who are unaffected by attestation.

Question 3:
IMA log already contains a pointer to the IMA keys used for signature
verification. Why does the service need to care what keys were used
to sign (install) the IMA keys? What is gained by measuring the keys
in the ".builtin_trusted_keys"


Answer:
To attest the clients using the current IMA log, service needs to maintain
hashes of all the deployed versions of all the system binaries for their
enterprise. This will introduce a very high operational overhead in
a large scale environment of clients with heterogenous builds.
This limits scalability and agility of rolling out frequent client
binary updates.


On the other hand, with the current patch set, we will have IMA
validate the file signature on the clients and the service validate
that the IMA keys were installed using trusted keys.


This provides a chain of trust:
    => IMA Key validates file signature on the client
    => Built-In trusted key attests IMA key on the client
    => Attestation service attests the Built-In trusted keys
         reported by the client in the IMA log


This approach, therefore, would require the service to maintain
a manageble set of trusted keys that it receives from a trusted source.
And, verify if the clients only have keys from that set of trusted keys.

Question 4:
Where will the attestation service receive the keys to validate against?

Answer:
Attestation service will receive the keys from a trusted source such as
the enterprise build services that provides the client builds.
The service will use this set of keys to verify that the keys reported by
the clients in the IMA log contains only keys from this trusted list.


Question 5:
What is changing in the IMA log through this patch set?


Answer:
This patch set does not remove any data that is currently included
in the IMA log. It only adds more data to the IMA log - the data on
".builtin_trusted_keys"

Lakshmi Ramasubramanian (9):
  KEYS: Defined an IMA hook to measure keys on key create or update
  KEYS: Defined functions to queue and dequeue keys for measurement
  KEYS: Added KEYRING_CHECK policy for key measurement
  KEYS: Updated IMA policy functions for handling key measurement
  KEYS: Updated ima_get_action() to return keyrings if specified in the
    policy
  KEYS: Measure key if the IMA policy allows measurement for the given
    keyring
  KEYS: Queue key for measurement if IMA is not yet initialized. Measure
    queued keys when IMA initialization is completed
  KEYS: Added a boolean flag for IMA initialization status.
    ima_policy_flag cannot be relied upon for knowing IMA initialization
    status because ima_policy_flag can be set to 0 when IMA is
    initialized, but there is no IMA policy configured.
  KEYS: Call the IMA hook to measure key when a new key is created or an
    existing key is updated

 Documentation/ABI/testing/ima_policy  | 15 +++++
 include/linux/ima.h                   |  7 ++
 security/integrity/ima/ima.h          | 27 ++++++--
 security/integrity/ima/ima_api.c      |  7 +-
 security/integrity/ima/ima_appraise.c |  2 +-
 security/integrity/ima/ima_init.c     | 10 ++-
 security/integrity/ima/ima_main.c     | 57 ++++++++++++++--
 security/integrity/ima/ima_policy.c   | 42 +++++++++++-
 security/integrity/ima/ima_queue.c    | 93 +++++++++++++++++++++++++++
 security/keys/key.c                   |  9 +++
 10 files changed, 254 insertions(+), 15 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH v3 3/9] KEYS: Added KEYRING_CHECK policy for key measurement
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

An IMA policy to manage measurement of keys is not supported.
A new IMA policy is needed to manage the measurement of keys.
A policy option is also needed to allow measurement of keys
linked to a given set of keyrings only.

This patch defines KEYRING_CHECK and keyrings in IMA policy
for this purpose. 

KEYRING_CHECK can be added in the IMA policy to measure keys.
keyrings can be, optionally, set to only measure keys
added or updated to a given set of keyrings. If keyrings is not
specified for KEYRING_CHECK, keys added or updated in
all keyrings are measured.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 Documentation/ABI/testing/ima_policy | 15 +++++++++++++++
 security/integrity/ima/ima.h         |  1 +
 security/integrity/ima/ima_main.c    |  2 +-
 security/integrity/ima/ima_policy.c  |  2 +-
 security/integrity/ima/ima_queue.c   |  2 +-
 5 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
index fc376a323908..757faf1a1a27 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -25,10 +25,12 @@ Description:
 			lsm:	[[subj_user=] [subj_role=] [subj_type=]
 				 [obj_user=] [obj_role=] [obj_type=]]
 			option:	[[appraise_type=]] [template=] [permit_directio]
+				[keyrings=]
 		base: 	func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
 				[FIRMWARE_CHECK]
 				[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
 				[KEXEC_CMDLINE]
+				[KEYRING_CHECK]
 			mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
 			       [[^]MAY_EXEC]
 			fsmagic:= hex value
@@ -38,6 +40,9 @@ Description:
 			fowner:= decimal value
 		lsm:  	are LSM specific
 		option:	appraise_type:= [imasig]
+			keyrings: = list of keyrings to measure
+			(eg, .builtin_trusted_keys|.ima). Only valid
+			when action is "measure" and func is KEYRING_CHECK.
 			template:= name of a defined IMA template type
 			(eg, ima-ng). Only valid when action is "measure".
 			pcr:= decimal value
@@ -105,3 +110,13 @@ Description:
 
 			measure func=KEXEC_KERNEL_CHECK pcr=4
 			measure func=KEXEC_INITRAMFS_CHECK pcr=5
+
+		Example of measure rules using KEYRING_CHECK
+			To measure keys added to
+			.builtin_trusted_keys or .ima keyring:
+
+			measure func=KEYRING_CHECK keyrings=.builtin_trusted_keys|.ima
+
+			To measure keys added to all keyrings:
+
+			measure func=KEYRING_CHECK
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index b9600070e415..12e9ec6847b5 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -191,6 +191,7 @@ static inline unsigned long ima_hash_key(u8 *digest)
 	hook(KEXEC_INITRAMFS_CHECK)	\
 	hook(POLICY_CHECK)		\
 	hook(KEXEC_CMDLINE)		\
+	hook(KEYRING_CHECK)		\
 	hook(MAX_CHECK)
 #define __ima_hook_enumify(ENUM)	ENUM,
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 18e1bc105be7..72ae0878ec5d 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -718,7 +718,7 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key,
 	pk = key->payload.data[asym_crypto];
 	process_buffer_measurement(pk->key, pk->keylen,
 				   keyring->description,
-				   NONE, 0);
+				   KEYRING_CHECK, 0);
 }
 
 static int __init init_ima(void)
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 6df7f641ff66..0cc49f2d5233 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -370,7 +370,7 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
 {
 	int i;
 
-	if (func == KEXEC_CMDLINE) {
+	if ((func == KEXEC_CMDLINE) || (func == KEYRING_CHECK)) {
 		if ((rule->flags & IMA_FUNC) && (rule->func == func))
 			return true;
 		return false;
diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c
index f2503f10abf4..5625381c5a97 100644
--- a/security/integrity/ima/ima_queue.c
+++ b/security/integrity/ima/ima_queue.c
@@ -317,7 +317,7 @@ void ima_measure_queued_keys(void)
 		process_buffer_measurement(entry->public_key,
 					   entry->public_key_len,
 					   entry->keyring_name,
-					   NONE, 0);
+					   KEYRING_CHECK, 0);
 		list_del(&entry->list);
 		ima_free_measure_key_entry(entry);
 	}
-- 
2.17.1


^ permalink raw reply related

* [PATCH v3 9/9] KEYS: Call the IMA hook to measure key when a new key is created or an existing key is updated
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

key_create_or_update function needs to call the IMA hook to measure
the key when a new key is created or an existing key is updated.

This patch adds the call to the IMA hook from key_create_or_update
function.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 include/linux/ima.h | 7 +++++++
 security/keys/key.c | 9 +++++++++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/ima.h b/include/linux/ima.h
index a20ad398d260..f085f1c6ef34 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -24,6 +24,9 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
 extern void ima_post_path_mknod(struct dentry *dentry);
 extern void ima_kexec_cmdline(const void *buf, int size);
+extern void ima_post_key_create_or_update(struct key *keyring,
+					  struct key *key,
+					  unsigned long flags, bool create);
 
 #ifdef CONFIG_IMA_KEXEC
 extern void ima_add_kexec_buffer(struct kimage *image);
@@ -91,6 +94,10 @@ static inline void ima_post_path_mknod(struct dentry *dentry)
 }
 
 static inline void ima_kexec_cmdline(const void *buf, int size) {}
+static inline void ima_post_key_create_or_update(struct key *keyring,
+						 struct key *key,
+						 unsigned long flags,
+						 bool create) {}
 #endif /* CONFIG_IMA */
 
 #ifndef CONFIG_IMA_KEXEC
diff --git a/security/keys/key.c b/security/keys/key.c
index 764f4c57913e..7c39054d8da6 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -13,6 +13,7 @@
 #include <linux/security.h>
 #include <linux/workqueue.h>
 #include <linux/random.h>
+#include <linux/ima.h>
 #include <linux/err.h>
 #include "internal.h"
 
@@ -936,6 +937,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 		goto error_link_end;
 	}
 
+	/* let the ima module know about the created key. */
+	ima_post_key_create_or_update(keyring, key, flags, true);
+
 	key_ref = make_key_ref(key, is_key_possessed(keyring_ref));
 
 error_link_end:
@@ -965,6 +969,11 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	}
 
 	key_ref = __key_update(key_ref, &prep);
+	if (!IS_ERR(key_ref)) {
+		/* let the ima module know about the updated key. */
+		ima_post_key_create_or_update(keyring, key, flags, false);
+	}
+
 	goto error_free_prep;
 }
 EXPORT_SYMBOL(key_create_or_update);
-- 
2.17.1


^ permalink raw reply related

* [PATCH v3 8/9] KEYS: Added a boolean flag for IMA initialization status.
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

IMA hook does not know whether a key can be measured right away or
the key needs to be queued to be measured at a later time.

This patch defines a flag to indicate the IMA initialization status.
IMA hook will use this flag to determine if a key can be measured
right away or the key needs to be queued to be measured at a later time.

ima_policy_flag cannot be relied upon for knowing IMA initialization
status because ima_policy_flag will be set to 0 when either IMA
is not initialized or the IMA policy itself is empty.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 security/integrity/ima/ima.h      | 1 +
 security/integrity/ima/ima_init.c | 3 +++
 security/integrity/ima/ima_main.c | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index f8bf5c24e0d0..5abc5a0b4591 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -54,6 +54,7 @@ extern int ima_policy_flag;
 extern int ima_hash_algo;
 extern int ima_appraise;
 extern struct tpm_chip *ima_tpm_chip;
+extern bool ima_initialized;
 
 /* IMA event related data */
 struct ima_event_data {
diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
index 91eaa5f2d008..8734ed5322c7 100644
--- a/security/integrity/ima/ima_init.c
+++ b/security/integrity/ima/ima_init.c
@@ -23,6 +23,7 @@
 /* name for boot aggregate entry */
 static const char boot_aggregate_name[] = "boot_aggregate";
 struct tpm_chip *ima_tpm_chip;
+bool ima_initialized;
 
 /* Add the boot aggregate to the IMA measurement list and extend
  * the PCR register.
@@ -135,6 +136,8 @@ int __init ima_init(void)
 	if (rc != 0)
 		return rc;
 
+	ima_initialized = true;
+
 	ima_measure_queued_keys();
 	return 0;
 }
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 2ad05563542c..e4c5e7150611 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -732,7 +732,7 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key,
 	if (key->type != &key_type_asymmetric)
 		return;
 
-	if (!ima_policy_flag) {
+	if (!ima_initialized) {
 		ima_queue_key_for_measurement(keyring, key);
 		return;
 	}
-- 
2.17.1


^ permalink raw reply related

* [PATCH v3 7/9] KEYS: Queue key for measurement if IMA is not yet initialized. Measure queued keys when IMA initialization is completed
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

Keys need to be queued when the IMA hook to measure keys is called before
IMA is initialized. Keys queued for measurement need to be processed when
IMA initialization is completed.

This patch adds the call to queue and de-queue keys for measurement.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 security/integrity/ima/ima_init.c | 7 ++++++-
 security/integrity/ima/ima_main.c | 5 +++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
index 5d55ade5f3b9..91eaa5f2d008 100644
--- a/security/integrity/ima/ima_init.c
+++ b/security/integrity/ima/ima_init.c
@@ -131,5 +131,10 @@ int __init ima_init(void)
 
 	ima_init_policy();
 
-	return ima_fs_init();
+	rc = ima_fs_init();
+	if (rc != 0)
+		return rc;
+
+	ima_measure_queued_keys();
+	return 0;
 }
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index bd835ec89ead..2ad05563542c 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -732,6 +732,11 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key,
 	if (key->type != &key_type_asymmetric)
 		return;
 
+	if (!ima_policy_flag) {
+		ima_queue_key_for_measurement(keyring, key);
+		return;
+	}
+
 	pk = key->payload.data[asym_crypto];
 	process_buffer_measurement(pk->key, pk->keylen,
 				   keyring->description,
-- 
2.17.1


^ permalink raw reply related

* [PATCH v3 6/9] KEYS: Measure key if the IMA policy allows measurement for the given keyring
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

process_buffer_measurement() does not know which keyring the given key
is being linked to. It needs the keyring name to determine whether 
or not the given key needs to be measured.

This patch adds a new parameter "keyring" to process_buffer_measurement()
to convey which keyring the given key is linked to.

If KEYRING_CHECK alone is set in the policy, all keys are measured.
If a list of keyrings is also specified in the policy then only keys
linked to those keyrings will be measured.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 security/integrity/ima/ima.h       |  2 +-
 security/integrity/ima/ima_main.c  | 24 +++++++++++++++++++-----
 security/integrity/ima/ima_queue.c |  3 ++-
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index ded78af94e69..f8bf5c24e0d0 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -225,7 +225,7 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file,
 			   struct ima_template_desc *template_desc);
 void process_buffer_measurement(const void *buf, int size,
 				const char *eventname, enum ima_hooks func,
-				int pcr);
+				int pcr, const char *keyring);
 void ima_audit_measurement(struct integrity_iint_cache *iint,
 			   const unsigned char *filename);
 int ima_alloc_init_template(struct ima_event_data *event_data,
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index cbc7de87106f..bd835ec89ead 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -612,12 +612,22 @@ int ima_load_data(enum kernel_load_data_id id)
  * @eventname: event name to be used for the buffer entry.
  * @func: IMA hook
  * @pcr: pcr to extend the measurement
+ * @keyring: keyring for the measurement
+ *
+ *	The following scenarios are possible with respect to
+ *	the parameter "keyring":
+ *	1, keyring is NULL. In this case buffer is measured.
+ *	2, keyring is not NULL, but ima_get_action returned
+ *	   a NULL keyrings. In this case also the buffer is measured.
+ *	3, keyring is not NULL and ima_get_action returned
+ *	   a non-NULL keyrings. In this case measure the buffer
+ *	   only if the given keyring is present in the keyrings.
  *
  * Based on policy, the buffer is measured into the ima log.
  */
 void process_buffer_measurement(const void *buf, int size,
 				const char *eventname, enum ima_hooks func,
-				int pcr)
+				int pcr, const char *keyring)
 {
 	int ret = 0;
 	struct ima_template_entry *entry = NULL;
@@ -647,8 +657,10 @@ void process_buffer_measurement(const void *buf, int size,
 			return;
 	}
 
-	if (keyrings != NULL)
-		keyrings = NULL;
+	if ((keyring != NULL) && (keyrings != NULL)
+	    && (strstr(keyrings, keyring) == NULL)) {
+		return;
+	}
 
 	if (!pcr)
 		pcr = CONFIG_IMA_MEASURE_PCR_IDX;
@@ -698,7 +710,8 @@ void ima_kexec_cmdline(const void *buf, int size)
 {
 	if (buf && size != 0) {
 		process_buffer_measurement(buf, size, "kexec-cmdline",
-					   KEXEC_CMDLINE, 0);
+					   KEXEC_CMDLINE, 0,
+					   NULL);
 	}
 }
 
@@ -722,7 +735,8 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key,
 	pk = key->payload.data[asym_crypto];
 	process_buffer_measurement(pk->key, pk->keylen,
 				   keyring->description,
-				   KEYRING_CHECK, 0);
+				   KEYRING_CHECK, 0,
+				   keyring->description);
 }
 
 static int __init init_ima(void)
diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c
index 5625381c5a97..805dcacb48e6 100644
--- a/security/integrity/ima/ima_queue.c
+++ b/security/integrity/ima/ima_queue.c
@@ -317,7 +317,8 @@ void ima_measure_queued_keys(void)
 		process_buffer_measurement(entry->public_key,
 					   entry->public_key_len,
 					   entry->keyring_name,
-					   KEYRING_CHECK, 0);
+					   KEYRING_CHECK, 0,
+					   entry->keyring_name);
 		list_del(&entry->list);
 		ima_free_measure_key_entry(entry);
 	}
-- 
2.17.1


^ permalink raw reply related

* [PATCH v3 5/9] KEYS: Updated ima_get_action() to return keyrings if specified in the policy
From: Lakshmi Ramasubramanian @ 2019-10-31  1:19 UTC (permalink / raw)
  To: zohar, dhowells, matthewgarrett, sashal, jamorris, linux-kernel,
	linux-integrity, linux-security-module, keyrings
  Cc: prsriva
In-Reply-To: <20191031011910.2574-1-nramas@linux.microsoft.com>

Information regarding what keyrings need to be measured is missing.
ima_get_action() needs to retrieve the keyrings, if specified for
KEYRING_CHECK. 

This patch adds a new out parameter to ima_get_action() to return
keyrings read from the policy.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
---
 security/integrity/ima/ima.h      | 3 ++-
 security/integrity/ima/ima_api.c  | 6 ++++--
 security/integrity/ima/ima_main.c | 8 ++++++--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 3539a159a7ac..ded78af94e69 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -212,7 +212,8 @@ struct ima_measure_key_entry {
 /* LIM API function definitions */
 int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
 		   int mask, enum ima_hooks func, int *pcr,
-		   struct ima_template_desc **template_desc);
+		   struct ima_template_desc **template_desc,
+		   char **keyrings);
 int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
 int ima_collect_measurement(struct integrity_iint_cache *iint,
 			    struct file *file, void *buf, loff_t size,
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index f488d1cead79..77ac076672e1 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -169,6 +169,7 @@ void ima_add_violation(struct file *file, const unsigned char *filename,
  * @func: caller identifier
  * @pcr: pointer filled in if matched measure policy sets pcr=
  * @template_desc: pointer filled in if matched measure policy sets template=
+ * @keyrings: pointer filled in if matched measure policy sets keyrings=
  *
  * The policy is defined in terms of keypairs:
  *		subj=, obj=, type=, func=, mask=, fsmagic=
@@ -184,14 +185,15 @@ void ima_add_violation(struct file *file, const unsigned char *filename,
  */
 int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
 		   int mask, enum ima_hooks func, int *pcr,
-		   struct ima_template_desc **template_desc)
+		   struct ima_template_desc **template_desc,
+		   char **keyrings)
 {
 	int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH;
 
 	flags &= ima_policy_flag;
 
 	return ima_match_policy(inode, cred, secid, func, mask, flags, pcr,
-				template_desc, NULL);
+				template_desc, keyrings);
 }
 
 /*
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 72ae0878ec5d..cbc7de87106f 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -214,7 +214,7 @@ static int process_measurement(struct file *file, const struct cred *cred,
 	 * Included is the appraise submask.
 	 */
 	action = ima_get_action(inode, cred, secid, mask, func, &pcr,
-				&template_desc);
+				&template_desc, NULL);
 	violation_check = ((func == FILE_CHECK || func == MMAP_CHECK) &&
 			   (ima_policy_flag & IMA_MEASURE));
 	if (!action && !violation_check)
@@ -627,6 +627,7 @@ void process_buffer_measurement(const void *buf, int size,
 					    .buf = buf,
 					    .buf_len = size};
 	struct ima_template_desc *template = NULL;
+	char *keyrings = NULL;
 	struct {
 		struct ima_digest_data hdr;
 		char digest[IMA_MAX_DIGEST_SIZE];
@@ -641,11 +642,14 @@ void process_buffer_measurement(const void *buf, int size,
 	if (func) {
 		security_task_getsecid(current, &secid);
 		action = ima_get_action(NULL, current_cred(), secid, 0, func,
-					&pcr, &template);
+					&pcr, &template, &keyrings);
 		if (!(action & IMA_MEASURE))
 			return;
 	}
 
+	if (keyrings != NULL)
+		keyrings = NULL;
+
 	if (!pcr)
 		pcr = CONFIG_IMA_MEASURE_PCR_IDX;
 
-- 
2.17.1


^ permalink raw reply related

* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Rasmus Villemoes @ 2019-10-30 22:38 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: David Howells, Linus Torvalds, Greg Kroah-Hartman, Peter Zijlstra,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, linux-security-module, linux-fsdevel, linux-api,
	LKML
In-Reply-To: <CAOi1vP9paV2-2_S0NgfbZDE6+5kqHXVc9xabHVC-2Ss1MmXkCg@mail.gmail.com>

On 30/10/2019 23.16, Ilya Dryomov wrote:
> On Wed, Oct 30, 2019 at 9:35 PM Rasmus Villemoes
> <linux@rasmusvillemoes.dk> wrote:
>>
>> On 30/10/2019 17.19, Ilya Dryomov wrote:
>>> On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
>>>>  /*
>>>> - * We use a start+len construction, which provides full use of the
>>>> - * allocated memory.
>>>> - * -- Florian Coosmann (FGC)
>>>> - *
>>>> + * We use head and tail indices that aren't masked off, except at the point of
>>>> + * dereference, but rather they're allowed to wrap naturally.  This means there
>>>> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
>>>> + * -- David Howells 2019-09-23.
>>>
>>> Hi David,
>>>
>>> Is "ring size < INT_MAX" constraint correct?
>>
>> No. As long as one always uses a[idx % size] to access the array, the
>> only requirement is that size is representable in an unsigned int. Then
>> because one also wants to do the % using simple bitmasking, that further
>> restricts one to sizes that are a power of 2, so the end result is that
>> the max size is 2^31 (aka INT_MAX+1).
> 
> I think the fact that indices are free running and wrap at a power of
> two already restricts you to sizes the are a power of two,

Ah, yes, of course. When reducing indices mod n that may already have
been implicitly reduced mod N, N must be a multiple of n for the result
to be well-defined.

Rasmus

^ permalink raw reply

* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Ilya Dryomov @ 2019-10-30 22:16 UTC (permalink / raw)
  To: Rasmus Villemoes
  Cc: David Howells, Linus Torvalds, Greg Kroah-Hartman, Peter Zijlstra,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, linux-security-module, linux-fsdevel, linux-api,
	LKML
In-Reply-To: <4892d186-8eb0-a282-e7e6-e79958431a54@rasmusvillemoes.dk>

On Wed, Oct 30, 2019 at 9:35 PM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
>
> On 30/10/2019 17.19, Ilya Dryomov wrote:
> > On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
> >>  /*
> >> - * We use a start+len construction, which provides full use of the
> >> - * allocated memory.
> >> - * -- Florian Coosmann (FGC)
> >> - *
> >> + * We use head and tail indices that aren't masked off, except at the point of
> >> + * dereference, but rather they're allowed to wrap naturally.  This means there
> >> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
> >> + * -- David Howells 2019-09-23.
> >
> > Hi David,
> >
> > Is "ring size < INT_MAX" constraint correct?
>
> No. As long as one always uses a[idx % size] to access the array, the
> only requirement is that size is representable in an unsigned int. Then
> because one also wants to do the % using simple bitmasking, that further
> restricts one to sizes that are a power of 2, so the end result is that
> the max size is 2^31 (aka INT_MAX+1).

I think the fact that indices are free running and wrap at a power of
two already restricts you to sizes the are a power of two, independent
of how you do masking.  If you switch to a[idx % size], size still has
to be a power of two for things to work when idx wraps.  Consider:

  size = 6
  head = tail = 4294967292, empty buffer

  push  4294967292 % 6 = 0
  push  4294967293 % 6 = 1
  push  4294967294 % 6 = 2
  push  4294967295 % 6 = 3
  push           0 % 6 = 0  <-- expected 4, overwrote a[0]

>
> > I've never had to implement this free running indices scheme, but
> > the way I've always visualized it is that the top bit of the index is
> > used as a lap (as in a race) indicator, leaving 31 bits to work with
> > (in case of unsigned ints).  Should that be
> >
> >   ring size <= 2^31
> >
> > or more precisely
> >
> >   ring size is a power of two <= 2^31
>
> Exactly. But it's kind of moot since the ring size would never be
> allowed to grow anywhere near that.

Thanks for confirming.  Even if it's kind of moot, I think it should be
corrected to avoid confusion.

                Ilya

^ permalink raw reply

* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Rasmus Villemoes @ 2019-10-30 20:35 UTC (permalink / raw)
  To: Ilya Dryomov, David Howells
  Cc: Linus Torvalds, Rasmus Villemoes, Greg Kroah-Hartman,
	Peter Zijlstra, nicolas.dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, linux-security-module,
	linux-fsdevel, linux-api, LKML
In-Reply-To: <CAOi1vP97DMX8zweOLfBDOFstrjC78=6RgxK3PPj_mehCOSeoaw@mail.gmail.com>

On 30/10/2019 17.19, Ilya Dryomov wrote:
> On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
>>  /*
>> - * We use a start+len construction, which provides full use of the
>> - * allocated memory.
>> - * -- Florian Coosmann (FGC)
>> - *
>> + * We use head and tail indices that aren't masked off, except at the point of
>> + * dereference, but rather they're allowed to wrap naturally.  This means there
>> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
>> + * -- David Howells 2019-09-23.
> 
> Hi David,
> 
> Is "ring size < INT_MAX" constraint correct?

No. As long as one always uses a[idx % size] to access the array, the
only requirement is that size is representable in an unsigned int. Then
because one also wants to do the % using simple bitmasking, that further
restricts one to sizes that are a power of 2, so the end result is that
the max size is 2^31 (aka INT_MAX+1).

> I've never had to implement this free running indices scheme, but
> the way I've always visualized it is that the top bit of the index is
> used as a lap (as in a race) indicator, leaving 31 bits to work with
> (in case of unsigned ints).  Should that be
> 
>   ring size <= 2^31
> 
> or more precisely
> 
>   ring size is a power of two <= 2^31

Exactly. But it's kind of moot since the ring size would never be
allowed to grow anywhere near that.

Rasmus

^ permalink raw reply

* Re: [PATCH linux-kselftest/test v1] apparmor: add AppArmor KUnit tests for policy unpack
From: Iurii Zaikin @ 2019-10-30 20:11 UTC (permalink / raw)
  To: Kees Cook
  Cc: Luis Chamberlain, Brendan Higgins, Alan Maguire,
	Matthias Maennich, shuah, John Johansen, jmorris, serge,
	David Gow, Theodore Ts'o, Linux Kernel Mailing List,
	linux-security-module, KUnit Development,
	open list:KERNEL SELFTEST FRAMEWORK, Mike Salvatore
In-Reply-To: <201910301205.74EC2A226D@keescook>

> Why can't unit tests live with the code they're testing? They're already
> logically tied together; what's the harm there? This needn't be the case
> for ALL tests, etc. The test driver could still live externally. The
> test in the other .c would just have exported functions... ?
>
Curiously enough, this approach has been adopted by D 2.0 where unittests are
members of the class under test:  https://digitalmars.com/d/2.0/unittest.html
but such approach is not mainstream.
I personally like the idea of testing the lowest level bits in isolation even if
they are not a part of any interface. I think that specifying the
interface using
unit tests and ensuring implementation correctness are complementary but
I haven't had much luck arguing this with our esteemed colleagues.

^ permalink raw reply

* Re: [PATCH linux-kselftest/test v1] apparmor: add AppArmor KUnit tests for policy unpack
From: Kees Cook @ 2019-10-30 19:09 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: Brendan Higgins, Alan Maguire, Matthias Maennich, shuah,
	John Johansen, jmorris, serge, Iurii Zaikin, David Gow,
	Theodore Ts'o, Linux Kernel Mailing List,
	linux-security-module, KUnit Development,
	open list:KERNEL SELFTEST FRAMEWORK, Mike Salvatore
In-Reply-To: <20191024101529.GK11244@42.do-not-panic.com>

On Thu, Oct 24, 2019 at 10:15:29AM +0000, Luis Chamberlain wrote:
> On Wed, Oct 23, 2019 at 05:42:18PM -0700, Brendan Higgins wrote:
> > With that, I think the best solution in this case will be the
> > "__visible_for_testing" route. It has no overhead when testing is
> > turned off (in fact it is no different in anyway when testing is
> > turned off). The downsides I see are:
> > 
> > 1) You may not be able to test non-module code not compiled for
> > testing later with the test modules that Alan is working on (But the
> > only way I think that will work is by preventing the symbol from being
> > inlined, right?).
> > 
> > 2) I think "__visible_for_testing" will be prone to abuse. Here, I
> > think there are reasons why we might want to expose these symbols for
> > testing, but not otherwise. Nevertheless, I think most symbols that
> > should be tested should probably be made visible by default. Since you
> > usually only want to test your public interfaces. I could very well
> > see this getting used as a kludge that gets used far too frequently.
> 
> There are two parts to your statement on 2):
> 
>   a) possible abuse of say __visible_for_testing

I really don't like the idea of littering the kernel with these. It'll
also require chunks in header files wrapped in #ifdefs. This is really
ugly.

>   b) you typically only want to test your public interfaces

True, but being able to test the little helper functions is a nice
starting point and a good building block.

Why can't unit tests live with the code they're testing? They're already
logically tied together; what's the harm there? This needn't be the case
for ALL tests, etc. The test driver could still live externally. The
test in the other .c would just have exported functions... ?

-- 
Kees Cook

^ permalink raw reply

* Re: [PATCH linux-kselftest/test v1] apparmor: add AppArmor KUnit tests for policy unpack
From: Kees Cook @ 2019-10-30 19:02 UTC (permalink / raw)
  To: Brendan Higgins
  Cc: Theodore Y. Ts'o, shuah, John Johansen, jmorris, serge,
	Alan Maguire, Iurii Zaikin, David Gow, Luis Chamberlain,
	Linux Kernel Mailing List, linux-security-module,
	KUnit Development, open list:KERNEL SELFTEST FRAMEWORK,
	Mike Salvatore
In-Reply-To: <CAFd5g45LmnbD7L4LqdbfBV5YR377e81m61+z==RKCGjWBFqDGQ@mail.gmail.com>

On Fri, Oct 18, 2019 at 02:41:38PM -0700, Brendan Higgins wrote:
> On Fri, Oct 18, 2019 at 9:25 AM Theodore Y. Ts'o <tytso@mit.edu> wrote:
> >
> > On Thu, Oct 17, 2019 at 05:43:07PM -0700, Brendan Higgins wrote:
> > > > +config SECURITY_APPARMOR_TEST
> > > > +   bool "Build KUnit tests for policy_unpack.c"
> > > > +   default n
> > > > +   depends on KUNIT && SECURITY_APPARMOR
> > >
> > > Ted, here is an example where doing select on direct dependencies is
> > > tricky because SECURITY_APPARMOR has a number of indirect dependencies.
> >
> > Well, that could be solved by adding a select on all of the indirect
> > dependencies.  I did get your point about the fact that we could have
> 
> In this particular case that would work.
> 
> > cases where the indirect dependencies might conflict with one another.
> > That's going to be a tough situation regardless of whether we have a
> > sat-solver or a human who has to struggle with that situation.
> 
> But yeah, that's the real problem.

I think at this stage we want to make it _possible_ to write tests
sanely without causing all kinds of headaches. I think "build all the
tests" can just be a function of "allmodconfig" and leave it at that
until we have cases we really need to deal with.

-- 
Kees Cook

^ permalink raw reply

* Re: [PATCH linux-kselftest/test v1] apparmor: add AppArmor KUnit tests for policy unpack
From: Kees Cook @ 2019-10-30 18:59 UTC (permalink / raw)
  To: Iurii Zaikin
  Cc: Brendan Higgins, shuah, john.johansen, jmorris, serge,
	alan.maguire, davidgow, Luis Chamberlain, Theodore Ts'o,
	Linux Kernel Mailing List, linux-security-module,
	KUnit Development, open list:KERNEL SELFTEST FRAMEWORK,
	Mike Salvatore
In-Reply-To: <CAAXuY3rLEt9nqOBSNaWjLMHNg6pDHdjtg7hFiYx-KCDhyfnkcg@mail.gmail.com>

On Thu, Oct 17, 2019 at 05:33:56PM -0700, Iurii Zaikin wrote:
> On Thu, Oct 17, 2019 at 5:19 PM Brendan Higgins
> <brendanhiggins@google.com> wrote:
> 
> > +config SECURITY_APPARMOR_TEST
> > +       bool "Build KUnit tests for policy_unpack.c"
> > +       default n

New options already already default n, this can be left off.

> > +       depends on KUNIT && SECURITY_APPARMOR
> > +       help
> >
> select SECURITY_APPARMOR ?

"select" doesn't enforce dependencies, so just a "depends ..." is
correct.

> > +       KUNIT_EXPECT_EQ(test, size, TEST_BLOB_DATA_SIZE);
> > +       KUNIT_EXPECT_TRUE(test,
> > +               memcmp(blob, TEST_BLOB_DATA, TEST_BLOB_DATA_SIZE) == 0);
> I think this must be  KUNIT_ASSERT_EQ(test, size, TEST_BLOB_DATA_SIZE);,
> otherwise there could be a buffer overflow in memcmp. All tests that
> follow such pattern

Agreed.

> are suspect. Also, not sure about your stylistic preference for
> KUNIT_EXPECT_TRUE(test,
>                memcmp(blob, TEST_BLOB_DATA, TEST_BLOB_DATA_SIZE) == 0);
> vs
> KUNIT_EXPECT_EQ(test,
>                0,
>                memcmp(blob, TEST_BLOB_DATA, TEST_BLOB_DATA_SIZE));

I like == 0.

-- 
Kees Cook

^ permalink raw reply

* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Ilya Dryomov @ 2019-10-30 16:19 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Rasmus Villemoes, Greg Kroah-Hartman,
	Peter Zijlstra, nicolas.dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, linux-security-module,
	linux-fsdevel, linux-api, LKML
In-Reply-To: <157186186167.3995.7568100174393739543.stgit@warthog.procyon.org.uk>

On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
>
> Convert pipes to use head and tail pointers for the buffer ring rather than
> pointer and length as the latter requires two atomic ops to update (or a
> combined op) whereas the former only requires one.
>
>  (1) The head pointer is the point at which production occurs and points to
>      the slot in which the next buffer will be placed.  This is equivalent
>      to pipe->curbuf + pipe->nrbufs.
>
>      The head pointer belongs to the write-side.
>
>  (2) The tail pointer is the point at which consumption occurs.  It points
>      to the next slot to be consumed.  This is equivalent to pipe->curbuf.
>
>      The tail pointer belongs to the read-side.
>
>  (3) head and tail are allowed to run to UINT_MAX and wrap naturally.  They
>      are only masked off when the array is being accessed, e.g.:
>
>         pipe->bufs[head & mask]
>
>      This means that it is not necessary to have a dead slot in the ring as
>      head == tail isn't ambiguous.
>
>  (4) The ring is empty if "head == tail".
>
>      A helper, pipe_empty(), is provided for this.
>
>  (5) The occupancy of the ring is "head - tail".
>
>      A helper, pipe_occupancy(), is provided for this.
>
>  (6) The number of free slots in the ring is "pipe->ring_size - occupancy".
>
>      A helper, pipe_space_for_user() is provided to indicate how many slots
>      userspace may use.
>
>  (7) The ring is full if "head - tail >= pipe->ring_size".
>
>      A helper, pipe_full(), is provided for this.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
>
>  fs/fuse/dev.c             |   31 +++--
>  fs/pipe.c                 |  169 ++++++++++++++++-------------
>  fs/splice.c               |  188 ++++++++++++++++++++------------
>  include/linux/pipe_fs_i.h |   86 ++++++++++++++-
>  include/linux/uio.h       |    4 -
>  lib/iov_iter.c            |  266 +++++++++++++++++++++++++--------------------
>  6 files changed, 464 insertions(+), 280 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index dadd617d826c..1e4bc27573cc 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -703,7 +703,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
>                         cs->pipebufs++;
>                         cs->nr_segs--;
>                 } else {
> -                       if (cs->nr_segs == cs->pipe->buffers)
> +                       if (cs->nr_segs >= cs->pipe->ring_size)
>                                 return -EIO;
>
>                         page = alloc_page(GFP_HIGHUSER);
> @@ -879,7 +879,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
>         struct pipe_buffer *buf;
>         int err;
>
> -       if (cs->nr_segs == cs->pipe->buffers)
> +       if (cs->nr_segs >= cs->pipe->ring_size)
>                 return -EIO;
>
>         err = unlock_request(cs->req);
> @@ -1341,7 +1341,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
>         if (!fud)
>                 return -EPERM;
>
> -       bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer),
> +       bufs = kvmalloc_array(pipe->ring_size, sizeof(struct pipe_buffer),
>                               GFP_KERNEL);
>         if (!bufs)
>                 return -ENOMEM;
> @@ -1353,7 +1353,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
>         if (ret < 0)
>                 goto out;
>
> -       if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
> +       if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->ring_size) {
>                 ret = -EIO;
>                 goto out;
>         }
> @@ -1935,6 +1935,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>                                      struct file *out, loff_t *ppos,
>                                      size_t len, unsigned int flags)
>  {
> +       unsigned int head, tail, mask, count;
>         unsigned nbuf;
>         unsigned idx;
>         struct pipe_buffer *bufs;
> @@ -1949,8 +1950,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>
>         pipe_lock(pipe);
>
> -       bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer),
> -                             GFP_KERNEL);
> +       head = pipe->head;
> +       tail = pipe->tail;
> +       mask = pipe->ring_size - 1;
> +       count = head - tail;
> +
> +       bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
>         if (!bufs) {
>                 pipe_unlock(pipe);
>                 return -ENOMEM;
> @@ -1958,8 +1963,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>
>         nbuf = 0;
>         rem = 0;
> -       for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
> -               rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
> +       for (idx = tail; idx < head && rem < len; idx++)
> +               rem += pipe->bufs[idx & mask].len;
>
>         ret = -EINVAL;
>         if (rem < len)
> @@ -1970,16 +1975,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>                 struct pipe_buffer *ibuf;
>                 struct pipe_buffer *obuf;
>
> -               BUG_ON(nbuf >= pipe->buffers);
> -               BUG_ON(!pipe->nrbufs);
> -               ibuf = &pipe->bufs[pipe->curbuf];
> +               BUG_ON(nbuf >= pipe->ring_size);
> +               BUG_ON(tail == head);
> +               ibuf = &pipe->bufs[tail & mask];
>                 obuf = &bufs[nbuf];
>
>                 if (rem >= ibuf->len) {
>                         *obuf = *ibuf;
>                         ibuf->ops = NULL;
> -                       pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
> -                       pipe->nrbufs--;
> +                       tail++;
> +                       pipe_commit_read(pipe, tail);
>                 } else {
>                         if (!pipe_buf_get(pipe, ibuf))
>                                 goto out_free;
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 8a2ab2f974bd..8a0806fe12d3 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -43,10 +43,11 @@ unsigned long pipe_user_pages_hard;
>  unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
>
>  /*
> - * We use a start+len construction, which provides full use of the
> - * allocated memory.
> - * -- Florian Coosmann (FGC)
> - *
> + * We use head and tail indices that aren't masked off, except at the point of
> + * dereference, but rather they're allowed to wrap naturally.  This means there
> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
> + * -- David Howells 2019-09-23.

Hi David,

Is "ring size < INT_MAX" constraint correct?

I've never had to implement this free running indices scheme, but
the way I've always visualized it is that the top bit of the index is
used as a lap (as in a race) indicator, leaving 31 bits to work with
(in case of unsigned ints).  Should that be

  ring size <= 2^31

or more precisely

  ring size is a power of two <= 2^31

or am I missing something?

Thanks,

                Ilya

^ permalink raw reply

* Re: [PATCH bpf-next v11 2/7] landlock: Add the management of domains
From: Mickaël Salaün @ 2019-10-30 14:03 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Shuah Khan, Stephen Smalley,
	Tejun Heo, Tetsuo Handa, Tycho Andersen, Will Drewry, bpf,
	kernel-hardening, linux-api, linux-security-module
In-Reply-To: <20191030025621.GA27626@mail.hallyn.com>


On 30/10/2019 03:56, Serge E. Hallyn wrote:
> On Tue, Oct 29, 2019 at 06:15:00PM +0100, Mickaël Salaün wrote:
>> A Landlock domain is a set of eBPF programs.  There is a list for each
>> different program types that can be run on a specific Landlock hook
>> (e.g. ptrace).  A domain is tied to a set of subjects (i.e. tasks).  A
>> Landlock program should not try (nor be able) to infer which subject is
>> currently enforced, but to have a unique security policy for all
>> subjects tied to the same domain.  This make the reasoning much easier
>> and help avoid pitfalls.
>>
>> The next commits tie a domain to a task's credentials thanks to
>> seccomp(2), but we could use cgroups or a security file-system to
>> enforce a sysadmin-defined policy .
>>
>> Signed-off-by: Mickaël Salaün <mic@digikod.net>
>> Cc: Alexei Starovoitov <ast@kernel.org>
>> Cc: Andy Lutomirski <luto@amacapital.net>
>> Cc: Daniel Borkmann <daniel@iogearbox.net>
>> Cc: James Morris <jmorris@namei.org>
>> Cc: Kees Cook <keescook@chromium.org>
>> Cc: Serge E. Hallyn <serge@hallyn.com>
>> Cc: Will Drewry <wad@chromium.org>
>> ---
>>
>> Changes since v10:
>> * rename files and names to clearly define a domain
>> * create a standalone patch to ease review
>> ---
> 
> [...]
> 
>> +/**
>> + * store_landlock_prog - prepend and deduplicate a Landlock prog_list
>> + *
>> + * Prepend @prog to @init_domain while ignoring @prog if they are already in
>> + * @ref_domain.  Whatever is the result of this function call, you can call
>> + * bpf_prog_put(@prog) after.
>> + *
>> + * @init_domain: empty domain to prepend to
>> + * @ref_domain: domain to check for duplicate programs
>> + * @prog: program to prepend
>> + *
>> + * Return -errno on error or 0 if @prog was successfully stored.
>> + */
>> +static int store_landlock_prog(struct landlock_domain *init_domain,
>> +		const struct landlock_domain *ref_domain,
>> +		struct bpf_prog *prog)
>> +{
>> +	struct landlock_prog_list *tmp_list = NULL;
>> +	int err;
>> +	size_t hook;
>> +	enum landlock_hook_type last_type;
>> +	struct bpf_prog *new = prog;
>> +
>> +	/* allocate all the memory we need */
>> +	struct landlock_prog_list *new_list;
>> +
>> +	last_type = get_hook_type(new);
>> +
>> +	/* ignore duplicate programs */
> 
> This comment should be "don't allow" rather than "ignore", right?

Exactly, fixed.

> 
>> +	if (ref_domain) {
>> +		struct landlock_prog_list *ref;
>> +
>> +		hook = get_hook_index(get_hook_type(new));
>> +		for (ref = ref_domain->programs[hook]; ref;
>> +				ref = ref->prev) {
>> +			if (ref->prog == new)
>> +				return -EINVAL;
>> +		}
>> +	}
>> +
>> +	new = bpf_prog_inc(new);
>> +	if (IS_ERR(new)) {
>> +		err = PTR_ERR(new);
>> +		goto put_tmp_list;
>> +	}
>> +	new_list = kzalloc(sizeof(*new_list), GFP_KERNEL);
>> +	if (!new_list) {
>> +		bpf_prog_put(new);
>> +		err = -ENOMEM;
>> +		goto put_tmp_list;
>> +	}
>> +	/* ignore Landlock types in this tmp_list */
>> +	new_list->prog = new;
>> +	new_list->prev = tmp_list;
>> +	refcount_set(&new_list->usage, 1);
>> +	tmp_list = new_list;
>> +
>> +	if (!tmp_list)
>> +		/* inform user space that this program was already added */
> 
> I'm not following this.  You just kzalloc'd new_list, pointed
> tmp_list to new_list, so how could tmp_list be NULL?  Was there
> a bad code reorg here, or am i being dense?

Indeed, this was introduce with a code refactoring (while removing a
program "chaining" concept) where this code snippet was in a loop, hence
the weird use of tmp_list. I'm cleaning this up (and simplifying this
whole code), and replacing the -EINVAL for the duplicate program check
with the -EEXIST.

Thanks!


> 
>> +		return -EEXIST;
>> +
>> +	/* properly store the list (without error cases) */
>> +	while (tmp_list) {
>> +		struct landlock_prog_list *new_list;
>> +
>> +		new_list = tmp_list;
>> +		tmp_list = tmp_list->prev;
>> +		/* do not increment the previous prog list usage */
>> +		hook = get_hook_index(get_hook_type(new_list->prog));
>> +		new_list->prev = init_domain->programs[hook];
>> +		/* no need to add from the last program to the first because
>> +		 * each of them are a different Landlock type */
>> +		smp_store_release(&init_domain->programs[hook], new_list);
>> +	}
>> +	return 0;
>> +
>> +put_tmp_list:
>> +	put_landlock_prog_list(tmp_list);
>> +	return err;
>> +}
>> +
>> +/* limit Landlock programs set to 256KB */
>> +#define LANDLOCK_PROGRAMS_MAX_PAGES (1 << 6)
>> +
>> +/**
>> + * landlock_prepend_prog - attach a Landlock prog_list to @current_domain
>> + *
>> + * Whatever is the result of this function call, you can call
>> + * bpf_prog_put(@prog) after.
>> + *
>> + * @current_domain: landlock_domain pointer, must be (RCU-)locked (if needed)
>> + *                  to prevent a concurrent put/free. This pointer must not be
>> + *                  freed after the call.
>> + * @prog: non-NULL Landlock prog_list to prepend to @current_domain. @prog will
>> + *        be owned by landlock_prepend_prog() and freed if an error happened.
>> + *
>> + * Return @current_domain or a new pointer when OK. Return a pointer error
>> + * otherwise.
>> + */
>> +struct landlock_domain *landlock_prepend_prog(
>> +		struct landlock_domain *current_domain,
>> +		struct bpf_prog *prog)
>> +{
>> +	struct landlock_domain *new_domain = current_domain;
>> +	unsigned long pages;
>> +	int err;
>> +	size_t i;
>> +	struct landlock_domain tmp_domain = {};
>> +
>> +	if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
>> +		return ERR_PTR(-EINVAL);
>> +
>> +	/* validate memory size allocation */
>> +	pages = prog->pages;
>> +	if (current_domain) {
>> +		size_t i;
>> +
>> +		for (i = 0; i < ARRAY_SIZE(current_domain->programs); i++) {
>> +			struct landlock_prog_list *walker_p;
>> +
>> +			for (walker_p = current_domain->programs[i];
>> +					walker_p; walker_p = walker_p->prev)
>> +				pages += walker_p->prog->pages;
>> +		}
>> +		/* count a struct landlock_domain if we need to allocate one */
>> +		if (refcount_read(&current_domain->usage) != 1)
>> +			pages += round_up(sizeof(*current_domain), PAGE_SIZE)
>> +				/ PAGE_SIZE;
>> +	}
>> +	if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
>> +		return ERR_PTR(-E2BIG);
>> +
>> +	/* ensure early that we can allocate enough memory for the new
>> +	 * prog_lists */
>> +	err = store_landlock_prog(&tmp_domain, current_domain, prog);
>> +	if (err)
>> +		return ERR_PTR(err);
>> +
>> +	/*
>> +	 * Each task_struct points to an array of prog list pointers.  These
>> +	 * tables are duplicated when additions are made (which means each
>> +	 * table needs to be refcounted for the processes using it). When a new
>> +	 * table is created, all the refcounters on the prog_list are bumped
>> +	 * (to track each table that references the prog). When a new prog is
>> +	 * added, it's just prepended to the list for the new table to point
>> +	 * at.
>> +	 *
>> +	 * Manage all the possible errors before this step to not uselessly
>> +	 * duplicate current_domain and avoid a rollback.
>> +	 */
>> +	if (!new_domain) {
>> +		/*
>> +		 * If there is no Landlock domain used by the current task,
>> +		 * then create a new one.
>> +		 */
>> +		new_domain = new_landlock_domain();
>> +		if (IS_ERR(new_domain))
>> +			goto put_tmp_lists;
>> +	} else if (refcount_read(&current_domain->usage) > 1) {
>> +		/*
>> +		 * If the current task is not the sole user of its Landlock
>> +		 * domain, then duplicate it.
>> +		 */
>> +		new_domain = new_landlock_domain();
>> +		if (IS_ERR(new_domain))
>> +			goto put_tmp_lists;
>> +		for (i = 0; i < ARRAY_SIZE(new_domain->programs); i++) {
>> +			new_domain->programs[i] =
>> +				READ_ONCE(current_domain->programs[i]);
>> +			if (new_domain->programs[i])
>> +				refcount_inc(&new_domain->programs[i]->usage);
>> +		}
>> +
>> +		/*
>> +		 * Landlock domain from the current task will not be freed here
>> +		 * because the usage is strictly greater than 1. It is only
>> +		 * prevented to be freed by another task thanks to the caller
>> +		 * of landlock_prepend_prog() which should be locked if needed.
>> +		 */
>> +		landlock_put_domain(current_domain);
>> +	}
>> +
>> +	/* prepend tmp_domain to new_domain */
>> +	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++) {
>> +		/* get the last new list */
>> +		struct landlock_prog_list *last_list =
>> +			tmp_domain.programs[i];
>> +
>> +		if (last_list) {
>> +			while (last_list->prev)
>> +				last_list = last_list->prev;
>> +			/* no need to increment usage (pointer replacement) */
>> +			last_list->prev = new_domain->programs[i];
>> +			new_domain->programs[i] = tmp_domain.programs[i];
>> +		}
>> +	}
>> +	return new_domain;
>> +
>> +put_tmp_lists:
>> +	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++)
>> +		put_landlock_prog_list(tmp_domain.programs[i]);
>> +	return new_domain;
>> +}
>> diff --git a/security/landlock/domain_manage.h b/security/landlock/domain_manage.h
>> new file mode 100644
>> index 000000000000..5b5b49f6e3e8
>> --- /dev/null
>> +++ b/security/landlock/domain_manage.h
>> @@ -0,0 +1,23 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Landlock LSM - domain management headers
>> + *
>> + * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
>> + * Copyright © 2018-2019 ANSSI
>> + */
>> +
>> +#ifndef _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
>> +#define _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
>> +
>> +#include <linux/filter.h>
>> +
>> +#include "common.h"
>> +
>> +void landlock_get_domain(struct landlock_domain *dom);
>> +void landlock_put_domain(struct landlock_domain *dom);
>> +
>> +struct landlock_domain *landlock_prepend_prog(
>> +		struct landlock_domain *current_domain,
>> +		struct bpf_prog *prog);
>> +
>> +#endif /* _SECURITY_LANDLOCK_DOMAIN_MANAGE_H */
>> -- 
>> 2.23.0
> 

^ permalink raw reply

* Re: [PATCH v23 12/24] x86/sgx: Linux Enclave Driver
From: Stephen Smalley @ 2019-10-30 13:45 UTC (permalink / raw)
  To: Jarkko Sakkinen, linux-kernel, x86, linux-sgx
  Cc: akpm, dave.hansen, sean.j.christopherson, nhorman, npmccallum,
	serge.ayoun, shay.katz-zamir, haitao.huang, andriy.shevchenko,
	tglx, kai.svahn, bp, josh, luto, kai.huang, rientjes, cedric.xing,
	puiterwijk, linux-security-module, Suresh Siddha
In-Reply-To: <20191028210324.12475-13-jarkko.sakkinen@linux.intel.com>

On 10/28/19 5:03 PM, Jarkko Sakkinen wrote:
> Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> can be used by applications to set aside private regions of code and
> data. The code outside the SGX hosted software entity is disallowed to
> access the memory inside the enclave enforced by the CPU. We call these
> entities as enclaves.
> 
> This commit implements a driver that provides an ioctl API to construct
> and run enclaves. Enclaves are constructed from pages residing in
> reserved physical memory areas. The contents of these pages can only be
> accessed when they are mapped as part of an enclave, by a hardware
> thread running inside the enclave.
> 
> The starting state of an enclave consists of a fixed measured set of
> pages that are copied to the EPC during the construction process by
> using ENCLS leaf functions and Software Enclave Control Structure (SECS)
> that defines the enclave properties.
> 
> Enclave are constructed by using ENCLS leaf functions ECREATE, EADD and
> EINIT. ECREATE initializes SECS, EADD copies pages from system memory to
> the EPC and EINIT check a given signed measurement and moves the enclave
> into a state ready for execution.
> 
> An initialized enclave can only be accessed through special Thread Control
> Structure (TCS) pages by using ENCLU (ring-3 only) leaf EENTER.  This leaf
> function converts a thread into enclave mode and continues the execution in
> the offset defined by the TCS provided to EENTER. An enclave is exited
> through syscall, exception, interrupts or by explicitly calling another
> ENCLU leaf EEXIT.
> 
> The permissions, which enclave page is added will set the limit for maximum
> permissions that can be set for mmap() and mprotect(). This will
> effectively allow to build different security schemes between producers and
> consumers of enclaves. Later on we can increase granularity with LSM hooks
> for page addition (i.e. for producers) and mapping of the enclave (i.e. for
> consumers)

Where do things stand wrt to ensuring that SGX cannot be used to 
introduce executable mappings that were never authorized by the LSM (or 
never measured by IMA)?

> 
> Cc: linux-security-module@vger.kernel.org
> Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> ---
>   Documentation/ioctl/ioctl-number.rst |   1 +
>   arch/x86/include/uapi/asm/sgx.h      |  64 +++
>   arch/x86/kernel/cpu/sgx/Makefile     |   3 +
>   arch/x86/kernel/cpu/sgx/driver.c     | 252 ++++++++++
>   arch/x86/kernel/cpu/sgx/driver.h     |  32 ++
>   arch/x86/kernel/cpu/sgx/encl.c       | 329 ++++++++++++++
>   arch/x86/kernel/cpu/sgx/encl.h       |  87 ++++
>   arch/x86/kernel/cpu/sgx/ioctl.c      | 656 +++++++++++++++++++++++++++
>   arch/x86/kernel/cpu/sgx/main.c       |  10 +
>   arch/x86/kernel/cpu/sgx/reclaim.c    |   1 +
>   10 files changed, 1435 insertions(+)
>   create mode 100644 arch/x86/include/uapi/asm/sgx.h
>   create mode 100644 arch/x86/kernel/cpu/sgx/driver.c
>   create mode 100644 arch/x86/kernel/cpu/sgx/driver.h
>   create mode 100644 arch/x86/kernel/cpu/sgx/encl.c
>   create mode 100644 arch/x86/kernel/cpu/sgx/encl.h
>   create mode 100644 arch/x86/kernel/cpu/sgx/ioctl.c
> 
> diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst
> index bef79cd4c6b4..f9f3ea9606fc 100644
> --- a/Documentation/ioctl/ioctl-number.rst
> +++ b/Documentation/ioctl/ioctl-number.rst
> @@ -321,6 +321,7 @@ Code  Seq#    Include File                                           Comments
>                                                                        <mailto:tlewis@mindspring.com>
>   0xA3  90-9F  linux/dtlk.h
>   0xA4  00-1F  uapi/linux/tee.h                                        Generic TEE subsystem
> +0xA4  00-1F  uapi/asm/sgx.h                                          Intel SGX subsystem (a legit conflict as TEE and SGX do not co-exist)
>   0xAA  00-3F  linux/uapi/linux/userfaultfd.h
>   0xAB  00-1F  linux/nbd.h
>   0xAC  00-1F  linux/raw.h
> diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> new file mode 100644
> index 000000000000..e0f79ebdfd8a
> --- /dev/null
> +++ b/arch/x86/include/uapi/asm/sgx.h
> @@ -0,0 +1,64 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) WITH Linux-syscall-note */
> +/*
> + * Copyright(c) 2016-19 Intel Corporation.
> + */
> +#ifndef _UAPI_ASM_X86_SGX_H
> +#define _UAPI_ASM_X86_SGX_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +/**
> + * enum sgx_epage_flags - page control flags
> + * %SGX_PAGE_MEASURE:	Measure the page contents with a sequence of
> + *			ENCLS[EEXTEND] operations.
> + */
> +enum sgx_page_flags {
> +	SGX_PAGE_MEASURE	= 0x01,
> +};
> +
> +#define SGX_MAGIC 0xA4
> +
> +#define SGX_IOC_ENCLAVE_CREATE \
> +	_IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
> +#define SGX_IOC_ENCLAVE_ADD_PAGES \
> +	_IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
> +#define SGX_IOC_ENCLAVE_INIT \
> +	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
> +
> +/**
> + * struct sgx_enclave_create - parameter structure for the
> + *                             %SGX_IOC_ENCLAVE_CREATE ioctl
> + * @src:	address for the SECS page data
> + */
> +struct sgx_enclave_create  {
> +	__u64	src;
> +};
> +
> +/**
> + * struct sgx_enclave_add_pages - parameter structure for the
> + *                                %SGX_IOC_ENCLAVE_ADD_PAGE ioctl
> + * @src:	start address for the page data
> + * @offset:	starting page offset
> + * @length:	length of the data (multiple of the page size)
> + * @secinfo:	address for the SECINFO data
> + * @flags:	page control flags
> + */
> +struct sgx_enclave_add_pages {
> +	__u64	src;
> +	__u64	offset;
> +	__u64	length;
> +	__u64	secinfo;
> +	__u64	flags;
> +};
> +
> +/**
> + * struct sgx_enclave_init - parameter structure for the
> + *                           %SGX_IOC_ENCLAVE_INIT ioctl
> + * @sigstruct:	address for the SIGSTRUCT data
> + */
> +struct sgx_enclave_init {
> +	__u64 sigstruct;
> +};
> +
> +#endif /* _UAPI_ASM_X86_SGX_H */
> diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
> index 874492d9e3bd..3ddcdabab081 100644
> --- a/arch/x86/kernel/cpu/sgx/Makefile
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -1,4 +1,7 @@
>   obj-y += \
> +	driver.o \
> +	encl.o \
>   	encls.o \
> +	ioctl.o \
>   	main.o \
>   	reclaim.o
> diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
> new file mode 100644
> index 000000000000..c724dcccf2e2
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/driver.c
> @@ -0,0 +1,252 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +// Copyright(c) 2016-18 Intel Corporation.
> +
> +#include <linux/acpi.h>
> +#include <linux/cdev.h>
> +#include <linux/mman.h>
> +#include <linux/platform_device.h>
> +#include <linux/security.h>
> +#include <linux/suspend.h>
> +#include <asm/traps.h>
> +#include "driver.h"
> +#include "encl.h"
> +
> +MODULE_DESCRIPTION("Intel SGX Enclave Driver");
> +MODULE_AUTHOR("Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>");
> +MODULE_LICENSE("Dual BSD/GPL");
> +
> +struct workqueue_struct *sgx_encl_wq;
> +u64 sgx_encl_size_max_32;
> +u64 sgx_encl_size_max_64;
> +u32 sgx_misc_reserved_mask;
> +u64 sgx_attributes_reserved_mask;
> +u64 sgx_xfrm_reserved_mask = ~0x3;
> +u32 sgx_xsave_size_tbl[64];
> +
> +static int sgx_open(struct inode *inode, struct file *file)
> +{
> +	struct sgx_encl *encl;
> +	int ret;
> +
> +	encl = kzalloc(sizeof(*encl), GFP_KERNEL);
> +	if (!encl)
> +		return -ENOMEM;
> +
> +	atomic_set(&encl->flags, 0);
> +	kref_init(&encl->refcount);
> +	INIT_RADIX_TREE(&encl->page_tree, GFP_KERNEL);
> +	mutex_init(&encl->lock);
> +	INIT_LIST_HEAD(&encl->mm_list);
> +	spin_lock_init(&encl->mm_lock);
> +
> +	ret = init_srcu_struct(&encl->srcu);
> +	if (ret) {
> +		kfree(encl);
> +		return ret;
> +	}
> +
> +	file->private_data = encl;
> +
> +	return 0;
> +}
> +
> +static int sgx_release(struct inode *inode, struct file *file)
> +{
> +	struct sgx_encl *encl = file->private_data;
> +	struct sgx_encl_mm *encl_mm;
> +
> +	for ( ; ; )  {
> +		spin_lock(&encl->mm_lock);
> +
> +		if (list_empty(&encl->mm_list)) {
> +			encl_mm = NULL;
> +		} else {
> +			encl_mm = list_first_entry(&encl->mm_list,
> +						   struct sgx_encl_mm, list);
> +			list_del_rcu(&encl_mm->list);
> +		}
> +
> +		spin_unlock(&encl->mm_lock);
> +
> +		/* The list is empty, ready to go. */
> +		if (!encl_mm)
> +			break;
> +
> +		synchronize_srcu(&encl->srcu);
> +		mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
> +		kfree(encl_mm);
> +	};
> +
> +	mutex_lock(&encl->lock);
> +	atomic_or(SGX_ENCL_DEAD, &encl->flags);
> +	mutex_unlock(&encl->lock);
> +
> +	kref_put(&encl->refcount, sgx_encl_release);
> +	return 0;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long sgx_compat_ioctl(struct file *filep, unsigned int cmd,
> +			      unsigned long arg)
> +{
> +	return sgx_ioctl(filep, cmd, arg);
> +}
> +#endif
> +
> +static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	struct sgx_encl *encl = file->private_data;
> +	int ret;
> +
> +	ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end,
> +			       vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC));
> +	if (ret)
> +		return ret;
> +
> +	ret = sgx_encl_mm_add(encl, vma->vm_mm);
> +	if (ret)
> +		return ret;
> +
> +	vma->vm_ops = &sgx_vm_ops;
> +	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
> +	vma->vm_private_data = encl;
> +
> +	return 0;
> +}
> +
> +static unsigned long sgx_get_unmapped_area(struct file *file,
> +					   unsigned long addr,
> +					   unsigned long len,
> +					   unsigned long pgoff,
> +					   unsigned long flags)
> +{
> +	if (flags & MAP_PRIVATE)
> +		return -EINVAL;
> +
> +	if (flags & MAP_FIXED)
> +		return addr;
> +
> +	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
> +}
> +
> +static const struct file_operations sgx_encl_fops = {
> +	.owner			= THIS_MODULE,
> +	.open			= sgx_open,
> +	.release		= sgx_release,
> +	.unlocked_ioctl		= sgx_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl		= sgx_compat_ioctl,
> +#endif
> +	.mmap			= sgx_mmap,
> +	.get_unmapped_area	= sgx_get_unmapped_area,
> +};
> +
> +static struct bus_type sgx_bus_type = {
> +	.name	= "sgx",
> +};
> +
> +static struct device sgx_encl_dev;
> +static struct cdev sgx_encl_cdev;
> +static dev_t sgx_devt;
> +
> +static void sgx_dev_release(struct device *dev)
> +{
> +}
> +
> +static __init int sgx_dev_init(const char *name, struct device *dev,
> +			       struct cdev *cdev,
> +			       const struct file_operations *fops, int minor)
> +{
> +	int ret;
> +
> +	device_initialize(dev);
> +
> +	dev->bus = &sgx_bus_type;
> +	dev->devt = MKDEV(MAJOR(sgx_devt), minor);
> +	dev->release = sgx_dev_release;
> +
> +	ret = dev_set_name(dev, name);
> +	if (ret) {
> +		put_device(dev);
> +		return ret;
> +	}
> +
> +	cdev_init(cdev, fops);
> +	cdev->owner = THIS_MODULE;
> +	return 0;
> +}
> +
> +int __init sgx_drv_init(void)
> +{
> +	unsigned int eax, ebx, ecx, edx;
> +	u64 attr_mask, xfrm_mask;
> +	int ret;
> +	int i;
> +
> +	if (!boot_cpu_has(X86_FEATURE_SGX_LC)) {
> +		pr_info("The public key MSRs are not writable\n");
> +		return -ENODEV;
> +	}
> +
> +	ret = bus_register(&sgx_bus_type);
> +	if (ret)
> +		return ret;
> +
> +	ret = alloc_chrdev_region(&sgx_devt, 0, SGX_DRV_NR_DEVICES, "sgx");
> +	if (ret < 0)
> +		goto err_bus;
> +
> +	cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
> +	sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
> +	sgx_encl_size_max_64 = 1ULL << ((edx >> 8) & 0xFF);
> +	sgx_encl_size_max_32 = 1ULL << (edx & 0xFF);
> +
> +	cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
> +
> +	attr_mask = (((u64)ebx) << 32) + (u64)eax;
> +	sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
> +
> +	if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
> +		xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
> +
> +		for (i = 2; i < 64; i++) {
> +			cpuid_count(0x0D, i, &eax, &ebx, &ecx, &edx);
> +			if ((1 << i) & xfrm_mask)
> +				sgx_xsave_size_tbl[i] = eax + ebx;
> +		}
> +
> +		sgx_xfrm_reserved_mask = ~xfrm_mask;
> +	}
> +
> +	ret = sgx_dev_init("sgx/enclave", &sgx_encl_dev, &sgx_encl_cdev,
> +			   &sgx_encl_fops, 0);
> +	if (ret)
> +		goto err_chrdev_region;
> +
> +	sgx_encl_wq = alloc_workqueue("sgx-encl-wq",
> +				      WQ_UNBOUND | WQ_FREEZABLE, 1);
> +	if (!sgx_encl_wq) {
> +		ret = -ENOMEM;
> +		goto err_encl_dev;
> +	}
> +
> +	ret = cdev_device_add(&sgx_encl_cdev, &sgx_encl_dev);
> +	if (ret)
> +		goto err_encl_wq;
> +
> +	return 0;
> +
> +err_encl_wq:
> +	destroy_workqueue(sgx_encl_wq);
> +
> +err_encl_dev:
> +	put_device(&sgx_encl_dev);
> +
> +err_chrdev_region:
> +	unregister_chrdev_region(sgx_devt, SGX_DRV_NR_DEVICES);
> +
> +err_bus:
> +	bus_unregister(&sgx_bus_type);
> +
> +	return ret;
> +}
> diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
> new file mode 100644
> index 000000000000..e95c6e86c0c6
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/driver.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +#ifndef __ARCH_SGX_DRIVER_H__
> +#define __ARCH_SGX_DRIVER_H__
> +
> +#include <crypto/hash.h>
> +#include <linux/kref.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/radix-tree.h>
> +#include <linux/rwsem.h>
> +#include <linux/sched.h>
> +#include <linux/workqueue.h>
> +#include <uapi/asm/sgx.h>
> +#include "sgx.h"
> +
> +#define SGX_DRV_NR_DEVICES	2
> +#define SGX_EINIT_SPIN_COUNT	20
> +#define SGX_EINIT_SLEEP_COUNT	50
> +#define SGX_EINIT_SLEEP_TIME	20
> +
> +extern struct workqueue_struct *sgx_encl_wq;
> +extern u64 sgx_encl_size_max_32;
> +extern u64 sgx_encl_size_max_64;
> +extern u32 sgx_misc_reserved_mask;
> +extern u64 sgx_attributes_reserved_mask;
> +extern u64 sgx_xfrm_reserved_mask;
> +extern u32 sgx_xsave_size_tbl[64];
> +
> +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
> +
> +int sgx_drv_init(void);
> +
> +#endif /* __ARCH_X86_SGX_DRIVER_H__ */
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> new file mode 100644
> index 000000000000..cd2b8dbb0eca
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -0,0 +1,329 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +// Copyright(c) 2016-18 Intel Corporation.
> +
> +#include <linux/lockdep.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/shmem_fs.h>
> +#include <linux/suspend.h>
> +#include <linux/sched/mm.h>
> +#include "arch.h"
> +#include "encl.h"
> +#include "sgx.h"
> +
> +static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
> +						unsigned long addr)
> +{
> +	struct sgx_encl_page *entry;
> +	unsigned int flags;
> +
> +	/* If process was forked, VMA is still there but vm_private_data is set
> +	 * to NULL.
> +	 */
> +	if (!encl)
> +		return ERR_PTR(-EFAULT);
> +
> +	flags = atomic_read(&encl->flags);
> +
> +	if ((flags & SGX_ENCL_DEAD) || !(flags & SGX_ENCL_INITIALIZED))
> +		return ERR_PTR(-EFAULT);
> +
> +	entry = radix_tree_lookup(&encl->page_tree, addr >> PAGE_SHIFT);
> +	if (!entry)
> +		return ERR_PTR(-EFAULT);
> +
> +	/* Page is already resident in the EPC. */
> +	if (entry->epc_page)
> +		return entry;
> +
> +	return ERR_PTR(-EFAULT);
> +}
> +
> +static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
> +				     struct mm_struct *mm)
> +{
> +	struct sgx_encl_mm *encl_mm =
> +		container_of(mn, struct sgx_encl_mm, mmu_notifier);
> +	struct sgx_encl_mm *tmp = NULL;
> +
> +	/*
> +	 * The enclave itself can remove encl_mm.  Note, objects can't be moved
> +	 * off an RCU protected list, but deletion is ok.
> +	 */
> +	spin_lock(&encl_mm->encl->mm_lock);
> +	list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
> +		if (tmp == encl_mm) {
> +			list_del_rcu(&encl_mm->list);
> +			break;
> +		}
> +	}
> +	spin_unlock(&encl_mm->encl->mm_lock);
> +
> +	if (tmp == encl_mm) {
> +		synchronize_srcu(&encl_mm->encl->srcu);
> +		mmu_notifier_put(mn);
> +	}
> +}
> +
> +static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
> +{
> +	struct sgx_encl_mm *encl_mm =
> +		container_of(mn, struct sgx_encl_mm, mmu_notifier);
> +
> +	kfree(encl_mm);
> +}
> +
> +static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
> +	.release		= sgx_mmu_notifier_release,
> +	.free_notifier		= sgx_mmu_notifier_free,
> +};
> +
> +static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
> +					    struct mm_struct *mm)
> +{
> +	struct sgx_encl_mm *encl_mm = NULL;
> +	struct sgx_encl_mm *tmp;
> +	int idx;
> +
> +	idx = srcu_read_lock(&encl->srcu);
> +
> +	list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
> +		if (tmp->mm == mm) {
> +			encl_mm = tmp;
> +			break;
> +		}
> +	}
> +
> +	srcu_read_unlock(&encl->srcu, idx);
> +
> +	return encl_mm;
> +}
> +
> +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
> +{
> +	struct sgx_encl_mm *encl_mm;
> +	int ret;
> +
> +	if (atomic_read(&encl->flags) & SGX_ENCL_DEAD)
> +		return -EINVAL;
> +
> +	/*
> +	 * mm_structs are kept on mm_list until the mm or the enclave dies,
> +	 * i.e. once an mm is off the list, it's gone for good, therefore it's
> +	 * impossible to get a false positive on @mm due to a stale mm_list.
> +	 */
> +	if (sgx_encl_find_mm(encl, mm))
> +		return 0;
> +
> +	encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
> +	if (!encl_mm)
> +		return -ENOMEM;
> +
> +	encl_mm->encl = encl;
> +	encl_mm->mm = mm;
> +	encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
> +
> +	ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
> +	if (ret) {
> +		kfree(encl_mm);
> +		return ret;
> +	}
> +
> +	spin_lock(&encl->mm_lock);
> +	list_add_rcu(&encl_mm->list, &encl->mm_list);
> +	spin_unlock(&encl->mm_lock);
> +
> +	synchronize_srcu(&encl->srcu);
> +
> +	return 0;
> +}
> +
> +static void sgx_vma_open(struct vm_area_struct *vma)
> +{
> +	struct sgx_encl *encl = vma->vm_private_data;
> +
> +	if (!encl)
> +		return;
> +
> +	if (sgx_encl_mm_add(encl, vma->vm_mm))
> +		vma->vm_private_data = NULL;
> +}
> +
> +static unsigned int sgx_vma_fault(struct vm_fault *vmf)
> +{
> +	unsigned long addr = (unsigned long)vmf->address;
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct sgx_encl *encl = vma->vm_private_data;
> +	struct sgx_encl_page *entry;
> +	int ret = VM_FAULT_NOPAGE;
> +	unsigned long pfn;
> +
> +	if (!encl)
> +		return VM_FAULT_SIGBUS;
> +
> +	mutex_lock(&encl->lock);
> +
> +	entry = sgx_encl_load_page(encl, addr);
> +	if (IS_ERR(entry)) {
> +		if (unlikely(PTR_ERR(entry) != -EBUSY))
> +			ret = VM_FAULT_SIGBUS;
> +
> +		goto out;
> +	}
> +
> +	if (!follow_pfn(vma, addr, &pfn))
> +		goto out;
> +
> +	ret = vmf_insert_pfn(vma, addr, PFN_DOWN(entry->epc_page->desc));
> +	if (ret != VM_FAULT_NOPAGE) {
> +		ret = VM_FAULT_SIGBUS;
> +		goto out;
> +	}
> +
> +out:
> +	mutex_unlock(&encl->lock);
> +	return ret;
> +}
> +
> +/**
> + * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
> + * @encl:		an enclave
> + * @start:		lower bound of the address range, inclusive
> + * @end:		upper bound of the address range, exclusive
> + * @vm_prot_bits:	requested protections of the address range
> + *
> + * Iterate through the enclave pages contained within [@start, @end) to verify
> + * the permissions requested by @vm_prot_bits do not exceed that of any enclave
> + * page to be mapped.  Page addresses that do not have an associated enclave
> + * page are interpreted to zero permissions.
> + *
> + * Return:
> + *   0 on success,
> + *   -EACCES if VMA permissions exceed enclave page permissions
> + */
> +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
> +		     unsigned long end, unsigned long vm_prot_bits)
> +{
> +	unsigned long idx, idx_start, idx_end;
> +	struct sgx_encl_page *page;
> +
> +	/* PROT_NONE always succeeds. */
> +	if (!vm_prot_bits)
> +		return 0;
> +
> +	idx_start = PFN_DOWN(start);
> +	idx_end = PFN_DOWN(end - 1);
> +
> +	for (idx = idx_start; idx <= idx_end; ++idx) {
> +		mutex_lock(&encl->lock);
> +		page = radix_tree_lookup(&encl->page_tree, idx);
> +		mutex_unlock(&encl->lock);
> +
> +		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> +			return -EACCES;
> +	}
> +
> +	return 0;
> +}
> +
> +static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
> +			    unsigned long end, unsigned long prot)
> +{
> +	return sgx_encl_may_map(vma->vm_private_data, start, end,
> +				calc_vm_prot_bits(prot, 0));
> +}
> +
> +const struct vm_operations_struct sgx_vm_ops = {
> +	.open = sgx_vma_open,
> +	.fault = sgx_vma_fault,
> +	.may_mprotect = sgx_vma_mprotect,
> +};
> +
> +/**
> + * sgx_encl_find - find an enclave
> + * @mm:		mm struct of the current process
> + * @addr:	address in the ELRANGE
> + * @vma:	the resulting VMA
> + *
> + * Find an enclave identified by the given address. Give back a VMA that is
> + * part of the enclave and located in that address. The VMA is given back if it
> + * is a proper enclave VMA even if an &sgx_encl instance does not exist yet
> + * (enclave creation has not been performed).
> + *
> + * Return:
> + *   0 on success,
> + *   -EINVAL if an enclave was not found,
> + *   -ENOENT if the enclave has not been created yet
> + */
> +int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
> +		  struct vm_area_struct **vma)
> +{
> +	struct vm_area_struct *result;
> +	struct sgx_encl *encl;
> +
> +	result = find_vma(mm, addr);
> +	if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
> +		return -EINVAL;
> +
> +	encl = result->vm_private_data;
> +	*vma = result;
> +
> +	return encl ? 0 : -ENOENT;
> +}
> +
> +/**
> + * sgx_encl_destroy() - destroy enclave resources
> + * @encl:	an &sgx_encl instance
> + */
> +void sgx_encl_destroy(struct sgx_encl *encl)
> +{
> +	struct sgx_encl_page *entry;
> +	struct radix_tree_iter iter;
> +	void **slot;
> +
> +	atomic_or(SGX_ENCL_DEAD, &encl->flags);
> +
> +	radix_tree_for_each_slot(slot, &encl->page_tree, &iter, 0) {
> +		entry = *slot;
> +
> +		if (entry->epc_page) {
> +			sgx_free_page(entry->epc_page);
> +			encl->secs_child_cnt--;
> +			entry->epc_page = NULL;
> +		}
> +
> +		radix_tree_delete(&entry->encl->page_tree,
> +				  PFN_DOWN(entry->desc));
> +		kfree(entry);
> +	}
> +
> +	if (!encl->secs_child_cnt && encl->secs.epc_page) {
> +		sgx_free_page(encl->secs.epc_page);
> +		encl->secs.epc_page = NULL;
> +	}
> +}
> +
> +/**
> + * sgx_encl_release - Destroy an enclave instance
> + * @kref:	address of a kref inside &sgx_encl
> + *
> + * Used together with kref_put(). Frees all the resources associated with the
> + * enclave and the instance itself.
> + */
> +void sgx_encl_release(struct kref *ref)
> +{
> +	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
> +
> +	sgx_encl_destroy(encl);
> +
> +	if (encl->backing)
> +		fput(encl->backing);
> +
> +	WARN_ON_ONCE(!list_empty(&encl->mm_list));
> +
> +	/* Detect EPC page leak's. */
> +	WARN_ON_ONCE(encl->secs_child_cnt);
> +	WARN_ON_ONCE(encl->secs.epc_page);
> +
> +	kfree(encl);
> +}
> diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
> new file mode 100644
> index 000000000000..1d1bc5d590ee
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/encl.h
> @@ -0,0 +1,87 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +/**
> + * Copyright(c) 2016-19 Intel Corporation.
> + */
> +#ifndef _X86_ENCL_H
> +#define _X86_ENCL_H
> +
> +#include <linux/cpumask.h>
> +#include <linux/kref.h>
> +#include <linux/list.h>
> +#include <linux/mm_types.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <linux/radix-tree.h>
> +#include <linux/srcu.h>
> +#include <linux/workqueue.h>
> +#include "sgx.h"
> +
> +/**
> + * enum sgx_encl_page_desc - defines bits for an enclave page's descriptor
> + * %SGX_ENCL_PAGE_ADDR_MASK:		Holds the virtual address of the page.
> + *
> + * The page address for SECS is zero and is used by the subsystem to recognize
> + * the SECS page.
> + */
> +enum sgx_encl_page_desc {
> +	/* Bits 11:3 are available when the page is not swapped. */
> +	SGX_ENCL_PAGE_ADDR_MASK		= PAGE_MASK,
> +};
> +
> +#define SGX_ENCL_PAGE_ADDR(page) \
> +	((page)->desc & SGX_ENCL_PAGE_ADDR_MASK)
> +
> +struct sgx_encl_page {
> +	unsigned long desc;
> +	unsigned long vm_max_prot_bits;
> +	struct sgx_epc_page *epc_page;
> +	struct sgx_encl *encl;
> +};
> +
> +enum sgx_encl_flags {
> +	SGX_ENCL_CREATED	= BIT(0),
> +	SGX_ENCL_INITIALIZED	= BIT(1),
> +	SGX_ENCL_DEBUG		= BIT(2),
> +	SGX_ENCL_DEAD		= BIT(3),
> +	SGX_ENCL_IOCTL		= BIT(4),
> +};
> +
> +struct sgx_encl_mm {
> +	struct sgx_encl *encl;
> +	struct mm_struct *mm;
> +	struct list_head list;
> +	struct mmu_notifier mmu_notifier;
> +};
> +
> +struct sgx_encl {
> +	atomic_t flags;
> +	u64 secs_attributes;
> +	u64 allowed_attributes;
> +	unsigned int page_cnt;
> +	unsigned int secs_child_cnt;
> +	struct mutex lock;
> +	struct list_head mm_list;
> +	spinlock_t mm_lock;
> +	struct file *backing;
> +	struct kref refcount;
> +	struct srcu_struct srcu;
> +	unsigned long base;
> +	unsigned long size;
> +	unsigned long ssaframesize;
> +	struct radix_tree_root page_tree;
> +	struct sgx_encl_page secs;
> +	cpumask_t cpumask;
> +};
> +
> +extern const struct vm_operations_struct sgx_vm_ops;
> +
> +int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
> +		  struct vm_area_struct **vma);
> +void sgx_encl_destroy(struct sgx_encl *encl);
> +void sgx_encl_release(struct kref *ref);
> +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
> +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
> +		     unsigned long end, unsigned long vm_prot_bits);
> +
> +#endif /* _X86_ENCL_H */
> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
> new file mode 100644
> index 000000000000..691efac24ed7
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/ioctl.c
> @@ -0,0 +1,656 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +// Copyright(c) 2016-19 Intel Corporation.
> +
> +#include <asm/mman.h>
> +#include <linux/mman.h>
> +#include <linux/delay.h>
> +#include <linux/file.h>
> +#include <linux/hashtable.h>
> +#include <linux/highmem.h>
> +#include <linux/ratelimit.h>
> +#include <linux/sched/signal.h>
> +#include <linux/shmem_fs.h>
> +#include <linux/slab.h>
> +#include <linux/suspend.h>
> +#include "driver.h"
> +#include "encl.h"
> +#include "encls.h"
> +
> +static u32 sgx_calc_ssaframesize(u32 miscselect, u64 xfrm)
> +{
> +	u32 size_max = PAGE_SIZE;
> +	u32 size;
> +	int i;
> +
> +	for (i = 2; i < 64; i++) {
> +		if (!((1 << i) & xfrm))
> +			continue;
> +
> +		size = SGX_SSA_GPRS_SIZE + sgx_xsave_size_tbl[i];
> +		if (miscselect & SGX_MISC_EXINFO)
> +			size += SGX_SSA_MISC_EXINFO_SIZE;
> +
> +		if (size > size_max)
> +			size_max = size;
> +	}
> +
> +	return PFN_UP(size_max);
> +}
> +
> +static int sgx_validate_secs(const struct sgx_secs *secs,
> +			     unsigned long ssaframesize)
> +{
> +	if (secs->size < (2 * PAGE_SIZE) || !is_power_of_2(secs->size))
> +		return -EINVAL;
> +
> +	if (secs->base & (secs->size - 1))
> +		return -EINVAL;
> +
> +	if (secs->miscselect & sgx_misc_reserved_mask ||
> +	    secs->attributes & sgx_attributes_reserved_mask ||
> +	    secs->xfrm & sgx_xfrm_reserved_mask)
> +		return -EINVAL;
> +
> +	if (secs->attributes & SGX_ATTR_MODE64BIT) {
> +		if (secs->size > sgx_encl_size_max_64)
> +			return -EINVAL;
> +	} else if (secs->size > sgx_encl_size_max_32)
> +		return -EINVAL;
> +
> +	if (!(secs->xfrm & XFEATURE_MASK_FP) ||
> +	    !(secs->xfrm & XFEATURE_MASK_SSE) ||
> +	    (((secs->xfrm >> XFEATURE_BNDREGS) & 1) !=
> +	     ((secs->xfrm >> XFEATURE_BNDCSR) & 1)))
> +		return -EINVAL;
> +
> +	if (!secs->ssa_frame_size || ssaframesize > secs->ssa_frame_size)
> +		return -EINVAL;
> +
> +	if (memchr_inv(secs->reserved1, 0, sizeof(secs->reserved1)) ||
> +	    memchr_inv(secs->reserved2, 0, sizeof(secs->reserved2)) ||
> +	    memchr_inv(secs->reserved3, 0, sizeof(secs->reserved3)) ||
> +	    memchr_inv(secs->reserved4, 0, sizeof(secs->reserved4)))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
> +						 unsigned long offset,
> +						 u64 secinfo_flags)
> +{
> +	struct sgx_encl_page *encl_page;
> +	unsigned long prot;
> +
> +	encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
> +	if (!encl_page)
> +		return ERR_PTR(-ENOMEM);
> +
> +	encl_page->desc = encl->base + offset;
> +	encl_page->encl = encl;
> +
> +	prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ)  |
> +	       _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
> +	       _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
> +
> +	/*
> +	 * TCS pages must always RW set for CPU access while the SECINFO
> +	 * permissions are *always* zero - the CPU ignores the user provided
> +	 * values and silently overwrites them with zero permissions.
> +	 */
> +	if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
> +		prot |= PROT_READ | PROT_WRITE;
> +
> +	/* Calculate maximum of the VM flags for the page. */
> +	encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
> +
> +	return encl_page;
> +}
> +
> +static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
> +{
> +	unsigned long encl_size = secs->size + PAGE_SIZE;
> +	struct sgx_epc_page *secs_epc;
> +	unsigned long ssaframesize;
> +	struct sgx_pageinfo pginfo;
> +	struct sgx_secinfo secinfo;
> +	struct file *backing;
> +	long ret;
> +
> +	if (atomic_read(&encl->flags) & SGX_ENCL_CREATED)
> +		return -EINVAL;
> +
> +	ssaframesize = sgx_calc_ssaframesize(secs->miscselect, secs->xfrm);
> +	if (sgx_validate_secs(secs, ssaframesize)) {
> +		pr_debug("invalid SECS\n");
> +		return -EINVAL;
> +	}
> +
> +	backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
> +				   VM_NORESERVE);
> +	if (IS_ERR(backing))
> +		return PTR_ERR(backing);
> +
> +	encl->backing = backing;
> +
> +	secs_epc = sgx_try_alloc_page();
> +	if (IS_ERR(secs_epc)) {
> +		ret = PTR_ERR(secs_epc);
> +		goto err_out_backing;
> +	}
> +
> +	encl->secs.epc_page = secs_epc;
> +
> +	pginfo.addr = 0;
> +	pginfo.contents = (unsigned long)secs;
> +	pginfo.metadata = (unsigned long)&secinfo;
> +	pginfo.secs = 0;
> +	memset(&secinfo, 0, sizeof(secinfo));
> +
> +	ret = __ecreate((void *)&pginfo, sgx_epc_addr(secs_epc));
> +	if (ret) {
> +		pr_debug("ECREATE returned %ld\n", ret);
> +		goto err_out;
> +	}
> +
> +	if (secs->attributes & SGX_ATTR_DEBUG)
> +		atomic_or(SGX_ENCL_DEBUG, &encl->flags);
> +
> +	encl->secs.encl = encl;
> +	encl->secs_attributes = secs->attributes;
> +	encl->allowed_attributes |= SGX_ATTR_ALLOWED_MASK;
> +	encl->base = secs->base;
> +	encl->size = secs->size;
> +	encl->ssaframesize = secs->ssa_frame_size;
> +
> +	/*
> +	 * Set SGX_ENCL_CREATED only after the enclave is fully prepped.  This
> +	 * allows setting and checking enclave creation without having to take
> +	 * encl->lock.
> +	 */
> +	atomic_or(SGX_ENCL_CREATED, &encl->flags);
> +
> +	return 0;
> +
> +err_out:
> +	sgx_free_page(encl->secs.epc_page);
> +	encl->secs.epc_page = NULL;
> +
> +err_out_backing:
> +	fput(encl->backing);
> +	encl->backing = NULL;
> +
> +	return ret;
> +}
> +
> +/**
> + * sgx_ioc_enclave_create - handler for %SGX_IOC_ENCLAVE_CREATE
> + * @filep:	open file to /dev/sgx
> + * @arg:	userspace pointer to a struct sgx_enclave_create instance
> + *
> + * Allocate kernel data structures for a new enclave and execute ECREATE after
> + * verifying the correctness of the provided SECS.
> + *
> + * Note, enforcement of restricted and disallowed attributes is deferred until
> + * sgx_ioc_enclave_init(), only the architectural correctness of the SECS is
> + * checked by sgx_ioc_enclave_create().
> + *
> + * Return:
> + *   0 on success,
> + *   -errno otherwise
> + */
> +static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
> +{
> +	struct sgx_enclave_create ecreate;
> +	struct page *secs_page;
> +	struct sgx_secs *secs;
> +	int ret;
> +
> +	if (copy_from_user(&ecreate, arg, sizeof(ecreate)))
> +		return -EFAULT;
> +
> +	secs_page = alloc_page(GFP_HIGHUSER);
> +	if (!secs_page)
> +		return -ENOMEM;
> +
> +	secs = kmap(secs_page);
> +	if (copy_from_user(secs, (void __user *)ecreate.src, sizeof(*secs))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	ret = sgx_encl_create(encl, secs);
> +
> +out:
> +	kunmap(secs_page);
> +	__free_page(secs_page);
> +	return ret;
> +}
> +
> +static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
> +{
> +	u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
> +	u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
> +
> +	if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
> +		return -EINVAL;
> +
> +	if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
> +		return -EINVAL;
> +
> +	/*
> +	 * CPU will silently overwrite the permissions as zero, which means
> +	 * that we need to validate it ourselves.
> +	 */
> +	if (pt == SGX_SECINFO_TCS && perm)
> +		return -EINVAL;
> +
> +	if (secinfo->flags & SGX_SECINFO_RESERVED_MASK)
> +		return -EINVAL;
> +
> +	if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved)))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static int __sgx_encl_add_page(struct sgx_encl *encl,
> +			       struct sgx_encl_page *encl_page,
> +			       struct sgx_epc_page *epc_page,
> +			       struct sgx_secinfo *secinfo, unsigned long src)
> +{
> +	struct sgx_pageinfo pginfo;
> +	struct vm_area_struct *vma;
> +	struct page *src_page;
> +	int ret;
> +
> +	/* Query vma's VM_MAYEXEC as an indirect path_noexec() check. */
> +	if (encl_page->vm_max_prot_bits & VM_EXEC) {
> +		vma = find_vma(current->mm, src);
> +		if (!vma)
> +			return -EFAULT;
> +
> +		if (!(vma->vm_flags & VM_MAYEXEC))
> +			return -EACCES;
> +	}
> +
> +	ret = get_user_pages(src, 1, 0, &src_page, NULL);
> +	if (ret < 1)
> +		return ret;
> +
> +	pginfo.secs = (unsigned long)sgx_epc_addr(encl->secs.epc_page);
> +	pginfo.addr = SGX_ENCL_PAGE_ADDR(encl_page);
> +	pginfo.metadata = (unsigned long)secinfo;
> +	pginfo.contents = (unsigned long)kmap_atomic(src_page);
> +
> +	ret = __eadd(&pginfo, sgx_epc_addr(epc_page));
> +
> +	kunmap_atomic((void *)pginfo.contents);
> +	put_page(src_page);
> +
> +	return ret ? -EFAULT : 0;
> +}
> +
> +static int __sgx_encl_extend(struct sgx_encl *encl,
> +			     struct sgx_epc_page *epc_page)
> +{
> +	int ret;
> +	int i;
> +
> +	for (i = 0; i < 16; i++) {
> +		ret = __eextend(sgx_epc_addr(encl->secs.epc_page),
> +				sgx_epc_addr(epc_page) + (i * 0x100));
> +		if (ret) {
> +			if (encls_failed(ret))
> +				ENCLS_WARN(ret, "EEXTEND");
> +			return -EFAULT;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int sgx_encl_add_page(struct sgx_encl *encl,
> +			     struct sgx_enclave_add_pages *addp,
> +			     struct sgx_secinfo *secinfo)
> +{
> +	struct sgx_encl_page *encl_page;
> +	struct sgx_epc_page *epc_page;
> +	int ret;
> +
> +	encl_page = sgx_encl_page_alloc(encl, addp->offset, secinfo->flags);
> +	if (IS_ERR(encl_page))
> +		return PTR_ERR(encl_page);
> +
> +	epc_page = sgx_try_alloc_page();
> +	if (IS_ERR(epc_page)) {
> +		kfree(encl_page);
> +		return PTR_ERR(epc_page);
> +	}
> +
> +	if (atomic_read(&encl->flags) &
> +	    (SGX_ENCL_INITIALIZED | SGX_ENCL_DEAD)) {
> +		ret = -EFAULT;
> +		goto err_out_free;
> +	}
> +
> +	down_read(&current->mm->mmap_sem);
> +	mutex_lock(&encl->lock);
> +
> +	/*
> +	 * Insert prior to EADD in case of OOM.  EADD modifies MRENCLAVE, i.e.
> +	 * can't be gracefully unwound, while failure on EADD/EXTEND is limited
> +	 * to userspace errors (or kernel/hardware bugs).
> +	 */
> +	ret = radix_tree_insert(&encl->page_tree, PFN_DOWN(encl_page->desc),
> +				encl_page);
> +	if (ret)
> +		goto err_out_unlock;
> +
> +	ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
> +				  addp->src);
> +	if (ret)
> +		goto err_out;
> +
> +	/*
> +	 * Complete the "add" before doing the "extend" so that the "add"
> +	 * isn't in a half-baked state in the extremely unlikely scenario the
> +	 * the enclave will be destroyed in response to EEXTEND failure.
> +	 */
> +	encl_page->encl = encl;
> +	encl_page->epc_page = epc_page;
> +	encl->secs_child_cnt++;
> +
> +	if (addp->flags & SGX_PAGE_MEASURE) {
> +		ret = __sgx_encl_extend(encl, epc_page);
> +		if (ret)
> +			sgx_encl_destroy(encl);
> +	}
> +
> +	mutex_unlock(&encl->lock);
> +	up_read(&current->mm->mmap_sem);
> +	return ret;
> +
> +err_out:
> +	radix_tree_delete(&encl_page->encl->page_tree,
> +			  PFN_DOWN(encl_page->desc));
> +
> +err_out_unlock:
> +	mutex_unlock(&encl->lock);
> +	up_read(&current->mm->mmap_sem);
> +
> +err_out_free:
> +	sgx_free_page(epc_page);
> +	kfree(encl_page);
> +
> +	return ret;
> +}
> +
> +/**
> + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
> + * @encl:       pointer to an enclave instance (via ioctl() file pointer)
> + * @arg:	a user pointer to a struct sgx_enclave_add_pages instance
> + *
> + * Add (EADD) one or more pages to an uninitialized enclave, and optionally
> + * extend (EEXTEND) the measurement with the contents of the page. The range of
> + * pages must be virtually contiguous. The SECINFO and measurement mask are
> + * applied to all pages, i.e. pages with different properties must be added in
> + * separate calls.
> + *
> + * A SECINFO for a TCS is required to always contain zero permissions because
> + * CPU silently zeros them. Allowing anything else would cause a mismatch in
> + * the measurement.
> + *
> + * mmap()'s protection bits are capped by the page permissions. For each page
> + * address, the maximum protection bits are computed with the following
> + * heuristics:
> + *
> + * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
> + * 2. A TCS page: PROT_R | PROT_W.
> + * 3. No page: PROT_NONE.
> + *
> + * mmap() is not allowed to surpass the minimum of the maximum protection bits
> + * within the given address range.
> + *
> + * As stated above, a non-existent page is interpreted as a page with no
> + * permissions. In effect, this allows mmap() with PROT_NONE to be used to seek
> + * an address range for the enclave that can be then populated into SECS.
> + *
> + * @arg->addr, @arg->src and @arg->length are adjusted to reflect the
> + * remaining pages that need to be added to the enclave, e.g. userspace can
> + * re-invoke SGX_IOC_ENCLAVE_ADD_PAGES using the same struct in response to an
> + * ERESTARTSYS error.
> + *
> + * Return:
> + *   0 on success,
> + *   -EINVAL if any input param or the SECINFO contains invalid data,
> + *   -EACCES if an executable source page is located in a noexec partition,
> + *   -ENOMEM if any memory allocation, including EPC, fails,
> + *   -ERESTARTSYS if a pending signal is recognized
> + */
> +static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
> +{
> +	struct sgx_enclave_add_pages addp;
> +	struct sgx_secinfo secinfo;
> +	int ret;
> +
> +	if (!(atomic_read(&encl->flags) & SGX_ENCL_CREATED))
> +		return -EINVAL;
> +
> +	if (copy_from_user(&addp, arg, sizeof(addp)))
> +		return -EFAULT;
> +
> +	if (!IS_ALIGNED(addp.offset, PAGE_SIZE) ||
> +	    !IS_ALIGNED(addp.src, PAGE_SIZE))
> +		return -EINVAL;
> +
> +	if (!(access_ok(addp.src, PAGE_SIZE)))
> +		return -EFAULT;
> +
> +	if (addp.length & (PAGE_SIZE - 1))
> +		return -EINVAL;
> +
> +	if (addp.offset + addp.length - PAGE_SIZE >= encl->size)
> +		return -EINVAL;
> +
> +	if (copy_from_user(&secinfo, (void __user *)addp.secinfo,
> +			   sizeof(secinfo)))
> +		return -EFAULT;
> +
> +	if (sgx_validate_secinfo(&secinfo))
> +		return -EINVAL;
> +
> +	for ( ; addp.length > 0; addp.length -= PAGE_SIZE) {
> +		if (signal_pending(current)) {
> +			ret = -ERESTARTSYS;
> +			break;
> +		}
> +
> +		if (need_resched())
> +			cond_resched();
> +
> +		ret = sgx_encl_add_page(encl, &addp, &secinfo);
> +		if (ret)
> +			break;
> +
> +		addp.offset += PAGE_SIZE;
> +		addp.src += PAGE_SIZE;
> +	}
> +
> +	if (copy_to_user(arg, &addp, sizeof(addp)))
> +		return -EFAULT;
> +
> +	return ret;
> +}
> +
> +static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
> +			      void *hash)
> +{
> +	SHASH_DESC_ON_STACK(shash, tfm);
> +
> +	shash->tfm = tfm;
> +
> +	return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
> +}
> +
> +static int sgx_get_key_hash(const void *modulus, void *hash)
> +{
> +	struct crypto_shash *tfm;
> +	int ret;
> +
> +	tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
> +	if (IS_ERR(tfm))
> +		return PTR_ERR(tfm);
> +
> +	ret = __sgx_get_key_hash(tfm, modulus, hash);
> +
> +	crypto_free_shash(tfm);
> +	return ret;
> +}
> +
> +static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
> +			 struct sgx_einittoken *token)
> +{
> +	u64 mrsigner[4];
> +	int ret;
> +	int i;
> +	int j;
> +
> +	/* Check that the required attributes have been authorized. */
> +	if (encl->secs_attributes & ~encl->allowed_attributes)
> +		return -EINVAL;
> +
> +	ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&encl->lock);
> +
> +	if (atomic_read(&encl->flags) & SGX_ENCL_INITIALIZED) {
> +		ret = -EFAULT;
> +		goto err_out;
> +	}
> +
> +	for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
> +		for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
> +			ret = sgx_einit(sigstruct, token, encl->secs.epc_page,
> +					mrsigner);
> +			if (ret == SGX_UNMASKED_EVENT)
> +				continue;
> +			else
> +				break;
> +		}
> +
> +		if (ret != SGX_UNMASKED_EVENT)
> +			break;
> +
> +		msleep_interruptible(SGX_EINIT_SLEEP_TIME);
> +
> +		if (signal_pending(current)) {
> +			ret = -ERESTARTSYS;
> +			goto err_out;
> +		}
> +	}
> +
> +	if (ret & ENCLS_FAULT_FLAG) {
> +		if (encls_failed(ret))
> +			ENCLS_WARN(ret, "EINIT");
> +
> +		sgx_encl_destroy(encl);
> +		ret = -EFAULT;
> +	} else if (ret) {
> +		pr_debug("EINIT returned %d\n", ret);
> +		ret = -EPERM;
> +	} else {
> +		atomic_or(SGX_ENCL_INITIALIZED, &encl->flags);
> +	}
> +
> +err_out:
> +	mutex_unlock(&encl->lock);
> +	return ret;
> +}
> +
> +/**
> + * sgx_ioc_enclave_init - handler for %SGX_IOC_ENCLAVE_INIT
> + *
> + * @filep:	open file to /dev/sgx
> + * @arg:	userspace pointer to a struct sgx_enclave_init instance
> + *
> + * Flush any outstanding enqueued EADD operations and perform EINIT.  The
> + * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
> + * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
> + *
> + * Return:
> + *   0 on success,
> + *   SGX error code on EINIT failure,
> + *   -errno otherwise
> + */
> +static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
> +{
> +	struct sgx_einittoken *einittoken;
> +	struct sgx_sigstruct *sigstruct;
> +	struct sgx_enclave_init einit;
> +	struct page *initp_page;
> +	int ret;
> +
> +	if (!(atomic_read(&encl->flags) & SGX_ENCL_CREATED))
> +		return -EINVAL;
> +
> +	if (copy_from_user(&einit, arg, sizeof(einit)))
> +		return -EFAULT;
> +
> +	initp_page = alloc_page(GFP_HIGHUSER);
> +	if (!initp_page)
> +		return -ENOMEM;
> +
> +	sigstruct = kmap(initp_page);
> +	einittoken = (struct sgx_einittoken *)
> +		((unsigned long)sigstruct + PAGE_SIZE / 2);
> +	memset(einittoken, 0, sizeof(*einittoken));
> +
> +	if (copy_from_user(sigstruct, (void __user *)einit.sigstruct,
> +			   sizeof(*sigstruct))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	ret = sgx_encl_init(encl, sigstruct, einittoken);
> +
> +out:
> +	kunmap(initp_page);
> +	__free_page(initp_page);
> +	return ret;
> +}
> +
> +
> +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
> +{
> +	struct sgx_encl *encl = filep->private_data;
> +	int ret, encl_flags;
> +
> +	encl_flags = atomic_fetch_or(SGX_ENCL_IOCTL, &encl->flags);
> +	if (encl_flags & SGX_ENCL_IOCTL)
> +		return -EBUSY;
> +
> +	if (encl_flags & SGX_ENCL_DEAD)
> +		return -EFAULT;
> +
> +	switch (cmd) {
> +	case SGX_IOC_ENCLAVE_CREATE:
> +		ret = sgx_ioc_enclave_create(encl, (void __user *)arg);
> +		break;
> +	case SGX_IOC_ENCLAVE_ADD_PAGES:
> +		ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg);
> +		break;
> +	case SGX_IOC_ENCLAVE_INIT:
> +		ret = sgx_ioc_enclave_init(encl, (void __user *)arg);
> +		break;
> +	default:
> +		ret = -ENOIOCTLCMD;
> +		break;
> +	}
> +
> +	atomic_andnot(SGX_ENCL_IOCTL, &encl->flags);
> +
> +	return ret;
> +}
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 6a37df61ae32..36a295a0272b 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -8,6 +8,7 @@
>   #include <linux/ratelimit.h>
>   #include <linux/sched/signal.h>
>   #include <linux/slab.h>
> +#include "driver.h"
>   #include "encls.h"
>   
>   struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
> @@ -196,6 +197,8 @@ static bool __init sgx_page_cache_init(void)
>   
>   static void __init sgx_init(void)
>   {
> +	int ret;
> +
>   	if (!boot_cpu_has(X86_FEATURE_SGX))
>   		return;
>   
> @@ -205,8 +208,15 @@ static void __init sgx_init(void)
>   	if (!sgx_page_reclaimer_init())
>   		goto err_page_cache;
>   
> +	ret = sgx_drv_init();
> +	if (ret)
> +		goto err_kthread;
> +
>   	return;
>   
> +err_kthread:
> +	kthread_stop(ksgxswapd_tsk);
> +
>   err_page_cache:
>   	sgx_page_cache_teardown();
>   }
> diff --git a/arch/x86/kernel/cpu/sgx/reclaim.c b/arch/x86/kernel/cpu/sgx/reclaim.c
> index f071158d34f6..bdb42f4326aa 100644
> --- a/arch/x86/kernel/cpu/sgx/reclaim.c
> +++ b/arch/x86/kernel/cpu/sgx/reclaim.c
> @@ -10,6 +10,7 @@
>   #include <linux/sched/mm.h>
>   #include <linux/sched/signal.h>
>   #include "encls.h"
> +#include "driver.h"
>   
>   struct task_struct *ksgxswapd_tsk;
>   
> 


^ permalink raw reply

* Re: [PATCH v23 12/24] x86/sgx: Linux Enclave Driver
From: Sean Christopherson @ 2019-10-30  9:30 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: linux-kernel, x86, linux-sgx, akpm, dave.hansen, nhorman,
	npmccallum, serge.ayoun, shay.katz-zamir, haitao.huang,
	andriy.shevchenko, tglx, kai.svahn, bp, josh, luto, kai.huang,
	rientjes, cedric.xing, puiterwijk, linux-security-module,
	Suresh Siddha
In-Reply-To: <20191029092920.GA14494@linux.intel.com>

On Tue, Oct 29, 2019 at 11:29:20AM +0200, Jarkko Sakkinen wrote:
> On Mon, Oct 28, 2019 at 11:03:12PM +0200, Jarkko Sakkinen wrote:
> > +/**
> > + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
> > + * @encl:       pointer to an enclave instance (via ioctl() file pointer)
> > + * @arg:	a user pointer to a struct sgx_enclave_add_pages instance
> > + *
> > + * Add (EADD) one or more pages to an uninitialized enclave, and optionally
> > + * extend (EEXTEND) the measurement with the contents of the page. The range of
> > + * pages must be virtually contiguous. The SECINFO and measurement mask are
> > + * applied to all pages, i.e. pages with different properties must be added in
> > + * separate calls.
> > + *
> > + * A SECINFO for a TCS is required to always contain zero permissions because
> > + * CPU silently zeros them. Allowing anything else would cause a mismatch in
> > + * the measurement.
> > + *
> > + * mmap()'s protection bits are capped by the page permissions. For each page
> > + * address, the maximum protection bits are computed with the following
> > + * heuristics:
> > + *
> > + * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
> > + * 2. A TCS page: PROT_R | PROT_W.
> > + * 3. No page: PROT_NONE.
> > + *
> > + * mmap() is not allowed to surpass the minimum of the maximum protection bits
> > + * within the given address range.
> > + *
> > + * As stated above, a non-existent page is interpreted as a page with no
> > + * permissions. In effect, this allows mmap() with PROT_NONE to be used to seek
> > + * an address range for the enclave that can be then populated into SECS.
> > + *
> > + * @arg->addr, @arg->src and @arg->length are adjusted to reflect the
> > + * remaining pages that need to be added to the enclave, e.g. userspace can
> > + * re-invoke SGX_IOC_ENCLAVE_ADD_PAGES using the same struct in response to an
> > + * ERESTARTSYS error.
> > + *
> > + * Return:
> > + *   0 on success,
> > + *   -EINVAL if any input param or the SECINFO contains invalid data,
> > + *   -EACCES if an executable source page is located in a noexec partition,
> > + *   -ENOMEM if any memory allocation, including EPC, fails,
> > + *   -ERESTARTSYS if a pending signal is recognized
> > + */
> > +static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
> 
> This should return the number of pages processed instead of zero on
> success. Kernel needs to be able to cap the amount it will process.

Why?  The number of pages processed is effectively returned via the params
on any error, e.g. wouldn't it be more appropriate to return -ERESTARTSYS?
And I don't see any reason to add an arbitrary cap on the number of pages,
e.g. SGX plays nice with the scheduler and signals, and restricting the
number of EPC pages available to a process via cgroups (returning -ENOMEM)
is a better solution for managing EPC.

^ permalink raw reply

* Re: load_uefi_certs: Couldn't get size: 0x800000000000000e
From: Paul Menzel @ 2019-10-30  8:57 UTC (permalink / raw)
  To: Josh Boyer; +Cc: linux-security-module, Linux Kernel Mailing List
In-Reply-To: <3f48aab2-7d8e-ee9b-c362-3b7f84609abe@molgen.mpg.de>

Dear Josh,


On 04.10.19 14:24, Paul Menzel wrote:

> On a Dell Latitude E7250 with a self-built Linux 5.4-rc1 the error message
> below is printed on the screen.
> 
>      [    0.658664] Couldn't get size: 0x800000000000000e
> 
> Here are the message from the ring buffer (`dmesg`).
> 
> ```
> [    0.658329] calling  load_uefi_certs+0x0/0x217 @ 1
> [    0.658472] integrity: Loading X.509 certificate: UEFI:db
> [    0.658544] integrity: Loaded X.509 cert 'Microsoft Windows Production PCA 2011: a92902398e16c49778cd90f99e4f9ae17c55af53'
> [    0.658545] integrity: Loading X.509 certificate: UEFI:db
> [    0.658606] integrity: Loaded X.509 cert 'Microsoft Corporation UEFI CA 2011: 13adbf4309bd82709c8cd54f316ed522988a1bd4'
> [    0.658664] Couldn't get size: 0x800000000000000e
> [    0.658689] Couldn't get UEFI MokListRT
> [    0.658768] initcall load_uefi_certs+0x0/0x217 returned 0 after 426 usecs
> ```
> 
> Could this error message be improved, to give the user a hint, what is wrong,
> what the consequences are, and what should be done?
> 
> Should the message below be `pr_error`, and the other error debug?
> 
>      pr_info("Couldn't get UEFI MokListRT\n");
> 
> Please find the Linux messages and (`make savedefconfig`) configuration
> attached.
> 
> Do you know more about the reason, and if the Linux code can handle it more
> gracefully?
> 
> Commit 15ea0e1e (efi: Import certificates from UEFI Secure Boot) was already
> added in Linux 5.0, and I do not remember seeing it with Debian’s default
> Linux kernel. No idea, what is configured differently, or I just missed it.

I am still getting this with Linux 5.4-rc5.


Kind regards,

Paul

^ permalink raw reply

* Re: [PATCH bpf-next v11 2/7] landlock: Add the management of domains
From: Serge E. Hallyn @ 2019-10-30  2:56 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-3-mic@digikod.net>

On Tue, Oct 29, 2019 at 06:15:00PM +0100, Mickaël Salaün wrote:
> A Landlock domain is a set of eBPF programs.  There is a list for each
> different program types that can be run on a specific Landlock hook
> (e.g. ptrace).  A domain is tied to a set of subjects (i.e. tasks).  A
> Landlock program should not try (nor be able) to infer which subject is
> currently enforced, but to have a unique security policy for all
> subjects tied to the same domain.  This make the reasoning much easier
> and help avoid pitfalls.
> 
> The next commits tie a domain to a task's credentials thanks to
> seccomp(2), but we could use cgroups or a security file-system to
> enforce a sysadmin-defined policy .
> 
> Signed-off-by: Mickaël Salaün <mic@digikod.net>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Andy Lutomirski <luto@amacapital.net>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: James Morris <jmorris@namei.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Serge E. Hallyn <serge@hallyn.com>
> Cc: Will Drewry <wad@chromium.org>
> ---
> 
> Changes since v10:
> * rename files and names to clearly define a domain
> * create a standalone patch to ease review
> ---

[...]

> +/**
> + * store_landlock_prog - prepend and deduplicate a Landlock prog_list
> + *
> + * Prepend @prog to @init_domain while ignoring @prog if they are already in
> + * @ref_domain.  Whatever is the result of this function call, you can call
> + * bpf_prog_put(@prog) after.
> + *
> + * @init_domain: empty domain to prepend to
> + * @ref_domain: domain to check for duplicate programs
> + * @prog: program to prepend
> + *
> + * Return -errno on error or 0 if @prog was successfully stored.
> + */
> +static int store_landlock_prog(struct landlock_domain *init_domain,
> +		const struct landlock_domain *ref_domain,
> +		struct bpf_prog *prog)
> +{
> +	struct landlock_prog_list *tmp_list = NULL;
> +	int err;
> +	size_t hook;
> +	enum landlock_hook_type last_type;
> +	struct bpf_prog *new = prog;
> +
> +	/* allocate all the memory we need */
> +	struct landlock_prog_list *new_list;
> +
> +	last_type = get_hook_type(new);
> +
> +	/* ignore duplicate programs */

This comment should be "don't allow" rather than "ignore", right?

> +	if (ref_domain) {
> +		struct landlock_prog_list *ref;
> +
> +		hook = get_hook_index(get_hook_type(new));
> +		for (ref = ref_domain->programs[hook]; ref;
> +				ref = ref->prev) {
> +			if (ref->prog == new)
> +				return -EINVAL;
> +		}
> +	}
> +
> +	new = bpf_prog_inc(new);
> +	if (IS_ERR(new)) {
> +		err = PTR_ERR(new);
> +		goto put_tmp_list;
> +	}
> +	new_list = kzalloc(sizeof(*new_list), GFP_KERNEL);
> +	if (!new_list) {
> +		bpf_prog_put(new);
> +		err = -ENOMEM;
> +		goto put_tmp_list;
> +	}
> +	/* ignore Landlock types in this tmp_list */
> +	new_list->prog = new;
> +	new_list->prev = tmp_list;
> +	refcount_set(&new_list->usage, 1);
> +	tmp_list = new_list;
> +
> +	if (!tmp_list)
> +		/* inform user space that this program was already added */

I'm not following this.  You just kzalloc'd new_list, pointed
tmp_list to new_list, so how could tmp_list be NULL?  Was there
a bad code reorg here, or am i being dense?

> +		return -EEXIST;
> +
> +	/* properly store the list (without error cases) */
> +	while (tmp_list) {
> +		struct landlock_prog_list *new_list;
> +
> +		new_list = tmp_list;
> +		tmp_list = tmp_list->prev;
> +		/* do not increment the previous prog list usage */
> +		hook = get_hook_index(get_hook_type(new_list->prog));
> +		new_list->prev = init_domain->programs[hook];
> +		/* no need to add from the last program to the first because
> +		 * each of them are a different Landlock type */
> +		smp_store_release(&init_domain->programs[hook], new_list);
> +	}
> +	return 0;
> +
> +put_tmp_list:
> +	put_landlock_prog_list(tmp_list);
> +	return err;
> +}
> +
> +/* limit Landlock programs set to 256KB */
> +#define LANDLOCK_PROGRAMS_MAX_PAGES (1 << 6)
> +
> +/**
> + * landlock_prepend_prog - attach a Landlock prog_list to @current_domain
> + *
> + * Whatever is the result of this function call, you can call
> + * bpf_prog_put(@prog) after.
> + *
> + * @current_domain: landlock_domain pointer, must be (RCU-)locked (if needed)
> + *                  to prevent a concurrent put/free. This pointer must not be
> + *                  freed after the call.
> + * @prog: non-NULL Landlock prog_list to prepend to @current_domain. @prog will
> + *        be owned by landlock_prepend_prog() and freed if an error happened.
> + *
> + * Return @current_domain or a new pointer when OK. Return a pointer error
> + * otherwise.
> + */
> +struct landlock_domain *landlock_prepend_prog(
> +		struct landlock_domain *current_domain,
> +		struct bpf_prog *prog)
> +{
> +	struct landlock_domain *new_domain = current_domain;
> +	unsigned long pages;
> +	int err;
> +	size_t i;
> +	struct landlock_domain tmp_domain = {};
> +
> +	if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* validate memory size allocation */
> +	pages = prog->pages;
> +	if (current_domain) {
> +		size_t i;
> +
> +		for (i = 0; i < ARRAY_SIZE(current_domain->programs); i++) {
> +			struct landlock_prog_list *walker_p;
> +
> +			for (walker_p = current_domain->programs[i];
> +					walker_p; walker_p = walker_p->prev)
> +				pages += walker_p->prog->pages;
> +		}
> +		/* count a struct landlock_domain if we need to allocate one */
> +		if (refcount_read(&current_domain->usage) != 1)
> +			pages += round_up(sizeof(*current_domain), PAGE_SIZE)
> +				/ PAGE_SIZE;
> +	}
> +	if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
> +		return ERR_PTR(-E2BIG);
> +
> +	/* ensure early that we can allocate enough memory for the new
> +	 * prog_lists */
> +	err = store_landlock_prog(&tmp_domain, current_domain, prog);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	/*
> +	 * Each task_struct points to an array of prog list pointers.  These
> +	 * tables are duplicated when additions are made (which means each
> +	 * table needs to be refcounted for the processes using it). When a new
> +	 * table is created, all the refcounters on the prog_list are bumped
> +	 * (to track each table that references the prog). When a new prog is
> +	 * added, it's just prepended to the list for the new table to point
> +	 * at.
> +	 *
> +	 * Manage all the possible errors before this step to not uselessly
> +	 * duplicate current_domain and avoid a rollback.
> +	 */
> +	if (!new_domain) {
> +		/*
> +		 * If there is no Landlock domain used by the current task,
> +		 * then create a new one.
> +		 */
> +		new_domain = new_landlock_domain();
> +		if (IS_ERR(new_domain))
> +			goto put_tmp_lists;
> +	} else if (refcount_read(&current_domain->usage) > 1) {
> +		/*
> +		 * If the current task is not the sole user of its Landlock
> +		 * domain, then duplicate it.
> +		 */
> +		new_domain = new_landlock_domain();
> +		if (IS_ERR(new_domain))
> +			goto put_tmp_lists;
> +		for (i = 0; i < ARRAY_SIZE(new_domain->programs); i++) {
> +			new_domain->programs[i] =
> +				READ_ONCE(current_domain->programs[i]);
> +			if (new_domain->programs[i])
> +				refcount_inc(&new_domain->programs[i]->usage);
> +		}
> +
> +		/*
> +		 * Landlock domain from the current task will not be freed here
> +		 * because the usage is strictly greater than 1. It is only
> +		 * prevented to be freed by another task thanks to the caller
> +		 * of landlock_prepend_prog() which should be locked if needed.
> +		 */
> +		landlock_put_domain(current_domain);
> +	}
> +
> +	/* prepend tmp_domain to new_domain */
> +	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++) {
> +		/* get the last new list */
> +		struct landlock_prog_list *last_list =
> +			tmp_domain.programs[i];
> +
> +		if (last_list) {
> +			while (last_list->prev)
> +				last_list = last_list->prev;
> +			/* no need to increment usage (pointer replacement) */
> +			last_list->prev = new_domain->programs[i];
> +			new_domain->programs[i] = tmp_domain.programs[i];
> +		}
> +	}
> +	return new_domain;
> +
> +put_tmp_lists:
> +	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++)
> +		put_landlock_prog_list(tmp_domain.programs[i]);
> +	return new_domain;
> +}
> diff --git a/security/landlock/domain_manage.h b/security/landlock/domain_manage.h
> new file mode 100644
> index 000000000000..5b5b49f6e3e8
> --- /dev/null
> +++ b/security/landlock/domain_manage.h
> @@ -0,0 +1,23 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Landlock LSM - domain management headers
> + *
> + * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
> + * Copyright © 2018-2019 ANSSI
> + */
> +
> +#ifndef _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
> +#define _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
> +
> +#include <linux/filter.h>
> +
> +#include "common.h"
> +
> +void landlock_get_domain(struct landlock_domain *dom);
> +void landlock_put_domain(struct landlock_domain *dom);
> +
> +struct landlock_domain *landlock_prepend_prog(
> +		struct landlock_domain *current_domain,
> +		struct bpf_prog *prog);
> +
> +#endif /* _SECURITY_LANDLOCK_DOMAIN_MANAGE_H */
> -- 
> 2.23.0

^ permalink raw reply

* [PATCH bpf-next v11 6/7] bpf,landlock: Add tests for the Landlock ptrace program type
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

Test eBPF program context access and ptrace hooks semantic.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* rework tests with new Landlock ptrace programs which restrict ptrace
  thanks to the task_landlock_ptrace_ancestor() helper
* simplify ptrace tests (make expect_ptrace implicit)
* add tests:
  * check a child process tracing its parent
  * check Landlock domain without ptrace enforcement (e.g. useful for
    audit/signaling purpose)
  * check inherited-only domains
  * check task pointer arithmetic
* fix flaky test for multi-core
* increase log size
* cosmetic renames
* update and improve the Makefile

Changes since v9:
* replace subtype with expected_attach_type and expected_attach_triggers
* rename inode_map_lookup() into inode_map_lookup_elem()
* check for inode map entry without value (which is now possible thanks
  to the pointer null check)
* use read-only inode map for Landlock programs

Changes since v8:
* update eBPF include path for macros
* use TEST_GEN_PROGS and use the generic "clean" target
* add more verbose errors
* update the bpf/verifier files
* remove chain tests (from landlock and bpf/verifier)
* replace the whitelist tests with blacklist tests (because of stateless
  Landlock programs): remove "dotdot" tests and other depth tests
* sync the landlock Makefile with its bpf sibling directory and use
  bpf_load_program_xattr()

Changes since v7:
* update tests and add new ones for filesystem hierarchy and Landlock
  chains.

Changes since v6:
* use the new kselftest_harness.h
* use const variables
* replace ASSERT_STEP with ASSERT_*
* rename BPF_PROG_TYPE_LANDLOCK to BPF_PROG_TYPE_LANDLOCK_RULE
* force sample library rebuild
* fix install target

Changes since v5:
* add subtype test
* add ptrace tests
* split and rename files
* cleanup and rebase
---
 scripts/bpf_helpers_doc.py                    |   1 +
 tools/include/uapi/linux/bpf.h                |  23 +-
 tools/include/uapi/linux/landlock.h           |  22 ++
 tools/lib/bpf/libbpf_probes.c                 |   3 +
 tools/testing/selftests/bpf/config            |   3 +
 tools/testing/selftests/bpf/test_verifier.c   |   1 +
 .../testing/selftests/bpf/verifier/landlock.c |  56 +++++
 tools/testing/selftests/landlock/.gitignore   |   5 +
 tools/testing/selftests/landlock/Makefile     |  27 +++
 tools/testing/selftests/landlock/config       |   5 +
 tools/testing/selftests/landlock/test.h       |  48 ++++
 tools/testing/selftests/landlock/test_base.c  |  24 ++
 .../testing/selftests/landlock/test_ptrace.c  | 210 ++++++++++++++++++
 13 files changed, 427 insertions(+), 1 deletion(-)
 create mode 100644 tools/include/uapi/linux/landlock.h
 create mode 100644 tools/testing/selftests/bpf/verifier/landlock.c
 create mode 100644 tools/testing/selftests/landlock/.gitignore
 create mode 100644 tools/testing/selftests/landlock/Makefile
 create mode 100644 tools/testing/selftests/landlock/config
 create mode 100644 tools/testing/selftests/landlock/test.h
 create mode 100644 tools/testing/selftests/landlock/test_base.c
 create mode 100644 tools/testing/selftests/landlock/test_ptrace.c

diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 7548569e8076..8e4c0fe75663 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -466,6 +466,7 @@ class PrinterHelpers(Printer):
             'const struct sk_buff': 'const struct __sk_buff',
             'struct sk_msg_buff': 'struct sk_msg_md',
             'struct xdp_buff': 'struct xdp_md',
+            'struct task_struct': 'void',
     }
 
     def print_header(self):
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4af8b0819a32..c88436b97163 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -173,6 +173,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
 	BPF_PROG_TYPE_CGROUP_SOCKOPT,
+	BPF_PROG_TYPE_LANDLOCK_HOOK,
 };
 
 enum bpf_attach_type {
@@ -199,6 +200,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP6_RECVMSG,
 	BPF_CGROUP_GETSOCKOPT,
 	BPF_CGROUP_SETSOCKOPT,
+	BPF_LANDLOCK_PTRACE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2775,6 +2777,24 @@ union bpf_attr {
  * 		restricted to raw_tracepoint bpf programs.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_task_landlock_ptrace_ancestor(struct task_struct *parent, struct task_struct *child)
+ *	Description
+ *		Check the relation of a potentially parent task with a child
+ *		one, according to their Landlock ptrace hook programs.
+ *	Return
+ *		**-EINVAL** if the child's ptrace programs are not comparable
+ *		to the parent ones, i.e. one of them is an empty set.
+ *
+ *		**-ENOENT** if the parent's ptrace programs are either in a
+ *		separate hierarchy of the child ones, or if the parent's ptrace
+ *		programs are a superset of the child ones.
+ *
+ *		0 if the parent's ptrace programs are the same as the child
+ *		ones.
+ *
+ *		1 if the parent's ptrace programs are indeed a subset of the
+ *		child ones.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2888,7 +2908,8 @@ union bpf_attr {
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
 	FN(tcp_gen_syncookie),		\
-	FN(skb_output),
+	FN(skb_output),			\
+	FN(task_landlock_ptrace_ancestor),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/include/uapi/linux/landlock.h b/tools/include/uapi/linux/landlock.h
new file mode 100644
index 000000000000..3db2d190c4e7
--- /dev/null
+++ b/tools/include/uapi/linux/landlock.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Landlock - UAPI headers
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifndef _UAPI__LINUX_LANDLOCK_H__
+#define _UAPI__LINUX_LANDLOCK_H__
+
+#include <linux/types.h>
+
+#define LANDLOCK_RET_ALLOW	0
+#define LANDLOCK_RET_DENY	1
+
+struct landlock_context_ptrace {
+	__u64 tracer;
+	__u64 tracee;
+};
+
+#endif /* _UAPI__LINUX_LANDLOCK_H__ */
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 4b0b0364f5fc..1e0d6346a7c7 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -78,6 +78,9 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	case BPF_PROG_TYPE_KPROBE:
 		xattr.kern_version = get_kernel_version();
 		break;
+	case BPF_PROG_TYPE_LANDLOCK_HOOK:
+		xattr.expected_attach_type = BPF_LANDLOCK_PTRACE;
+		break;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 5dc109f4c097..3161a88a6059 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -35,3 +35,6 @@ CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_IPV6_SIT=m
 CONFIG_BPF_JIT=y
+CONFIG_SECCOMP_FILTER=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_LANDLOCK=y
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index d27fd929abb9..74f249dafc0b 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -30,6 +30,7 @@
 #include <linux/bpf.h>
 #include <linux/if_ether.h>
 #include <linux/btf.h>
+#include <linux/landlock.h>
 
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
diff --git a/tools/testing/selftests/bpf/verifier/landlock.c b/tools/testing/selftests/bpf/verifier/landlock.c
new file mode 100644
index 000000000000..59cd333745dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/landlock.c
@@ -0,0 +1,56 @@
+{
+	"landlock/ptrace: always accept",
+	.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK,
+	.expected_attach_type = BPF_LANDLOCK_PTRACE,
+	.insns = {
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+},
+{
+	"landlock/ptrace: forbid arbitrary return value",
+	.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK,
+	.expected_attach_type = BPF_LANDLOCK_PTRACE,
+	.insns = {
+		BPF_MOV32_IMM(BPF_REG_0, 2),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "At program exit the register R0 has value (0x2; 0x0) should have been in (0x0; 0x1)",
+},
+{
+	"landlock/ptrace: read context and call dedicated helper",
+	.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK,
+	.expected_attach_type = BPF_LANDLOCK_PTRACE,
+	.insns = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracer)),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracer)),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				BPF_FUNC_task_landlock_ptrace_ancestor),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+},
+{
+	"landlock/ptrace: forbid pointer arithmetic",
+	.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK,
+	.expected_attach_type = BPF_LANDLOCK_PTRACE,
+	.insns = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracer)),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracee)),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "R1 pointer arithmetic on task prohibited",
+},
diff --git a/tools/testing/selftests/landlock/.gitignore b/tools/testing/selftests/landlock/.gitignore
new file mode 100644
index 000000000000..4c5c01d23fe0
--- /dev/null
+++ b/tools/testing/selftests/landlock/.gitignore
@@ -0,0 +1,5 @@
+/feature
+/fixdep
+/*libbpf*
+/test_base
+/test_ptrace
diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile
new file mode 100644
index 000000000000..2da77c30e77f
--- /dev/null
+++ b/tools/testing/selftests/landlock/Makefile
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0
+
+LIBDIR := $(abspath ../../../lib)
+BPFDIR := $(LIBDIR)/bpf
+TOOLSDIR := $(abspath ../../../include)
+APIDIR := $(TOOLSDIR)/uapi
+
+CFLAGS += -g -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(TOOLSDIR)
+LDLIBS += -lelf
+
+test_src = $(wildcard test_*.c)
+
+TEST_GEN_PROGS := $(test_src:.c=)
+
+include ../lib.mk
+
+BPFOBJ := $(OUTPUT)/libbpf.a
+
+$(TEST_GEN_PROGS): $(BPFOBJ) ../kselftest_harness.h
+
+.PHONY: force
+
+# force a rebuild of BPFOBJ when its dependencies are updated
+force:
+
+$(BPFOBJ): force
+	$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config
new file mode 100644
index 000000000000..fa5081b840ad
--- /dev/null
+++ b/tools/testing/selftests/landlock/config
@@ -0,0 +1,5 @@
+CONFIG_BPF=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_SECCOMP_FILTER=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_LANDLOCK=y
diff --git a/tools/testing/selftests/landlock/test.h b/tools/testing/selftests/landlock/test.h
new file mode 100644
index 000000000000..836df68b6bb8
--- /dev/null
+++ b/tools/testing/selftests/landlock/test.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Landlock helpers
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <linux/filter.h>
+#include <linux/landlock.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+
+#include "../kselftest_harness.h"
+#include "../../../../samples/bpf/bpf_load.h"
+
+#ifndef SECCOMP_PREPEND_LANDLOCK_PROG
+#define SECCOMP_PREPEND_LANDLOCK_PROG	4
+#endif
+
+#ifndef seccomp
+static int __attribute__((unused)) seccomp(unsigned int op, unsigned int flags,
+		void *args)
+{
+	errno = 0;
+	return syscall(__NR_seccomp, op, flags, args);
+}
+#endif
+
+static int __attribute__((unused)) ll_bpf_load_program(
+		const struct bpf_insn *bpf_insns, size_t insns_len,
+		char *log_buf, size_t log_buf_sz,
+		const enum bpf_attach_type attach_type)
+{
+	struct bpf_load_program_attr load_attr;
+
+	memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
+	load_attr.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK;
+	load_attr.expected_attach_type = attach_type;
+	load_attr.insns = bpf_insns;
+	load_attr.insns_cnt = insns_len / sizeof(struct bpf_insn);
+	load_attr.license = "GPL";
+
+	return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
+}
diff --git a/tools/testing/selftests/landlock/test_base.c b/tools/testing/selftests/landlock/test_base.c
new file mode 100644
index 000000000000..db46f39048cb
--- /dev/null
+++ b/tools/testing/selftests/landlock/test_base.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - base
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+
+#include "test.h"
+
+TEST(seccomp_landlock)
+{
+	int ret;
+
+	ret = seccomp(SECCOMP_PREPEND_LANDLOCK_PROG, 0, NULL);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EFAULT, errno) {
+		TH_LOG("Kernel does not support CONFIG_SECURITY_LANDLOCK");
+	}
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/landlock/test_ptrace.c b/tools/testing/selftests/landlock/test_ptrace.c
new file mode 100644
index 000000000000..f4ee67126394
--- /dev/null
+++ b/tools/testing/selftests/landlock/test_ptrace.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - ptrace
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#define _GNU_SOURCE
+#include <signal.h> /* raise */
+#include <sys/ptrace.h>
+#include <sys/types.h> /* waitpid */
+#include <sys/wait.h> /* waitpid */
+#include <unistd.h> /* fork, pipe */
+
+#include "test.h"
+
+#define LOG_SIZE 512
+
+static void create_domain(struct __test_metadata *_metadata,
+		bool scoped_ptrace, bool inherited_only)
+{
+	const struct bpf_insn prog_void[] = {
+		BPF_MOV32_IMM(BPF_REG_0, LANDLOCK_RET_ALLOW),
+		BPF_EXIT_INSN(),
+	};
+	const struct bpf_insn prog_check[] = {
+		BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracer)),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracee)),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				BPF_FUNC_task_landlock_ptrace_ancestor),
+		/* if @tracee is an ancestor or at the same level of @tracer,
+		 * then allow ptrace (warning: do not use BPF_JGE 0) */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, inherited_only ? 0 : 1, 2),
+		BPF_MOV32_IMM(BPF_REG_0, LANDLOCK_RET_DENY),
+		BPF_EXIT_INSN(),
+		BPF_MOV32_IMM(BPF_REG_0, LANDLOCK_RET_ALLOW),
+		BPF_EXIT_INSN(),
+	};
+	int prog;
+	char log[LOG_SIZE] = "";
+
+	if (scoped_ptrace)
+		prog = ll_bpf_load_program(prog_check, sizeof(prog_check),
+				log, sizeof(log), BPF_LANDLOCK_PTRACE);
+	else
+		prog = ll_bpf_load_program(prog_void, sizeof(prog_void),
+				log, sizeof(log), BPF_LANDLOCK_PTRACE);
+	ASSERT_NE(-1, prog) {
+		TH_LOG("Failed to load the %s program: %s\n%s",
+				scoped_ptrace ? "check" : "void",
+				strerror(errno), log);
+	}
+	ASSERT_EQ(0, seccomp(SECCOMP_PREPEND_LANDLOCK_PROG, 0, &prog)) {
+		TH_LOG("Failed to create a Landlock domain: %s", strerror(errno));
+	}
+	EXPECT_EQ(0, close(prog));
+}
+
+/* test PTRACE_TRACEME and PTRACE_ATTACH for parent and child */
+static void _check_ptrace(struct __test_metadata *_metadata,
+		bool scoped_ptrace, bool domain_both,
+		bool domain_parent, bool domain_child)
+{
+	pid_t child, parent;
+	int status;
+	int pipe_child[2], pipe_parent[2];
+	char buf_parent;
+	const bool inherited_only = domain_both && !domain_parent && !domain_child;
+
+	parent = getpid();
+
+	ASSERT_EQ(0, pipe(pipe_child));
+	ASSERT_EQ(0, pipe(pipe_parent));
+	if (domain_both)
+		create_domain(_metadata, scoped_ptrace, inherited_only);
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		char buf_child;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		EXPECT_EQ(0, close(pipe_child[0]));
+		if (domain_child)
+			create_domain(_metadata, scoped_ptrace, inherited_only);
+
+		/* sync #1 */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1)) {
+			TH_LOG("Failed to read() sync #1 from parent");
+		}
+		ASSERT_EQ('.', buf_child);
+
+		/* test the parent protection */
+		ASSERT_EQ((domain_child && scoped_ptrace) ? -1 : 0,
+				ptrace(PTRACE_ATTACH, parent, NULL, 0));
+		if (domain_child && scoped_ptrace) {
+			ASSERT_EQ(EPERM, errno);
+		} else {
+			ASSERT_EQ(parent, waitpid(parent, &status, 0));
+			ASSERT_EQ(1, WIFSTOPPED(status));
+			ASSERT_EQ(0, ptrace(PTRACE_DETACH, parent, NULL, 0));
+		}
+
+		/* sync #2 */
+		ASSERT_EQ(1, write(pipe_child[1], ".", 1)) {
+			TH_LOG("Failed to write() sync #2 to parent");
+		}
+
+		/* test traceme */
+		ASSERT_EQ((domain_parent && scoped_ptrace) ? -1 : 0,
+				ptrace(PTRACE_TRACEME));
+		if (domain_parent && scoped_ptrace) {
+			ASSERT_EQ(EPERM, errno);
+		} else {
+			ASSERT_EQ(0, raise(SIGSTOP));
+		}
+
+		/* sync #3 */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1)) {
+			TH_LOG("Failed to read() sync #3 from parent");
+		}
+		ASSERT_EQ('.', buf_child);
+		_exit(_metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
+	}
+
+	EXPECT_EQ(0, close(pipe_child[1]));
+	EXPECT_EQ(0, close(pipe_parent[0]));
+	if (domain_parent)
+		create_domain(_metadata, scoped_ptrace, inherited_only);
+
+	/* sync #1 */
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1)) {
+		TH_LOG("Failed to write() sync #1 to child");
+	}
+
+	/* test the parent protection */
+	/* sync #2 */
+	ASSERT_EQ(1, read(pipe_child[0], &buf_parent, 1)) {
+		TH_LOG("Failed to read() sync #2 from child");
+	}
+	ASSERT_EQ('.', buf_parent);
+
+	/* test traceme */
+	if (!(domain_parent && scoped_ptrace)) {
+		ASSERT_EQ(child, waitpid(child, &status, 0));
+		ASSERT_EQ(1, WIFSTOPPED(status));
+		ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0));
+	}
+	/* test attach */
+	ASSERT_EQ((domain_parent && scoped_ptrace) ? -1 : 0,
+			ptrace(PTRACE_ATTACH, child, NULL, 0));
+	if (domain_parent && scoped_ptrace) {
+		ASSERT_EQ(EPERM, errno);
+	} else {
+		ASSERT_EQ(child, waitpid(child, &status, 0));
+		ASSERT_EQ(1, WIFSTOPPED(status));
+		ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0));
+	}
+
+	/* sync #3 */
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1)) {
+		TH_LOG("Failed to write() sync #3 to child");
+	}
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	if (WIFSIGNALED(status) || WEXITSTATUS(status))
+		_metadata->passed = 0;
+}
+
+/* keep the *_scoped order to check program inheritance */
+#define CHECK_PTRACE(name, domain_both, domain_parent, domain_child) \
+	TEST(name ## _unscoped) { \
+		_check_ptrace(_metadata, false, domain_both, domain_parent, \
+				domain_child); \
+	} \
+	TEST(name ## _scoped) { \
+		_check_ptrace(_metadata, false, domain_both, domain_parent, \
+				domain_child); \
+		_check_ptrace(_metadata, true, domain_both, domain_parent, \
+				domain_child); \
+	}
+
+/* no domain */
+CHECK_PTRACE(allow_without_domain, false, false, false);
+
+/* child domain */
+CHECK_PTRACE(allow_with_one_domain, false, false, true);
+
+/* parent domain */
+CHECK_PTRACE(deny_with_parent_domain, false, true, false);
+
+/* parent and child domain */
+CHECK_PTRACE(deny_with_sibling_domain, false, true, true);
+
+/* inherited domain */
+CHECK_PTRACE(allow_sibling_domain, true, false, false);
+
+/* inherited and child domain */
+CHECK_PTRACE(allow_with_nested_domain, true, false, true);
+
+/* inherited and parent domain */
+CHECK_PTRACE(deny_with_nested_and_parent_domain, true, true, false);
+
+/* inherited, parent and child domain */
+CHECK_PTRACE(deny_with_forked_domain, true, true, true);
+
+TEST_HARNESS_MAIN
-- 
2.23.0


^ permalink raw reply related

* [PATCH bpf-next v11 2/7] landlock: Add the management of domains
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

A Landlock domain is a set of eBPF programs.  There is a list for each
different program types that can be run on a specific Landlock hook
(e.g. ptrace).  A domain is tied to a set of subjects (i.e. tasks).  A
Landlock program should not try (nor be able) to infer which subject is
currently enforced, but to have a unique security policy for all
subjects tied to the same domain.  This make the reasoning much easier
and help avoid pitfalls.

The next commits tie a domain to a task's credentials thanks to
seccomp(2), but we could use cgroups or a security file-system to
enforce a sysadmin-defined policy .

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* rename files and names to clearly define a domain
* create a standalone patch to ease review
---
 security/landlock/Makefile        |   3 +-
 security/landlock/common.h        |  38 +++++
 security/landlock/domain_manage.c | 265 ++++++++++++++++++++++++++++++
 security/landlock/domain_manage.h |  23 +++
 4 files changed, 328 insertions(+), 1 deletion(-)
 create mode 100644 security/landlock/domain_manage.c
 create mode 100644 security/landlock/domain_manage.h

diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 682b798c6b76..dd5f70185778 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
 landlock-y := \
-	bpf_verify.o bpf_ptrace.o
+	bpf_verify.o bpf_ptrace.o \
+	domain_manage.o
diff --git a/security/landlock/common.h b/security/landlock/common.h
index 0234c4bc4acd..fb2990eb5fb4 100644
--- a/security/landlock/common.h
+++ b/security/landlock/common.h
@@ -11,11 +11,49 @@
 
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/refcount.h>
 
 enum landlock_hook_type {
 	LANDLOCK_HOOK_PTRACE = 1,
 };
 
+#define _LANDLOCK_HOOK_LAST LANDLOCK_HOOK_PTRACE
+
+struct landlock_prog_list {
+	struct landlock_prog_list *prev;
+	struct bpf_prog *prog;
+	refcount_t usage;
+};
+
+/**
+ * struct landlock_domain - Landlock programs enforced on a set of tasks
+ *
+ * When prepending a new program, if &struct landlock_domain is shared with
+ * other tasks, then duplicate it and prepend the program to this new &struct
+ * landlock_domain.
+ *
+ * @usage: reference count to manage the object lifetime. When a task needs to
+ *	   add Landlock programs and if @usage is greater than 1, then the
+ *	   task must duplicate &struct landlock_domain to not change the
+ *	   children's programs as well.
+ * @programs: array of non-NULL &struct landlock_prog_list pointers
+ */
+struct landlock_domain {
+	struct landlock_prog_list *programs[_LANDLOCK_HOOK_LAST];
+	refcount_t usage;
+};
+
+/**
+ * get_hook_index - get an index for the programs of struct landlock_prog_set
+ *
+ * @type: a Landlock hook type
+ */
+static inline size_t get_hook_index(enum landlock_hook_type type)
+{
+	/* type ID > 0 for loaded programs */
+	return type - 1;
+}
+
 static inline enum landlock_hook_type get_hook_type(const struct bpf_prog *prog)
 {
 	switch (prog->expected_attach_type) {
diff --git a/security/landlock/domain_manage.c b/security/landlock/domain_manage.c
new file mode 100644
index 000000000000..c955b9c95c84
--- /dev/null
+++ b/security/landlock/domain_manage.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - domain management
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/refcount.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "common.h"
+#include "domain_manage.h"
+
+void landlock_get_domain(struct landlock_domain *dom)
+{
+	if (!dom)
+		return;
+	refcount_inc(&dom->usage);
+}
+
+static void put_landlock_prog_list(struct landlock_prog_list *prog_list)
+{
+	struct landlock_prog_list *orig = prog_list;
+
+	/* clean up single-reference branches iteratively */
+	while (orig && refcount_dec_and_test(&orig->usage)) {
+		struct landlock_prog_list *freeme = orig;
+
+		if (orig->prog)
+			bpf_prog_put(orig->prog);
+		orig = orig->prev;
+		kfree(freeme);
+	}
+}
+
+void landlock_put_domain(struct landlock_domain *domain)
+{
+	if (domain && refcount_dec_and_test(&domain->usage)) {
+		size_t i;
+
+		for (i = 0; i < ARRAY_SIZE(domain->programs); i++)
+			put_landlock_prog_list(domain->programs[i]);
+		kfree(domain);
+	}
+}
+
+static struct landlock_domain *new_landlock_domain(void)
+{
+	struct landlock_domain *domain;
+
+	/* array filled with NULL values */
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return ERR_PTR(-ENOMEM);
+	refcount_set(&domain->usage, 1);
+	return domain;
+}
+
+/**
+ * store_landlock_prog - prepend and deduplicate a Landlock prog_list
+ *
+ * Prepend @prog to @init_domain while ignoring @prog if they are already in
+ * @ref_domain.  Whatever is the result of this function call, you can call
+ * bpf_prog_put(@prog) after.
+ *
+ * @init_domain: empty domain to prepend to
+ * @ref_domain: domain to check for duplicate programs
+ * @prog: program to prepend
+ *
+ * Return -errno on error or 0 if @prog was successfully stored.
+ */
+static int store_landlock_prog(struct landlock_domain *init_domain,
+		const struct landlock_domain *ref_domain,
+		struct bpf_prog *prog)
+{
+	struct landlock_prog_list *tmp_list = NULL;
+	int err;
+	size_t hook;
+	enum landlock_hook_type last_type;
+	struct bpf_prog *new = prog;
+
+	/* allocate all the memory we need */
+	struct landlock_prog_list *new_list;
+
+	last_type = get_hook_type(new);
+
+	/* ignore duplicate programs */
+	if (ref_domain) {
+		struct landlock_prog_list *ref;
+
+		hook = get_hook_index(get_hook_type(new));
+		for (ref = ref_domain->programs[hook]; ref;
+				ref = ref->prev) {
+			if (ref->prog == new)
+				return -EINVAL;
+		}
+	}
+
+	new = bpf_prog_inc(new);
+	if (IS_ERR(new)) {
+		err = PTR_ERR(new);
+		goto put_tmp_list;
+	}
+	new_list = kzalloc(sizeof(*new_list), GFP_KERNEL);
+	if (!new_list) {
+		bpf_prog_put(new);
+		err = -ENOMEM;
+		goto put_tmp_list;
+	}
+	/* ignore Landlock types in this tmp_list */
+	new_list->prog = new;
+	new_list->prev = tmp_list;
+	refcount_set(&new_list->usage, 1);
+	tmp_list = new_list;
+
+	if (!tmp_list)
+		/* inform user space that this program was already added */
+		return -EEXIST;
+
+	/* properly store the list (without error cases) */
+	while (tmp_list) {
+		struct landlock_prog_list *new_list;
+
+		new_list = tmp_list;
+		tmp_list = tmp_list->prev;
+		/* do not increment the previous prog list usage */
+		hook = get_hook_index(get_hook_type(new_list->prog));
+		new_list->prev = init_domain->programs[hook];
+		/* no need to add from the last program to the first because
+		 * each of them are a different Landlock type */
+		smp_store_release(&init_domain->programs[hook], new_list);
+	}
+	return 0;
+
+put_tmp_list:
+	put_landlock_prog_list(tmp_list);
+	return err;
+}
+
+/* limit Landlock programs set to 256KB */
+#define LANDLOCK_PROGRAMS_MAX_PAGES (1 << 6)
+
+/**
+ * landlock_prepend_prog - attach a Landlock prog_list to @current_domain
+ *
+ * Whatever is the result of this function call, you can call
+ * bpf_prog_put(@prog) after.
+ *
+ * @current_domain: landlock_domain pointer, must be (RCU-)locked (if needed)
+ *                  to prevent a concurrent put/free. This pointer must not be
+ *                  freed after the call.
+ * @prog: non-NULL Landlock prog_list to prepend to @current_domain. @prog will
+ *        be owned by landlock_prepend_prog() and freed if an error happened.
+ *
+ * Return @current_domain or a new pointer when OK. Return a pointer error
+ * otherwise.
+ */
+struct landlock_domain *landlock_prepend_prog(
+		struct landlock_domain *current_domain,
+		struct bpf_prog *prog)
+{
+	struct landlock_domain *new_domain = current_domain;
+	unsigned long pages;
+	int err;
+	size_t i;
+	struct landlock_domain tmp_domain = {};
+
+	if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
+		return ERR_PTR(-EINVAL);
+
+	/* validate memory size allocation */
+	pages = prog->pages;
+	if (current_domain) {
+		size_t i;
+
+		for (i = 0; i < ARRAY_SIZE(current_domain->programs); i++) {
+			struct landlock_prog_list *walker_p;
+
+			for (walker_p = current_domain->programs[i];
+					walker_p; walker_p = walker_p->prev)
+				pages += walker_p->prog->pages;
+		}
+		/* count a struct landlock_domain if we need to allocate one */
+		if (refcount_read(&current_domain->usage) != 1)
+			pages += round_up(sizeof(*current_domain), PAGE_SIZE)
+				/ PAGE_SIZE;
+	}
+	if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
+		return ERR_PTR(-E2BIG);
+
+	/* ensure early that we can allocate enough memory for the new
+	 * prog_lists */
+	err = store_landlock_prog(&tmp_domain, current_domain, prog);
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * Each task_struct points to an array of prog list pointers.  These
+	 * tables are duplicated when additions are made (which means each
+	 * table needs to be refcounted for the processes using it). When a new
+	 * table is created, all the refcounters on the prog_list are bumped
+	 * (to track each table that references the prog). When a new prog is
+	 * added, it's just prepended to the list for the new table to point
+	 * at.
+	 *
+	 * Manage all the possible errors before this step to not uselessly
+	 * duplicate current_domain and avoid a rollback.
+	 */
+	if (!new_domain) {
+		/*
+		 * If there is no Landlock domain used by the current task,
+		 * then create a new one.
+		 */
+		new_domain = new_landlock_domain();
+		if (IS_ERR(new_domain))
+			goto put_tmp_lists;
+	} else if (refcount_read(&current_domain->usage) > 1) {
+		/*
+		 * If the current task is not the sole user of its Landlock
+		 * domain, then duplicate it.
+		 */
+		new_domain = new_landlock_domain();
+		if (IS_ERR(new_domain))
+			goto put_tmp_lists;
+		for (i = 0; i < ARRAY_SIZE(new_domain->programs); i++) {
+			new_domain->programs[i] =
+				READ_ONCE(current_domain->programs[i]);
+			if (new_domain->programs[i])
+				refcount_inc(&new_domain->programs[i]->usage);
+		}
+
+		/*
+		 * Landlock domain from the current task will not be freed here
+		 * because the usage is strictly greater than 1. It is only
+		 * prevented to be freed by another task thanks to the caller
+		 * of landlock_prepend_prog() which should be locked if needed.
+		 */
+		landlock_put_domain(current_domain);
+	}
+
+	/* prepend tmp_domain to new_domain */
+	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++) {
+		/* get the last new list */
+		struct landlock_prog_list *last_list =
+			tmp_domain.programs[i];
+
+		if (last_list) {
+			while (last_list->prev)
+				last_list = last_list->prev;
+			/* no need to increment usage (pointer replacement) */
+			last_list->prev = new_domain->programs[i];
+			new_domain->programs[i] = tmp_domain.programs[i];
+		}
+	}
+	return new_domain;
+
+put_tmp_lists:
+	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++)
+		put_landlock_prog_list(tmp_domain.programs[i]);
+	return new_domain;
+}
diff --git a/security/landlock/domain_manage.h b/security/landlock/domain_manage.h
new file mode 100644
index 000000000000..5b5b49f6e3e8
--- /dev/null
+++ b/security/landlock/domain_manage.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - domain management headers
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
+#define _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
+
+#include <linux/filter.h>
+
+#include "common.h"
+
+void landlock_get_domain(struct landlock_domain *dom);
+void landlock_put_domain(struct landlock_domain *dom);
+
+struct landlock_domain *landlock_prepend_prog(
+		struct landlock_domain *current_domain,
+		struct bpf_prog *prog);
+
+#endif /* _SECURITY_LANDLOCK_DOMAIN_MANAGE_H */
-- 
2.23.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox