Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v2 7/7] random: Remove kernel.random.read_wakeup_threshold
From: Andy Lutomirski @ 2019-09-20 14:36 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Ahmed S. Darwish,
	Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
	Michael Kerrisk, Willy Tarreau, Matthew Garrett,
	Ext4 Developers List, linux-man, Andy Lutomirski
In-Reply-To: <cover.1568990048.git.luto@kernel.org>

It has no effect any more, so remove it.  We can revert this if
there is some user code that expects to be able to set this sysctl.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 99fea5cc29a8..2a284f30cac4 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -369,12 +369,6 @@
 #define ENTROPY_SHIFT 3
 #define ENTROPY_BITS(r) ((r)->entropy_count >> ENTROPY_SHIFT)
 
-/*
- * The minimum number of bits of entropy before we wake up a read on
- * /dev/random.  Should be enough to do a significant reseed.
- */
-static int random_read_wakeup_bits = 64;
-
 /*
  * If the entropy count falls under this number of bits, then we
  * should wake up processes which are selecting or polling on write
@@ -1982,8 +1976,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 
 #include <linux/sysctl.h>
 
-static int min_read_thresh = 8, min_write_thresh;
-static int max_read_thresh = OUTPUT_POOL_WORDS * 32;
+static int min_write_thresh;
 static int max_write_thresh = INPUT_POOL_WORDS * 32;
 static int random_min_urandom_seed = 60;
 static char sysctl_bootid[16];
@@ -2058,15 +2051,6 @@ struct ctl_table random_table[] = {
 		.proc_handler	= proc_do_entropy,
 		.data		= &input_pool.entropy_count,
 	},
-	{
-		.procname	= "read_wakeup_threshold",
-		.data		= &random_read_wakeup_bits,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &min_read_thresh,
-		.extra2		= &max_read_thresh,
-	},
 	{
 		.procname	= "write_wakeup_threshold",
 		.data		= &random_write_wakeup_bits,
-- 
2.21.0

^ permalink raw reply related

* [PATCH v2 6/7] random: Delete code to pull data into pools
From: Andy Lutomirski @ 2019-09-20 14:36 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Ahmed S. Darwish,
	Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
	Michael Kerrisk, Willy Tarreau, Matthew Garrett,
	Ext4 Developers List, linux-man, Andy Lutomirski
In-Reply-To: <cover.1568990048.git.luto@kernel.org>

There is no pool that pulls, so it was just dead code.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 4521138231ed..99fea5cc29a8 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -528,10 +528,8 @@ struct entropy_store {
 	const struct poolinfo *poolinfo;
 	__u32 *pool;
 	const char *name;
-	struct entropy_store *pull;
 
 	/* read-write data: */
-	unsigned long last_pulled;
 	spinlock_t lock;
 	unsigned short add_ptr;
 	unsigned short input_rotate;
@@ -1347,41 +1345,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness);
  *
  *********************************************************************/
 
-/*
- * This utility inline function is responsible for transferring entropy
- * from the primary pool to the secondary extraction pool. We make
- * sure we pull enough for a 'catastrophic reseed'.
- */
-static void _xfer_secondary_pool(struct entropy_store *r, size_t nbytes);
-static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes)
-{
-	if (!r->pull ||
-	    r->entropy_count >= (nbytes << (ENTROPY_SHIFT + 3)) ||
-	    r->entropy_count > r->poolinfo->poolfracbits)
-		return;
-
-	_xfer_secondary_pool(r, nbytes);
-}
-
-static void _xfer_secondary_pool(struct entropy_store *r, size_t nbytes)
-{
-	__u32	tmp[OUTPUT_POOL_WORDS];
-
-	int bytes = nbytes;
-
-	/* pull at least as much as a wakeup */
-	bytes = max_t(int, bytes, random_read_wakeup_bits / 8);
-	/* but never more than the buffer size */
-	bytes = min_t(int, bytes, sizeof(tmp));
-
-	trace_xfer_secondary_pool(r->name, bytes * 8, nbytes * 8,
-				  ENTROPY_BITS(r), ENTROPY_BITS(r->pull));
-	bytes = extract_entropy(r->pull, tmp, bytes,
-				random_read_wakeup_bits / 8, 0);
-	mix_pool_bytes(r, tmp, bytes);
-	credit_entropy_bits(r, bytes*8);
-}
-
 /*
  * This function decides how many bytes to actually take from the
  * given pool, and also debits the entropy count accordingly.
@@ -1545,7 +1508,6 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
 			spin_unlock_irqrestore(&r->lock, flags);
 			trace_extract_entropy(r->name, EXTRACT_SIZE,
 					      ENTROPY_BITS(r), _RET_IP_);
-			xfer_secondary_pool(r, EXTRACT_SIZE);
 			extract_buf(r, tmp);
 			spin_lock_irqsave(&r->lock, flags);
 			memcpy(r->last_data, tmp, EXTRACT_SIZE);
@@ -1554,7 +1516,6 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
 	}
 
 	trace_extract_entropy(r->name, nbytes, ENTROPY_BITS(r), _RET_IP_);
-	xfer_secondary_pool(r, nbytes);
 	nbytes = account(r, nbytes, min, reserved);
 
 	return _extract_entropy(r, buf, nbytes, fips_enabled);
@@ -1765,7 +1726,6 @@ static void __init init_std_data(struct entropy_store *r)
 	ktime_t now = ktime_get_real();
 	unsigned long rv;
 
-	r->last_pulled = jiffies;
 	mix_pool_bytes(r, &now, sizeof(now));
 	for (i = r->poolinfo->poolbytes; i > 0; i -= sizeof(rv)) {
 		if (!arch_get_random_seed_long(&rv) &&
-- 
2.21.0

^ permalink raw reply related

* [PATCH v2 5/7] random: Remove the blocking pool
From: Andy Lutomirski @ 2019-09-20 14:36 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Ahmed S. Darwish,
	Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
	Michael Kerrisk, Willy Tarreau, Matthew Garrett,
	Ext4 Developers List, linux-man, Andy Lutomirski
In-Reply-To: <cover.1568990048.git.luto@kernel.org>

There is no longer any interface to read data from the blocking
pool, so remove it.

This enables quite a bit of code deletion, much of which will be
done in subsequent patches.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 106 ------------------------------------------
 1 file changed, 106 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 29a158d9353c..4521138231ed 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -470,7 +470,6 @@ static const struct poolinfo {
 /*
  * Static global variables
  */
-static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
 static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 static struct fasync_struct *fasync;
 
@@ -530,7 +529,6 @@ struct entropy_store {
 	__u32 *pool;
 	const char *name;
 	struct entropy_store *pull;
-	struct work_struct push_work;
 
 	/* read-write data: */
 	unsigned long last_pulled;
@@ -549,9 +547,7 @@ static ssize_t _extract_entropy(struct entropy_store *r, void *buf,
 				size_t nbytes, int fips);
 
 static void crng_reseed(struct crng_state *crng, struct entropy_store *r);
-static void push_to_pool(struct work_struct *work);
 static __u32 input_pool_data[INPUT_POOL_WORDS] __latent_entropy;
-static __u32 blocking_pool_data[OUTPUT_POOL_WORDS] __latent_entropy;
 
 static struct entropy_store input_pool = {
 	.poolinfo = &poolinfo_table[0],
@@ -560,16 +556,6 @@ static struct entropy_store input_pool = {
 	.pool = input_pool_data
 };
 
-static struct entropy_store blocking_pool = {
-	.poolinfo = &poolinfo_table[1],
-	.name = "blocking",
-	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock),
-	.pool = blocking_pool_data,
-	.push_work = __WORK_INITIALIZER(blocking_pool.push_work,
-					push_to_pool),
-};
-
 static __u32 const twist_table[8] = {
 	0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
 	0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
@@ -765,15 +751,11 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 		entropy_count = 0;
 	} else if (entropy_count > pool_size)
 		entropy_count = pool_size;
-	if ((r == &blocking_pool) && !r->initialized &&
-	    (entropy_count >> ENTROPY_SHIFT) > 128)
-		has_initialized = 1;
 	if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig)
 		goto retry;
 
 	if (has_initialized) {
 		r->initialized = 1;
-		wake_up_interruptible(&random_read_wait);
 		kill_fasync(&fasync, SIGIO, POLL_IN);
 	}
 
@@ -782,7 +764,6 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 
 	if (r == &input_pool) {
 		int entropy_bits = entropy_count >> ENTROPY_SHIFT;
-		struct entropy_store *other = &blocking_pool;
 
 		if (crng_init < 2) {
 			if (entropy_bits < 128)
@@ -790,27 +771,6 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 			crng_reseed(&primary_crng, r);
 			entropy_bits = r->entropy_count >> ENTROPY_SHIFT;
 		}
-
-		/* initialize the blocking pool if necessary */
-		if (entropy_bits >= random_read_wakeup_bits &&
-		    !other->initialized) {
-			schedule_work(&other->push_work);
-			return;
-		}
-
-		/* should we wake readers? */
-		if (entropy_bits >= random_read_wakeup_bits &&
-		    wq_has_sleeper(&random_read_wait)) {
-			wake_up_interruptible(&random_read_wait);
-		}
-		/* If the input pool is getting full, and the blocking
-		 * pool has room, send some entropy to the blocking
-		 * pool.
-		 */
-		if (!work_pending(&other->push_work) &&
-		    (ENTROPY_BITS(r) > 6 * r->poolinfo->poolbytes) &&
-		    (ENTROPY_BITS(other) <= 6 * other->poolinfo->poolbytes))
-			schedule_work(&other->push_work);
 	}
 }
 
@@ -1422,22 +1382,6 @@ static void _xfer_secondary_pool(struct entropy_store *r, size_t nbytes)
 	credit_entropy_bits(r, bytes*8);
 }
 
-/*
- * Used as a workqueue function so that when the input pool is getting
- * full, we can "spill over" some entropy to the output pools.  That
- * way the output pools can store some of the excess entropy instead
- * of letting it go to waste.
- */
-static void push_to_pool(struct work_struct *work)
-{
-	struct entropy_store *r = container_of(work, struct entropy_store,
-					      push_work);
-	BUG_ON(!r);
-	_xfer_secondary_pool(r, random_read_wakeup_bits/8);
-	trace_push_to_pool(r->name, r->entropy_count >> ENTROPY_SHIFT,
-			   r->pull->entropy_count >> ENTROPY_SHIFT);
-}
-
 /*
  * This function decides how many bytes to actually take from the
  * given pool, and also debits the entropy count accordingly.
@@ -1616,54 +1560,6 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
 	return _extract_entropy(r, buf, nbytes, fips_enabled);
 }
 
-/*
- * This function extracts randomness from the "entropy pool", and
- * returns it in a userspace buffer.
- */
-static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf,
-				    size_t nbytes)
-{
-	ssize_t ret = 0, i;
-	__u8 tmp[EXTRACT_SIZE];
-	int large_request = (nbytes > 256);
-
-	trace_extract_entropy_user(r->name, nbytes, ENTROPY_BITS(r), _RET_IP_);
-	if (!r->initialized && r->pull) {
-		xfer_secondary_pool(r, ENTROPY_BITS(r->pull)/8);
-		if (!r->initialized)
-			return 0;
-	}
-	xfer_secondary_pool(r, nbytes);
-	nbytes = account(r, nbytes, 0, 0);
-
-	while (nbytes) {
-		if (large_request && need_resched()) {
-			if (signal_pending(current)) {
-				if (ret == 0)
-					ret = -ERESTARTSYS;
-				break;
-			}
-			schedule();
-		}
-
-		extract_buf(r, tmp);
-		i = min_t(int, nbytes, EXTRACT_SIZE);
-		if (copy_to_user(buf, tmp, i)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		nbytes -= i;
-		buf += i;
-		ret += i;
-	}
-
-	/* Wipe data just returned from memory */
-	memzero_explicit(tmp, sizeof(tmp));
-
-	return ret;
-}
-
 #define warn_unseeded_randomness(previous) \
 	_warn_unseeded_randomness(__func__, (void *) _RET_IP_, (previous))
 
@@ -1893,7 +1789,6 @@ static void __init init_std_data(struct entropy_store *r)
 int __init rand_initialize(void)
 {
 	init_std_data(&input_pool);
-	init_std_data(&blocking_pool);
 	crng_initialize(&primary_crng);
 	crng_global_init_time = jiffies;
 	if (ratelimit_disable) {
@@ -2053,7 +1948,6 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		input_pool.entropy_count = 0;
-		blocking_pool.entropy_count = 0;
 		return 0;
 	case RNDRESEEDCRNG:
 		if (!capable(CAP_SYS_ADMIN))
-- 
2.21.0

^ permalink raw reply related

* [PATCH v2 4/7] random: Make /dev/random be almost like /dev/urandom
From: Andy Lutomirski @ 2019-09-20 14:36 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Ahmed S. Darwish,
	Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
	Michael Kerrisk, Willy Tarreau, Matthew Garrett,
	Ext4 Developers List, linux-man, Andy Lutomirski
In-Reply-To: <cover.1568990048.git.luto@kernel.org>

This patch changes the read semantics of /dev/random to be the same
as /dev/urandom except that reads will block until the CRNG is
ready.

None of the cleanups that this enables have been done yet.  As a
result, this gives a warning about an unused function.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 55 +++++++++++--------------------------------
 1 file changed, 14 insertions(+), 41 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 1ad2c7eaf675..29a158d9353c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -354,7 +354,6 @@
 #define INPUT_POOL_WORDS	(1 << (INPUT_POOL_SHIFT-5))
 #define OUTPUT_POOL_SHIFT	10
 #define OUTPUT_POOL_WORDS	(1 << (OUTPUT_POOL_SHIFT-5))
-#define SEC_XFER_SIZE		512
 #define EXTRACT_SIZE		10
 
 
@@ -803,7 +802,6 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 		if (entropy_bits >= random_read_wakeup_bits &&
 		    wq_has_sleeper(&random_read_wait)) {
 			wake_up_interruptible(&random_read_wait);
-			kill_fasync(&fasync, SIGIO, POLL_IN);
 		}
 		/* If the input pool is getting full, and the blocking
 		 * pool has room, send some entropy to the blocking
@@ -1031,6 +1029,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 		crng_init = 2;
 		process_random_ready_list();
 		wake_up_interruptible(&crng_init_wait);
+		kill_fasync(&fasync, SIGIO, POLL_IN);
 		pr_notice("random: crng init done\n");
 		if (unseeded_warning.missed) {
 			pr_notice("random: %d get_random_xx warning(s) missed "
@@ -1921,43 +1920,6 @@ void rand_initialize_disk(struct gendisk *disk)
 }
 #endif
 
-static ssize_t
-_random_read(int nonblock, char __user *buf, size_t nbytes)
-{
-	ssize_t n;
-
-	if (nbytes == 0)
-		return 0;
-
-	nbytes = min_t(size_t, nbytes, SEC_XFER_SIZE);
-	while (1) {
-		n = extract_entropy_user(&blocking_pool, buf, nbytes);
-		if (n < 0)
-			return n;
-		trace_random_read(n*8, (nbytes-n)*8,
-				  ENTROPY_BITS(&blocking_pool),
-				  ENTROPY_BITS(&input_pool));
-		if (n > 0)
-			return n;
-
-		/* Pool is (near) empty.  Maybe wait and retry. */
-		if (nonblock)
-			return -EAGAIN;
-
-		wait_event_interruptible(random_read_wait,
-		    blocking_pool.initialized &&
-		    (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits));
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-	}
-}
-
-static ssize_t
-random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
-{
-	return _random_read(file->f_flags & O_NONBLOCK, buf, nbytes);
-}
-
 static ssize_t
 urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
@@ -1981,15 +1943,26 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 	return ret;
 }
 
+static ssize_t
+random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	int ret;
+
+	ret = wait_for_random_bytes();
+	if (ret != 0)
+		return ret;
+	return urandom_read(file, buf, nbytes, ppos);
+}
+
 static __poll_t
 random_poll(struct file *file, poll_table * wait)
 {
 	__poll_t mask;
 
-	poll_wait(file, &random_read_wait, wait);
+	poll_wait(file, &crng_init_wait, wait);
 	poll_wait(file, &random_write_wait, wait);
 	mask = 0;
-	if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)
+	if (crng_ready())
 		mask |= EPOLLIN | EPOLLRDNORM;
 	if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits)
 		mask |= EPOLLOUT | EPOLLWRNORM;
-- 
2.21.0

^ permalink raw reply related

* [PATCH v2 3/7] random: Ignore GRND_RANDOM in getentropy(2)
From: Andy Lutomirski @ 2019-09-20 14:36 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Ahmed S. Darwish,
	Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
	Michael Kerrisk, Willy Tarreau, Matthew Garrett,
	Ext4 Developers List, linux-man, Andy Lutomirski
In-Reply-To: <cover.1568990048.git.luto@kernel.org>

The separate blocking pool is going away.  Start by ignoring
GRND_RANDOM in getentropy(2).

This should not materially break any API.  Any code that worked
without this change should work at least as well with this change.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c       | 3 ---
 include/uapi/linux/random.h | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index acabb870f222..1ad2c7eaf675 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -2135,9 +2135,6 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 	if (count > INT_MAX)
 		count = INT_MAX;
 
-	if (flags & GRND_RANDOM)
-		return _random_read(flags & GRND_NONBLOCK, buf, count);
-
 	if (!(flags & GRND_INSECURE) && !crng_ready()) {
 		if (flags & GRND_NONBLOCK)
 			return -EAGAIN;
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index c092d20088d3..dcc1b3e6106f 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -48,7 +48,7 @@ struct rand_pool_info {
  * Flags for getrandom(2)
  *
  * GRND_NONBLOCK	Don't block and return EAGAIN instead
- * GRND_RANDOM		Use the /dev/random pool instead of /dev/urandom
+ * GRND_RANDOM		No effect
  * GRND_INSECURE	Return non-cryptographic random bytes
  */
 #define GRND_NONBLOCK	0x0001
-- 
2.21.0

^ permalink raw reply related

* [PATCH v2 2/7] random: Add GRND_INSECURE to return best-effort non-cryptographic bytes
From: Andy Lutomirski @ 2019-09-20 14:36 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Ahmed S. Darwish,
	Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
	Michael Kerrisk, Willy Tarreau, Matthew Garrett,
	Ext4 Developers List, linux-man, Andy Lutomirski
In-Reply-To: <cover.1568990048.git.luto@kernel.org>

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c       | 11 +++++++++--
 include/uapi/linux/random.h |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index d152612e08fc..acabb870f222 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -2122,7 +2122,14 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 {
 	int ret;
 
-	if (flags & ~(GRND_NONBLOCK|GRND_RANDOM))
+	if (flags & ~(GRND_NONBLOCK|GRND_RANDOM|GRND_INSECURE))
+		return -EINVAL;
+
+	/*
+	 * Requesting insecure and blocking randomness at the same time makes
+	 * no sense.
+	 */
+	if ((flags & (GRND_INSECURE|GRND_RANDOM)) == (GRND_INSECURE|GRND_RANDOM))
 		return -EINVAL;
 
 	if (count > INT_MAX)
@@ -2131,7 +2138,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 	if (flags & GRND_RANDOM)
 		return _random_read(flags & GRND_NONBLOCK, buf, count);
 
-	if (!crng_ready()) {
+	if (!(flags & GRND_INSECURE) && !crng_ready()) {
 		if (flags & GRND_NONBLOCK)
 			return -EAGAIN;
 		ret = wait_for_random_bytes();
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index 26ee91300e3e..c092d20088d3 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -49,8 +49,10 @@ struct rand_pool_info {
  *
  * GRND_NONBLOCK	Don't block and return EAGAIN instead
  * GRND_RANDOM		Use the /dev/random pool instead of /dev/urandom
+ * GRND_INSECURE	Return non-cryptographic random bytes
  */
 #define GRND_NONBLOCK	0x0001
 #define GRND_RANDOM	0x0002
+#define GRND_INSECURE	0x0004
 
 #endif /* _UAPI_LINUX_RANDOM_H */
-- 
2.21.0

^ permalink raw reply related

* [PATCH v2 1/7] random: Don't wake crng_init_wait when crng_init == 1
From: Andy Lutomirski @ 2019-09-20 14:36 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Ahmed S. Darwish,
	Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
	Michael Kerrisk, Willy Tarreau, Matthew Garrett,
	Ext4 Developers List, linux-man, Andy Lutomirski
In-Reply-To: <cover.1568990048.git.luto@kernel.org>

crng_init_wait is only used to wayt for crng_init to be set to 2, so
there's no point to waking it when crng_init is set to 1.  Remove the
unnecessary wake_up_interruptible() call.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 5d5ea4ce1442..d152612e08fc 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -946,7 +946,6 @@ static int crng_fast_load(const char *cp, size_t len)
 	if (crng_init_cnt >= CRNG_INIT_CNT_THRESH) {
 		invalidate_batched_entropy();
 		crng_init = 1;
-		wake_up_interruptible(&crng_init_wait);
 		pr_notice("random: fast init done\n");
 	}
 	return 1;
-- 
2.21.0

^ permalink raw reply related

* [PATCH v2 0/7] Rework random blocking
From: Andy Lutomirski @ 2019-09-20 14:36 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Ahmed S. Darwish,
	Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
	Michael Kerrisk, Willy Tarreau, Matthew Garrett,
	Ext4 Developers List, linux-man, Andy Lutomirski

This makes two major semantic changes to Linux's random APIs:

It adds getentropy(..., GRND_INSECURE).  This causes getentropy to
always return *something*.  There is no guarantee whatsoever that
the result will be cryptographically random or even unique, but the
kernel will give the best quality random output it can.  The name is
a big hint: the resulting output is INSECURE.

The purpose of this is to allow programs that genuinely want
best-effort entropy to get it without resorting to /dev/urandom.
Plenty of programs do this because they need to do *something*
during boot and they can't afford to wait.  Calling it "INSECURE" is
probably the best we can do to discourage using this API for things
that need security.

This series also removes the blocking pool and makes /dev/random
work just like getentropy(..., 0) and makes GRND_RANDOM a no-op.  I
believe that Linux's blocking pool has outlived its usefulness.
Linux's CRNG generates output that is good enough to use even for
key generation.  The blocking pool is not stronger in any material
way, and keeping it around requires a lot of infrastructure of
dubious value.

This series should not break any existing programs.  /dev/urandom is
unchanged.  /dev/random will still block just after booting, but it
will block less than it used to.  getentropy() with existing flags
will return output that is, for practical purposes, just as strong
as before.

Changes from v1:
 - Rebased to v5.3.  No other changes.

Andy Lutomirski (7):
  random: Don't wake crng_init_wait when crng_init == 1
  random: Add GRND_INSECURE to return best-effort non-cryptographic
    bytes
  random: Ignore GRND_RANDOM in getentropy(2)
  random: Make /dev/random be almost like /dev/urandom
  random: Remove the blocking pool
  random: Delete code to pull data into pools
  random: Remove kernel.random.read_wakeup_threshold

 drivers/char/random.c       | 234 ++++--------------------------------
 include/uapi/linux/random.h |   4 +-
 2 files changed, 27 insertions(+), 211 deletions(-)

-- 
2.21.0

^ permalink raw reply

* Re: [PATCH RFC v4 1/1] random: WARN on large getrandom() waits and introduce getrandom2()
From: Andy Lutomirski @ 2019-09-20 14:33 UTC (permalink / raw)
  To: Ahmed S. Darwish
  Cc: Linus Torvalds, Lennart Poettering, Theodore Y. Ts'o,
	Eric W. Biederman, Alexander E. Patrakov, Michael Kerrisk,
	Willy Tarreau, Matthew Garrett, lkml, Ext4 Developers List,
	Linux API, linux-man
In-Reply-To: <20190920134609.GA2113@pc>

On Fri, Sep 20, 2019 at 6:46 AM Ahmed S. Darwish <darwish.07@gmail.com> wrote:
>
> Hi,
>
> On Wed, Sep 18, 2019 at 04:57:58PM -0700, Linus Torvalds wrote:
> > On Wed, Sep 18, 2019 at 2:17 PM Ahmed S. Darwish <darwish.07@gmail.com> wrote:
> > >
> > > Since Linux v3.17, getrandom(2) has been created as a new and more
> > > secure interface for pseudorandom data requests.  It attempted to
> > > solve three problems, as compared to /dev/urandom:
>   >
> > I don't think your patch is really _wrong_, but I think it's silly to
> > introduce a new system call, when we have 30 bits left in the flags of
> > the old one, and the old system call checked them.
> >
> > So it's much simpler and more straightforward to  just introduce a
> > single new bit #2 that says "I actually know what I'm doing, and I'm
> > explicitly asking for secure/insecure random data".
> >
> > And then say that the existing bit #1 just means "I want to wait for entropy".
> >
> > So then you end up with this:
> >
> >     /*
> >      * Flags for getrandom(2)
> >      *
> >      * GRND_NONBLOCK    Don't block and return EAGAIN instead
> >      * GRND_WAIT_ENTROPY        Explicitly wait for entropy
> >      * GRND_EXPLICIT    Make it clear you know what you are doing
> >      */
> >     #define GRND_NONBLOCK               0x0001
> >     #define GRND_WAIT_ENTROPY   0x0002
> >     #define GRND_EXPLICIT               0x0004

What is this GRND_EXPLICIT thing?

A few weeks ago, I sent a whole series to address this, and I
obviously didn't cc enough people.  I'll resend a rebased version
today.  Meanwhile, some comments on this whole mess:

As I think everyone mostly agrees in this whole thread, getrandom()
can't just magically start returning non-random results.  That would
be a big problem.

Linus, I disagree that blocking while waiting for randomness is an
error.  Sometimes you want to generate a key, you want to finish as
quickly as possible, and you don't want to be in the business of
fiddling with the setup of the kernel RNG.  I would argue that *most*
crypto applications are in this category.  I think that the kernel
should, instead, handle this mess itself.  As a first pass, it could
be as simple as noticing that someone is blocking on randomness and
kicking off a thread that does some randomish reads to the rootfs.
This would roughly simulate the old behavior in which an ext4 rootfs
did more IO than necessary.  A fancier version would, as discussed in
this thread, do more clever things.

(As an aside, I am not a fan of xoring or adding stuff to the CRNG
state.  We should just use an actual crypto primitive for this.
Accumulate the state in a buffer and SHA-512 it.  Or use something
like the Keccak duplex sponge.  But this is a discussion for another
day.)

So I'm going to resend my series.  You can all fight over whether the
patch that actually goes in should be based on my series or based on
this patch.

--Andy

^ permalink raw reply

* Re: [PATCH -next] treewide: remove unused argument in lock_release()
From: Will Deacon @ 2019-09-20 13:49 UTC (permalink / raw)
  To: Qian Cai
  Cc: torvalds, ast, akpm, mingo, peterz, linux-kernel, linux-api,
	maarten.lankhorst, mripard, sean, airlied, daniel, dri-devel,
	gregkh, jslaby, viro, linux-fsdevel, joonas.lahtinen,
	rodrigo.vivi, intel-gfx, tytso, jack, linux-ext4, tj, mark, jlbec,
	joseph.qi, ocfs2-devel, davem, daniel, netdev, bpf, duyuyang,
	juri.lelli
In-Reply-To: <1568983836.5576.194.camel@lca.pw>

On Fri, Sep 20, 2019 at 08:50:36AM -0400, Qian Cai wrote:
> On Fri, 2019-09-20 at 10:38 +0100, Will Deacon wrote:
> > On Thu, Sep 19, 2019 at 12:09:40PM -0400, Qian Cai wrote:
> > > Since the commit b4adfe8e05f1 ("locking/lockdep: Remove unused argument
> > > in __lock_release"), @nested is no longer used in lock_release(), so
> > > remove it from all lock_release() calls and friends.
> > > 
> > > Signed-off-by: Qian Cai <cai@lca.pw>
> > > ---
> > 
> > Although this looks fine to me at a first glance, it might be slightly
> > easier to manage if you hit {spin,rwlock,seqcount,mutex,rwsem}_release()
> > first with coccinelle scripts, and then hack lock_release() as a final
> > patch. That way it's easy to regenerate things if needed.
> 
> I am not sure if it worth the extra efforts where I have to retest it on all
> architectures, and the patch is really simple, but I can certainly do that if
> you insist.

I'm not insisting, just thought it might be easier to get it merged that
way. If you prefer to go with the big diff,

Acked-by: Will Deacon <will@kernel.org>

Will

^ permalink raw reply

* Re: [PATCH RFC v4 1/1] random: WARN on large getrandom() waits and introduce getrandom2()
From: Ahmed S. Darwish @ 2019-09-20 13:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Lennart Poettering, Theodore Y. Ts'o, Eric W. Biederman,
	Alexander E. Patrakov, Michael Kerrisk, Willy Tarreau,
	Matthew Garrett, lkml, linux-ext4, linux-api, linux-man
In-Reply-To: <CAHk-=wiCqDiU7SE3FLn2W26MS_voUAuqj5XFa1V_tiGTrrW-zQ@mail.gmail.com>

Hi,

On Wed, Sep 18, 2019 at 04:57:58PM -0700, Linus Torvalds wrote:
> On Wed, Sep 18, 2019 at 2:17 PM Ahmed S. Darwish <darwish.07@gmail.com> wrote:
> >
> > Since Linux v3.17, getrandom(2) has been created as a new and more
> > secure interface for pseudorandom data requests.  It attempted to
> > solve three problems, as compared to /dev/urandom:
  > 
> I don't think your patch is really _wrong_, but I think it's silly to
> introduce a new system call, when we have 30 bits left in the flags of
> the old one, and the old system call checked them.
> 
> So it's much simpler and more straightforward to  just introduce a
> single new bit #2 that says "I actually know what I'm doing, and I'm
> explicitly asking for secure/insecure random data".
> 
> And then say that the existing bit #1 just means "I want to wait for entropy".
> 
> So then you end up with this:
> 
>     /*
>      * Flags for getrandom(2)
>      *
>      * GRND_NONBLOCK    Don't block and return EAGAIN instead
>      * GRND_WAIT_ENTROPY        Explicitly wait for entropy
>      * GRND_EXPLICIT    Make it clear you know what you are doing
>      */
>     #define GRND_NONBLOCK               0x0001
>     #define GRND_WAIT_ENTROPY   0x0002
>     #define GRND_EXPLICIT               0x0004
> 
>     #define GRND_SECURE (GRND_EXPLICIT | GRND_WAIT_ENTROPY)
>     #define GRND_INSECURE       (GRND_EXPLICIT | GRND_NONBLOCK)
> 
>     /* Nobody wants /dev/random behavior, nobody should use it */
>     #define GRND_RANDOM 0x0002
> 
> which is actually fairly easy to understand. So now we have three
> bits, and the values are:
> 
>  000  - ambiguous "secure or just lazy/ignorant"
>  001 - -EAGAIN or secure
>  010 - blocking /dev/random DO NOT USE
>  011 - nonblocking /dev/random DO NOT USE
>  100 - nonsense, returns -EINVAL
>  101 - /dev/urandom without warnings
>  110 - blocking secure
>  111 - -EAGAIN or secure
>

Hmmm, the point of the new syscall was **exactly** to avoid the 2^3
combinations above, and to provide developers only two, sane and easy,
options:

  - GRND2_INSECURE
  - GRND2_SECURE_UNBOUNDED_INITIAL_WAIT

You *must* pick one of these, and that's it. (!)

Then the proposed getrandom_wait(7) manpage, also mentioned in the V4
patch WARN message, would provide a big rationale, and encourage
everyone to use the new getrandom2(2) syscall instead.

But yeah, maybe we should add the extra flags to the old getrandom()
instead, and let glibc implement a getrandom_safe(3) wrapper only
with the sane options available.

Problem is, glibc is still *really* slow in adopting linux syscall
wrappers, so I'm not optimistic about that...

I still see the new system call as the sanest path, even provided
the cost of a new syscall number..

@Linus, @Ted:  Final thoughts?

> and people would be encouraged to use one of these three:
> 
>  - GRND_INSECURE
>  - GRND_SECURE
>  - GRND_SECURE | GRND_NONBLOCK
> 
> all of which actually make sense, and none of which have any
> ambiguity. And while "GRND_INSECURE | GRND_NONBLOCK" works, it's
> exactly the same as just plain GRND_INSECURE - the point is that it
> doesn't block for entropy anyway, so non-blocking makes no different.
>

[...]

> 
> There is *one* other small semantic change: The old code did
> urandom_read() which added warnings, but each warning also _reset_ the
> crng_init_cnt. Until it decided not to warn any more, at which point
> it also stops that resetting of crng_init_cnt.
> 
> And that reset of crng_init_cnt, btw, is some cray cray.
> 
> It's basically a "we used up entropy" thing, which is very
> questionable to begin with as the whole discussion has shown, but
> since it stops doing it after 10 cases, it's not even good security
> assuming the "use up entropy" case makes sense in the first place.
> 
> So I didn't copy that insanity either. And I'm wondering if removing
> it from /dev/urandom might also end up helping Ahmed's case of getting
> entropy earlier, when we don't reset the counter.
>

Yeah, noticed that, but I've learned not to change crypto or
speculative-execution code even if the changes "just look the same" at
first glance ;-)

(out of curiosity, I'll do a quick test with this CRNG entropy reset
part removed. Maybe it was indeed part of the problem..)

> But other than those two details, none of the existing semantics
> changed, we just added the three actually _sane_ cases without any
> ambiguity.
> 
> In particular, this still leaves the semantics of that nasty
> "getrandom(0)" as the same "blocking urandom" that it currently is.
> But now it's a separate case, and we can make that perhaps do the
> timeout, or at least the warning.
>

Yeah, I would propose to keep the V4-submitted "timeout then WARN"
logic. This alone will give user-space / distributions time to adapt.

For example, it was interesting that even the 0day bot had limited
entropy on boot (virtio-rng / TRUST_CPU not enabled):

    https://lkml.kernel.org/r/20190920005120.GP15734@shao2-debian

If user-space didn't get its act together, then the other extreme
measures can be implemented later (the getrandom() length test, using
jitter as a credited kernel entropy source, etc., etc.)

> And the new cases are defined to *not* warn. In particular,
> GRND_INSECURE very much does *not* warn about early urandom access
> when crng isn't ready. Because the whole point of that new mode is
> that the user knows it isn't secure.
>
> So that should make getrandom(GRND_INSECURE) palatable to the systemd
> kind of use that wanted to avoid the pointless kernel warning.
>

Yup, that's what was in the submitted V4 patch too. The caller
explicitly asked for "insecure", so they know what they're doing.

getrandom2(2) never prints any kernel message.

> And we could mark this for stable and try to get it backported so that
> it will have better coverage, and encourage people to use the new sane
> _explicit_ waiting (or not) for entropy.
>

ACK. I'll wait for an answer to the "Final thoughts?" question above,
send a V5 with CC:stable, then disappear from this thread ;-)

Thanks a lot everyone!

--
Ahmed Darwish

^ permalink raw reply

* Re: [PATCH -next] treewide: remove unused argument in lock_release()
From: Qian Cai @ 2019-09-20 12:50 UTC (permalink / raw)
  To: Will Deacon, torvalds
  Cc: ast, akpm, mingo, peterz, linux-kernel, linux-api,
	maarten.lankhorst, mripard, sean, airlied, daniel, dri-devel,
	gregkh, jslaby, viro, linux-fsdevel, joonas.lahtinen,
	rodrigo.vivi, intel-gfx, tytso, jack, linux-ext4, tj, mark, jlbec,
	joseph.qi, ocfs2-devel, davem, daniel, netdev, bpf, duyuyang,
	juri.lelli, vincent.guittot
In-Reply-To: <20190920093700.7nfaghxdrmubp2do@willie-the-truck>

On Fri, 2019-09-20 at 10:38 +0100, Will Deacon wrote:
> On Thu, Sep 19, 2019 at 12:09:40PM -0400, Qian Cai wrote:
> > Since the commit b4adfe8e05f1 ("locking/lockdep: Remove unused argument
> > in __lock_release"), @nested is no longer used in lock_release(), so
> > remove it from all lock_release() calls and friends.
> > 
> > Signed-off-by: Qian Cai <cai@lca.pw>
> > ---
> 
> Although this looks fine to me at a first glance, it might be slightly
> easier to manage if you hit {spin,rwlock,seqcount,mutex,rwsem}_release()
> first with coccinelle scripts, and then hack lock_release() as a final
> patch. That way it's easy to regenerate things if needed.

I am not sure if it worth the extra efforts where I have to retest it on all
architectures, and the patch is really simple, but I can certainly do that if
you insist.

I have just confirmed the patch [1] also applied correctly to the latest
mainline, so it might be the best time just for Linus to merge it directly so it
does not introduce build errors later on?

[1]

https://lore.kernel.org/lkml/1568909380-32199-1-git-send-email-cai@lca.pw/

^ permalink raw reply

* Re: [PATCH -next] treewide: remove unused argument in lock_release()
From: Will Deacon @ 2019-09-20  9:38 UTC (permalink / raw)
  To: Qian Cai
  Cc: akpm, mingo, peterz, linux-kernel, linux-api, maarten.lankhorst,
	mripard, sean, airlied, daniel, dri-devel, gregkh, jslaby, viro,
	linux-fsdevel, joonas.lahtinen, rodrigo.vivi, intel-gfx, tytso,
	jack, linux-ext4, tj, mark, jlbec, joseph.qi, ocfs2-devel, davem,
	daniel, netdev, bpf, duyuyang, juri.lelli, vincent.guittot,
	hannes
In-Reply-To: <1568909380-32199-1-git-send-email-cai@lca.pw>

On Thu, Sep 19, 2019 at 12:09:40PM -0400, Qian Cai wrote:
> Since the commit b4adfe8e05f1 ("locking/lockdep: Remove unused argument
> in __lock_release"), @nested is no longer used in lock_release(), so
> remove it from all lock_release() calls and friends.
> 
> Signed-off-by: Qian Cai <cai@lca.pw>
> ---

Although this looks fine to me at a first glance, it might be slightly
easier to manage if you hit {spin,rwlock,seqcount,mutex,rwsem}_release()
first with coccinelle scripts, and then hack lock_release() as a final
patch. That way it's easy to regenerate things if needed.

Cheers,

Will

^ permalink raw reply

* [PATCH -next] treewide: remove unused argument in lock_release()
From: Qian Cai @ 2019-09-19 16:09 UTC (permalink / raw)
  To: akpm, mingo
  Cc: peterz, will, linux-kernel, linux-api, maarten.lankhorst, mripard,
	sean, airlied, daniel, dri-devel, gregkh, jslaby, viro,
	linux-fsdevel, joonas.lahtinen, rodrigo.vivi, intel-gfx, tytso,
	jack, linux-ext4, tj, mark, jlbec, joseph.qi, ocfs2-devel, davem,
	st, daniel, netdev, bpf, duyuyang, juri.lelli, vincent.guittot,
	hannes

Since the commit b4adfe8e05f1 ("locking/lockdep: Remove unused argument
in __lock_release"), @nested is no longer used in lock_release(), so
remove it from all lock_release() calls and friends.

Signed-off-by: Qian Cai <cai@lca.pw>
---
 drivers/gpu/drm/drm_connector.c               |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_shrinker.c  |  6 +++---
 drivers/gpu/drm/i915/gt/intel_engine_pm.c     |  2 +-
 drivers/gpu/drm/i915/i915_request.c           |  2 +-
 drivers/tty/tty_ldsem.c                       |  8 ++++----
 fs/dcache.c                                   |  2 +-
 fs/jbd2/transaction.c                         |  4 ++--
 fs/kernfs/dir.c                               |  4 ++--
 fs/ocfs2/dlmglue.c                            |  2 +-
 include/linux/jbd2.h                          |  2 +-
 include/linux/lockdep.h                       | 21 ++++++++++-----------
 include/linux/percpu-rwsem.h                  |  4 ++--
 include/linux/rcupdate.h                      |  2 +-
 include/linux/rwlock_api_smp.h                | 16 ++++++++--------
 include/linux/seqlock.h                       |  4 ++--
 include/linux/spinlock_api_smp.h              |  8 ++++----
 include/linux/ww_mutex.h                      |  2 +-
 include/net/sock.h                            |  2 +-
 kernel/bpf/stackmap.c                         |  2 +-
 kernel/cpu.c                                  |  2 +-
 kernel/locking/lockdep.c                      |  3 +--
 kernel/locking/mutex.c                        |  4 ++--
 kernel/locking/rtmutex.c                      |  6 +++---
 kernel/locking/rwsem.c                        | 10 +++++-----
 kernel/printk/printk.c                        | 10 +++++-----
 kernel/sched/core.c                           |  2 +-
 lib/locking-selftest.c                        | 24 ++++++++++++------------
 mm/memcontrol.c                               |  2 +-
 net/core/sock.c                               |  2 +-
 tools/lib/lockdep/include/liblockdep/common.h |  3 +--
 tools/lib/lockdep/include/liblockdep/mutex.h  |  2 +-
 tools/lib/lockdep/include/liblockdep/rwlock.h |  2 +-
 tools/lib/lockdep/preload.c                   | 16 ++++++++--------
 33 files changed, 90 insertions(+), 93 deletions(-)

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 4c766624b20d..4a8b2e5c2af6 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -719,7 +719,7 @@ void drm_connector_list_iter_end(struct drm_connector_list_iter *iter)
 		__drm_connector_put_safe(iter->conn);
 		spin_unlock_irqrestore(&config->connector_list_lock, flags);
 	}
-	lock_release(&connector_list_iter_dep_map, 0, _RET_IP_);
+	lock_release(&connector_list_iter_dep_map, _RET_IP_);
 }
 EXPORT_SYMBOL(drm_connector_list_iter_end);
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
index edd21d14e64f..1a51b3598d63 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
@@ -509,14 +509,14 @@ void i915_gem_shrinker_taints_mutex(struct drm_i915_private *i915,
 		      I915_MM_SHRINKER, 0, _RET_IP_);
 
 	mutex_acquire(&mutex->dep_map, 0, 0, _RET_IP_);
-	mutex_release(&mutex->dep_map, 0, _RET_IP_);
+	mutex_release(&mutex->dep_map, _RET_IP_);
 
-	mutex_release(&i915->drm.struct_mutex.dep_map, 0, _RET_IP_);
+	mutex_release(&i915->drm.struct_mutex.dep_map, _RET_IP_);
 
 	fs_reclaim_release(GFP_KERNEL);
 
 	if (unlock)
-		mutex_release(&i915->drm.struct_mutex.dep_map, 0, _RET_IP_);
+		mutex_release(&i915->drm.struct_mutex.dep_map, _RET_IP_);
 }
 
 #define obj_to_i915(obj__) to_i915((obj__)->base.dev)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 65b5ca74b394..7f647243b3b9 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -52,7 +52,7 @@ static inline unsigned long __timeline_mark_lock(struct intel_context *ce)
 static inline void __timeline_mark_unlock(struct intel_context *ce,
 					  unsigned long flags)
 {
-	mutex_release(&ce->timeline->mutex.dep_map, 0, _THIS_IP_);
+	mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_);
 	local_irq_restore(flags);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index a53777dd371c..e1f1be4d0531 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1456,7 +1456,7 @@ long i915_request_wait(struct i915_request *rq,
 	dma_fence_remove_callback(&rq->fence, &wait.cb);
 
 out:
-	mutex_release(&rq->engine->gt->reset.mutex.dep_map, 0, _THIS_IP_);
+	mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_);
 	trace_i915_request_wait_end(rq);
 	return timeout;
 }
diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index 60ff236a3d63..ce8291053af3 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -303,7 +303,7 @@ static int __ldsem_down_read_nested(struct ld_semaphore *sem,
 	if (count <= 0) {
 		lock_contended(&sem->dep_map, _RET_IP_);
 		if (!down_read_failed(sem, count, timeout)) {
-			rwsem_release(&sem->dep_map, 1, _RET_IP_);
+			rwsem_release(&sem->dep_map, _RET_IP_);
 			return 0;
 		}
 	}
@@ -322,7 +322,7 @@ static int __ldsem_down_write_nested(struct ld_semaphore *sem,
 	if ((count & LDSEM_ACTIVE_MASK) != LDSEM_ACTIVE_BIAS) {
 		lock_contended(&sem->dep_map, _RET_IP_);
 		if (!down_write_failed(sem, count, timeout)) {
-			rwsem_release(&sem->dep_map, 1, _RET_IP_);
+			rwsem_release(&sem->dep_map, _RET_IP_);
 			return 0;
 		}
 	}
@@ -390,7 +390,7 @@ void ldsem_up_read(struct ld_semaphore *sem)
 {
 	long count;
 
-	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+	rwsem_release(&sem->dep_map, _RET_IP_);
 
 	count = atomic_long_add_return(-LDSEM_READ_BIAS, &sem->count);
 	if (count < 0 && (count & LDSEM_ACTIVE_MASK) == 0)
@@ -404,7 +404,7 @@ void ldsem_up_write(struct ld_semaphore *sem)
 {
 	long count;
 
-	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+	rwsem_release(&sem->dep_map, _RET_IP_);
 
 	count = atomic_long_add_return(-LDSEM_WRITE_BIAS, &sem->count);
 	if (count < 0)
diff --git a/fs/dcache.c b/fs/dcache.c
index e88cf0554e65..f7931b682a0d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1319,7 +1319,7 @@ static void d_walk(struct dentry *parent, void *data,
 
 		if (!list_empty(&dentry->d_subdirs)) {
 			spin_unlock(&this_parent->d_lock);
-			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+			spin_release(&dentry->d_lock.dep_map, _RET_IP_);
 			this_parent = dentry;
 			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
 			goto repeat;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bee8498d7792..b25ebdcabfa3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -713,7 +713,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 	if (need_to_start)
 		jbd2_log_start_commit(journal, tid);
 
-	rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+	rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
 	handle->h_buffer_credits = nblocks;
 	/*
 	 * Restore the original nofs context because the journal restart
@@ -1848,7 +1848,7 @@ int jbd2_journal_stop(handle_t *handle)
 			wake_up(&journal->j_wait_transaction_locked);
 	}
 
-	rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+	rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
 
 	if (wait_for_commit)
 		err = jbd2_log_wait_commit(journal, tid);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 6ebae6bbe6a5..c45b82feac9a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -438,7 +438,7 @@ void kernfs_put_active(struct kernfs_node *kn)
 		return;
 
 	if (kernfs_lockdep(kn))
-		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+		rwsem_release(&kn->dep_map, _RET_IP_);
 	v = atomic_dec_return(&kn->active);
 	if (likely(v != KN_DEACTIVATED_BIAS))
 		return;
@@ -476,7 +476,7 @@ static void kernfs_drain(struct kernfs_node *kn)
 
 	if (kernfs_lockdep(kn)) {
 		lock_acquired(&kn->dep_map, _RET_IP_);
-		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+		rwsem_release(&kn->dep_map, _RET_IP_);
 	}
 
 	kernfs_drain_open_files(kn);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index ad594fef2ab0..71975b9b142c 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1687,7 +1687,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	if (lockres->l_lockdep_map.key != NULL)
-		rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+		rwsem_release(&lockres->l_lockdep_map, caller_ip);
 #endif
 }
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 603fbc4e2f70..564793c24d12 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1170,7 +1170,7 @@ struct journal_s
 #define jbd2_might_wait_for_commit(j) \
 	do { \
 		rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \
-		rwsem_release(&j->j_trans_commit_map, 1, _THIS_IP_); \
+		rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \
 	} while (0)
 
 /* journal feature predicate functions */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b8a835fd611b..c50d01ef1414 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -349,8 +349,7 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			 int trylock, int read, int check,
 			 struct lockdep_map *nest_lock, unsigned long ip);
 
-extern void lock_release(struct lockdep_map *lock, int nested,
-			 unsigned long ip);
+extern void lock_release(struct lockdep_map *lock, unsigned long ip);
 
 /*
  * Same "read" as for lock_acquire(), except -1 means any.
@@ -428,7 +427,7 @@ static inline void lockdep_set_selftest_task(struct task_struct *task)
 }
 
 # define lock_acquire(l, s, t, r, c, n, i)	do { } while (0)
-# define lock_release(l, n, i)			do { } while (0)
+# define lock_release(l, i)			do { } while (0)
 # define lock_downgrade(l, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
@@ -591,42 +590,42 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 
 #define spin_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define spin_acquire_nest(l, s, t, n, i)	lock_acquire_exclusive(l, s, t, n, i)
-#define spin_release(l, n, i)			lock_release(l, n, i)
+#define spin_release(l, i)			lock_release(l, i)
 
 #define rwlock_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define rwlock_acquire_read(l, s, t, i)		lock_acquire_shared_recursive(l, s, t, NULL, i)
-#define rwlock_release(l, n, i)			lock_release(l, n, i)
+#define rwlock_release(l, i)			lock_release(l, i)
 
 #define seqcount_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define seqcount_acquire_read(l, s, t, i)	lock_acquire_shared_recursive(l, s, t, NULL, i)
-#define seqcount_release(l, n, i)		lock_release(l, n, i)
+#define seqcount_release(l, i)			lock_release(l, i)
 
 #define mutex_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define mutex_acquire_nest(l, s, t, n, i)	lock_acquire_exclusive(l, s, t, n, i)
-#define mutex_release(l, n, i)			lock_release(l, n, i)
+#define mutex_release(l, i)			lock_release(l, i)
 
 #define rwsem_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define rwsem_acquire_nest(l, s, t, n, i)	lock_acquire_exclusive(l, s, t, n, i)
 #define rwsem_acquire_read(l, s, t, i)		lock_acquire_shared(l, s, t, NULL, i)
-#define rwsem_release(l, n, i)			lock_release(l, n, i)
+#define rwsem_release(l, i)			lock_release(l, i)
 
 #define lock_map_acquire(l)			lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
 #define lock_map_acquire_read(l)		lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
 #define lock_map_acquire_tryread(l)		lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
-#define lock_map_release(l)			lock_release(l, 1, _THIS_IP_)
+#define lock_map_release(l)			lock_release(l, _THIS_IP_)
 
 #ifdef CONFIG_PROVE_LOCKING
 # define might_lock(lock) 						\
 do {									\
 	typecheck(struct lockdep_map *, &(lock)->dep_map);		\
 	lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_);	\
-	lock_release(&(lock)->dep_map, 0, _THIS_IP_);			\
+	lock_release(&(lock)->dep_map, _THIS_IP_);			\
 } while (0)
 # define might_lock_read(lock) 						\
 do {									\
 	typecheck(struct lockdep_map *, &(lock)->dep_map);		\
 	lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_);	\
-	lock_release(&(lock)->dep_map, 0, _THIS_IP_);			\
+	lock_release(&(lock)->dep_map, _THIS_IP_);			\
 } while (0)
 
 #define lockdep_assert_irqs_enabled()	do {				\
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 3998cdf9cd14..ad2ca2a89d5b 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -93,7 +93,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 		__percpu_up_read(sem); /* Unconditional memory barrier */
 	preempt_enable();
 
-	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+	rwsem_release(&sem->rw_sem.dep_map, _RET_IP_);
 }
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
@@ -118,7 +118,7 @@ extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_release(&sem->rw_sem.dep_map, 1, ip);
+	lock_release(&sem->rw_sem.dep_map, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
 		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 75a2eded7aa2..269b31eab3d6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -210,7 +210,7 @@ static inline void rcu_lock_acquire(struct lockdep_map *map)
 
 static inline void rcu_lock_release(struct lockdep_map *map)
 {
-	lock_release(map, 1, _THIS_IP_);
+	lock_release(map, _THIS_IP_);
 }
 
 extern struct lockdep_map rcu_lock_map;
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index 86ebb4bf9c6e..abfb53ab11be 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -215,14 +215,14 @@ static inline void __raw_write_lock(rwlock_t *lock)
 
 static inline void __raw_write_unlock(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	rwlock_release(&lock->dep_map, _RET_IP_);
 	do_raw_write_unlock(lock);
 	preempt_enable();
 }
 
 static inline void __raw_read_unlock(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	rwlock_release(&lock->dep_map, _RET_IP_);
 	do_raw_read_unlock(lock);
 	preempt_enable();
 }
@@ -230,7 +230,7 @@ static inline void __raw_read_unlock(rwlock_t *lock)
 static inline void
 __raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	rwlock_release(&lock->dep_map, _RET_IP_);
 	do_raw_read_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
@@ -238,7 +238,7 @@ static inline void __raw_read_unlock(rwlock_t *lock)
 
 static inline void __raw_read_unlock_irq(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	rwlock_release(&lock->dep_map, _RET_IP_);
 	do_raw_read_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
@@ -246,7 +246,7 @@ static inline void __raw_read_unlock_irq(rwlock_t *lock)
 
 static inline void __raw_read_unlock_bh(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	rwlock_release(&lock->dep_map, _RET_IP_);
 	do_raw_read_unlock(lock);
 	__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
 }
@@ -254,7 +254,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock)
 static inline void __raw_write_unlock_irqrestore(rwlock_t *lock,
 					     unsigned long flags)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	rwlock_release(&lock->dep_map, _RET_IP_);
 	do_raw_write_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
@@ -262,7 +262,7 @@ static inline void __raw_write_unlock_irqrestore(rwlock_t *lock,
 
 static inline void __raw_write_unlock_irq(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	rwlock_release(&lock->dep_map, _RET_IP_);
 	do_raw_write_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
@@ -270,7 +270,7 @@ static inline void __raw_write_unlock_irq(rwlock_t *lock)
 
 static inline void __raw_write_unlock_bh(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
+	rwlock_release(&lock->dep_map, _RET_IP_);
 	do_raw_write_unlock(lock);
 	__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
 }
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index bcf4cf26b8c8..0491d963d47e 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -79,7 +79,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
 
 	local_irq_save(flags);
 	seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
-	seqcount_release(&l->dep_map, 1, _RET_IP_);
+	seqcount_release(&l->dep_map, _RET_IP_);
 	local_irq_restore(flags);
 }
 
@@ -384,7 +384,7 @@ static inline void write_seqcount_begin(seqcount_t *s)
 
 static inline void write_seqcount_end(seqcount_t *s)
 {
-	seqcount_release(&s->dep_map, 1, _RET_IP_);
+	seqcount_release(&s->dep_map, _RET_IP_);
 	raw_write_seqcount_end(s);
 }
 
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index b762eaba4cdf..19a9be9d97ee 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -147,7 +147,7 @@ static inline void __raw_spin_lock(raw_spinlock_t *lock)
 
 static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-	spin_release(&lock->dep_map, 1, _RET_IP_);
+	spin_release(&lock->dep_map, _RET_IP_);
 	do_raw_spin_unlock(lock);
 	preempt_enable();
 }
@@ -155,7 +155,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock,
 					    unsigned long flags)
 {
-	spin_release(&lock->dep_map, 1, _RET_IP_);
+	spin_release(&lock->dep_map, _RET_IP_);
 	do_raw_spin_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
@@ -163,7 +163,7 @@ static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock,
 
 static inline void __raw_spin_unlock_irq(raw_spinlock_t *lock)
 {
-	spin_release(&lock->dep_map, 1, _RET_IP_);
+	spin_release(&lock->dep_map, _RET_IP_);
 	do_raw_spin_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
@@ -171,7 +171,7 @@ static inline void __raw_spin_unlock_irq(raw_spinlock_t *lock)
 
 static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock)
 {
-	spin_release(&lock->dep_map, 1, _RET_IP_);
+	spin_release(&lock->dep_map, _RET_IP_);
 	do_raw_spin_unlock(lock);
 	__local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
 }
diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 3af7c0e03be5..d7554252404c 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -182,7 +182,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
 static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 {
 #ifdef CONFIG_DEBUG_MUTEXES
-	mutex_release(&ctx->dep_map, 0, _THIS_IP_);
+	mutex_release(&ctx->dep_map, _THIS_IP_);
 
 	DEBUG_LOCKS_WARN_ON(ctx->acquired);
 	if (!IS_ENABLED(CONFIG_PROVE_LOCKING))
diff --git a/include/net/sock.h b/include/net/sock.h
index 2c53f1a1d905..e46db0c846d2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1484,7 +1484,7 @@ static inline void sock_release_ownership(struct sock *sk)
 		sk->sk_lock.owned = 0;
 
 		/* The sk_lock has mutex_unlock() semantics: */
-		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 	}
 }
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..dcfe2d37ad15 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -338,7 +338,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 		 * up_read_non_owner(). The rwsem_release() is called
 		 * here to release the lock from lockdep's perspective.
 		 */
-		rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
+		rwsem_release(&current->mm->mmap_sem.dep_map, _RET_IP_);
 	}
 }
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e1967e9eddc2..97ed88e0cf72 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -336,7 +336,7 @@ static void lockdep_acquire_cpus_lock(void)
 
 static void lockdep_release_cpus_lock(void)
 {
-	rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+	rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_);
 }
 
 /*
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 233459c03b5a..8123518f9045 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4491,8 +4491,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-void lock_release(struct lockdep_map *lock, int nested,
-			  unsigned long ip)
+void lock_release(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 468a9b8422e3..5352ce50a97e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1091,7 +1091,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 err_early_kill:
 	spin_unlock(&lock->wait_lock);
 	debug_mutex_free_waiter(&waiter);
-	mutex_release(&lock->dep_map, 1, ip);
+	mutex_release(&lock->dep_map, ip);
 	preempt_enable();
 	return ret;
 }
@@ -1225,7 +1225,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 	DEFINE_WAKE_Q(wake_q);
 	unsigned long owner;
 
-	mutex_release(&lock->dep_map, 1, ip);
+	mutex_release(&lock->dep_map, ip);
 
 	/*
 	 * Release the lock before (potentially) taking the spinlock such that
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2874bf556162..851bbb10819d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1517,7 +1517,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
 	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
 	if (ret)
-		mutex_release(&lock->dep_map, 1, _RET_IP_);
+		mutex_release(&lock->dep_map, _RET_IP_);
 
 	return ret;
 }
@@ -1561,7 +1561,7 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
 				       RT_MUTEX_MIN_CHAINWALK,
 				       rt_mutex_slowlock);
 	if (ret)
-		mutex_release(&lock->dep_map, 1, _RET_IP_);
+		mutex_release(&lock->dep_map, _RET_IP_);
 
 	return ret;
 }
@@ -1600,7 +1600,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
  */
 void __sched rt_mutex_unlock(struct rt_mutex *lock)
 {
-	mutex_release(&lock->dep_map, 1, _RET_IP_);
+	mutex_release(&lock->dep_map, _RET_IP_);
 	rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index eef04551eae7..44e68761f432 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1504,7 +1504,7 @@ int __sched down_read_killable(struct rw_semaphore *sem)
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
 	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
-		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		rwsem_release(&sem->dep_map, _RET_IP_);
 		return -EINTR;
 	}
 
@@ -1546,7 +1546,7 @@ int __sched down_write_killable(struct rw_semaphore *sem)
 
 	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
 				  __down_write_killable)) {
-		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		rwsem_release(&sem->dep_map, _RET_IP_);
 		return -EINTR;
 	}
 
@@ -1573,7 +1573,7 @@ int down_write_trylock(struct rw_semaphore *sem)
  */
 void up_read(struct rw_semaphore *sem)
 {
-	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+	rwsem_release(&sem->dep_map, _RET_IP_);
 	__up_read(sem);
 }
 EXPORT_SYMBOL(up_read);
@@ -1583,7 +1583,7 @@ void up_read(struct rw_semaphore *sem)
  */
 void up_write(struct rw_semaphore *sem)
 {
-	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+	rwsem_release(&sem->dep_map, _RET_IP_);
 	__up_write(sem);
 }
 EXPORT_SYMBOL(up_write);
@@ -1639,7 +1639,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
 
 	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
 				  __down_write_killable)) {
-		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		rwsem_release(&sem->dep_map, _RET_IP_);
 		return -EINTR;
 	}
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ca65327a6de8..c8be5a0f5259 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -248,7 +248,7 @@ static void __up_console_sem(unsigned long ip)
 {
 	unsigned long flags;
 
-	mutex_release(&console_lock_dep_map, 1, ip);
+	mutex_release(&console_lock_dep_map, ip);
 
 	printk_safe_enter_irqsave(flags);
 	up(&console_sem);
@@ -1679,20 +1679,20 @@ static int console_lock_spinning_disable_and_check(void)
 	raw_spin_unlock(&console_owner_lock);
 
 	if (!waiter) {
-		spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+		spin_release(&console_owner_dep_map, _THIS_IP_);
 		return 0;
 	}
 
 	/* The waiter is now free to continue */
 	WRITE_ONCE(console_waiter, false);
 
-	spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+	spin_release(&console_owner_dep_map, _THIS_IP_);
 
 	/*
 	 * Hand off console_lock to waiter. The waiter will perform
 	 * the up(). After this, the waiter is the console_lock owner.
 	 */
-	mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
+	mutex_release(&console_lock_dep_map, _THIS_IP_);
 	return 1;
 }
 
@@ -1746,7 +1746,7 @@ static int console_trylock_spinning(void)
 	/* Owner will clear console_waiter on hand off */
 	while (READ_ONCE(console_waiter))
 		cpu_relax();
-	spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+	spin_release(&console_owner_dep_map, _THIS_IP_);
 
 	printk_safe_exit_irqrestore(flags);
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9a1346a5fa9..f845693e8e75 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3105,7 +3105,7 @@ static inline void finish_task(struct task_struct *prev)
 	 * do an early lockdep release here:
 	 */
 	rq_unpin_lock(rq, rf);
-	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+	spin_release(&rq->lock.dep_map, _THIS_IP_);
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = next;
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index a1705545e6ac..14f44f59e733 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1475,7 +1475,7 @@ static void ww_test_edeadlk_normal(void)
 
 	mutex_lock(&o2.base);
 	o2.ctx = &t2;
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 
 	WWAI(&t);
 	t2 = t;
@@ -1500,7 +1500,7 @@ static void ww_test_edeadlk_normal_slow(void)
 	int ret;
 
 	mutex_lock(&o2.base);
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 	o2.ctx = &t2;
 
 	WWAI(&t);
@@ -1527,7 +1527,7 @@ static void ww_test_edeadlk_no_unlock(void)
 
 	mutex_lock(&o2.base);
 	o2.ctx = &t2;
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 
 	WWAI(&t);
 	t2 = t;
@@ -1551,7 +1551,7 @@ static void ww_test_edeadlk_no_unlock_slow(void)
 	int ret;
 
 	mutex_lock(&o2.base);
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 	o2.ctx = &t2;
 
 	WWAI(&t);
@@ -1576,7 +1576,7 @@ static void ww_test_edeadlk_acquire_more(void)
 	int ret;
 
 	mutex_lock(&o2.base);
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 	o2.ctx = &t2;
 
 	WWAI(&t);
@@ -1597,7 +1597,7 @@ static void ww_test_edeadlk_acquire_more_slow(void)
 	int ret;
 
 	mutex_lock(&o2.base);
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 	o2.ctx = &t2;
 
 	WWAI(&t);
@@ -1618,11 +1618,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk(void)
 	int ret;
 
 	mutex_lock(&o2.base);
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 	o2.ctx = &t2;
 
 	mutex_lock(&o3.base);
-	mutex_release(&o3.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o3.base.dep_map, _THIS_IP_);
 	o3.ctx = &t2;
 
 	WWAI(&t);
@@ -1644,11 +1644,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk_slow(void)
 	int ret;
 
 	mutex_lock(&o2.base);
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 	o2.ctx = &t2;
 
 	mutex_lock(&o3.base);
-	mutex_release(&o3.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o3.base.dep_map, _THIS_IP_);
 	o3.ctx = &t2;
 
 	WWAI(&t);
@@ -1669,7 +1669,7 @@ static void ww_test_edeadlk_acquire_wrong(void)
 	int ret;
 
 	mutex_lock(&o2.base);
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 	o2.ctx = &t2;
 
 	WWAI(&t);
@@ -1694,7 +1694,7 @@ static void ww_test_edeadlk_acquire_wrong_slow(void)
 	int ret;
 
 	mutex_lock(&o2.base);
-	mutex_release(&o2.base.dep_map, 1, _THIS_IP_);
+	mutex_release(&o2.base.dep_map, _THIS_IP_);
 	o2.ctx = &t2;
 
 	WWAI(&t);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c4c08b45e44..3956ab6dba14 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1800,7 +1800,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 	struct mem_cgroup *iter;
 
 	spin_lock(&memcg_oom_lock);
-	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
+	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
 	for_each_mem_cgroup_tree(iter, memcg)
 		iter->oom_lock = false;
 	spin_unlock(&memcg_oom_lock);
diff --git a/net/core/sock.c b/net/core/sock.c
index 07863edbe6fc..a988e70cdac5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -521,7 +521,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 
 		rc = sk_backlog_rcv(sk, skb);
 
-		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 		bh_unlock_sock(sk);
 		atomic_inc(&sk->sk_drops);
diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h
index a81d91d4fc78..a6d7ee5f18ba 100644
--- a/tools/lib/lockdep/include/liblockdep/common.h
+++ b/tools/lib/lockdep/include/liblockdep/common.h
@@ -42,8 +42,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			int trylock, int read, int check,
 			struct lockdep_map *nest_lock, unsigned long ip);
-void lock_release(struct lockdep_map *lock, int nested,
-			unsigned long ip);
+void lock_release(struct lockdep_map *lock, unsigned long ip);
 void lockdep_reset_lock(struct lockdep_map *lock);
 void lockdep_register_key(struct lock_class_key *key);
 void lockdep_unregister_key(struct lock_class_key *key);
diff --git a/tools/lib/lockdep/include/liblockdep/mutex.h b/tools/lib/lockdep/include/liblockdep/mutex.h
index 783dd0df06f9..bd106b82759b 100644
--- a/tools/lib/lockdep/include/liblockdep/mutex.h
+++ b/tools/lib/lockdep/include/liblockdep/mutex.h
@@ -42,7 +42,7 @@ static inline int liblockdep_pthread_mutex_lock(liblockdep_pthread_mutex_t *lock
 
 static inline int liblockdep_pthread_mutex_unlock(liblockdep_pthread_mutex_t *lock)
 {
-	lock_release(&lock->dep_map, 0, (unsigned long)_RET_IP_);
+	lock_release(&lock->dep_map, (unsigned long)_RET_IP_);
 	return pthread_mutex_unlock(&lock->mutex);
 }
 
diff --git a/tools/lib/lockdep/include/liblockdep/rwlock.h b/tools/lib/lockdep/include/liblockdep/rwlock.h
index 365762e3a1ea..6d5d2932bf4d 100644
--- a/tools/lib/lockdep/include/liblockdep/rwlock.h
+++ b/tools/lib/lockdep/include/liblockdep/rwlock.h
@@ -44,7 +44,7 @@ static inline int liblockdep_pthread_rwlock_rdlock(liblockdep_pthread_rwlock_t *
 
 static inline int liblockdep_pthread_rwlock_unlock(liblockdep_pthread_rwlock_t *lock)
 {
-	lock_release(&lock->dep_map, 0, (unsigned long)_RET_IP_);
+	lock_release(&lock->dep_map, (unsigned long)_RET_IP_);
 	return pthread_rwlock_unlock(&lock->rwlock);
 }
 
diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c
index 76245d16196d..8f1adbe887b2 100644
--- a/tools/lib/lockdep/preload.c
+++ b/tools/lib/lockdep/preload.c
@@ -270,7 +270,7 @@ int pthread_mutex_lock(pthread_mutex_t *mutex)
 	 */
 	r = ll_pthread_mutex_lock(mutex);
 	if (r)
-		lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_);
+		lock_release(&__get_lock(mutex)->dep_map, (unsigned long)_RET_IP_);
 
 	return r;
 }
@@ -284,7 +284,7 @@ int pthread_mutex_trylock(pthread_mutex_t *mutex)
 	lock_acquire(&__get_lock(mutex)->dep_map, 0, 1, 0, 1, NULL, (unsigned long)_RET_IP_);
 	r = ll_pthread_mutex_trylock(mutex);
 	if (r)
-		lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_);
+		lock_release(&__get_lock(mutex)->dep_map, (unsigned long)_RET_IP_);
 
 	return r;
 }
@@ -295,7 +295,7 @@ int pthread_mutex_unlock(pthread_mutex_t *mutex)
 
 	try_init_preload();
 
-	lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_);
+	lock_release(&__get_lock(mutex)->dep_map, (unsigned long)_RET_IP_);
 	/*
 	 * Just like taking a lock, only in reverse!
 	 *
@@ -355,7 +355,7 @@ int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
 	lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 2, 1, NULL, (unsigned long)_RET_IP_);
 	r = ll_pthread_rwlock_rdlock(rwlock);
 	if (r)
-		lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+		lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_);
 
 	return r;
 }
@@ -369,7 +369,7 @@ int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock)
 	lock_acquire(&__get_lock(rwlock)->dep_map, 0, 1, 2, 1, NULL, (unsigned long)_RET_IP_);
 	r = ll_pthread_rwlock_tryrdlock(rwlock);
 	if (r)
-		lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+		lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_);
 
 	return r;
 }
@@ -383,7 +383,7 @@ int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock)
 	lock_acquire(&__get_lock(rwlock)->dep_map, 0, 1, 0, 1, NULL, (unsigned long)_RET_IP_);
 	r = ll_pthread_rwlock_trywrlock(rwlock);
 	if (r)
-                lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+		lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_);
 
 	return r;
 }
@@ -397,7 +397,7 @@ int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock)
 	lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_);
 	r = ll_pthread_rwlock_wrlock(rwlock);
 	if (r)
-		lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+		lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_);
 
 	return r;
 }
@@ -408,7 +408,7 @@ int pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
 
         init_preload();
 
-	lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+	lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_);
 	r = ll_pthread_rwlock_unlock(rwlock);
 	if (r)
 		lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_);
-- 
1.8.3.1

^ permalink raw reply related

* Re: [RFC PATCH 2/3] fs: add RWF_ENCODED for writing compressed data
From: Jann Horn @ 2019-09-19 15:44 UTC (permalink / raw)
  To: Omar Sandoval, Jens Axboe
  Cc: linux-fsdevel, linux-btrfs, Dave Chinner, Linux API, Kernel Team,
	Andy Lutomirski
In-Reply-To: <230a76e65372a8fb3ec62ce167d9322e5e342810.1568875700.git.osandov@fb.com>

On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> Btrfs can transparently compress data written by the user. However, we'd
> like to add an interface to write pre-compressed data directly to the
> filesystem. This adds support for so-called "encoded writes" via
> pwritev2().
>
> A new RWF_ENCODED flags indicates that a write is "encoded". If this
> flag is set, iov[0].iov_base points to a struct encoded_iov which
> contains metadata about the write: namely, the compression algorithm and
> the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> must be set to sizeof(struct encoded_iov), which can be used to extend
> the interface in the future. The remaining iovecs contain the encoded
> extent.
>
> A similar interface for reading encoded data can be added to preadv2()
> in the future.
>
> Filesystems must indicate that they support encoded writes by setting
> FMODE_ENCODED_IO in ->file_open().
[...]
> +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> +                        struct iov_iter *from)
> +{
> +       if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> +               return -EINVAL;
> +       if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> +               return -EFAULT;
> +       if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> +           encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> +               iocb->ki_flags &= ~IOCB_ENCODED;
> +               return 0;
> +       }
> +       if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> +           encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> +               return -EINVAL;
> +       if (!capable(CAP_SYS_ADMIN))
> +               return -EPERM;

How does this capable() check interact with io_uring? Without having
looked at this in detail, I suspect that when an encoded write is
requested through io_uring, the capable() check might be executed on
something like a workqueue worker thread, which is probably running
with a full capability set.

^ permalink raw reply

* [RFC PATCH 3/3] btrfs: implement encoded (compressed) writes
From: Omar Sandoval @ 2019-09-19  6:53 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs; +Cc: Dave Chinner, linux-api, kernel-team
In-Reply-To: <cover.1568875700.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

This adds support to Btrfs for the RWF_ENCODED flag to pwritev2(). The
implementation is similar to direct I/O: we have to flush any ordered
extents, invalidate the page cache, and do the io tree/delalloc/extent
map/ordered extent dance. From there, we can reuse the compression code
with a minor modification to distinguish the write from writeback.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/btrfs/compression.c |   6 +-
 fs/btrfs/compression.h |   5 +-
 fs/btrfs/ctree.h       |   4 +
 fs/btrfs/file.c        |  40 +++++++--
 fs/btrfs/inode.c       | 190 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 232 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b05b361e2062..6632dd8d2e4d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -276,7 +276,8 @@ static void end_compressed_bio_write(struct bio *bio)
 			bio->bi_status == BLK_STS_OK);
 	cb->compressed_pages[0]->mapping = NULL;
 
-	end_compressed_writeback(inode, cb);
+	if (cb->writeback)
+		end_compressed_writeback(inode, cb);
 	/* note, our inode could be gone now */
 
 	/*
@@ -311,7 +312,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				 unsigned long compressed_len,
 				 struct page **compressed_pages,
 				 unsigned long nr_pages,
-				 unsigned int write_flags)
+				 unsigned int write_flags, bool writeback)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct bio *bio = NULL;
@@ -336,6 +337,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	cb->mirror_num = 0;
 	cb->compressed_pages = compressed_pages;
 	cb->compressed_len = compressed_len;
+	cb->writeback = writeback;
 	cb->orig_bio = NULL;
 	cb->nr_pages = nr_pages;
 
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 4cb8be9ff88b..d4176384ec15 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -47,6 +47,9 @@ struct compressed_bio {
 	/* the compression algorithm for this bio */
 	int compress_type;
 
+	/* Whether this is a write for writeback. */
+	bool writeback;
+
 	/* number of compressed pages in the array */
 	unsigned long nr_pages;
 
@@ -93,7 +96,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long compressed_len,
 				  struct page **compressed_pages,
 				  unsigned long nr_pages,
-				  unsigned int write_flags);
+				  unsigned int write_flags, bool writeback);
 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags);
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 19d669d12ca1..76962d319316 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2905,6 +2905,10 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
 int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
 void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
 					  u64 end, int uptodate);
+
+ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+			    struct encoded_iov *encoded);
+
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8fe4eb7e5045..068b7f2cc243 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1872,8 +1872,7 @@ static void update_time_for_write(struct inode *inode)
 		inode_inc_iversion(inode);
 }
 
-static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
-				    struct iov_iter *from)
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
@@ -1883,14 +1882,22 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	u64 end_pos;
 	ssize_t num_written = 0;
 	const bool sync = iocb->ki_flags & IOCB_DSYNC;
+	struct encoded_iov encoded;
 	ssize_t err;
 	loff_t pos;
 	size_t count;
 	loff_t oldsize;
 	int clean_page = 0;
 
-	if (!(iocb->ki_flags & IOCB_DIRECT) &&
-	    (iocb->ki_flags & IOCB_NOWAIT))
+	if (iocb->ki_flags & IOCB_ENCODED) {
+		err = import_encoded_write(iocb, &encoded, from);
+		if (err)
+			return err;
+	}
+
+	if ((iocb->ki_flags & IOCB_NOWAIT) &&
+	    (!(iocb->ki_flags & IOCB_DIRECT) ||
+	     (iocb->ki_flags & IOCB_ENCODED)))
 		return -EOPNOTSUPP;
 
 	if (!inode_trylock(inode)) {
@@ -1899,14 +1906,27 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 		inode_lock(inode);
 	}
 
-	err = generic_write_checks(iocb, from);
-	if (err <= 0) {
+	if (iocb->ki_flags & IOCB_ENCODED) {
+		err = generic_encoded_write_checks(iocb, &encoded);
+		if (err) {
+			inode_unlock(inode);
+			return err;
+		}
+		count = encoded.unencoded_len;
+	} else {
+		err = generic_write_checks(iocb, from);
+		if (err < 0) {
+			inode_unlock(inode);
+			return err;
+		}
+		count = iov_iter_count(from);
+	}
+	if (count == 0) {
 		inode_unlock(inode);
 		return err;
 	}
 
 	pos = iocb->ki_pos;
-	count = iov_iter_count(from);
 	if (iocb->ki_flags & IOCB_NOWAIT) {
 		/*
 		 * We will allocate space in case nodatacow is not set,
@@ -1965,7 +1985,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	if (sync)
 		atomic_inc(&BTRFS_I(inode)->sync_writers);
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
+	if (iocb->ki_flags & IOCB_ENCODED) {
+		num_written = btrfs_encoded_write(iocb, from, &encoded);
+	} else if (iocb->ki_flags & IOCB_DIRECT) {
 		num_written = __btrfs_direct_write(iocb, from);
 	} else {
 		num_written = btrfs_buffered_write(iocb, from);
@@ -3440,7 +3462,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 
 static int btrfs_file_open(struct inode *inode, struct file *filp)
 {
-	filp->f_mode |= FMODE_NOWAIT;
+	filp->f_mode |= FMODE_NOWAIT | FMODE_ENCODED_IO;
 	return generic_file_open(inode, filp);
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0546401bc0a..90ca8537df8e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -865,7 +865,7 @@ static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
 				    ins.objectid,
 				    ins.offset, async_extent->pages,
 				    async_extent->nr_pages,
-				    async_chunk->write_flags)) {
+				    async_chunk->write_flags, true)) {
 			struct page *p = async_extent->pages[0];
 			const u64 start = async_extent->start;
 			const u64 end = start + async_extent->ram_size - 1;
@@ -10590,6 +10590,194 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 	}
 }
 
+ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+			    struct encoded_iov *encoded)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_changeset *data_reserved = NULL;
+	struct extent_state *cached_state = NULL;
+	int compression;
+	u64 disk_num_bytes, num_bytes;
+	u64 start, end;
+	unsigned long nr_pages, i;
+	struct page **pages;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	ssize_t ret;
+
+	switch (encoded->compression) {
+	case ENCODED_IOV_COMPRESSION_ZLIB:
+		compression = BTRFS_COMPRESS_ZLIB;
+		break;
+	case ENCODED_IOV_COMPRESSION_LZO:
+		compression = BTRFS_COMPRESS_LZO;
+		break;
+	case ENCODED_IOV_COMPRESSION_ZSTD:
+		compression = BTRFS_COMPRESS_ZSTD;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	disk_num_bytes = iov_iter_count(from);
+
+	/* The extent size must be sane. */
+	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
+	    disk_num_bytes > BTRFS_MAX_COMPRESSED ||
+	    disk_num_bytes == 0)
+		return -EINVAL;
+
+	/*
+	 * The compressed data on disk must be sector-aligned. For convenience,
+	 * we extend the compressed data with zeroes if it isn't.
+	 */
+	disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
+	/*
+	 * The extent in the file must also be sector-aligned. However, we allow
+	 * a write which ends at or extends i_size to have an unaligned length;
+	 * we round up the extent size and set i_size to the given length.
+	 */
+	start = iocb->ki_pos;
+	if (!IS_ALIGNED(start, fs_info->sectorsize))
+		return -EINVAL;
+	if (start + encoded->unencoded_len >= inode->i_size) {
+		num_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
+	} else {
+		num_bytes = encoded->unencoded_len;
+		if (!IS_ALIGNED(num_bytes, fs_info->sectorsize))
+			return -EINVAL;
+	}
+	end = start + num_bytes - 1;
+
+	/*
+	 * It's valid for compressed data to be larger than or the same size as
+	 * the decompressed data. However, for buffered I/O, we never write out
+	 * a compressed extent unless it's smaller than the decompressed data,
+	 * so for now, let's not allow creating such extents explicity, either.
+	 */
+	if (disk_num_bytes >= num_bytes)
+		return -EINVAL;
+
+	nr_pages = (disk_num_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_USER);
+	if (!pages)
+		return -ENOMEM;
+	for (i = 0; i < nr_pages; i++) {
+		size_t bytes;
+		char *kaddr;
+
+		pages[i] = alloc_page(GFP_USER);
+		if (!pages[i]) {
+			ret = -ENOMEM;
+			goto out_pages;
+		}
+		kaddr = kmap(pages[i]);
+		bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
+		if (copy_from_iter(kaddr, bytes, from) != bytes) {
+			kunmap(pages[i]);
+			ret = -EFAULT;
+			goto out_pages;
+		}
+		if (bytes < PAGE_SIZE)
+			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
+		kunmap(pages[i]);
+	}
+
+	for (;;) {
+		struct btrfs_ordered_extent *ordered;
+
+		lock_extent_bits(io_tree, start, end, &cached_state);
+		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
+						     end - start + 1);
+		if (!ordered &&
+		    !filemap_range_has_page(inode->i_mapping, start, end))
+			break;
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, end,
+				     &cached_state);
+		cond_resched();
+		ret = btrfs_wait_ordered_range(inode, start, end);
+		if (ret)
+			goto out_pages;
+		ret = invalidate_inode_pages2_range(inode->i_mapping,
+						    start >> PAGE_SHIFT,
+						    end >> PAGE_SHIFT);
+		if (ret)
+			goto out_pages;
+	}
+
+	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start,
+					   num_bytes);
+	if (ret)
+		goto out_unlock;
+
+	ret = btrfs_reserve_extent(root, num_bytes, disk_num_bytes,
+				   disk_num_bytes, 0, 0, &ins, 1, 1);
+	if (ret)
+		goto out_delalloc_release;
+
+	em = create_io_em(inode, start, num_bytes, start, ins.objectid,
+			  ins.offset, ins.offset, num_bytes, compression,
+			  BTRFS_ORDERED_COMPRESSED);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_free_reserve;
+	}
+	free_extent_map(em);
+
+	ret = btrfs_add_ordered_extent_compress(inode, start, ins.objectid,
+						num_bytes, ins.offset,
+						BTRFS_ORDERED_COMPRESSED,
+						compression);
+	if (ret) {
+		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
+		goto out_free_reserve;
+	}
+	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+	if (start + encoded->unencoded_len > inode->i_size)
+		i_size_write(inode, start + encoded->unencoded_len);
+
+	unlock_extent_cached(io_tree, start, end, &cached_state);
+
+	btrfs_delalloc_release_extents(BTRFS_I(inode), num_bytes, false);
+
+	if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+					  ins.offset, pages, nr_pages, 0,
+					  false)) {
+		struct page *page = pages[0];
+
+		page->mapping = inode->i_mapping;
+		btrfs_writepage_endio_finish_ordered(page, start, end, 0);
+		page->mapping = NULL;
+		ret = -EIO;
+		goto out_pages;
+	}
+	iocb->ki_pos += encoded->unencoded_len;
+	return encoded->unencoded_len;
+
+out_free_reserve:
+	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_delalloc_release:
+	btrfs_delalloc_release_space(inode, data_reserved, start, num_bytes,
+				     true);
+out_unlock:
+	unlock_extent_cached(io_tree, start, end, &cached_state);
+out_pages:
+	for (i = 0; i < nr_pages; i++) {
+		if (pages[i])
+			put_page(pages[i]);
+	}
+	kvfree(pages);
+	return ret;
+}
+
 #ifdef CONFIG_SWAP
 /*
  * Add an entry indicating a block group or device which is pinned by a
-- 
2.23.0

^ permalink raw reply related

* [RFC PATCH 2/3] fs: add RWF_ENCODED for writing compressed data
From: Omar Sandoval @ 2019-09-19  6:53 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs; +Cc: Dave Chinner, linux-api, kernel-team
In-Reply-To: <cover.1568875700.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

Btrfs can transparently compress data written by the user. However, we'd
like to add an interface to write pre-compressed data directly to the
filesystem. This adds support for so-called "encoded writes" via
pwritev2().

A new RWF_ENCODED flags indicates that a write is "encoded". If this
flag is set, iov[0].iov_base points to a struct encoded_iov which
contains metadata about the write: namely, the compression algorithm and
the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
must be set to sizeof(struct encoded_iov), which can be used to extend
the interface in the future. The remaining iovecs contain the encoded
extent.

A similar interface for reading encoded data can be added to preadv2()
in the future.

Filesystems must indicate that they support encoded writes by setting
FMODE_ENCODED_IO in ->file_open().

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 include/linux/fs.h      | 13 +++++++
 include/uapi/linux/fs.h | 24 ++++++++++++-
 mm/filemap.c            | 75 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75c4b7680385..ae3ac0312674 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
+/* File supports encoded IO */
+#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
@@ -314,6 +317,7 @@ enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_ENCODED		(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -3046,6 +3050,10 @@ extern int sb_min_blocksize(struct super_block *, int);
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+struct encoded_iov;
+extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
+extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
+				struct iov_iter *);
 extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *count, unsigned int remap_flags);
@@ -3364,6 +3372,11 @@ static inline int kiocb_set_rw_flags(int rw, struct kiocb *ki, rwf_t flags)
 			return -EOPNOTSUPP;
 		ki->ki_flags |= IOCB_NOWAIT;
 	}
+	if (flags & RWF_ENCODED) {
+		if (rw != WRITE || !(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
+			return -EOPNOTSUPP;
+		ki->ki_flags |= IOCB_ENCODED;
+	}
 	if (flags & RWF_HIPRI)
 		ki->ki_flags |= IOCB_HIPRI;
 	if (flags & RWF_DSYNC)
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index aad225b05be7..b775d9aea978 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -283,6 +283,25 @@ struct fsxattr {
 
 typedef int __bitwise __kernel_rwf_t;
 
+enum {
+	ENCODED_IOV_COMPRESSION_NONE,
+	ENCODED_IOV_COMPRESSION_ZLIB,
+	ENCODED_IOV_COMPRESSION_LZO,
+	ENCODED_IOV_COMPRESSION_ZSTD,
+	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
+};
+
+enum {
+	ENCODED_IOV_ENCRYPTION_NONE,
+	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
+};
+
+struct encoded_iov {
+	__u64 unencoded_len;
+	__u32 compression;
+	__u32 encryption;
+};
+
 /* high priority request, poll if possible */
 #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
 
@@ -298,8 +317,11 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* encoded (e.g., compressed or encrypted) IO */
+#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_ENCODED)
 
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 40667c2f3383..3d2555364432 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2974,24 +2974,16 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
 	return 0;
 }
 
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-	loff_t count;
 	int ret;
 
 	if (IS_SWAPFILE(inode))
 		return -ETXTBSY;
 
-	if (!iov_iter_count(from))
+	if (!*count)
 		return 0;
 
 	/* FIXME: this is for backwards compatibility with 2.4 */
@@ -3001,8 +2993,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 		return -EINVAL;
 
-	count = iov_iter_count(from);
-	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+}
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns a negative errno or the new number of bytes to write.
+ */
+inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+	loff_t count = iov_iter_count(from);
+	int ret;
+
+	ret = generic_write_checks_common(iocb, &count);
 	if (ret)
 		return ret;
 
@@ -3011,6 +3016,52 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 }
 EXPORT_SYMBOL(generic_write_checks);
 
+int generic_encoded_write_checks(struct kiocb *iocb,
+				 struct encoded_iov *encoded)
+{
+	loff_t count = encoded->unencoded_len;
+	int ret;
+
+	ret = generic_write_checks_common(iocb, &count);
+	if (ret)
+		return ret;
+
+	if (count != encoded->unencoded_len) {
+		/*
+		 * The write got truncated by generic_write_checks(). We can't
+		 * do a partial encoded write.
+		 */
+		return -EFBIG;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_encoded_write_checks);
+
+/*
+ * If no encoding is set, this clears IOCB_ENCODED and the write should be
+ * treated as a normal write.
+ */
+int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
+			 struct iov_iter *from)
+{
+	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
+		return -EINVAL;
+	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
+		return -EFAULT;
+	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
+	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
+		iocb->ki_flags &= ~IOCB_ENCODED;
+		return 0;
+	}
+	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
+	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	return 0;
+}
+EXPORT_SYMBOL(import_encoded_write);
+
 /*
  * Performs necessary checks before doing a clone.
  *
-- 
2.23.0

^ permalink raw reply related

* [PATCH] readv.2: Document new RWF_ENCODED flag to pwritev2()
From: Omar Sandoval @ 2019-09-19  6:53 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs; +Cc: Dave Chinner, linux-api, kernel-team
In-Reply-To: <cover.1568875700.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 man2/readv.2 | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)

diff --git a/man2/readv.2 b/man2/readv.2
index af27aa63e..7d32b3bd2 100644
--- a/man2/readv.2
+++ b/man2/readv.2
@@ -265,6 +265,135 @@ the data is always appended to the end of the file.
 However, if the
 .I offset
 argument is \-1, the current file offset is updated.
+.TP
+.BR RWF_ENCODED " (since Linux 5.5)"
+Read or write encoded (e.g., compressed) data.
+.I iov[0].iov_base
+points to an
+.I
+encoded_iov
+structure, defined in
+.I <linux/fs.h>
+as:
+.IP
+.in +4n
+.EX
+struct encoded_iov {
+    __u64 unencoded_len;
+    __u32 compression;
+    __u32 encryption;
+};
+.EE
+.in
+.IP
+.I iov[0].iov_len
+must be set to
+.IR "sizeof(struct\ encoded_iov)" .
+.IP
+.I compression
+is one of
+.B ENCODED_IOV_COMPRESSION_NONE
+(zero),
+.BR ENCODED_IOV_COMPRESSION_ZLIB ,
+.BR ENCODED_IOV_COMPRESSION_LZO ,
+or
+.BR ENCODED_IOV_COMPRESSION_ZSTD .
+.I encryption
+is currently always
+.B ENCODED_IOV_ENCRYPTION_NONE
+(zero).
+.I unencoded_len
+is the length of the unencoded (i.e., decrypted and decompressed) data.
+If
+.I offset
+is -1, then the current file offset is incremented by the unencoded length.
+.IP
+For
+.BR pwritev2 (),
+the remaining buffers in the
+.I iov
+array should point to the encoded data;
+.I unencoded_len
+should be set to the logical length of the data in the file; and
+.I compression
+and
+.I encryption
+should be set to indicate the encoding.
+The calling process must have the
+.B CAP_SYS_ADMIN
+capability.
+If
+.I compression
+is
+.B ENCODED_IOV_COMPRESSION_NONE
+and
+.I encryption
+is
+.BR ENCODED_IOV_ENCRYPTION_NONE ,
+then
+.I unencoded_len
+is ignored and the write behaves as if the
+.B RWF_ENCODED
+flag was not specified.
+.IP
+As of Linux 5.5, this flag is only implemented for
+.BR pwritev2 (),
+and only on Btrfs with the following requirements:
+.RS
+.IP \(bu 3
+.I offset
+(or the current file offset if
+.I offset
+is -1) must be aligned to the sector size of the filesystem.
+.IP \(bu
+.I unencoded_len
+and the length of the encoded data must each be no more than 128 KiB.
+This limit may increase in the future.
+.IP \(bu
+.I unencoded_len
+must be non-zero.
+.IP \(bu
+.I unencoded_len
+must be aligned to the sector size of the filesystem,
+unless the data ends at or beyond the current end of the file.
+.IP \(bu
+If
+.I compression
+is not
+.BR ENCODED_IOV_COMPRESSION_NONE ,
+then the length of the encoded data rounded up to the nearest sector must be less than
+.IR unencoded_len .
+.IP \(bu
+If
+.I compression
+is
+.BR ENCODED_IOV_COMPRESSION_ZLIB ,
+then the encoded data must be a single zlib stream.
+.IP \(bu
+If
+.I compression
+is
+.BR ENCODED_IOV_COMPRESSION_LZO ,
+then the encoded data must be be compressed page by page with LZO1X
+and wrapped in the format described in the Linux kernel source file
+.IR fs/btrfs/lzo.c .
+.IP \(bu
+If
+.I compression
+is
+.BR ENCODED_IOV_COMPRESSION_ZSTD ,
+then the encoded data must be a single zstd stream,
+and the
+.I windowLog
+compression parameter must be no more than 17.
+.RE
+.IP
+Note that the encoded data is not validated when it is written.
+If it is not valid (e.g., it cannot be decompressed,
+or its decompressed length does not match
+.IR unencoded_len ),
+then a subsequent read may result in an error or return truncated data.
+.\" TODO: how should this interact with O_DIRECT?
 .SH RETURN VALUE
 On success,
 .BR readv (),
@@ -284,6 +413,13 @@ than requested (see
 and
 .BR write (2)).
 .PP
+If
+.B
+RWF_ENCODED
+was specified in
+.IR flags ,
+then the return value is the unencoded number of bytes.
+.PP
 On error, \-1 is returned, and \fIerrno\fP is set appropriately.
 .SH ERRORS
 The errors are as given for
@@ -312,8 +448,28 @@ The vector count,
 .IR iovcnt ,
 is less than zero or greater than the permitted maximum.
 .TP
+.B EINVAL
+.B RWF_ENCODED
+is specified in
+.I flags
+and the alignment and/or size requirements are not met.
+.TP
 .B EOPNOTSUPP
 An unknown flag is specified in \fIflags\fP.
+.TP
+.B EOPNOTSUPP
+.B RWF_ENCODED
+is specified in
+.I flags
+and the filesystem does not implement encoded I/O.
+.TP
+.B EPERM
+.B RWF_ENCODED
+is specified in
+.I flags
+and the calling process does not have the
+.B CAP_SYS_ADMIN
+capability.
 .SH VERSIONS
 .BR preadv ()
 and
-- 
2.23.0

^ permalink raw reply related

* [RFC PATCH 1/3] fs: pass READ/WRITE to kiocb_set_rw_flags()
From: Omar Sandoval @ 2019-09-19  6:53 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs
  Cc: Dave Chinner, linux-api, kernel-team, Jan Kara, Jens Axboe
In-Reply-To: <cover.1568875700.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

A following change will want to check whether an IO is a read or write
in kiocb_set_rw_flags(). Additionally, aio and io_uring currently set
the IOCB_WRITE flag on a kiocb right before calling call_write_iter(),
but we can move that into the common code.

Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/aio.c           | 9 ++++-----
 fs/io_uring.c      | 9 ++++-----
 fs/read_write.c    | 2 +-
 include/linux/fs.h | 5 ++++-
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 01e0fb9ae45a..72195e182db2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1442,7 +1442,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 	iocb_put(iocb);
 }
 
-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+static int aio_prep_rw(int rw, struct kiocb *req, const struct iocb *iocb)
 {
 	int ret;
 
@@ -1469,7 +1469,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	} else
 		req->ki_ioprio = get_current_ioprio();
 
-	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+	ret = kiocb_set_rw_flags(rw, req, iocb->aio_rw_flags);
 	if (unlikely(ret))
 		return ret;
 
@@ -1525,7 +1525,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(READ, req, iocb);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -1553,7 +1553,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(WRITE, req, iocb);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -1579,7 +1579,6 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
 			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 		}
-		req->ki_flags |= IOCB_WRITE;
 		aio_rw_done(req, call_write_iter(file, req, &iter));
 	}
 	kfree(iovec);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0dadbdbead0f..548525cb1699 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1002,7 +1002,7 @@ static bool io_file_supports_async(struct file *file)
 	return false;
 }
 
-static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
+static int io_prep_rw(int rw, struct io_kiocb *req, const struct sqe_submit *s,
 		      bool force_nonblock)
 {
 	const struct io_uring_sqe *sqe = s->sqe;
@@ -1031,7 +1031,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 	} else
 		kiocb->ki_ioprio = get_current_ioprio();
 
-	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	ret = kiocb_set_rw_flags(rw, kiocb, READ_ONCE(sqe->rw_flags));
 	if (unlikely(ret))
 		return ret;
 
@@ -1258,7 +1258,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	size_t iov_count;
 	ssize_t read_size, ret;
 
-	ret = io_prep_rw(req, s, force_nonblock);
+	ret = io_prep_rw(READ, req, s, force_nonblock);
 	if (ret)
 		return ret;
 	file = kiocb->ki_filp;
@@ -1319,7 +1319,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	size_t iov_count;
 	ssize_t ret;
 
-	ret = io_prep_rw(req, s, force_nonblock);
+	ret = io_prep_rw(WRITE, req, s, force_nonblock);
 	if (ret)
 		return ret;
 
@@ -1363,7 +1363,6 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 			__sb_writers_release(file_inode(file)->i_sb,
 						SB_FREEZE_WRITE);
 		}
-		kiocb->ki_flags |= IOCB_WRITE;
 
 		ret2 = call_write_iter(file, kiocb, &iter);
 		if (!force_nonblock || ret2 != -EAGAIN) {
diff --git a/fs/read_write.c b/fs/read_write.c
index 5bbf587f5bc1..a6548a9d965d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -682,7 +682,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
-	ret = kiocb_set_rw_flags(&kiocb, flags);
+	ret = kiocb_set_rw_flags(type, &kiocb, flags);
 	if (ret)
 		return ret;
 	kiocb.ki_pos = (ppos ? *ppos : 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ffe35d97afcb..75c4b7680385 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3351,8 +3351,11 @@ static inline int iocb_flags(struct file *file)
 	return res;
 }
 
-static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
+static inline int kiocb_set_rw_flags(int rw, struct kiocb *ki, rwf_t flags)
 {
+	if (rw == WRITE)
+		ki->ki_flags |= IOCB_WRITE;
+
 	if (unlikely(flags & ~RWF_SUPPORTED))
 		return -EOPNOTSUPP;
 
-- 
2.23.0

^ permalink raw reply related

* [RFC PATCH 0/3] fs: interface for directly writing encoded (e.g., compressed) data
From: Omar Sandoval @ 2019-09-19  6:53 UTC (permalink / raw)
  To: linux-fsdevel, linux-btrfs; +Cc: Dave Chinner, linux-api, kernel-team

From: Omar Sandoval <osandov@fb.com>

Hello,

This patch series adds an API for writing compressed data directly to
the filesystem. It is based on my previous series which added a
Btrfs-specific ioctl [1], but this is now an extension to pwritev2() as
suggested by Dave Chinner [2]. I've included a man page patch describing
the API in detail. A test case and example program are available [3].

The use case that I have in mind is Btrfs send/receive: currently, when
sending data from one compressed filesystem to another, the sending side
decompresses the data and the receiving side recompresses it before
writing it out. This is wasteful and can be avoided if we can just send
and write compressed extents. The send part will be implemented in a
separate series, as this API can stand alone.

Patch 1 is a prep patch. Patch 2 adds the interface in the VFS. Patch 3
implements it in Btrfs.

This series is based on Linus' tree as of commit
b41dae061bbd722b9d7fa828f35d22035b218e18.

Please share any comments on the API or implementation. Thanks!

1: https://lore.kernel.org/linux-fsdevel/cover.1567623877.git.osandov@fb.com/
2: https://lore.kernel.org/linux-fsdevel/20190906212710.GI7452@vader/
3: https://github.com/osandov/xfstests/tree/btrfs-compressed-write

Omar Sandoval (3):
  fs: pass READ/WRITE to kiocb_set_rw_flags()
  fs: add RWF_ENCODED for writing compressed data
  btrfs: implement encoded (compressed) writes

 fs/aio.c                |   9 +-
 fs/btrfs/compression.c  |   6 +-
 fs/btrfs/compression.h  |   5 +-
 fs/btrfs/ctree.h        |   4 +
 fs/btrfs/file.c         |  40 +++++++--
 fs/btrfs/inode.c        | 190 +++++++++++++++++++++++++++++++++++++++-
 fs/io_uring.c           |   9 +-
 fs/read_write.c         |   2 +-
 include/linux/fs.h      |  18 +++-
 include/uapi/linux/fs.h |  24 ++++-
 mm/filemap.c            |  75 +++++++++++++---
 11 files changed, 344 insertions(+), 38 deletions(-)

-- 
2.23.0

^ permalink raw reply

* [PATCH ghak90 V7 21/21] audit: add proc interface for capcontid
From: Richard Guy Briggs @ 2019-09-19  1:22 UTC (permalink / raw)
  To: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel
  Cc: Paul Moore, sgrubb, omosnace, dhowells, simo, eparis, serge,
	ebiederm, nhorman, dwalsh, mpatel, Richard Guy Briggs
In-Reply-To: <cover.1568834524.git.rgb@redhat.com>

Add a /proc interface to capcontid for testing purposes.  This isn't
intended to be merged upstream.  Container orchestrators/engines are
expected to link to libaudit to use the functions audit_set_capcontid()
and audit_get_capcontid.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 fs/proc/base.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 26091800180c..283ef8e006e7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1360,6 +1360,59 @@ static ssize_t proc_contid_write(struct file *file, const char __user *buf,
 	.write		= proc_contid_write,
 	.llseek		= generic_file_llseek,
 };
+
+static ssize_t proc_capcontid_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *task = get_proc_task(inode);
+	ssize_t length;
+	char tmpbuf[TMPBUFLEN];
+
+	if (!task)
+		return -ESRCH;
+	/* if we don't have caps, reject */
+	if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
+		return -EPERM;
+	length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_capcontid(task));
+	put_task_struct(task);
+	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static ssize_t proc_capcontid_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct inode *inode = file_inode(file);
+	u32 capcontid;
+	int rv;
+	struct task_struct *task = get_proc_task(inode);
+
+	if (!task)
+		return -ESRCH;
+	if (*ppos != 0) {
+		/* No partial writes. */
+		put_task_struct(task);
+		return -EINVAL;
+	}
+
+	rv = kstrtou32_from_user(buf, count, 10, &capcontid);
+	if (rv < 0) {
+		put_task_struct(task);
+		return rv;
+	}
+
+	rv = audit_set_capcontid(task, capcontid);
+	put_task_struct(task);
+	if (rv < 0)
+		return rv;
+	return count;
+}
+
+static const struct file_operations proc_capcontid_operations = {
+	.read		= proc_capcontid_read,
+	.write		= proc_capcontid_write,
+	.llseek		= generic_file_llseek,
+};
 #endif
 
 #ifdef CONFIG_FAULT_INJECTION
@@ -3121,6 +3174,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 	REG("audit_containerid", S_IWUSR|S_IRUSR, proc_contid_operations),
+	REG("audit_capcontainerid", S_IWUSR|S_IRUSR|S_IRUSR, proc_capcontid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
@@ -3522,6 +3576,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 	REG("audit_containerid", S_IWUSR|S_IRUSR, proc_contid_operations),
+	REG("audit_capcontainerid", S_IWUSR|S_IRUSR|S_IRUSR, proc_capcontid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH ghak90 V7 20/21] audit: add capcontid to set contid outside init_user_ns
From: Richard Guy Briggs @ 2019-09-19  1:22 UTC (permalink / raw)
  To: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel
  Cc: Paul Moore, sgrubb, omosnace, dhowells, simo, eparis, serge,
	ebiederm, nhorman, dwalsh, mpatel, Richard Guy Briggs
In-Reply-To: <cover.1568834524.git.rgb@redhat.com>

Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
process in a non-init user namespace the capability to set audit
container identifiers.

Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
AUDIT_SET_CAPCONTID 1028.  The message format includes the data
structure:
struct audit_capcontid_status {
        pid_t   pid;
        u32     enable;
};

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h      | 14 +++++++
 include/uapi/linux/audit.h |  2 +
 kernel/audit.c             | 98 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/audit.h             |  5 +++
 4 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1ce27af686ea..dcc53e62e266 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -117,6 +117,7 @@ struct audit_task_info {
 	kuid_t			loginuid;
 	unsigned int		sessionid;
 	struct audit_cont	*cont;
+	u32			capcontid;
 #ifdef CONFIG_AUDITSYSCALL
 	struct audit_context	*ctx;
 #endif
@@ -224,6 +225,14 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->audit->sessionid;
 }
 
+static inline u32 audit_get_capcontid(struct task_struct *tsk)
+{
+	if (!tsk->audit)
+		return 0;
+	return tsk->audit->capcontid;
+}
+
+extern int audit_set_capcontid(struct task_struct *tsk, u32 enable);
 extern int audit_set_contid(struct task_struct *tsk, u64 contid);
 
 static inline u64 audit_get_contid(struct task_struct *tsk)
@@ -309,6 +318,11 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return AUDIT_SID_UNSET;
 }
 
+static inline u32 audit_get_capcontid(struct task_struct *tsk)
+{
+	return 0;
+}
+
 static inline u64 audit_get_contid(struct task_struct *tsk)
 {
 	return AUDIT_CID_UNSET;
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index eef42c8eea77..011b0a8ee9b2 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -78,6 +78,8 @@
 #define AUDIT_GET_LOGINUID	1024	/* Get loginuid of a task */
 #define AUDIT_SET_LOGINUID	1025	/* Set loginuid of a task */
 #define AUDIT_GET_SESSIONID	1026	/* Set sessionid of a task */
+#define AUDIT_GET_CAPCONTID	1027	/* Get cap_contid of a task */
+#define AUDIT_SET_CAPCONTID	1028	/* Set cap_contid of a task */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
diff --git a/kernel/audit.c b/kernel/audit.c
index a70c9184e5d9..7160da464849 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1192,6 +1192,14 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_GET_SESSIONID:
 		return 0;
 		break;
+	case AUDIT_GET_CAPCONTID:
+	case AUDIT_SET_CAPCONTID:
+	case AUDIT_GET_CONTID:
+	case AUDIT_SET_CONTID:
+		if (!netlink_capable(skb, CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
+			return -EPERM;
+		return 0;
+		break;
 	default:  /* do more checks below */
 		break;
 	}
@@ -1227,8 +1235,6 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_TTY_SET:
 	case AUDIT_TRIM:
 	case AUDIT_MAKE_EQUIV:
-	case AUDIT_GET_CONTID:
-	case AUDIT_SET_CONTID:
 	case AUDIT_SET_LOGINUID:
 		/* Only support auditd and auditctl in initial pid namespace
 		 * for now. */
@@ -1304,6 +1310,23 @@ static int audit_get_contid_status(struct sk_buff *skb)
 	return 0;
 }
 
+static int audit_get_capcontid_status(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh = nlmsg_hdr(skb);
+	u32 seq = nlh->nlmsg_seq;
+	void *data = nlmsg_data(nlh);
+	struct audit_capcontid_status cs;
+
+	cs.pid = ((struct audit_capcontid_status *)data)->pid;
+	if (!cs.pid)
+		cs.pid = task_tgid_nr(current);
+	rcu_read_lock();
+	cs.enable = audit_get_capcontid(find_task_by_vpid(cs.pid));
+	rcu_read_unlock();
+	audit_send_reply(skb, seq, AUDIT_GET_CAPCONTID, 0, 0, &cs, sizeof(cs));
+	return 0;
+}
+
 struct audit_loginuid_status { uid_t loginuid; };
 
 static int audit_get_loginuid_status(struct sk_buff *skb)
@@ -1779,6 +1802,27 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (err)
 			return err;
 		break;
+	case AUDIT_SET_CAPCONTID: {
+		struct audit_capcontid_status *s = data;
+		struct task_struct *tsk;
+
+		/* check if new data is valid */
+		if (nlmsg_len(nlh) < sizeof(*s))
+			return -EINVAL;
+		tsk = find_get_task_by_vpid(s->pid);
+		if (!tsk)
+			return -EINVAL;
+
+		err = audit_set_capcontid(tsk, s->enable);
+		put_task_struct(tsk);
+		return err;
+		break;
+	}
+	case AUDIT_GET_CAPCONTID:
+		err = audit_get_capcontid_status(skb);
+		if (err)
+			return err;
+		break;
 	case AUDIT_SET_LOGINUID: {
 		uid_t *loginuid = data;
 		kuid_t kloginuid;
@@ -2711,6 +2755,56 @@ static struct task_struct *audit_cont_owner(struct task_struct *tsk)
 	return NULL;
 }
 
+int audit_set_capcontid(struct task_struct *task, u32 enable)
+{
+	u32 oldcapcontid;
+	int rc = 0;
+	struct audit_buffer *ab;
+	uid_t uid;
+	struct tty_struct *tty;
+	char comm[sizeof(current->comm)];
+
+	if (!task->audit)
+		return -ENOPROTOOPT;
+	oldcapcontid = audit_get_capcontid(task);
+	/* if task is not descendant, block */
+	if (task == current)
+		rc = -EBADSLT;
+	else if (!task_is_descendant(current, task))
+		rc = -EXDEV;
+	else if (current_user_ns() == &init_user_ns) {
+		if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
+			rc = -EPERM;
+	}
+	if (!rc)
+		task->audit->capcontid = enable;
+
+	if (!audit_enabled)
+		return rc;
+
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SET_CAPCONTID);
+	if (!ab)
+		return rc;
+
+	uid = from_kuid(&init_user_ns, task_uid(current));
+	tty = audit_get_tty();
+	audit_log_format(ab,
+			 "opid=%d capcontid=%u old-capcontid=%u pid=%d uid=%u auid=%u tty=%s ses=%u",
+			 task_tgid_nr(task), enable, oldcapcontid,
+			 task_tgid_nr(current), uid,
+			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
+			 tty ? tty_name(tty) : "(none)",
+			 audit_get_sessionid(current));
+	audit_put_tty(tty);
+	audit_log_task_context(ab);
+	audit_log_format(ab, " comm=");
+	audit_log_untrustedstring(ab, get_task_comm(comm, current));
+	audit_log_d_path_exe(ab, current->mm);
+	audit_log_format(ab, " res=%d", !rc);
+	audit_log_end(ab);
+	return rc;
+}
+
 /*
  * audit_set_contid - set current task's audit contid
  * @task: target task
diff --git a/kernel/audit.h b/kernel/audit.h
index cb25341c1a0f..ac4694e88485 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -231,6 +231,11 @@ struct audit_contid_status {
 	u64	id;
 };
 
+struct audit_capcontid_status {
+	pid_t	pid;
+	u32	enable;
+};
+
 #define AUDIT_CONTID_DEPTH	5
 
 /* Indicates that audit should log the full pathname. */
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH ghak90 V7 19/21] audit: check cont depth
From: Richard Guy Briggs @ 2019-09-19  1:22 UTC (permalink / raw)
  To: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel
  Cc: Paul Moore, sgrubb, omosnace, dhowells, simo, eparis, serge,
	ebiederm, nhorman, dwalsh, mpatel, Richard Guy Briggs
In-Reply-To: <cover.1568834524.git.rgb@redhat.com>

Set an arbitrary limit on the depth of audit container identifier
nesting to limit abuse.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/audit.c | 21 +++++++++++++++++++++
 kernel/audit.h |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/kernel/audit.c b/kernel/audit.c
index 848fd1c8c579..a70c9184e5d9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2667,6 +2667,22 @@ int audit_signal_info(int sig, struct task_struct *t)
 	return audit_signal_info_syscall(t);
 }
 
+static int audit_contid_depth(struct audit_cont *cont)
+{
+	struct audit_cont *parent;
+	int depth = 1;
+
+	if (!cont)
+		return 0;
+
+	parent = cont->parent;
+	while (parent) {
+		depth++;
+		parent = parent->parent;
+	}
+	return depth;
+}
+
 struct audit_cont *audit_cont(struct task_struct *tsk)
 {
 	if (!tsk->audit || !tsk->audit->cont)
@@ -2785,6 +2801,11 @@ int audit_set_contid(struct task_struct *task, u64 contid)
 			rc = -ENOSPC;
 			goto conterror;
 		}
+		/* Set max contid depth */
+		if (audit_contid_depth(audit_cont(current->real_parent)) >= AUDIT_CONTID_DEPTH) {
+			rc = -EMLINK;
+			goto conterror;
+		}
 		if (!newcont) {
 			newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
 			if (newcont) {
diff --git a/kernel/audit.h b/kernel/audit.h
index 89b7de323c13..cb25341c1a0f 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -231,6 +231,8 @@ struct audit_contid_status {
 	u64	id;
 };
 
+#define AUDIT_CONTID_DEPTH	5
+
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH ghak90 V7 18/21] audit: track container nesting
From: Richard Guy Briggs @ 2019-09-19  1:22 UTC (permalink / raw)
  To: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel
  Cc: Paul Moore, sgrubb, omosnace, dhowells, simo, eparis, serge,
	ebiederm, nhorman, dwalsh, mpatel, Richard Guy Briggs
In-Reply-To: <cover.1568834524.git.rgb@redhat.com>

Track the parent container of a container to be able to filter and
report nesting.

Now that we have a way to track and check the parent container of a
container, fixup other patches, or squash all nesting fixes together.

fixup! audit: add container id
fixup! audit: log drop of contid on exit of last task
fixup! audit: log container info of syscalls
fixup! audit: add containerid filtering
fixup! audit: NETFILTER_PKT: record each container ID associated with a netNS
fixup! audit: convert to contid list to check for orch/engine ownership softirq (for netfilter) audit: protect contid list lock from softirq

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h |  1 +
 kernel/audit.c        | 67 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/audit.h        |  3 +++
 kernel/auditfilter.c  | 20 ++++++++++++++-
 kernel/auditsc.c      |  2 +-
 5 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/include/linux/audit.h b/include/linux/audit.h
index dcd92f964120..1ce27af686ea 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -110,6 +110,7 @@ struct audit_cont {
 	struct task_struct	*owner;
 	refcount_t              refcount;
 	struct rcu_head         rcu;
+	struct audit_cont	*parent;
 };
 
 struct audit_task_info {
diff --git a/kernel/audit.c b/kernel/audit.c
index 9e82de13d2eb..848fd1c8c579 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -213,7 +213,7 @@ struct audit_reply {
 
 static struct kmem_cache *audit_task_cache;
 
-static DEFINE_SPINLOCK(audit_contid_list_lock);
+DEFINE_SPINLOCK(audit_contid_list_lock);
 
 void __init audit_task_init(void)
 {
@@ -275,6 +275,7 @@ void audit_free(struct task_struct *tsk)
 {
 	struct audit_task_info *info = tsk->audit;
 	struct nsproxy *ns = tsk->nsproxy;
+	unsigned long flags;
 
 	audit_free_syscall(tsk);
 	if (ns)
@@ -282,9 +283,9 @@ void audit_free(struct task_struct *tsk)
 	/* Freeing the audit_task_info struct must be performed after
 	 * audit_log_exit() due to need for loginuid and sessionid.
 	 */
-	spin_lock(&audit_contid_list_lock); 
+	spin_lock_irqsave(&audit_contid_list_lock, flags); 
 	audit_cont_put(tsk->audit->cont);
-	spin_unlock(&audit_contid_list_lock); 
+	spin_unlock_irqrestore(&audit_contid_list_lock, flags); 
 	info = tsk->audit;
 	tsk->audit = NULL;
 	kmem_cache_free(audit_task_cache, info);
@@ -450,6 +451,7 @@ void audit_switch_task_namespaces(struct nsproxy *ns, struct task_struct *p)
 		audit_netns_contid_add(new->net_ns, contid);
 }
 
+void audit_log_contid(struct audit_buffer *ab, u64 contid);
 /**
  * audit_log_netns_contid_list - List contids for the given network namespace
  * @net: the network namespace of interest
@@ -481,7 +483,7 @@ void audit_log_netns_contid_list(struct net *net, struct audit_context *context)
 			audit_log_format(ab, "contid=");
 		} else
 			audit_log_format(ab, ",");
-		audit_log_format(ab, "%llu", cont->id);
+		audit_log_contid(ab, cont->id);
 	}
 	audit_log_end(ab);
 out:
@@ -2371,6 +2373,36 @@ void audit_log_session_info(struct audit_buffer *ab)
 	audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
 }
 
+void audit_log_contid(struct audit_buffer *ab, u64 contid)
+{
+	struct audit_cont *cont = NULL;
+	struct audit_cont *prcont = NULL;
+	int h;
+	unsigned long flags;
+
+	if (!audit_contid_valid(contid)) {
+		audit_log_format(ab, "%llu", contid);
+		return;
+	}
+	h = audit_hash_contid(contid);
+	spin_lock_irqsave(&audit_contid_list_lock, flags);
+	list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
+		if (cont->id == contid)
+			prcont = cont;
+	if (!prcont) {
+		audit_log_format(ab, "%llu", contid);
+		goto out;
+	}
+	while (prcont) {
+		audit_log_format(ab, "%llu", prcont->id);
+		prcont = prcont->parent;
+		if (prcont)
+			audit_log_format(ab, "^");
+	}
+out:
+	spin_unlock_irqrestore(&audit_contid_list_lock, flags);
+}
+
 /*
  * audit_log_container_id - report container info
  * @context: task or local context for record
@@ -2386,7 +2418,8 @@ void audit_log_container_id(struct audit_context *context, u64 contid)
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER_ID);
 	if (!ab)
 		return;
-	audit_log_format(ab, "contid=%llu", contid);
+	audit_log_format(ab, "contid=");
+	audit_log_contid(ab, contid);
 	audit_log_end(ab);
 }
 EXPORT_SYMBOL(audit_log_container_id);
@@ -2648,6 +2681,7 @@ void audit_cont_put(struct audit_cont *cont)
 		return;
 	if (refcount_dec_and_test(&cont->refcount)) {
 		put_task_struct(cont->owner);
+		audit_cont_put(cont->parent);
 		list_del_rcu(&cont->list);
 		kfree_rcu(cont, rcu);
 		audit_contid_count--;
@@ -2732,8 +2766,9 @@ int audit_set_contid(struct task_struct *task, u64 contid)
 		struct audit_cont *cont = NULL;
 		struct audit_cont *newcont = NULL;
 		int h = audit_hash_contid(contid);
+		unsigned long flags;
 
-		spin_lock(&audit_contid_list_lock);
+		spin_lock_irqsave(&audit_contid_list_lock, flags);
 		list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
 			if (cont->id == contid) {
 				/* task injection to existing container */
@@ -2757,6 +2792,9 @@ int audit_set_contid(struct task_struct *task, u64 contid)
 				newcont->id = contid;
 				get_task_struct(current);
 				newcont->owner = current;
+				newcont->parent = audit_cont(newcont->owner);
+				if (newcont->parent)
+					refcount_inc(&newcont->parent->refcount);
 				refcount_set(&newcont->refcount, 1);
 				list_add_rcu(&newcont->list, &audit_contid_hash[h]);
 				audit_contid_count++;
@@ -2768,7 +2806,7 @@ int audit_set_contid(struct task_struct *task, u64 contid)
 		task->audit->cont = newcont;
 		audit_cont_put(oldcont);
 conterror:
-		spin_unlock(&audit_contid_list_lock);
+		spin_unlock_irqrestore(&audit_contid_list_lock, flags);
 	}
 	if (!rc) {
 		if (audit_contid_valid(oldcontid))
@@ -2786,9 +2824,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
 
 	uid = from_kuid(&init_user_ns, task_uid(current));
 	tty = audit_get_tty();
+	audit_log_format(ab, "op=set opid=%d contid=", task_tgid_nr(task));
+	audit_log_contid(ab, contid);
+	audit_log_format(ab, " old-contid=");
+	audit_log_contid(ab, oldcontid);
 	audit_log_format(ab,
-			 "op=set opid=%d contid=%llu old-contid=%llu pid=%d uid=%u auid=%u tty=%s ses=%u",
-			 task_tgid_nr(task), contid, oldcontid,
+			 " pid=%d uid=%u auid=%u tty=%s ses=%u",
 			 task_tgid_nr(current), uid,
 			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
 			 tty ? tty_name(tty) : "(none)",
@@ -2819,10 +2860,12 @@ void audit_log_container_drop(void)
 
 	uid = from_kuid(&init_user_ns, task_uid(current));
 	tty = audit_get_tty();
+	audit_log_format(ab, "op=drop opid=%d contid=%llu old-contid=",
+			 task_tgid_nr(current), AUDIT_CID_UNSET);
+	audit_log_contid(ab, audit_get_contid(current));
 	audit_log_format(ab,
-			 "op=drop opid=%d contid=%llu old-contid=%llu pid=%d uid=%u auid=%u tty=%s ses=%u",
-			 task_tgid_nr(current), audit_get_contid(current),
-			 audit_get_contid(current), task_tgid_nr(current), uid,
+			 " pid=%d uid=%u auid=%u tty=%s ses=%u",
+			 task_tgid_nr(current), uid,
 			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
 			 tty ? tty_name(tty) : "(none)",
        			 audit_get_sessionid(current));
diff --git a/kernel/audit.h b/kernel/audit.h
index 25732fbc47a4..89b7de323c13 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -220,6 +220,8 @@ static inline int audit_hash_contid(u64 contid)
 	return (contid & (AUDIT_CONTID_BUCKETS-1));
 }
 
+extern spinlock_t audit_contid_list_lock;
+
 extern int audit_contid_count;
 
 #define AUDIT_CONTID_COUNT	1 << 16
@@ -235,6 +237,7 @@ struct audit_contid_status {
 extern int audit_match_class(int class, unsigned syscall);
 extern int audit_comparator(const u32 left, const u32 op, const u32 right);
 extern int audit_comparator64(const u64 left, const u32 op, const u64 right);
+extern int audit_contid_comparator(const u64 left, const u32 op, const u64 right);
 extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
 extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
 extern int parent_len(const char *path);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9606f973fe33..513d57d03637 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1297,6 +1297,24 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
 	}
 }
 
+int audit_contid_comparator(u64 left, u32 op, u64 right)
+{
+	struct audit_cont *cont = NULL;
+	int h;
+	int result = 0;
+	unsigned long flags;
+
+	h = audit_hash_contid(left);
+	spin_lock_irqsave(&audit_contid_list_lock, flags);
+	list_for_each_entry_rcu(cont, &audit_contid_hash[h], list) {
+		result = audit_comparator64(cont->id, op, right);
+		if (result)
+			break;
+	}
+	spin_unlock_irqrestore(&audit_contid_list_lock, flags);
+	return result;
+}
+
 /**
  * parent_len - find the length of the parent portion of a pathname
  * @path: pathname of which to determine length
@@ -1388,7 +1406,7 @@ int audit_filter(int msgtype, unsigned int listtype)
 							  f->op, f->val);
 				break;
 			case AUDIT_CONTID:
-				result = audit_comparator64(audit_get_contid(current),
+				result = audit_contid_comparator(audit_get_contid(current),
 							    f->op, f->val64);
 				break;
 			case AUDIT_MSGTYPE:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a658fe775b86..6bf6d8b9dfd1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -630,7 +630,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 							  f->op, f->val);
 			break;
 		case AUDIT_CONTID:
-			result = audit_comparator64(audit_get_contid(tsk),
+			result = audit_contid_comparator(audit_get_contid(tsk),
 						    f->op, f->val64);
 			break;
 		case AUDIT_SUBJ_USER:
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH ghak90 V7 17/21] audit: add support for loginuid/sessionid set/get by netlink
From: Richard Guy Briggs @ 2019-09-19  1:22 UTC (permalink / raw)
  To: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel
  Cc: nhorman, Richard Guy Briggs, dhowells, ebiederm, simo, eparis,
	mpatel, serge
In-Reply-To: <cover.1568834524.git.rgb@redhat.com>

Add the ability to get and set the login uid and to get the session id
using an audit netlink message using message types AUDIT_GET_LOGINUID
1024, AUDIT_SET_LOGINUID 1025 and AUDIT_GET_SESSIONID 1026 in addition
to using the proc filesystem.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/uapi/linux/audit.h |  3 +++
 kernel/audit.c             | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index e26729fc9943..eef42c8eea77 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -75,6 +75,9 @@
 #define AUDIT_SIGNAL_INFO2	1021	/* Get info auditd signal sender */
 #define AUDIT_GET_CONTID	1022	/* Get contid of a task */
 #define AUDIT_SET_CONTID	1023	/* Set contid of a task */
+#define AUDIT_GET_LOGINUID	1024	/* Get loginuid of a task */
+#define AUDIT_SET_LOGINUID	1025	/* Set loginuid of a task */
+#define AUDIT_GET_SESSIONID	1026	/* Set sessionid of a task */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
diff --git a/kernel/audit.c b/kernel/audit.c
index df92de20ed73..9e82de13d2eb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1184,6 +1184,15 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 {
 	int err = 0;
 
+	/* These messages can work outside the initial namespaces */
+	switch (msg_type) {
+	case AUDIT_GET_LOGINUID:
+	case AUDIT_GET_SESSIONID:
+		return 0;
+		break;
+	default:  /* do more checks below */
+		break;
+	}
 	/* Only support initial user namespace for now. */
 	/*
 	 * We return ECONNREFUSED because it tricks userspace into thinking
@@ -1218,6 +1227,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_MAKE_EQUIV:
 	case AUDIT_GET_CONTID:
 	case AUDIT_SET_CONTID:
+	case AUDIT_SET_LOGINUID:
 		/* Only support auditd and auditctl in initial pid namespace
 		 * for now. */
 		if (task_active_pid_ns(current) != &init_pid_ns)
@@ -1292,6 +1302,33 @@ static int audit_get_contid_status(struct sk_buff *skb)
 	return 0;
 }
 
+struct audit_loginuid_status { uid_t loginuid; };
+
+static int audit_get_loginuid_status(struct sk_buff *skb)
+{
+	u32 seq;
+	uid_t loginuid;
+	struct audit_loginuid_status ls;
+
+	loginuid = from_kuid(current_user_ns(), audit_get_loginuid(current));
+	ls.loginuid = loginuid;
+
+	seq = nlmsg_hdr(skb)->nlmsg_seq;
+	audit_send_reply(skb, seq, AUDIT_GET_LOGINUID, 0, 0, &ls, sizeof(ls));
+	return loginuid;
+}
+
+static int audit_get_sessionid_status(struct sk_buff *skb)
+{
+	u32 seq;
+	struct audit_sessionid_status { u32 sessionid; };
+	struct audit_sessionid_status ss = { audit_get_sessionid(current) };
+
+	seq = nlmsg_hdr(skb)->nlmsg_seq;
+	audit_send_reply(skb, seq, AUDIT_GET_SESSIONID, 0, 0, &ss, sizeof(ss));
+	return audit_get_sessionid(current);
+}
+
 static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
 				     u32 old_lock, u32 new_lock, int res)
 {
@@ -1740,6 +1777,31 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (err)
 			return err;
 		break;
+	case AUDIT_SET_LOGINUID: {
+		uid_t *loginuid = data;
+		kuid_t kloginuid;
+
+		/* check if new data is valid */
+		if (nlmsg_len(nlh) < sizeof(u32))
+			return -EINVAL;
+
+		kloginuid = make_kuid(current_user_ns(), *loginuid);
+                if (!uid_valid(kloginuid))
+                        return -EINVAL;
+
+		return audit_set_loginuid(kloginuid);
+		break;
+	}
+	case AUDIT_GET_LOGINUID:
+		err = audit_get_loginuid_status(skb);
+		if (err)
+			return err;
+		break;
+	case AUDIT_GET_SESSIONID:
+		err = audit_get_sessionid_status(skb);
+		if (err)
+			return err;
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
1.8.3.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox