All of lore.kernel.org
 help / color / mirror / Atom feed
* [linux-next:master 1427/2449] ERROR: "__spi_register_driver" [sound/soc/codecs/snd-soc-wm0010.ko] undefined!
From: kbuild test robot @ 2020-02-14 19:37 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2472 bytes --]

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
head:   9f01828e9e1655836fea88d0c8225d648850b33a
commit: ea00d95200d02ece71f5814d41b14f2eb16d598b [1427/2449] ASoC: Use imply for SND_SOC_ALL_CODECS
config: x86_64-randconfig-f003-20200214 (attached as .config)
compiler: gcc-7 (Debian 7.5.0-4) 7.5.0
reproduce:
        git checkout ea00d95200d02ece71f5814d41b14f2eb16d598b
        # save the attached .config to linux build tree
        make ARCH=x86_64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

Note: the linux-next/master HEAD 9f01828e9e1655836fea88d0c8225d648850b33a builds fine.
      It may have been fixed somewhere.

All errors (new ones prefixed by >>):

>> ERROR: "__spi_register_driver" [sound/soc/codecs/snd-soc-wm0010.ko] undefined!
>> ERROR: "spi_async" [sound/soc/codecs/snd-soc-wm0010.ko] undefined!
>> ERROR: "spi_sync" [sound/soc/codecs/snd-soc-wm0010.ko] undefined!
>> ERROR: "__spi_register_driver" [sound/soc/codecs/snd-soc-adav801.ko] undefined!
>> ERROR: "__spi_register_driver" [sound/soc/codecs/snd-soc-adau1977-spi.ko] undefined!
>> ERROR: "spi_write_then_read" [sound/soc/codecs/snd-soc-adau1977-spi.ko] undefined!
>> ERROR: "spi_get_device_id" [sound/soc/codecs/snd-soc-adau1977-spi.ko] undefined!
>> ERROR: "__spi_register_driver" [sound/soc/codecs/snd-soc-adau1781-spi.ko] undefined!
>> ERROR: "spi_write_then_read" [sound/soc/codecs/snd-soc-adau1781-spi.ko] undefined!
>> ERROR: "spi_get_device_id" [sound/soc/codecs/snd-soc-adau1781-spi.ko] undefined!
>> ERROR: "__spi_register_driver" [sound/soc/codecs/snd-soc-ad193x-spi.ko] undefined!
>> ERROR: "spi_get_device_id" [sound/soc/codecs/snd-soc-ad193x-spi.ko] undefined!
>> ERROR: "__spi_register_driver" [sound/soc/codecs/snd-soc-ad1836.ko] undefined!
>> ERROR: "spi_get_device_id" [sound/soc/codecs/snd-soc-ad1836.ko] undefined!
>> ERROR: "abx500_set_register_interruptible" [sound/soc/codecs/snd-soc-ab8500-codec.ko] undefined!
>> ERROR: "abx500_get_register_interruptible" [sound/soc/codecs/snd-soc-ab8500-codec.ko] undefined!
>> ERROR: "spi_sync" [drivers/base/regmap/regmap-spi.ko] undefined!
>> ERROR: "spi_async" [drivers/base/regmap/regmap-spi.ko] undefined!
>> ERROR: "spi_write_then_read" [drivers/base/regmap/regmap-spi.ko] undefined!

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 46967 bytes --]

^ permalink raw reply

* [PATCH AUTOSEL 4.9 072/141] net/wan/fsl_ucc_hdlc: remove set but not used variables 'ut_info' and 'ret'
From: Sasha Levin @ 2020-02-14 16:20 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Chen Zhou, netdev, Hulk Robot, linuxppc-dev,
	David S . Miller
In-Reply-To: <20200214162122.19794-1-sashal@kernel.org>

From: Chen Zhou <chenzhou10@huawei.com>

[ Upstream commit 270fe2ceda66b6964d4c6f261d7f562a02c1c786 ]

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/net/wan/fsl_ucc_hdlc.c: In function ucc_hdlc_irq_handler:
drivers/net/wan/fsl_ucc_hdlc.c:643:23:
	warning: variable ut_info set but not used [-Wunused-but-set-variable]
drivers/net/wan/fsl_ucc_hdlc.c: In function uhdlc_suspend:
drivers/net/wan/fsl_ucc_hdlc.c:880:23:
	warning: variable ut_info set but not used [-Wunused-but-set-variable]
drivers/net/wan/fsl_ucc_hdlc.c: In function uhdlc_resume:
drivers/net/wan/fsl_ucc_hdlc.c:925:6:
	warning: variable ret set but not used [-Wunused-but-set-variable]

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wan/fsl_ucc_hdlc.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 87bf05a81db50..7c4a30391f746 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -591,11 +591,9 @@ static irqreturn_t ucc_hdlc_irq_handler(int irq, void *dev_id)
 	struct ucc_hdlc_private *priv = (struct ucc_hdlc_private *)dev_id;
 	struct net_device *dev = priv->ndev;
 	struct ucc_fast_private *uccf;
-	struct ucc_tdm_info *ut_info;
 	u32 ucce;
 	u32 uccm;
 
-	ut_info = priv->ut_info;
 	uccf = priv->uccf;
 
 	ucce = ioread32be(uccf->p_ucce);
@@ -826,7 +824,6 @@ static void resume_clk_config(struct ucc_hdlc_private *priv)
 static int uhdlc_suspend(struct device *dev)
 {
 	struct ucc_hdlc_private *priv = dev_get_drvdata(dev);
-	struct ucc_tdm_info *ut_info;
 	struct ucc_fast __iomem *uf_regs;
 
 	if (!priv)
@@ -838,7 +835,6 @@ static int uhdlc_suspend(struct device *dev)
 	netif_device_detach(priv->ndev);
 	napi_disable(&priv->napi);
 
-	ut_info = priv->ut_info;
 	uf_regs = priv->uf_regs;
 
 	/* backup gumr guemr*/
@@ -872,7 +868,7 @@ static int uhdlc_resume(struct device *dev)
 	struct ucc_fast __iomem *uf_regs;
 	struct ucc_fast_private *uccf;
 	struct ucc_fast_info *uf_info;
-	int ret, i;
+	int i;
 	u32 cecr_subblock;
 	u16 bd_status;
 
@@ -917,16 +913,16 @@ static int uhdlc_resume(struct device *dev)
 
 	/* Write to QE CECR, UCCx channel to Stop Transmission */
 	cecr_subblock = ucc_fast_get_qe_cr_subblock(uf_info->ucc_num);
-	ret = qe_issue_cmd(QE_STOP_TX, cecr_subblock,
-			   (u8)QE_CR_PROTOCOL_UNSPECIFIED, 0);
+	qe_issue_cmd(QE_STOP_TX, cecr_subblock,
+		     (u8)QE_CR_PROTOCOL_UNSPECIFIED, 0);
 
 	/* Set UPSMR normal mode */
 	iowrite32be(0, &uf_regs->upsmr);
 
 	/* init parameter base */
 	cecr_subblock = ucc_fast_get_qe_cr_subblock(uf_info->ucc_num);
-	ret = qe_issue_cmd(QE_ASSIGN_PAGE_TO_DEVICE, cecr_subblock,
-			   QE_CR_PROTOCOL_UNSPECIFIED, priv->ucc_pram_offset);
+	qe_issue_cmd(QE_ASSIGN_PAGE_TO_DEVICE, cecr_subblock,
+		     QE_CR_PROTOCOL_UNSPECIFIED, priv->ucc_pram_offset);
 
 	priv->ucc_pram = (struct ucc_hdlc_param __iomem *)
 				qe_muram_addr(priv->ucc_pram_offset);
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH 3/7] ext2fs: Update allocation info earlier in ext2fs_mkdir() and ext2fs_symlink()
From: Andreas Dilger @ 2020-02-14 19:37 UTC (permalink / raw)
  To: Jan Kara; +Cc: Ted Tso, linux-ext4
In-Reply-To: <20200213101602.29096-4-jack@suse.cz>

[-- Attachment #1: Type: text/plain, Size: 2987 bytes --]

On Feb 13, 2020, at 3:15 AM, Jan Kara <jack@suse.cz> wrote:
> 
> Currently, ext2fs_mkdir() and ext2fs_symlink() update allocation bitmaps
> and other information only close to the end of the function, in
> particular after calling to ext2fs_link(). When ext2fs_link() will
> support indexed directories, it will also need to allocate blocks and
> that would cause filesystem corruption in case allocation info isn't
> properly updated. So make sure ext2fs_mkdir() and ext2fs_symlink()
> update allocation info before calling into ext2fs_link().
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

I was wondering if this was done at the end of the function to avoid the
need to undo it if there was an error in the middle of the operation?
I suppose the worst that would happen in that case is an extra bit set
in the block bitmap until the next e2fsck, which is a relatively safe
side-effect...  I'm not sure whether e2fsck would abort anyway in the
case either of these functions return an error?

In any case, this is better than what is there currently.

Reviewed-by: Andreas Dilger <adilger@dilger.ca>


> ---
> lib/ext2fs/mkdir.c   | 14 +++++++-------
> lib/ext2fs/symlink.c | 14 +++++++-------
> 2 files changed, 14 insertions(+), 14 deletions(-)
> 
> diff --git a/lib/ext2fs/mkdir.c b/lib/ext2fs/mkdir.c
> index 2a63aad16715..947003ebf309 100644
> --- a/lib/ext2fs/mkdir.c
> +++ b/lib/ext2fs/mkdir.c
> @@ -143,6 +143,13 @@ errcode_t ext2fs_mkdir(ext2_filsys fs, ext2_ino_t parent, ext2_ino_t inum,
> 		}
> 	}
> 
> +	/*
> +	 * Update accounting....
> +	 */
> +	if (!inline_data)
> +		ext2fs_block_alloc_stats2(fs, blk, +1);
> +	ext2fs_inode_alloc_stats2(fs, ino, +1, 1);
> +
> 	/*
> 	 * Link the directory into the filesystem hierarchy
> 	 */
> @@ -175,13 +182,6 @@ errcode_t ext2fs_mkdir(ext2_filsys fs, ext2_ino_t parent, ext2_ino_t inum,
> 			goto cleanup;
> 	}
> 
> -	/*
> -	 * Update accounting....
> -	 */
> -	if (!inline_data)
> -		ext2fs_block_alloc_stats2(fs, blk, +1);
> -	ext2fs_inode_alloc_stats2(fs, ino, +1, 1);
> -
> cleanup:
> 	if (block)
> 		ext2fs_free_mem(&block);
> diff --git a/lib/ext2fs/symlink.c b/lib/ext2fs/symlink.c
> index 7f78c5f75549..3e07a539daf3 100644
> --- a/lib/ext2fs/symlink.c
> +++ b/lib/ext2fs/symlink.c
> @@ -162,6 +162,13 @@ need_block:
> 			goto cleanup;
> 	}
> 
> +	/*
> +	 * Update accounting....
> +	 */
> +	if (!fastlink && !inlinelink)
> +		ext2fs_block_alloc_stats2(fs, blk, +1);
> +	ext2fs_inode_alloc_stats2(fs, ino, +1, 0);
> +
> 	/*
> 	 * Link the symlink into the filesystem hierarchy
> 	 */
> @@ -179,13 +186,6 @@ need_block:
> 			goto cleanup;
> 	}
> 
> -	/*
> -	 * Update accounting....
> -	 */
> -	if (!fastlink && !inlinelink)
> -		ext2fs_block_alloc_stats2(fs, blk, +1);
> -	ext2fs_inode_alloc_stats2(fs, ino, +1, 0);
> -
> cleanup:
> 	if (block_buf)
> 		ext2fs_free_mem(&block_buf);
> --
> 2.16.4
> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply

* Re: [RFC patch 07/19] bpf: Provide BPF_PROG_RUN_PIN_ON_CPU() macro
From: Thomas Gleixner @ 2020-02-14 19:36 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: LKML, David Miller, bpf, netdev, Alexei Starovoitov,
	Daniel Borkmann, Sebastian Sewior, Peter Zijlstra, Clark Williams,
	Steven Rostedt, Juri Lelli, Ingo Molnar
In-Reply-To: <20200214185027.nx6enxvmghucai2d@localhost>

Mathieu Desnoyers <mathieu.desnoyers@efficios.com> writes:

> On 14-Feb-2020 02:39:24 PM, Thomas Gleixner wrote:
> [...]
>> +#define BPF_PROG_RUN_PIN_ON_CPU(prog, ctx) ({				\
>> +	u32 ret;							\
>> +	migrate_disable();						\
>> +	ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);	\
>> +	migrate_enable();						\
>> +	ret; })
>
> Does it really have to be a statement expression with a local variable ?
>
> If so, we should consider renaming "ret" to "__ret" to minimize the
> chances of a caller issuing BPF_PROG_RUN_PIN_ON_CPU with "ret" as
> prog or ctx argument, which would lead to unexpected results.

Indeed. That really can be an inline.

Thanks,

        tglx

^ permalink raw reply

* Re: [Intel-gfx] [PATCH] drm/i915/selftests: Fix selftest_mocs for DGFX
From: Daniele Ceraolo Spurio @ 2020-02-14 19:36 UTC (permalink / raw)
  To: Chris Wilson, Brian Welty, intel-gfx
In-Reply-To: <158170497173.15393.11944816323451861470@skylake-alporthouse-com>



On 2/14/20 10:29 AM, Chris Wilson wrote:
> Quoting Daniele Ceraolo Spurio (2020-02-14 17:56:58)
>>
>>
>> On 2/12/20 4:49 PM, Brian Welty wrote:
>>>
>>> On 2/12/2020 4:34 PM, Chris Wilson wrote:
>>>> Quoting Brian Welty (2020-02-13 00:14:18)
>>>>> For DGFX devices, the MOCS control value is not initialized or used.
>>>>
>>>> Then why is the table populated?
>>>> -Chris
>>>>
>>>
>>> The format has changed (been reduced?) for DGFX.  drm_i915_mocs_entry.l3cc_value is what is still initialized/used.
>>> Probably first needed is the patch that defines the table entries for DGFX.
>>> Ugh, I didn't notice this wasn't applied yet.  Let me ask about this.
>>>
>>
>> We do have:
>>
>> commit e6e2ac07118b15f25683fcbd59ea1be73ec9465d
>> Author: Lucas De Marchi <lucas.demarchi@intel.com>
>> Date:   Thu Oct 24 12:51:21 2019 -0700
>>
>>       drm/i915: do not set MOCS control values on dgfx
>>
>> So I see no reason not to add this change to the test side to match
>> that. Maybe we can add an additional check in the test to validate that
>> all the control_entries are set to 0 in the table on DGFX?
> 
> My expectation was that as we were not setting mocs values, we would not
> have defined a table for it. However, the table is combined for mocs and
> l3cc. l3cc is still used, right?
> 

yes, l3cc is still used. The diff below looks ok to me to keep the 
table-driven approach.

Daniele

> My ideal would be that our tables did remain the truth value we could
> use directly -- that would require splitting the tables though.
> 
> If we did something like
> 
> diff --git a/drivers/gpu/drm/i915/gt/selftest_mocs.c b/drivers/gpu/drm/i915/gt/selftest_mocs.c
> index de1f83100fb6..2c636257f12c 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_mocs.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_mocs.c
> @@ -12,7 +12,8 @@
>   #include "selftests/igt_spinner.h"
> 
>   struct live_mocs {
> -	struct drm_i915_mocs_table table;
> +	struct drm_i915_mocs_table mocs;
> +	struct drm_i915_mocs_table l3cc;
>   	struct i915_vma *scratch;
>   	void *vaddr;
>   };
> @@ -68,13 +69,32 @@ static struct i915_vma *create_scratch(struct intel_gt *gt)
>   	return vma;
>   }
> 
> +static bool has_l3cc(struct drm_i915_private *i915)
> +{
> +	return true;
> +}
> +
> +static bool has_mocs(struct drm_i915_private *i915)
> +{
> +	return !IS_DGFX(i915);
> +}
> +
>   static int live_mocs_init(struct live_mocs *arg, struct intel_gt *gt)
>   {
> +	struct drm_i915_mocs_table table;
>   	int err;
> 
> -	if (!get_mocs_settings(gt->i915, &arg->table))
> +	memset(arg, 0, sizeof(*arg));
> +
> +	if (!get_mocs_settings(gt->i915, &table))
>   		return -EINVAL;
> 
> +	if (has_l3cc(gt->i915))
> +		arg->l3cc = table;
> +
> +	if (has_mocs(gt->i915))
> +		arg->mocs = table;
> +
>   	arg->scratch = create_scratch(gt);
>   	if (IS_ERR(arg->scratch))
>   		return PTR_ERR(arg->scratch);
> @@ -223,9 +243,9 @@ static int check_mocs_engine(struct live_mocs *arg,
>   	/* Read the mocs tables back using SRM */
>   	offset = i915_ggtt_offset(vma);
>   	if (!err)
> -		err = read_mocs_table(rq, &arg->table, &offset);
> +		err = read_mocs_table(rq, &arg->mocs, &offset);
>   	if (!err && ce->engine->class == RENDER_CLASS)
> -		err = read_l3cc_table(rq, &arg->table, &offset);
> +		err = read_l3cc_table(rq, &arg->l3cc, &offset);
>   	offset -= i915_ggtt_offset(vma);
>   	GEM_BUG_ON(offset > PAGE_SIZE);
> 
> @@ -236,9 +256,9 @@ static int check_mocs_engine(struct live_mocs *arg,
>   	/* Compare the results against the expected tables */
>   	vaddr = arg->vaddr;
>   	if (!err)
> -		err = check_mocs_table(ce->engine, &arg->table, &vaddr);
> +		err = check_mocs_table(ce->engine, &arg->mocs, &vaddr);
>   	if (!err && ce->engine->class == RENDER_CLASS)
> -		err = check_l3cc_table(ce->engine, &arg->table, &vaddr);
> +		err = check_l3cc_table(ce->engine, &arg->l3cc, &vaddr);
>   	if (err)
>   		return err;
> 
> 
> we could retain the table driven approach?
> -Chris
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply

* Re: [PATCH v2 14/19] target/riscv: progressively load the instruction during decode
From: Robert Foley @ 2020-02-14 19:34 UTC (permalink / raw)
  To: Alex Bennée
  Cc: fam, Alistair Francis, berrange, Sagar Karandikar, pbonzini,
	stefanb, Bastian Koppelmann, Richard Henderson, qemu-devel,
	robhenry, f4bug, marcandre.lureau, aaron, cota, Palmer Dabbelt,
	stefanha, kuhn.chenqun, Peter Puhov, open list:RISC-V TCG CPUs,
	aurelien
In-Reply-To: <20200213225109.13120-15-alex.bennee@linaro.org>

On Thu, 13 Feb 2020 at 18:00, Alex Bennée <alex.bennee@linaro.org> wrote:
>
> The plugin system would throw up a harmless warning when it detected
> that a disassembly of an instruction didn't use all it's bytes. Fix
> the riscv decoder to only load the instruction bytes it needs as it
> needs them.
>
> This drops opcode from the ctx in favour if passing the appropriately
> sized opcode down a few levels of the decode.
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Robert Foley <robert.foley@linaro.org>


^ permalink raw reply

* Re: [PATCH] kcsan, trace: Make KCSAN compatible with tracing
From: Qian Cai @ 2020-02-14 19:35 UTC (permalink / raw)
  To: Marco Elver
  Cc: paulmck, andreyknvl, glider, dvyukov, kasan-dev, linux-kernel,
	rostedt, mingo
In-Reply-To: <20200214190500.126066-1-elver@google.com>

On Fri, 2020-02-14 at 20:05 +0100, Marco Elver wrote:
> Previously the system would lock up if ftrace was enabled together with
> KCSAN. This is due to recursion on reporting if the tracer code is
> instrumented with KCSAN.
> 
> To avoid this for all types of tracing, disable KCSAN instrumentation
> for all of kernel/trace.

I remembered that KCSAN + ftrace was working last week, but I probably had a bad
memory. Anyway, this patch works fine. Feel free to add,

Tested-by: Qian Cai <cai@lca.pw>

> 
> Signed-off-by: Marco Elver <elver@google.com>
> Reported-by: Qian Cai <cai@lca.pw>
> Cc: Paul E. McKenney <paulmck@kernel.org>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> ---
>  kernel/kcsan/Makefile | 2 ++
>  kernel/trace/Makefile | 3 +++
>  2 files changed, 5 insertions(+)
> 
> diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
> index df6b7799e4927..d4999b38d1be5 100644
> --- a/kernel/kcsan/Makefile
> +++ b/kernel/kcsan/Makefile
> @@ -4,6 +4,8 @@ KCOV_INSTRUMENT := n
>  UBSAN_SANITIZE := n
>  
>  CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
> +CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
> +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
>  
>  CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
>  	$(call cc-option,-fno-stack-protector,)
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index f9dcd19165fa2..6b601d88bf71e 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -6,6 +6,9 @@ ifdef CONFIG_FUNCTION_TRACER
>  ORIG_CFLAGS := $(KBUILD_CFLAGS)
>  KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
>  
> +# Avoid recursion due to instrumentation.
> +KCSAN_SANITIZE := n
> +
>  ifdef CONFIG_FTRACE_SELFTEST
>  # selftest needs instrumentation
>  CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)

^ permalink raw reply

* Re: [PATCH v2 14/19] target/riscv: progressively load the instruction during decode
From: Robert Foley @ 2020-02-14 19:34 UTC (permalink / raw)
  To: Alex Bennée
  Cc: qemu-devel, cota, aaron, Peter Puhov, kuhn.chenqun, robhenry, fam,
	berrange, f4bug, Richard Henderson, balrogg, aurelien, pbonzini,
	stefanha, stefanb, marcandre.lureau, Palmer Dabbelt,
	Alistair Francis, Sagar Karandikar, Bastian Koppelmann,
	open list:RISC-V TCG CPUs
In-Reply-To: <20200213225109.13120-15-alex.bennee@linaro.org>

On Thu, 13 Feb 2020 at 18:00, Alex Bennée <alex.bennee@linaro.org> wrote:
>
> The plugin system would throw up a harmless warning when it detected
> that a disassembly of an instruction didn't use all it's bytes. Fix
> the riscv decoder to only load the instruction bytes it needs as it
> needs them.
>
> This drops opcode from the ctx in favour if passing the appropriately
> sized opcode down a few levels of the decode.
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Robert Foley <robert.foley@linaro.org>


^ permalink raw reply

* Re: [RFC PATCH 1/1] selinux-testsuite: Use native filesystem for fs tests
From: Stephen Smalley @ 2020-02-14 19:35 UTC (permalink / raw)
  To: Richard Haines, selinux, paul
In-Reply-To: <20200214085643.3119-1-richard_c_haines@btinternet.com>

On 2/14/20 3:56 AM, Richard Haines wrote:
> Use the filesystem type that the selinux-testsuite is running from to be
> used for tests/filesystem and tests/fs_filesystem.
> 
> If testing locally the -f <fs_type> can be used to test a specific type.
> 
> These are the tested and supported filesystem types: ext2, ext3, ext4, xfs,
> btrfs, hfsplus, reiserfs, nfs4. If not in this list, tests are skipped.

Same comment as for the cover letter: ext4, xfs, nfs4 are the main ones 
of interest.  If you want to also allow for running the tests on ext[23] 
and btrfs that is fine but I wouldn't bother with hfsplus or reiserfs. 
I don't think you actually need a whitelist at all though. If someone 
runs the test on an unsupported filesystem and it fails, that's ok - 
that is correctly informing them that their filesystem doesn't support 
that aspect of SELinux functionality.  Why bother whitelisting and 
skipping tests in that situation?

I don't know if we might want to also include a test of context= mount 
functionality for a filesystem type that doesn't support file security 
labeling natively, e.g. vfat?  That's a common use case for context= 
mounts for removable media.

> Signed-off-by: Richard Haines <richard_c_haines@btinternet.com>
> ---

> diff --git a/policy/test_filesystem.te b/policy/test_filesystem.te
> index 09f9d4a..fd928de 100644
> --- a/policy/test_filesystem.te
> +++ b/policy/test_filesystem.te
>   allow test_filesystem_t test_filesystem_filetranscon_t:file { create getattr open write relabelfrom };
>   dontaudit unconfined_t test_filesystem_filetranscon_t:file { getattr read };
>   
> +#
> +############## Additional reiserfs rules ########################

Comment seems suspect (reiserfs above versus nfs below).

> +#
> +gen_require(`
> +	type nfs_t;
> +')
> +allow test_filesystem_no_getattr_t unlabeled_t:dir { search };

Why unlabeled_t? That seems like a bug.  Don't hide bugs in the test 
policy or code; we want them exposed as failures.

> +allow test_filesystem_no_getattr_t nfs_t:filesystem { associate };

Could allow for all of your test domains via a single rule on the 
"filesystemdomain" attribute?  Kind of weird using a domain type in a 
rootcontext= mount option but whatever.
> +#
> +############## Additional hfsplus rules ########################

Drop hfsplus, maybe switch to vfat testing of just context= mounts?

> +#
> +############### Additional NFS rules ###############
> +#
> +##### NFS mount option: rootcontext=system_u:object_r:test_filesystem_file_t:s0
> +# Note that defcontext is not supported by nfs
> +allow_map(test_filesystem_t, test_filesystem_file_t, file)
> +allow test_filesystem_t test_filesystem_file_t:dir { mounton };
> +allow test_filesystem_t test_filesystem_file_t:file { entrypoint execute read };

Why are you executing from the mount?

> +allow test_filesystem_t test_filesystem_file_t:filesystem { mount getattr remount relabelfrom relabelto unmount };
> +
> +# Test file:
> +allow test_filesystem_t test_file_t:file { create relabelfrom write };
> +
> +gen_require(`
> +	type user_home_t;
> +')
> +allow test_no_setfscreatecon_t user_home_t:dir { search };
> +allow test_setfscreatecon_t user_home_t:dir { search };

For these and all subsequent references to user_home_t, use a single 
rule on an attribute and try to use an interface to avoid assuming it 
must be user_home_t (versus admin_home_t or whatever). Maybe 
files_list_home(filesystemdomain) or 
userdom_search_user_home_content(filesystemdomain)?

Throughout, try to rewrite to use attributes to reduce identical rules.

> diff --git a/tests/filesystem/Filesystem.pm b/tests/filesystem/Filesystem.pm
> index a08570a..20b01af 100644
> --- a/tests/filesystem/Filesystem.pm
> +++ b/tests/filesystem/Filesystem.pm
> @@ -111,17 +111,30 @@ sub attach_dev {
>   
>   sub make_fs {
>       my ( $mk_type, $mk_dev, $mk_dir ) = @_;
> +
> +    if ( $mk_type eq "btrfs" or $mk_type eq "reiserfs" ) {
> +        $count = "count=27904";
> +    }

Why does btrfs or reiserfs  need a weird count value?  Why that 
particular value?  In any event, I wouldn't go out of your way to 
support either one.  If there is some sane value that we can use for all 
filesystem types, let's use that; otherwise just let it break on those 
filesystems.

> +    $opt = " ";
> +    if ( $mk_type eq "reiserfs" ) {
> +        $opt = "-q";    # Otherwise asks to proceed
> +    }

No need to support reiserfs specially IMHO.

> diff --git a/tests/filesystem/test b/tests/filesystem/test
> index 78faf72..b8d14ed 100755
> --- a/tests/filesystem/test
> +++ b/tests/filesystem/test
> @@ -12,19 +12,82 @@ BEGIN {
<snip>
> +    # If NFS specified, exit as cannot run locally CHECK IF NFS RUNNING ???
> +    if ( $fs_type eq "nfs4" or $fs_type eq "nfs" ) {
> +        plan skip_all => "Skip tests as running $fs_type locally not supported";
> +    }

Super-confused here.  NFS supports running locally as evidenced by my 
tools/nfs.sh script and README.md instructions. And you said you were 
going to support it on the cover and patch description.  Why skip out 
here?  And why do you have to test for both nfs4 and nfs?  My logic for 
skipping certain tests on nfs only needed to check for nfs based on 
output of stat -f --print %T $basedir.

> +    if (    $fs_type ne "ext2"
> +        and $fs_type ne "ext3"
> +        and $fs_type ne "ext4"
> +        and $fs_type ne "xfs"
> +        and $fs_type ne "btrfs"
> +        and $fs_type ne "hfsplus"
> +        and $fs_type ne "reiserfs"
> +        and $fs_type ne "nfs4"
> +        and $fs_type ne "nfs" )
> +    {
> +        plan skip_all => "Skip tests as $fs_type is not supported";
> +    }

IMHO no need for this whitelist or logic at all.  Just run the tests on 
whatever filesystem we have and if it fails, it fails.  That's ok.

> +    print "Testing filesystem type: $fs_type\n";
> +
> +    if ( $fs_type eq "nfs4" or $fs_type eq "nfs" ) {
> +        system("$basedir/test-nfs.pl $v -f $fs_type");
> +        exit 0;

Hmmm...if test-nfs.pl fails the error won't get propagated up?  And do 
we really need a separate test script for it?

> +    }
> +
> +    # Note: ext2, ext3, ext4, f2fs, reiserfs and jfs call dquot_quota_on();
> +    # therefore could check qouta permissions
> +    if (
> +        $fs_type eq "xfs"            # Requires xfs_quota(8)
> +        or $fs_type eq "btrfs"       # Requires btrfs_quota(8)
> +        or $fs_type eq "hfsplus"     # Does not support quotas
> +        or $fs_type eq "reiserfs"    # Has internal quota.
> +        or $fs_type eq "nfs4"        # Does not support quotas
> +        or $fs_type eq "nfs"
> +      )
> +    {
> +        $test_count   = 54;
> +        $quota_checks = 0;
> +    }
> +    else {
> +        $test_count = 68;
> +    }

Do the quota tests pass if you install xfs_quota?  If so, just note it 
as a dependency when running on a xfs filesystem in the README.md and 
run the tests.  On nfs4/nfs, skip the tests like we do for other 
functionality not supported on nfs.  The rest I'd leave out and just 
allow to fail until such a time as someone cares.

> +
> +    # These do not support defcontext tests
> +    if (   $fs_type eq "reiserfs"
> +        or $fs_type eq "hfsplus"
> +        or $fs_type eq "nfs4"
> +        or $fs_type eq "nfs" )
> +    {
> +        $test_count -= 6;

Skip on nfs/nfs4; otherwise we don't care.

> @@ -129,7 +197,12 @@ $result =
>     system(
>   "runcon -t test_filesystem_t $basedir/create_file_change_context -t test_filesystem_filecon_t -f $private_path/mp1/test_file $v"
>     );
> -ok( $result eq 0 );
> +if ( $fs_type eq "reiserfs" or $fs_type eq "hfsplus" ) {
> +    ok( $result >> 8 eq 95 );    # EOPNOTSUPP
> +}
> +else {
> +    ok( $result eq 0 );
> +}

Drop - we don't care about reiserfs or hfsplus SELinux support; let it 
fail and if someone cares they should implement the actual support not 
just skip it.

> @@ -221,7 +293,7 @@ mk_mntpoint_1($private_path);
>   ( $dev, $device_count ) = get_loop_dev( \@device_list, $device_count );
>   make_fs( $fs_type, $dev, $basedir );
>   $opts_no_relabelfrom =
> -  "defcontext=system_u:object_r:test_filesystem_sb_relabel_no_relabelfrom_t:s0";
> +"rootcontext=system_u:object_r:test_filesystem_sb_relabel_no_relabelfrom_t:s0";

Does this mean we won't exercise the check even on filesystem types that 
support it?  That doesn't seem desirable.  Optimally we'd test it for both.

> @@ -312,7 +384,12 @@ $result =
>     system(
>   "runcon -t test_filesystem_may_create_no_associate_t $basedir/create_file_change_context -t unconfined_t -f $basedir/mntpoint/mp1/test_file $v 2>&1"
>     );
> -ok( $result >> 8 eq 13 );
> +if ( $fs_type eq "reiserfs" or $fs_type eq "hfsplus" ) {
> +    ok( $result >> 8 eq 95 );    # EOPNOTSUPP
> +}

Don't care about testing reiserfs or hfsplus.

> diff --git a/tests/filesystem/test-nfs.pl b/tests/filesystem/test-nfs.pl
> new file mode 100755
> index 0000000..d6a931d
> --- /dev/null
> +++ b/tests/filesystem/test-nfs.pl
<snip>
> +############### Test setfscreatecon(3) ##########################
> +system("mkdir -p $basedir/mntpoint 2>/dev/null");
> +
> +print "Test setfscreatecon(3)\n";
> +$result = system
> +"runcon -t test_setfscreatecon_t $basedir/fs_relabel $v -b $basedir/mntpoint -t test_setfscreatecon_newcon_t";
> +ok( $result eq 0 );
> +
> +$result = system
> +"runcon -t test_no_setfscreatecon_t $basedir/fs_relabel $v -b $basedir/mntpoint -t test_setfscreatecon_newcon_t 2>&1";
> +ok( $result >> 8 eq 13 );

No, we don't want to replicate the tests in another script that has to 
be maintained separately.  The goal is to exercise the same test code on 
whatever filesystem we have.  So you could take parts of this new script 
back into test to set up the mount, but then we should proceed and run 
as many of the tests in the test script as are feasible on NFS.

> diff --git a/tools/nfs.sh b/tools/nfs.sh
> index 314f898..fb235dc 100755
> --- a/tools/nfs.sh
> +++ b/tools/nfs.sh
> @@ -1,4 +1,16 @@
>   #!/bin/sh -e
> +
> +# If 'make test' fails, close cleanly
> +function err_exit() {
> +	popd
> +	umount /mnt/selinux-testsuite
> +	exportfs -u localhost:$MOUNT
> +	rmdir /mnt/selinux-testsuite
> +	systemctl stop nfs-server
> +}
> +
> +trap 'err_exit' EXIT
> +

That's a nice cleanup regardless of the rest of this patch; feel free to 
separate it out and submit it.

>   MOUNT=`stat --print %m .`
>   TESTDIR=`pwd`
>   systemctl start nfs-server
> @@ -7,6 +19,10 @@ systemctl start nfs-server
>   exportfs -orw,no_root_squash,security_label localhost:$MOUNT
>   mkdir -p /mnt/selinux-testsuite
>   mount -t nfs -o vers=4.2 localhost:$TESTDIR /mnt/selinux-testsuite
> +# These may be used for tests/filesystem only at present as there is
> +# a bug in fsconfig(2), therefore tests/fs_filesystem will fail:
> +# mount -t nfs -o vers=4.2,rootcontext=system_u:object_r:test_filesystem_file_t:s0 localhost:$TESTDIR /mnt/selinux-testsuite
> +# mount -t nfs -o vers=4.2,fscontext=system_u:object_r:test_filesystem_file_t:s0 localhost:$TESTDIR /mnt/selinux-testsuite
>   pushd /mnt/selinux-testsuite
>   make test
>   popd

No, we should leave those tests failing until they are fixed - it is a 
real bug.


^ permalink raw reply

* Re: [linux-lvm] commit c527a0cbfc3 may have a bug
From: Gionatan Danti @ 2020-02-14 19:34 UTC (permalink / raw)
  To: LVM general discussion and development; +Cc: heming.zhao
In-Reply-To: <20200214191115.GA20792@redhat.com>

Il 2020-02-14 20:11 David Teigland ha scritto:
> Hi, it looks like a bug led to an incorrect filter configuration 
> actually
> working for a period of time.  When the bug was later fixed, the 
> incorrect
> filter became apparent.  In summary, the correct way to exclude devs 
> from
> lvmetad (and to handle duplicate PVs) is to set global_filter; filter 
> is
> not meant to work for that.

Hi David, being filters one of the most asked questions, can I ask why 
we have so many different filters, leading to such complex interactions 
and behaviors?

Don't get me wrong: I am sure you (the lvm team) have very good reasons 
to do that, and I am surely missing something? But what, precisely? How 
should we (end users) consider filters? Should we only use 
global_filter?

Thanks.

-- 
Danti Gionatan
Supporto Tecnico
Assyoma S.r.l. - www.assyoma.it [1]
email: g.danti@assyoma.it - info@assyoma.it
GPG public key ID: FF5F32A8

^ permalink raw reply

* Re: [PATCH] kvm/emulate: fix a -Werror=cast-function-type
From: Paolo Bonzini @ 2020-02-14 19:33 UTC (permalink / raw)
  To: Qian Cai, Jim Mattson
  Cc: Sean Christopherson, Vitaly Kuznetsov, Wanpeng Li, Joerg Roedel,
	kvm list, LKML
In-Reply-To: <1581707646.7365.72.camel@lca.pw>

On 14/02/20 20:14, Qian Cai wrote:
>> It seems misguided to define a local variable just to get an implicit
>> cast from (void *) to (fastop_t). Sean's first suggestion gives you
>> the same implicit cast without the local variable. The second
>> suggestion makes both casts explicit.
> 
> OK, I'll do a v2 using the first suggestion which looks simpler once it passed
> compilations.
> 

Another interesting possibility is to use an unnamed union of a
(*execute) function pointer and a (*fastop) function pointer.

Paolo


^ permalink raw reply

* [ANNOUNCE] v5.4.19-rt11
From: Sebastian Andrzej Siewior @ 2020-02-14 19:33 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, linux-rt-users, Steven Rostedt

Dear RT folks!

I'm pleased to announce the v5.4.19-rt11 patch set. 

Changes since v5.4.19-rt10:

  - Interrupts were disabled in the i915 with lockdep-enabled leading to
    warnings. Reported by Fernando Lopez-Lezcano, patch by Mike Galbraith.

  - BPF series by Thomas Gleixner. The series reworks the locking by and
    within BPF which enables its usage on RT.

Known issues
     - None

The delta patch against v5.4.19-rt10 is appended below and can be found here:
 
     https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.4/incr/patch-5.4.19-rt10-rt11.patch.xz

You can get this release via the git tree at:

    git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.4.19-rt11

The RT patch against v5.4.19 can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.4/older/patch-5.4.19-rt11.patch.xz

The split quilt queue is available at:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.4/older/patches-5.4.19-rt11.tar.xz

Sebastian
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 65b5ca74b3947..0e48a3d8ea22c 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -38,12 +38,15 @@ static int __engine_unpark(struct intel_wakeref *wf)
 }
 
 #if IS_ENABLED(CONFIG_LOCKDEP)
+#include <linux/locallock.h>
+
+static DEFINE_LOCAL_IRQ_LOCK(timeline_lock);
 
 static inline unsigned long __timeline_mark_lock(struct intel_context *ce)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(timeline_lock, flags);
 	mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_);
 
 	return flags;
@@ -53,7 +56,7 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
 					  unsigned long flags)
 {
 	mutex_release(&ce->timeline->mutex.dep_map, 0, _THIS_IP_);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(timeline_lock, flags);
 }
 
 #else
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3bf3835d0e866..3e6744c7122d6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -541,7 +541,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		struct bpf_prog *_prog;			\
 		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
-		preempt_disable();			\
+		migrate_disable();			\
 		rcu_read_lock();			\
 		_array = rcu_dereference(array);	\
 		if (unlikely(check_non_null && !_array))\
@@ -554,7 +554,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		}					\
 _out:							\
 		rcu_read_unlock();			\
-		preempt_enable();			\
+		migrate_enable();			\
 		_ret;					\
 	 })
 
@@ -588,7 +588,7 @@ _out:							\
 		u32 ret;				\
 		u32 _ret = 1;				\
 		u32 _cn = 0;				\
-		preempt_disable();			\
+		migrate_disable();			\
 		rcu_read_lock();			\
 		_array = rcu_dereference(array);	\
 		_item = &_array->items[0];		\
@@ -600,7 +600,7 @@ _out:							\
 			_item++;			\
 		}					\
 		rcu_read_unlock();			\
-		preempt_enable();			\
+		migrate_enable();			\
 		if (_ret)				\
 			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
 		else					\
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0367a75f873b6..76ce2dcb52cba 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -555,7 +555,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
 #define BPF_PROG_RUN(prog, ctx)	({				\
 	u32 ret;						\
-	cant_sleep();						\
+	cant_migrate();						\
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
 		struct bpf_prog_stats *stats;			\
 		u64 start = sched_clock();			\
@@ -570,6 +570,24 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 	}							\
 	ret; })
 
+/*
+ * Use in preemptible and therefore /migratable context to make sure that
+ * the execution of the BPF program runs on one CPU.
+ *
+ * This uses migrate_disable/enable() explicitely to document that the
+ * invocation of a BPF program does not require reentrancy protection
+ * against a BPF program which is invoked from a preempting task.
+ *
+ * For non enabled RT kernels migrate_disable/enable() maps to
+ * preempt_disable/enable(), i.e. it disables also preemption.
+ */
+#define BPF_PROG_RUN_PIN_ON_CPU(prog, ctx) ({				\
+	u32 ret;							\
+	migrate_disable();						\
+	ret = BPF_PROG_RUN(prog, ctx);					\
+	migrate_enable();						\
+	ret; })
+
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
 struct bpf_skb_data_end {
@@ -647,6 +665,7 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 	return qdisc_skb_cb(skb)->data;
 }
 
+/* Must be invoked with migration disabled */
 static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
 					 struct sk_buff *skb)
 {
@@ -672,9 +691,9 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
 {
 	u32 res;
 
-	preempt_disable();
+	migrate_disable();
 	res = __bpf_prog_run_save_cb(prog, skb);
-	preempt_enable();
+	migrate_enable();
 	return res;
 }
 
@@ -687,9 +706,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	if (unlikely(prog->cb_access))
 		memset(cb_data, 0, BPF_SKB_CB_LEN);
 
-	preempt_disable();
-	res = BPF_PROG_RUN(prog, skb);
-	preempt_enable();
+	res = BPF_PROG_RUN_PIN_ON_CPU(prog, skb);
 	return res;
 }
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f9abc6aab0be..f5ec1ddbfe070 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -271,6 +271,13 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
 
 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
 
+#ifndef CONFIG_PREEMPT_RT
+# define cant_migrate()		cant_sleep()
+#else
+  /* Placeholder for now */
+# define cant_migrate()		do { } while (0)
+#endif
+
 /**
  * abs - return absolute value of an argument
  * @x: the value.  If it is unsigned type, it is converted to signed type first.
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index adb085fe31e43..e1aab77564eeb 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -238,8 +238,30 @@ static inline int __migrate_disabled(struct task_struct *p)
 }
 
 #else
-#define migrate_disable()		preempt_disable()
-#define migrate_enable()		preempt_enable()
+/**
+ * migrate_disable - Prevent migration of the current task
+ *
+ * Maps to preempt_disable() which also disables preemption. Use
+ * migrate_disable() to annotate that the intent is to prevent migration
+ * but not necessarily preemption.
+ *
+ * Can be invoked nested like preempt_disable() and needs the corresponding
+ * number of migrate_enable() invocations.
+ */
+#define migrate_disable()	preempt_disable()
+
+/**
+ * migrate_enable - Allow migration of the current task
+ *
+ * Counterpart to migrate_disable().
+ *
+ * As migrate_disable() can be invoked nested only the uttermost invocation
+ * reenables migration.
+ *
+ * Currently mapped to preempt_enable().
+ */
+#define migrate_enable()	preempt_enable()
+
 static inline int __migrate_disabled(struct task_struct *p)
 {
 	return 0;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c97..c91ec298decad 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,9 +17,62 @@
 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
 	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
 
+/*
+ * The bucket lock has two protection scopes:
+ *
+ * 1) Serializing concurrent operations from BPF programs on differrent
+ *    CPUs
+ *
+ * 2) Serializing concurrent operations from BPF programs and sys_bpf()
+ *
+ * BPF programs can execute in any context including perf, kprobes and
+ * tracing. As there are almost no limits where perf, kprobes and tracing
+ * can be invoked from the lock operations need to be protected against
+ * deadlocks. Deadlocks can be caused by recursion and by an invocation in
+ * the lock held section when functions which acquire this lock are invoked
+ * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
+ * variable bpf_prog_active, which prevents BPF programs attached to perf
+ * events, kprobes and tracing to be invoked before the prior invocation
+ * from one of these contexts completed. sys_bpf() uses the same mechanism
+ * by pinning the task to the current CPU and incrementing the recursion
+ * protection accross the map operation.
+ *
+ * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
+ * operations like memory allocations (even with GFP_ATOMIC) from atomic
+ * contexts. This is required because even with GFP_ATOMIC the memory
+ * allocator calls into code pathes which acquire locks with long held lock
+ * sections. To ensure the deterministic behaviour these locks are regular
+ * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
+ * true atomic contexts on an RT kernel are the low level hardware
+ * handling, scheduling, low level interrupt handling, NMIs etc. None of
+ * these contexts should ever do memory allocations.
+ *
+ * As regular device interrupt handlers and soft interrupts are forced into
+ * thread context, the existing code which does
+ *   spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * just works.
+ *
+ * In theory the BPF locks could be converted to regular spinlocks as well,
+ * but the bucket locks and percpu_freelist locks can be taken from
+ * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
+ * atomic contexts even on RT. These mechanisms require preallocated maps,
+ * so there is no need to invoke memory allocations within the lock held
+ * sections.
+ *
+ * BPF maps which need dynamic allocation are only used from (forced)
+ * thread context on RT and can therefore use regular spinlocks which in
+ * turn allows to invoke memory allocations from the lock held section.
+ *
+ * On a non RT kernel this distinction is neither possible nor required.
+ * spinlock maps to raw_spinlock and the extra code is optimized out by the
+ * compiler.
+ */
 struct bucket {
 	struct hlist_nulls_head head;
-	raw_spinlock_t lock;
+	union {
+		raw_spinlock_t raw_lock;
+		spinlock_t     lock;
+	};
 };
 
 struct bpf_htab {
@@ -57,6 +110,51 @@ struct htab_elem {
 	char key[0] __aligned(8);
 };
 
+static inline bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
+static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
+{
+	return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
+}
+
+static void htab_init_buckets(struct bpf_htab *htab)
+{
+	unsigned i;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
+		if (htab_use_raw_lock(htab))
+			raw_spin_lock_init(&htab->buckets[i].raw_lock);
+		else
+			spin_lock_init(&htab->buckets[i].lock);
+	}
+}
+
+static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,
+					     struct bucket *b)
+{
+	unsigned long flags;
+
+	if (htab_use_raw_lock(htab))
+		raw_spin_lock_irqsave(&b->raw_lock, flags);
+	else
+		spin_lock_irqsave(&b->lock, flags);
+	return flags;
+}
+
+static inline void htab_unlock_bucket(const struct bpf_htab *htab,
+				      struct bucket *b,
+				      unsigned long flags)
+{
+	if (htab_use_raw_lock(htab))
+		raw_spin_unlock_irqrestore(&b->raw_lock, flags);
+	else
+		spin_unlock_irqrestore(&b->lock, flags);
+}
+
 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
 
 static bool htab_is_lru(const struct bpf_htab *htab)
@@ -71,11 +169,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
 }
 
-static bool htab_is_prealloc(const struct bpf_htab *htab)
-{
-	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
-}
-
 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
 				     void __percpu *pptr)
 {
@@ -306,8 +399,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 	struct bpf_htab *htab;
-	int err, i;
 	u64 cost;
+	int err;
 
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
@@ -369,10 +462,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	else
 		htab->hashrnd = get_random_int();
 
-	for (i = 0; i < htab->n_buckets; i++) {
-		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
-		raw_spin_lock_init(&htab->buckets[i].lock);
-	}
+	htab_init_buckets(htab);
 
 	if (prealloc) {
 		err = prealloc_init(htab);
@@ -580,7 +670,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 	b = __select_bucket(htab, tgt_l->hash);
 	head = &b->head;
 
-	raw_spin_lock_irqsave(&b->lock, flags);
+	flags = htab_lock_bucket(htab, b);
 
 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 		if (l == tgt_l) {
@@ -588,7 +678,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 			break;
 		}
 
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_unlock_bucket(htab, b, flags);
 
 	return l == tgt_l;
 }
@@ -668,11 +758,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
 	 * we're calling kfree, otherwise deadlock is possible if kprobes
 	 * are placed somewhere inside of slub
 	 */
-	preempt_disable();
+	migrate_disable();
 	__this_cpu_inc(bpf_prog_active);
 	htab_elem_free(htab, l);
 	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
+	migrate_enable();
 }
 
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
@@ -862,8 +952,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		 */
 	}
 
-	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&b->lock, flags);
+	flags = htab_lock_bucket(htab, b);
 
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 
@@ -904,7 +993,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	}
 	ret = 0;
 err:
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_unlock_bucket(htab, b, flags);
 	return ret;
 }
 
@@ -942,8 +1031,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 		return -ENOMEM;
 	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
 
-	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&b->lock, flags);
+	flags = htab_lock_bucket(htab, b);
 
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 
@@ -962,7 +1050,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	ret = 0;
 
 err:
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_unlock_bucket(htab, b, flags);
 
 	if (ret)
 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
@@ -997,8 +1085,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&b->lock, flags);
+	flags = htab_lock_bucket(htab, b);
 
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 
@@ -1021,7 +1108,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	}
 	ret = 0;
 err:
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_unlock_bucket(htab, b, flags);
 	return ret;
 }
 
@@ -1061,8 +1148,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 			return -ENOMEM;
 	}
 
-	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&b->lock, flags);
+	flags = htab_lock_bucket(htab, b);
 
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 
@@ -1084,7 +1170,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 	}
 	ret = 0;
 err:
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_unlock_bucket(htab, b, flags);
 	if (l_new)
 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
 	return ret;
@@ -1122,7 +1208,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	raw_spin_lock_irqsave(&b->lock, flags);
+	flags = htab_lock_bucket(htab, b);
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
@@ -1132,7 +1218,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 		ret = 0;
 	}
 
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_unlock_bucket(htab, b, flags);
 	return ret;
 }
 
@@ -1154,7 +1240,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	raw_spin_lock_irqsave(&b->lock, flags);
+	flags = htab_lock_bucket(htab, b);
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
@@ -1163,7 +1249,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 		ret = 0;
 	}
 
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	htab_unlock_bucket(htab, b, flags);
 	if (l)
 		bpf_lru_push_free(&htab->lru, &l->lru_node);
 	return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 56e6c75d354d9..3b3c420bc8ed8 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -34,7 +34,7 @@ struct lpm_trie {
 	size_t				n_entries;
 	size_t				max_prefixlen;
 	size_t				data_size;
-	raw_spinlock_t			lock;
+	spinlock_t			lock;
 };
 
 /* This trie implements a longest prefix match algorithm that can be used to
@@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map,
 	if (key->prefixlen > trie->max_prefixlen)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+	spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Allocate and fill a new node */
 
@@ -422,7 +422,7 @@ static int trie_update_elem(struct bpf_map *map,
 		kfree(im_node);
 	}
 
-	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+	spin_unlock_irqrestore(&trie->lock, irq_flags);
 
 	return ret;
 }
@@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 	if (key->prefixlen > trie->max_prefixlen)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+	spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Walk the tree looking for an exact key/length match and keeping
 	 * track of the path we traverse.  We will need to know the node
@@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 	kfree_rcu(node, rcu);
 
 out:
-	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+	spin_unlock_irqrestore(&trie->lock, irq_flags);
 
 	return ret;
 }
@@ -575,7 +575,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	if (ret)
 		goto out_err;
 
-	raw_spin_lock_init(&trie->lock);
+	spin_lock_init(&trie->lock);
 
 	return &trie->map;
 out_err:
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 6e090140b9240..b367430e611c7 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
 	free_percpu(s->freelist);
 }
 
+static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
+					   struct pcpu_freelist_node *node)
+{
+	node->next = head->first;
+	head->first = node;
+}
+
 static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
 					 struct pcpu_freelist_node *node)
 {
 	raw_spin_lock(&head->lock);
-	node->next = head->first;
-	head->first = node;
+	pcpu_freelist_push_node(head, node);
 	raw_spin_unlock(&head->lock);
 }
 
@@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 			    u32 nr_elems)
 {
 	struct pcpu_freelist_head *head;
-	unsigned long flags;
 	int i, cpu, pcpu_entries;
 
 	pcpu_entries = nr_elems / num_possible_cpus() + 1;
 	i = 0;
 
-	/* disable irq to workaround lockdep false positive
-	 * in bpf usage pcpu_freelist_populate() will never race
-	 * with pcpu_freelist_push()
-	 */
-	local_irq_save(flags);
 	for_each_possible_cpu(cpu) {
 again:
 		head = per_cpu_ptr(s->freelist, cpu);
-		___pcpu_freelist_push(head, buf);
+		/* No locking required as this is not visible yet. */
+		pcpu_freelist_push_node(head, buf);
 		i++;
 		buf += elem_size;
 		if (i == nr_elems)
@@ -78,7 +79,6 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 		if (i % pcpu_entries)
 			goto again;
 	}
-	local_irq_restore(flags);
 }
 
 struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 173e983619d77..e753900ff137a 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -40,6 +40,9 @@ static void do_up_read(struct irq_work *entry)
 {
 	struct stack_map_irq_work *work;
 
+	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+		return;
+
 	work = container_of(entry, struct stack_map_irq_work, irq_work);
 	up_read_non_owner(work->sem);
 	work->sem = NULL;
@@ -288,10 +291,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	struct stack_map_irq_work *work = NULL;
 
 	if (irqs_disabled()) {
-		work = this_cpu_ptr(&up_read_work);
-		if (work->irq_work.flags & IRQ_WORK_BUSY)
-			/* cannot queue more up_read, fallback */
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+			work = this_cpu_ptr(&up_read_work);
+			if (work->irq_work.flags & IRQ_WORK_BUSY)
+				/* cannot queue more up_read, fallback */
+				irq_work_busy = true;
+		} else {
+			/*
+			 * PREEMPT_RT does not allow to trylock mmap sem in
+			 * interrupt disabled context. Force the fallback code.
+			 */
 			irq_work_busy = true;
+		}
 	}
 
 	/*
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ace1cfaa24b6b..e773c23b10a4b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -794,7 +794,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 		goto done;
 	}
 
-	preempt_disable();
+	migrate_disable();
 	this_cpu_inc(bpf_prog_active);
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
@@ -837,7 +837,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 		rcu_read_unlock();
 	}
 	this_cpu_dec(bpf_prog_active);
-	preempt_enable();
+	migrate_enable();
 
 done:
 	if (err)
@@ -937,7 +937,7 @@ static int map_update_elem(union bpf_attr *attr)
 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
 	 * inside bpf map update or delete otherwise deadlocks are possible
 	 */
-	preempt_disable();
+	migrate_disable();
 	__this_cpu_inc(bpf_prog_active);
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
@@ -970,7 +970,7 @@ static int map_update_elem(union bpf_attr *attr)
 		rcu_read_unlock();
 	}
 	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
+	migrate_enable();
 	maybe_wait_bpf_programs(map);
 out:
 free_value:
@@ -1016,13 +1016,13 @@ static int map_delete_elem(union bpf_attr *attr)
 		goto out;
 	}
 
-	preempt_disable();
+	migrate_disable();
 	__this_cpu_inc(bpf_prog_active);
 	rcu_read_lock();
 	err = map->ops->map_delete_elem(map, key);
 	rcu_read_unlock();
 	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
+	migrate_enable();
 	maybe_wait_bpf_programs(map);
 out:
 	kfree(key);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a4ad23064f15e..253bdc106cb8c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8966,7 +8966,6 @@ static void bpf_overflow_handler(struct perf_event *event,
 	int ret = 0;
 
 	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
-	preempt_disable();
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 		goto out;
 	rcu_read_lock();
@@ -8974,7 +8973,6 @@ static void bpf_overflow_handler(struct perf_event *event,
 	rcu_read_unlock();
 out:
 	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
 	if (!ret)
 		return;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 614a557a0814b..25f9e00df9d44 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -267,16 +267,14 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
 	 */
-	preempt_disable();
 	for (; f; f = f->prev) {
-		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
+		u32 cur_ret = BPF_PROG_RUN_PIN_ON_CPU(f->prog, sd);
 
 		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
 			ret = cur_ret;
 			*match = f;
 		}
 	}
-	preempt_enable();
 	return ret;
 }
 #endif /* CONFIG_SECCOMP_FILTER */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 89bdac61233db..e80abded7b7af 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -83,7 +83,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	if (in_nmi()) /* not supported yet */
 		return 1;
 
-	preempt_disable();
+	migrate_disable();
 
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
 		/*
@@ -115,7 +115,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 
  out:
 	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
+	migrate_enable();
 
 	return ret;
 }
@@ -1330,9 +1330,7 @@ static __always_inline
 void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
 {
 	rcu_read_lock();
-	preempt_disable();
 	(void) BPF_PROG_RUN(prog, args);
-	preempt_enable();
 	rcu_read_unlock();
 }
 
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 5ef3eccee27cb..07b37fea141d9 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6660,14 +6660,14 @@ static int __run_one(const struct bpf_prog *fp, const void *data,
 	u64 start, finish;
 	int ret = 0, i;
 
-	preempt_disable();
+	migrate_disable();
 	start = ktime_get_ns();
 
 	for (i = 0; i < runs; i++)
 		ret = BPF_PROG_RUN(fp, data);
 
 	finish = ktime_get_ns();
-	preempt_enable();
+	migrate_enable();
 
 	*duration = finish - start;
 	do_div(*duration, runs);
diff --git a/localversion-rt b/localversion-rt
index d79dde624aaac..05c35cb580779 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt10
+-rt11
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 1153bbcdff721..cccd66cac3c1e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -37,7 +37,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 		repeat = 1;
 
 	rcu_read_lock();
-	preempt_disable();
+	migrate_disable();
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
 		bpf_cgroup_storage_set(storage);
@@ -50,18 +50,18 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 
 		if (need_resched()) {
 			time_spent += ktime_get_ns() - time_start;
-			preempt_enable();
+			migrate_enable();
 			rcu_read_unlock();
 
 			cond_resched();
 
 			rcu_read_lock();
-			preempt_disable();
+			migrate_disable();
 			time_start = ktime_get_ns();
 		}
 	}
 	time_spent += ktime_get_ns() - time_start;
-	preempt_enable();
+	migrate_enable();
 	rcu_read_unlock();
 
 	do_div(time_spent, repeat);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 96b2566c298dd..7e2ae105d3b9d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -844,9 +844,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 		     (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
 	flow_keys->flags = flags;
 
-	preempt_disable();
-	result = BPF_PROG_RUN(prog, ctx);
-	preempt_enable();
+	result = BPF_PROG_RUN_PIN_ON_CPU(prog, ctx);
 
 	flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
 	flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ded2d52276786..47e6af669d592 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 	struct bpf_prog *prog;
 	int ret;
 
-	preempt_disable();
 	rcu_read_lock();
 	prog = READ_ONCE(psock->progs.msg_parser);
 	if (unlikely(!prog)) {
@@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 
 	sk_msg_compute_data_pointers(msg);
 	msg->sk = sk;
-	ret = BPF_PROG_RUN(prog, msg);
+	ret = BPF_PROG_RUN_PIN_ON_CPU(prog, msg);
 	ret = sk_psock_map_verd(ret, msg->sk_redir);
 	psock->apply_bytes = msg->apply_bytes;
 	if (ret == __SK_REDIRECT) {
@@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 	}
 out:
 	rcu_read_unlock();
-	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
@@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
 
 	skb->sk = psock->sk;
 	bpf_compute_data_end_sk_skb(skb);
-	preempt_disable();
-	ret = BPF_PROG_RUN(prog, skb);
-	preempt_enable();
+	ret = BPF_PROG_RUN_PIN_ON_CPU(prog, skb);
 	/* strparser clones the skb before handing it to a upper layer,
 	 * meaning skb_orphan has been called. We NULL sk on the way out
 	 * to ensure we don't trigger a BUG_ON() in skb/sk operations
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index ea9e73428ed9c..4906c8f043afb 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -380,9 +380,7 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
 	struct bpf_prog *prog = psock->bpf_prog;
 	int res;
 
-	preempt_disable();
-	res = BPF_PROG_RUN(prog, skb);
-	preempt_enable();
+	res = BPF_PROG_RUN_PIN_ON_CPU(prog, skb);
 	return res;
 }
 

^ permalink raw reply related

* [PATCH AUTOSEL 4.9 049/141] net/wan/fsl_ucc_hdlc: reject muram offsets above 64K
From: Sasha Levin @ 2020-02-14 16:19 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Timur Tabi, netdev, Rasmus Villemoes, Li Yang,
	linuxppc-dev, David S . Miller, Qiang Zhao
In-Reply-To: <20200214162122.19794-1-sashal@kernel.org>

From: Rasmus Villemoes <linux@rasmusvillemoes.dk>

[ Upstream commit 148587a59f6b85831695e0497d9dd1af5f0495af ]

Qiang Zhao points out that these offsets get written to 16-bit
registers, and there are some QE platforms with more than 64K
muram. So it is possible that qe_muram_alloc() gives us an allocation
that can't actually be used by the hardware, so detect and reject
that.

Reported-by: Qiang Zhao <qiang.zhao@nxp.com>
Reviewed-by: Timur Tabi <timur@kernel.org>
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Li Yang <leoyang.li@nxp.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wan/fsl_ucc_hdlc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index af85a1b3135e2..87bf05a81db50 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -209,6 +209,11 @@ static int uhdlc_init(struct ucc_hdlc_private *priv)
 		ret = -ENOMEM;
 		goto free_riptr;
 	}
+	if (riptr != (u16)riptr || tiptr != (u16)tiptr) {
+		dev_err(priv->dev, "MURAM allocation out of addressable range\n");
+		ret = -ENOMEM;
+		goto free_tiptr;
+	}
 
 	/* Set RIPTR, TIPTR */
 	iowrite16be(riptr, &priv->ucc_pram->riptr);
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH v6 0/6] Optimize cgroup context switch
From: Ian Rogers @ 2020-02-14 19:32 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Namhyung Kim,
	Andrew Morton, Randy Dunlap, Masahiro Yamada, Shuah Khan,
	Krzysztof Kozlowski, Kees Cook, Paul E. McKenney,
	Masami Hiramatsu, Marco Elver, Kent Overstreet, Andy Shevchenko,
	Ard Biesheuvel, Kan Liang, LKML
In-Reply-To: <20200214075133.181299-1-irogers@google.com>

On a thread related to these patches Peter had previously asked for
what the performance numbers looked like. I've tested on Westmere and
Cascade Lake platforms. The benchmark is a set of processes in
different cgroups reading/writing to a file descriptor, where the read
context switches. To ensure the context switch all the processes are
pinned to a particular CPU, the benchmark is tested to ensure the
expected context-switches matches those performed. The benchmark
increases the number of perf events and cgroups, it also looks at the
effect of just monitoring 1 cgroup in an increasing set of cgroups.

Before the patches on Westmere if we do system wide profiling of 10
events and then increase the cgroups to 208 and monitor just one, the
context switch times go from 4.6us to 15.3us. If we monitor each
cgroup then the context switch times are 172.5us. With the patches,
the time for monitoring 1 cgroup goes from 4.6us to 14.9us, but when
monitoring all cgroups the context switch times are 14.1us. The small
speed up when monitoring 1 cgroup out of  a set is that in most
context switches the O(n) search for an event in a cgroup is now
O(log(n)). When all cgroups are monitored the number of events in the
kernel is the product of the number of events and cgroups, giving a
larger value for 'n' and a more dramatic speed up - 172.5us becomes
14.9us.

In summary what we see for performance is that before the patches we
see context switch times being affected by the number of cgroups
monitored, after the patches there is still a context switch cost in
monitoring events, but it is similar whether 1 or all cgroups are
being monitored. This fits with the intuition of what the patches are
trying to do by avoiding searches of events that are for cgroups the
current task isn't within.The results are consistent but less dramatic
for smaller numbers of events and cgroups. We've not identified a slow
down from the patches, but there is a degree of noise in the timing
data. Broadly, with turbo disabled on the test machines the patches
make context switch performance the same or faster. For a more
representative number of events and cgroups, say 6 and 32, we see
context switch time improve from 29.4us to 13.2us when all cgroups are
monitored.

Thanks,
Ian


On Thu, Feb 13, 2020 at 11:51 PM Ian Rogers <irogers@google.com> wrote:
>
> Avoid iterating over all per-CPU events during cgroup changing context
> switches by organizing events by cgroup.
>
> To make an efficient set of iterators, introduce a min max heap
> utility with test.
>
> The v6 patch reduces the patch set by 4 patches, it updates the cgroup
> id and fixes part of the min_heap rename from v5.
>
> The v5 patch set renames min_max_heap to min_heap as suggested by
> Peter Zijlstra, it also addresses comments around preferring
> __always_inline over inline.
>
> The v4 patch set addresses review comments on the v3 patch set by
> Peter Zijlstra.
>
> These patches include a caching algorithm to improve the search for
> the first event in a group by Kan Liang <kan.liang@linux.intel.com> as
> well as rebasing hit "optimize event_filter_match during sched_in"
> from https://lkml.org/lkml/2019/8/7/771.
>
> The v2 patch set was modified by Peter Zijlstra in his perf/cgroup
> branch:
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git
>
> These patches follow Peter's reorganization and his fixes to the
> perf_cpu_context min_heap storage code.
>
> Ian Rogers (5):
>   lib: introduce generic min-heap
>   perf: Use min_heap in visit_groups_merge
>   perf: Add per perf_cpu_context min_heap storage
>   perf/cgroup: Grow per perf_cpu_context heap storage
>   perf/cgroup: Order events in RB tree by cgroup id
>
> Peter Zijlstra (1):
>   perf/cgroup: Reorder perf_cgroup_connect()
>
>  include/linux/min_heap.h   | 135 ++++++++++++++++++++
>  include/linux/perf_event.h |   7 ++
>  kernel/events/core.c       | 251 +++++++++++++++++++++++++++++++------
>  lib/Kconfig.debug          |  10 ++
>  lib/Makefile               |   1 +
>  lib/test_min_heap.c        | 194 ++++++++++++++++++++++++++++
>  6 files changed, 563 insertions(+), 35 deletions(-)
>  create mode 100644 include/linux/min_heap.h
>  create mode 100644 lib/test_min_heap.c
>
> --
> 2.25.0.265.gbab2e86ba0-goog
>

^ permalink raw reply

* [PATCH] kernel-devsrc: support 4.4+ ARM/ARM64 kernels
From: Ruslan Bilovol @ 2020-02-14 19:24 UTC (permalink / raw)
  To: openembedded-core; +Cc: xe-linux-external

Linux Kernel 4.4 is an LTS kernel so people may still
build it with OE.

Thus make copying of some files optional:
 - arm64 module.lds file first appeared with kernel v4.6 commit
   fd045f6cd98e arm64: add support for module PLTs"
 - arm32 *.tbl files first appeared in kernel v4.10 in
   commit 96a8fae0fe09 "ARM: convert to generated
   system call tables"

Signed-off-by: Ruslan Bilovol <rbilovol@cisco.com>
---
 meta/recipes-kernel/linux/kernel-devsrc.bb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/meta/recipes-kernel/linux/kernel-devsrc.bb b/meta/recipes-kernel/linux/kernel-devsrc.bb
index 2fa4be67cc..2ac679e204 100644
--- a/meta/recipes-kernel/linux/kernel-devsrc.bb
+++ b/meta/recipes-kernel/linux/kernel-devsrc.bb
@@ -147,7 +147,7 @@ do_install() {
             cp -a --parents arch/arm64/kernel/vdso/note.S $kerneldir/build/
             cp -a --parents arch/arm64/kernel/vdso/gen_vdso_offsets.sh $kerneldir/build/
 
-            cp -a --parents arch/arm64/kernel/module.lds $kerneldir/build/
+            cp -a --parents arch/arm64/kernel/module.lds $kerneldir/build/ 2>/dev/null || :
 	fi
 
 	if [ "${ARCH}" = "powerpc" ]; then
@@ -187,7 +187,7 @@ do_install() {
 
 	# required for generate missing syscalls prepare phase
 	cp -a --parents $(find arch/x86 -type f -name "syscall_32.tbl") $kerneldir/build
-	cp -a --parents $(find arch/arm -type f -name "*.tbl") $kerneldir/build
+	cp -a --parents $(find arch/arm -type f -name "*.tbl") $kerneldir/build 2>/dev/null || :
 
 	if [ "${ARCH}" = "x86" ]; then
 	    # files for 'make prepare' to succeed with kernel-devel
-- 
2.17.1



^ permalink raw reply related

* Re: [PATCH v5 1/7] mm: pass task and mm to do_madvise
From: Minchan Kim @ 2020-02-14 19:31 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jann Horn, io-uring, Andrew Morton, LKML, linux-mm, Linux API,
	Oleksandr Natalenko, Suren Baghdasaryan, Tim Murray,
	Daniel Colascione, Sandeep Patil, Sonny Rao, Brian Geffon,
	Michal Hocko, Johannes Weiner, Shakeel Butt, John Dias,
	Joel Fernandes, sj38.park, Alexander Duyck
In-Reply-To: <93aadcc6-3ef5-4ea0-be6b-23c06862002e@kernel.dk>

On Fri, Feb 14, 2020 at 12:09:50PM -0700, Jens Axboe wrote:
> On 2/14/20 11:45 AM, Minchan Kim wrote:
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 63beda9bafc5..1c7e9cd6c8ce 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -2736,7 +2736,7 @@ static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
> >  	if (force_nonblock)
> >  		return -EAGAIN;
> >  
> > -	ret = do_madvise(ma->addr, ma->len, ma->advice);
> > +	ret = do_madvise(NULL, current->mm, ma->addr, ma->len, ma->advice);
> >  	if (ret < 0)
> >  		req_set_fail_links(req);
> >  	io_cqring_add_event(req, ret);
> 
> I think we want to use req->work.mm here - it'll be the same as
> current->mm at this point, but it makes it clear that we're using a
> grabbed mm.

Will fix at respin. Thanks for the review!

^ permalink raw reply

* Re: [PATCH v2 03/12] MIPS: CI20: defconfig: configure for supporting modules
From: H. Nikolaus Schaller @ 2020-02-14 19:30 UTC (permalink / raw)
  To: Paul Cercueil
  Cc: Paul Boddie, Rob Herring, Mark Rutland, Ralf Baechle, Paul Burton,
	David Airlie, Daniel Vetter, Andi Kleen, Miquel Raynal, Kees Cook,
	devicetree, linux-mips, linux-kernel, dri-devel, letux-kernel,
	kernel
In-Reply-To: <1581707415.3.7@crapouillou.net>

Hi Paul,

> Am 14.02.2020 um 20:10 schrieb Paul Cercueil <paul@crapouillou.net>:
> 
> Hi Nikolaus,
> 
> Patches 03-12 only touch the same two files - ci20.dts and ci20_defconfig.
> 
> Unless someone strongly disagrees, I'd suggest to squash all patches that touch each file together (except the ones with a Fixes tag), I don't think we really need that much granularity here.

It comes more from having developed these things quite independently and only collected for submission...

One patch I don't know how to handle: "MIPS: DTS: CI20: add DT node for IR sensor".
It is from 2015 and has a different author (some Alex Smith but the mail address seems to be broken).
This information and attribution will be lost if we squash them.

But I can do for V3 and will also fix the fixes tags by adding cc: stable :)

BR and thanks,
Nikolaus


> 
> -Paul
> 
> 
> Le ven., févr. 14, 2020 at 17:10, H. Nikolaus Schaller <hns@goldelico.com> a écrit :
>> Not all drivers need to be compiled into the kernel.
>> Support building and loading of kernel modules.
>> Signed-off-by: H. Nikolaus Schaller <hns@goldelico.com>
>> ---
>> arch/mips/configs/ci20_defconfig | 1 +
>> 1 file changed, 1 insertion(+)
>> diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig
>> index be41df2a81fb..e0d3c9d4c2ae 100644
>> --- a/arch/mips/configs/ci20_defconfig
>> +++ b/arch/mips/configs/ci20_defconfig
>> @@ -1,4 +1,5 @@
>> # CONFIG_LOCALVERSION_AUTO is not set
>> +CONFIG_MODULES=y
>> CONFIG_KERNEL_XZ=y
>> CONFIG_SYSVIPC=y
>> CONFIG_POSIX_MQUEUE=y
>> --
>> 2.23.0
> 
> 


^ permalink raw reply

* FYI nautilus branch is locked
From: Yuri Weinstein @ 2020-02-14 19:30 UTC (permalink / raw)
  To: Yuri Weinstein, dev, Development, Ceph, Abhishek Lekshmanan,
	Nathan Cutler, Casey Bodley, Patrick Donnelly, Neha Ojha,
	Durgin, Josh, David Zafman, Weil, Sage, Ramana Venkatesh Raja,
	Tamilarasi Muthamizhan, Dillaman, Jason, Sadeh-Weinraub, Yehuda,
	Lekshmanan, Abhishek, Ilya Dryomov, Jeff Layton, ceph-qe-team,
	Andrew Schoen

We are getting ready to test 14.2.9 and nautilus branch is locked for
merges until it's done.

sah1 - 4d5b84085009968f557baaa4209183f1374773cd

Nathan, Abhishek pls confirm.

Thank you
YuriW

^ permalink raw reply

* Re: get-lore-mbox: quickly grab full threads from lore
From: Kevin Hilman @ 2020-02-14 19:30 UTC (permalink / raw)
  To: Konstantin Ryabitsev, workflows
In-Reply-To: <20200201030105.k6akvbjpmlpcuiky@chatter.i7.local>

Konstantin Ryabitsev <konstantin@linuxfoundation.org> writes:

> I'd like your opinion on this quick helper script I wrote that uses any 
> message-id to grab a full thread from lore.kernel.org and save it as a 
> mbox file.

This is very useful, thank you!

One question/request: Is there a way for it to only grab a subset of a
series?  e.g. Some series contain patches that might end up going
through a couple different trees (e.g. DT patches typically take a
separate path than drivers) so as a maintainer for one of the
subsystems, I might want to only get a subset of the series into an
mbox, not the whole thing.

IOW, Right now even if I pass a msgid from the middle of the series, it
finds the whole series (which is cool!), but what if I want to apply
just that single patch?  Or even better, I might want to only apply
patches 3-5 and 9 from a 10-patch series.

Is this something do-able?

Kevin

^ permalink raw reply

* [PATCH v2 2/2] mm: fix long time stall from mm_populate
From: Minchan Kim @ 2020-02-14 19:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, Jan Kara, Matthew Wilcox, Josef Bacik,
	Johannes Weiner, Minchan Kim
In-Reply-To: <20200214192951.29430-1-minchan@kernel.org>

Basically, fault handler releases mmap_sem before requesting readahead
and then it is supposed to retry lookup the page from page cache with
FAULT_FLAG_TRIED so that it avoids the live lock of infinite retry.

However, what happens if the fault handler find a page from page
cache and the page has readahead marker but are waiting under
writeback? Plus one more condition, it happens under mm_populate
which repeats faulting unless it encounters error. So let's assemble
conditions below.

       CPU 1                                                        CPU 2

- first loop
    mm_populate
     for ()
       ..
       ret = populate_vma_page_range
         __get_user_pages
           faultin_page
             handle_mm_fault
               filemap_fault
                 do_async_mmap_readahead
                   if (PageReadahead(pageA))
                     maybe_unlock_mmap_for_io
                       up_read(mmap_sem)
					                    shrink_page_list
                                                              pageout
                                                                SetPageReclaim(=SetPageReadahead)(pageA)
                                                                writepage
                                                                  SetPageWriteback(pageA)

                     page_cache_async_readahead()
		       ClearPageReadahead(pageA)
                 do_async_mmap_readahead
		 lock_page_maybe_drop_mmap
		   goto out_retry

					                    the pageA is reclaimed
							    and new pageB is populated to the file offset
							    and finally has become PG_readahead

- second loop

	  __get_user_pages
           faultin_page
             handle_mm_fault
               filemap_fault
                 do_async_mmap_readahead
                   if (PageReadahead(pageB))
                     maybe_unlock_mmap_for_io
                       up_read(mmap_sem)
					                    shrink_page_list
                                                              pageout
                                                                SetPageReclaim(=SetPageReadahead)(pageB)
                                                                writepage
                                                                  SetPageWriteback(pageB)

                     page_cache_async_readahead()
		       ClearPageReadahead(pageB)
                 do_async_mmap_readahead
		 lock_page_maybe_drop_mmap
		   goto out_retry

It could be repeated forever so it's livelock. without involving reclaim,
it could happens if ra_pages become zero by fadvise/other threads who
have same fd one doing randome while the other one is sequential
because page_cache_async_readahead has following condition check like
PageWriteback and ra_pages are never synchrnized with fadvise and
shrink_readahead_size_eio from other threads.

void page_cache_async_readahead(struct address_space *mapping,
                           unsigned long req_size)
{
        /* no read-ahead */
        if (!ra->ra_pages)
                return;

Thus, we need to limit fault retry from mm_populate like page
fault handler.

Fixes: 6b4c9f446981 ("filemap: drop the mmap_sem for all blocking operations")
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/gup.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 1b521e0ac1de..6f6548c63ad5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1133,7 +1133,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
  *
  * This takes care of mlocking the pages too if VM_LOCKED is set.
  *
- * return 0 on success, negative error code on error.
+ * return number of pages pinned on success, negative error code on error.
  *
  * vma->vm_mm->mmap_sem must be held.
  *
@@ -1196,6 +1196,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 	struct vm_area_struct *vma = NULL;
 	int locked = 0;
 	long ret = 0;
+	bool tried = false;
 
 	end = start + len;
 
@@ -1226,14 +1227,18 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 		 * double checks the vma flags, so that it won't mlock pages
 		 * if the vma was already munlocked.
 		 */
-		ret = populate_vma_page_range(vma, nstart, nend, &locked);
+		ret = populate_vma_page_range(vma, nstart, nend,
+						tried ? NULL : &locked);
 		if (ret < 0) {
 			if (ignore_errors) {
 				ret = 0;
 				continue;	/* continue at next VMA */
 			}
 			break;
-		}
+		} else if (ret == 0)
+			tried = true;
+		else
+			tried = false;
 		nend = nstart + ret * PAGE_SIZE;
 		ret = 0;
 	}
-- 
2.25.0.265.gbab2e86ba0-goog



^ permalink raw reply related

* [PATCH v2 1/2] mm: make PageReadahead more strict
From: Minchan Kim @ 2020-02-14 19:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, Jan Kara, Matthew Wilcox, Josef Bacik,
	Johannes Weiner, Minchan Kim

Recently, I got some bugreports major page fault takes several seconds
sometime. When I review drop mmap_sem logic, I found several bugs.

   CPU 1                                                        CPU 2
mm_populate
 for ()
   ..
   ret = populate_vma_page_range
     __get_user_pages
       faultin_page
         handle_mm_fault
           filemap_fault
             do_async_mmap_readahead
                                                        shrink_page_list
                                                          pageout
                                                            SetPageReclaim(=SetPageReadahead)
                                                              writepage
                                                                SetPageWriteback
               if (PageReadahead(page))
                 maybe_unlock_mmap_for_io
                   up_read(mmap_sem)
                 page_cache_async_readahead()
                   if (PageWriteback(page))
                     return;

Here, since ret from populate_vma_page_range is zero, the loop continue
to run with same address with previous iteration. It will repeat the
loop until the page's writeout is done(ie, PG_writeback or PG_reclaim
is clear).

We could fix the above specific case via adding PageWriteback

   ret = populate_vma_page_range
           ...
           ...
           filemap_fault
             do_async_mmap_readahead
               if (!PageWriteback(page) && PageReadahead(page))
                 maybe_unlock_mmap_for_io
                   up_read(mmap_sem)
                 page_cache_async_readahead()
                   if (PageWriteback(page))
                     return;

Furthermore, to prevent potential issues caused by sharing PG_readahead
with PG_reclaim, let's make page flag wrapper for PageReadahead
with description. With that, we could remove PageWriteback check
in page_cache_async_readahead, which is more clear for maintenance/
readability.

Fixes: 6b4c9f446981 ("filemap: drop the mmap_sem for all blocking operations")
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/page-flags.h | 28 ++++++++++++++++++++++++++--
 mm/readahead.c             |  6 ------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1bf83c8fcaa7..f91a9b2a49bd 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -363,8 +363,32 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
 PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
 	TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
-PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
-	TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+
+SETPAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+CLEARPAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+
+/*
+ * Since PG_readahead is shared with PG_reclaim of the page flags,
+ * PageReadahead should double check whether it's readahead marker
+ * or PG_reclaim. It could be done by PageWriteback check because
+ * PG_reclaim is always with PG_writeback.
+ */
+static inline int PageReadahead(struct page *page)
+{
+	VM_BUG_ON_PGFLAGS(PageCompound(page), page);
+
+	return (page->flags & (1UL << PG_reclaim | 1UL << PG_writeback)) ==
+		(1UL << PG_reclaim);
+}
+
+/* Clear PG_readahead only if it's PG_readahead, not PG_reclaim */
+static inline int TestClearPageReadahead(struct page *page)
+{
+	VM_BUG_ON_PGFLAGS(PageCompound(page), page);
+
+	return !PageWriteback(page) ||
+			test_and_clear_bit(PG_reclaim, &page->flags);
+}
 
 #ifdef CONFIG_HIGHMEM
 /*
diff --git a/mm/readahead.c b/mm/readahead.c
index 2fe72cd29b47..85b15e5a1d7b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -553,12 +553,6 @@ page_cache_async_readahead(struct address_space *mapping,
 	if (!ra->ra_pages)
 		return;
 
-	/*
-	 * Same bit is used for PG_readahead and PG_reclaim.
-	 */
-	if (PageWriteback(page))
-		return;
-
 	ClearPageReadahead(page);
 
 	/*
-- 
2.25.0.265.gbab2e86ba0-goog



^ permalink raw reply related

* [PATCH AUTOSEL 4.9 012/141] powerpc/powernv/iov: Ensure the pdn for VFs always contains a valid PE number
From: Sasha Levin @ 2020-02-14 16:19 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Alexey Kardashevskiy, Oliver O'Halloran, linuxppc-dev,
	Sasha Levin
In-Reply-To: <20200214162122.19794-1-sashal@kernel.org>

From: Oliver O'Halloran <oohall@gmail.com>

[ Upstream commit 3b5b9997b331e77ce967eba2c4bc80dc3134a7fe ]

On pseries there is a bug with adding hotplugged devices to an IOMMU
group. For a number of dumb reasons fixing that bug first requires
re-working how VFs are configured on PowerNV. For background, on
PowerNV we use the pcibios_sriov_enable() hook to do two things:

  1. Create a pci_dn structure for each of the VFs, and
  2. Configure the PHB's internal BARs so the MMIO range for each VF
     maps to a unique PE.

Roughly speaking a PE is the hardware counterpart to a Linux IOMMU
group since all the devices in a PE share the same IOMMU table. A PE
also defines the set of devices that should be isolated in response to
a PCI error (i.e. bad DMA, UR/CA, AER events, etc). When isolated all
MMIO and DMA traffic to and from devicein the PE is blocked by the
root complex until the PE is recovered by the OS.

The requirement to block MMIO causes a giant headache because the P8
PHB generally uses a fixed mapping between MMIO addresses and PEs. As
a result we need to delay configuring the IOMMU groups for device
until after MMIO resources are assigned. For physical devices (i.e.
non-VFs) the PE assignment is done in pcibios_setup_bridge() which is
called immediately after the MMIO resources for downstream
devices (and the bridge's windows) are assigned. For VFs the setup is
more complicated because:

  a) pcibios_setup_bridge() is not called again when VFs are activated, and
  b) The pci_dev for VFs are created by generic code which runs after
     pcibios_sriov_enable() is called.

The work around for this is a two step process:

  1. A fixup in pcibios_add_device() is used to initialised the cached
     pe_number in pci_dn, then
  2. A bus notifier then adds the device to the IOMMU group for the PE
     specified in pci_dn->pe_number.

A side effect fixing the pseries bug mentioned in the first paragraph
is moving the fixup out of pcibios_add_device() and into
pcibios_bus_add_device(), which is called much later. This results in
step 2. failing because pci_dn->pe_number won't be initialised when
the bus notifier is run.

We can fix this by removing the need for the fixup. The PE for a VF is
known before the VF is even scanned so we can initialise
pci_dn->pe_number pcibios_sriov_enable() instead. Unfortunately,
moving the initialisation causes two problems:

  1. We trip the WARN_ON() in the current fixup code, and
  2. The EEH core clears pdn->pe_number when recovering a VF and
     relies on the fixup to correctly re-set it.

The only justification for either of these is a comment in
eeh_rmv_device() suggesting that pdn->pe_number *must* be set to
IODA_INVALID_PE in order for the VF to be scanned. However, this
comment appears to have no basis in reality. Both bugs can be fixed by
just deleting the code.

Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20191028085424.12006-1-oohall@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/kernel/eeh_driver.c          |  6 ------
 arch/powerpc/platforms/powernv/pci-ioda.c | 19 +++++++++++++++----
 arch/powerpc/platforms/powernv/pci.c      |  4 ----
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 620e08d4eb6e2..adac3dee4c57e 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -520,12 +520,6 @@ static void *eeh_rmv_device(void *data, void *userdata)
 
 		pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0);
 		edev->pdev = NULL;
-
-		/*
-		 * We have to set the VF PE number to invalid one, which is
-		 * required to plug the VF successfully.
-		 */
-		pdn->pe_number = IODA_INVALID_PE;
 #endif
 		if (rmv_data)
 			list_add(&edev->rmv_list, &rmv_data->edev_list);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3ec673b4ca6ce..b787a669a1e27 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1524,6 +1524,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 	/* Reserve PE for each VF */
 	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+		int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
+		int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
+		struct pci_dn *vf_pdn;
+
 		if (pdn->m64_single_mode)
 			pe_num = pdn->pe_num_map[vf_index];
 		else
@@ -1536,13 +1540,11 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		pe->pbus = NULL;
 		pe->parent_dev = pdev;
 		pe->mve_number = -1;
-		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
-			   pci_iov_virtfn_devfn(pdev, vf_index);
+		pe->rid = (vf_bus << 8) | vf_devfn;
 
 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
 			hose->global_number, pdev->bus->number,
-			PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
-			PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
+			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
 
 		if (pnv_ioda_configure_pe(phb, pe)) {
 			/* XXX What do we do here ? */
@@ -1556,6 +1558,15 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		list_add_tail(&pe->list, &phb->ioda.pe_list);
 		mutex_unlock(&phb->ioda.pe_list_mutex);
 
+		/* associate this pe to it's pdn */
+		list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
+			if (vf_pdn->busno == vf_bus &&
+			    vf_pdn->devfn == vf_devfn) {
+				vf_pdn->pe_number = pe_num;
+				break;
+			}
+		}
+
 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
 	}
 }
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 00dbf1e895a9d..2ed7627e991e0 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -856,16 +856,12 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 	struct pnv_phb *phb = hose->private_data;
 #ifdef CONFIG_PCI_IOV
 	struct pnv_ioda_pe *pe;
-	struct pci_dn *pdn;
 
 	/* Fix the VF pdn PE number */
 	if (pdev->is_virtfn) {
-		pdn = pci_get_pdn(pdev);
-		WARN_ON(pdn->pe_number != IODA_INVALID_PE);
 		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
 			if (pe->rid == ((pdev->bus->number << 8) |
 			    (pdev->devfn & 0xff))) {
-				pdn->pe_number = pe->pe_number;
 				pe->pdev = pdev;
 				break;
 			}
-- 
2.20.1


^ permalink raw reply related

* ✅ PASS: Test report for kernel 5.4.20-rc2-b06b66d.cki (stable)
From: CKI Project @ 2020-02-14 19:29 UTC (permalink / raw)
  To: Linux Stable maillist


Hello,

We ran automated tests on a recent commit from this kernel tree:

       Kernel repo: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
            Commit: b06b66d0f2c4 - Linux 5.4.20-rc2

The results of these automated tests are provided below.

    Overall result: PASSED
             Merge: OK
           Compile: OK
             Tests: OK

All kernel binaries, config files, and logs are available for download here:

  https://cki-artifacts.s3.us-east-2.amazonaws.com/index.html?prefix=datawarehouse/2020/02/13/436237

Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.

        ,-.   ,-.
       ( C ) ( K )  Continuous
        `-',-.`-'   Kernel
          ( I )     Integration
           `-'
______________________________________________________________________________

Compile testing
---------------

We compiled the kernel for 3 architectures:

    aarch64:
      make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg

    ppc64le:
      make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg

    x86_64:
      make options: -j30 INSTALL_MOD_STRIP=1 targz-pkg


Hardware testing
----------------
We booted each kernel and ran the following tests:

  aarch64:
    Host 1:

       ⚡ Internal infrastructure issues prevented one or more tests (marked
       with ⚡⚡⚡) from running on this architecture.
       This is not the fault of the kernel that was tested.

       ✅ Boot test
       ✅ xfstests - ext4
       ✅ xfstests - xfs
       ✅ selinux-policy: serge-testsuite
       ✅ lvm thinp sanity
       ✅ storage: software RAID testing
       🚧 ⚡⚡⚡ Storage blktests

    Host 2:

       ⚡ Internal infrastructure issues prevented one or more tests (marked
       with ⚡⚡⚡) from running on this architecture.
       This is not the fault of the kernel that was tested.

       ✅ Boot test
       ✅ Podman system integration test - as root
       ✅ Podman system integration test - as user
       ✅ LTP
       ✅ Loopdev Sanity
       ✅ Memory function: memfd_create
       ✅ AMTU (Abstract Machine Test Utility)
       ✅ Networking bridge: sanity
       ✅ Ethernet drivers sanity
       ✅ Networking MACsec: sanity
       ✅ Networking socket: fuzz
       ✅ Networking sctp-auth: sockopts test
       ⚡⚡⚡ Networking: igmp conformance test
       ✅ Networking route: pmtu
       ✅ Networking route_func - local
       ✅ Networking route_func - forward
       ✅ Networking TCP: keepalive test
       ✅ Networking UDP: socket
       ✅ Networking tunnel: geneve basic test
       ✅ Networking tunnel: gre basic
       ✅ L2TP basic test
       ✅ Networking tunnel: vxlan basic
       ✅ Networking ipsec: basic netns - transport
       ✅ Networking ipsec: basic netns - tunnel
       ✅ audit: audit testsuite test
       ✅ httpd: mod_ssl smoke sanity
       ✅ tuned: tune-processes-through-perf
       ✅ ALSA PCM loopback test
       ✅ ALSA Control (mixer) Userspace Element test
       ✅ storage: SCSI VPD
       ✅ trace: ftrace/tracer
       🚧 ✅ CIFS Connectathon
       🚧 ✅ POSIX pjd-fstest suites
       🚧 ✅ Memory function: kaslr
       🚧 ✅ LTP: openposix test suite
       🚧 ✅ Networking vnic: ipvlan/basic
       🚧 ✅ iotop: sanity
       🚧 ✅ Usex - version 1.9-29
       🚧 ✅ storage: dm/common

    Host 3:

       ⚡ Internal infrastructure issues prevented one or more tests (marked
       with ⚡⚡⚡) from running on this architecture.
       This is not the fault of the kernel that was tested.

       ✅ Boot test
       ✅ Podman system integration test - as root
       ✅ Podman system integration test - as user
       ✅ LTP
       ✅ Loopdev Sanity
       ✅ Memory function: memfd_create
       ✅ AMTU (Abstract Machine Test Utility)
       ✅ Networking bridge: sanity
       ✅ Ethernet drivers sanity
       ✅ Networking MACsec: sanity
       ✅ Networking socket: fuzz
       ✅ Networking sctp-auth: sockopts test
       ⚡⚡⚡ Networking: igmp conformance test
       ⚡⚡⚡ Networking route: pmtu
       ⚡⚡⚡ Networking route_func - local
       ⚡⚡⚡ Networking route_func - forward
       ⚡⚡⚡ Networking TCP: keepalive test
       ⚡⚡⚡ Networking UDP: socket
       ⚡⚡⚡ Networking tunnel: geneve basic test
       ⚡⚡⚡ Networking tunnel: gre basic
       ⚡⚡⚡ L2TP basic test
       ⚡⚡⚡ Networking tunnel: vxlan basic
       ⚡⚡⚡ Networking ipsec: basic netns - transport
       ⚡⚡⚡ Networking ipsec: basic netns - tunnel
       ⚡⚡⚡ audit: audit testsuite test
       ⚡⚡⚡ httpd: mod_ssl smoke sanity
       ⚡⚡⚡ tuned: tune-processes-through-perf
       ⚡⚡⚡ ALSA PCM loopback test
       ⚡⚡⚡ ALSA Control (mixer) Userspace Element test
       ⚡⚡⚡ storage: SCSI VPD
       ⚡⚡⚡ trace: ftrace/tracer
       🚧 ⚡⚡⚡ CIFS Connectathon
       🚧 ⚡⚡⚡ POSIX pjd-fstest suites
       🚧 ⚡⚡⚡ Memory function: kaslr
       🚧 ⚡⚡⚡ LTP: openposix test suite
       🚧 ⚡⚡⚡ Networking vnic: ipvlan/basic
       🚧 ⚡⚡⚡ iotop: sanity
       🚧 ⚡⚡⚡ Usex - version 1.9-29
       🚧 ⚡⚡⚡ storage: dm/common

    Host 4:
       ✅ Boot test
       ⏱  Podman system integration test - as root
       ⏱  Podman system integration test - as user
       ⏱  LTP
       ⏱  Loopdev Sanity
       ⏱  Memory function: memfd_create
       ⏱  AMTU (Abstract Machine Test Utility)
       ⏱  Networking bridge: sanity
       ⏱  Ethernet drivers sanity
       ⏱  Networking MACsec: sanity
       ⏱  Networking socket: fuzz
       ⏱  Networking sctp-auth: sockopts test
       ⏱  Networking: igmp conformance test
       ⏱  Networking route: pmtu
       ⏱  Networking route_func - local
       ⏱  Networking route_func - forward
       ⏱  Networking TCP: keepalive test
       ⏱  Networking UDP: socket
       ⏱  Networking tunnel: geneve basic test
       ⏱  Networking tunnel: gre basic
       ⏱  L2TP basic test
       ⏱  Networking tunnel: vxlan basic
       ⏱  Networking ipsec: basic netns - transport
       ⏱  Networking ipsec: basic netns - tunnel
       ⏱  audit: audit testsuite test
       ⏱  httpd: mod_ssl smoke sanity
       ⏱  tuned: tune-processes-through-perf
       ⏱  ALSA PCM loopback test
       ⏱  ALSA Control (mixer) Userspace Element test
       ⏱  storage: SCSI VPD
       ⏱  trace: ftrace/tracer
       ⏱  CIFS Connectathon
       ⏱  POSIX pjd-fstest suites
       ⏱  Memory function: kaslr
       ⏱  LTP: openposix test suite
       ⏱  Networking vnic: ipvlan/basic
       ⏱  iotop: sanity
       ⏱  Usex - version 1.9-29
       ⏱  storage: dm/common

  ppc64le:
    Host 1:

       ⚡ Internal infrastructure issues prevented one or more tests (marked
       with ⚡⚡⚡) from running on this architecture.
       This is not the fault of the kernel that was tested.

       ✅ Boot test
       ✅ Podman system integration test - as root
       ✅ Podman system integration test - as user
       ⚡⚡⚡ LTP
       ✅ Loopdev Sanity
       ✅ Memory function: memfd_create
       ✅ AMTU (Abstract Machine Test Utility)
       ✅ Networking bridge: sanity
       ✅ Ethernet drivers sanity
       ✅ Networking MACsec: sanity
       ✅ Networking socket: fuzz
       ✅ Networking sctp-auth: sockopts test
       ✅ Networking route: pmtu
       ✅ Networking route_func - local
       ✅ Networking route_func - forward
       ✅ Networking TCP: keepalive test
       ✅ Networking UDP: socket
       ✅ Networking tunnel: geneve basic test
       ✅ Networking tunnel: gre basic
       ✅ L2TP basic test
       ✅ Networking tunnel: vxlan basic
       ✅ Networking ipsec: basic netns - tunnel
       ✅ audit: audit testsuite test
       ✅ httpd: mod_ssl smoke sanity
       ✅ tuned: tune-processes-through-perf
       ✅ ALSA PCM loopback test
       ✅ ALSA Control (mixer) Userspace Element test
       ✅ trace: ftrace/tracer
       🚧 ✅ CIFS Connectathon
       🚧 ✅ POSIX pjd-fstest suites
       🚧 ✅ Memory function: kaslr
       🚧 ✅ LTP: openposix test suite
       🚧 ✅ Networking vnic: ipvlan/basic
       🚧 ✅ iotop: sanity
       🚧 ✅ Usex - version 1.9-29
       🚧 ✅ storage: dm/common

    Host 2:

       ⚡ Internal infrastructure issues prevented one or more tests (marked
       with ⚡⚡⚡) from running on this architecture.
       This is not the fault of the kernel that was tested.

       ✅ Boot test
       ✅ xfstests - ext4
       ✅ xfstests - xfs
       ✅ selinux-policy: serge-testsuite
       ✅ lvm thinp sanity
       ✅ storage: software RAID testing
       🚧 ✅ IPMI driver test
       🚧 ✅ IPMItool loop stress test
       🚧 ⚡⚡⚡ Storage blktests

    Host 3:
       ✅ Boot test
       ✅ Podman system integration test - as root
       ✅ Podman system integration test - as user
       ✅ LTP
       ✅ Loopdev Sanity
       ✅ Memory function: memfd_create
       ✅ AMTU (Abstract Machine Test Utility)
       ✅ Networking bridge: sanity
       ✅ Ethernet drivers sanity
       ✅ Networking MACsec: sanity
       ✅ Networking socket: fuzz
       ✅ Networking sctp-auth: sockopts test
       ✅ Networking route: pmtu
       ✅ Networking route_func - local
       ✅ Networking route_func - forward
       ✅ Networking TCP: keepalive test
       ✅ Networking UDP: socket
       ✅ Networking tunnel: geneve basic test
       ✅ Networking tunnel: gre basic
       ✅ L2TP basic test
       ✅ Networking tunnel: vxlan basic
       ✅ Networking ipsec: basic netns - tunnel
       ✅ audit: audit testsuite test
       ✅ httpd: mod_ssl smoke sanity
       ✅ tuned: tune-processes-through-perf
       ✅ ALSA PCM loopback test
       ✅ ALSA Control (mixer) Userspace Element test
       ✅ trace: ftrace/tracer
       🚧 ✅ CIFS Connectathon
       🚧 ✅ POSIX pjd-fstest suites
       🚧 ✅ Memory function: kaslr
       🚧 ✅ LTP: openposix test suite
       🚧 ✅ Networking vnic: ipvlan/basic
       🚧 ✅ iotop: sanity
       🚧 ✅ Usex - version 1.9-29
       🚧 ✅ storage: dm/common

  x86_64:
    Host 1:

       ⚡ Internal infrastructure issues prevented one or more tests (marked
       with ⚡⚡⚡) from running on this architecture.
       This is not the fault of the kernel that was tested.

       ✅ Boot test
       ✅ xfstests - ext4
       ✅ xfstests - xfs
       ✅ selinux-policy: serge-testsuite
       ✅ lvm thinp sanity
       ✅ storage: software RAID testing
       ✅ stress: stress-ng
       🚧 ✅ IOMMU boot test
       🚧 ⚡⚡⚡ IPMI driver test
       🚧 ✅ IPMItool loop stress test
       🚧 ⚡⚡⚡ Storage blktests

    Host 2:
       ✅ Boot test
       ✅ Storage SAN device stress - mpt3sas driver

    Host 3:
       ✅ Boot test
       ✅ Podman system integration test - as root
       ✅ Podman system integration test - as user
       ✅ LTP
       ✅ Loopdev Sanity
       ✅ Memory function: memfd_create
       ✅ AMTU (Abstract Machine Test Utility)
       ✅ Networking bridge: sanity
       ✅ Ethernet drivers sanity
       ✅ Networking MACsec: sanity
       ✅ Networking socket: fuzz
       ✅ Networking sctp-auth: sockopts test
       ✅ Networking: igmp conformance test
       ✅ Networking route: pmtu
       ✅ Networking route_func - local
       ✅ Networking route_func - forward
       ✅ Networking TCP: keepalive test
       ✅ Networking UDP: socket
       ✅ Networking tunnel: geneve basic test
       ✅ Networking tunnel: gre basic
       ✅ L2TP basic test
       ✅ Networking tunnel: vxlan basic
       ✅ Networking ipsec: basic netns - transport
       ✅ Networking ipsec: basic netns - tunnel
       ✅ audit: audit testsuite test
       ✅ httpd: mod_ssl smoke sanity
       ✅ tuned: tune-processes-through-perf
       ✅ pciutils: sanity smoke test
       ✅ ALSA PCM loopback test
       ✅ ALSA Control (mixer) Userspace Element test
       ✅ storage: SCSI VPD
       ✅ trace: ftrace/tracer
       🚧 ✅ CIFS Connectathon
       🚧 ✅ POSIX pjd-fstest suites
       🚧 ✅ Memory function: kaslr
       🚧 ✅ LTP: openposix test suite
       🚧 ✅ Networking vnic: ipvlan/basic
       🚧 ✅ iotop: sanity
       🚧 ✅ Usex - version 1.9-29
       🚧 ✅ storage: dm/common

    Host 4:
       ✅ Boot test
       ✅ Storage SAN device stress - megaraid_sas

  Test sources: https://github.com/CKI-project/tests-beaker
    💚 Pull requests are welcome for new tests or improvements to existing tests!

Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.

Testing timeout
---------------
We aim to provide a report within reasonable timeframe. Tests that haven't
finished running yet are marked with ⏱.


^ permalink raw reply

* Linux hangs at ACPI init on Medion P15648 MD63490
From: Jan Engelhardt @ 2020-02-14 19:28 UTC (permalink / raw)
  To: linux-acpi

Greetings.


I have a problem with a certain x86 laptop, and judging from the
kernel's output, this looks very much like a broken ACPI table.
Versions tried are 5.3.8 (Fedora31 liveimage), 5.5.2 (openSUSE
Tumbleweed installer) and 5.6.0-rc1+
(b19e8c68470385dd2c5440876591fddb02c8c402; self compile), all
exhibiting the same hang.

The last messages emitted by 5.6.0-rc1+ are:

	ACPI: 11 ACPI AML tables successfully acquired and loaded
	ACPI: EC: EC started
	ACPI: EC: interrupt blocked
	ACPI: \: Used as first EC
	ACPI: \: GPE=0x10, IRQ=-1, EC_CMD/EC_SC=0x66, EC_DATA=0x62
	ACPI: EC: Boot ECDT EC used to handle transactions
	<hang>

The full boot procedure is made available at
http://inai.de/files/m921.mp4 [79MB].
Curiously, FreeBSD 12.1 can be booted without issues, so either they
already workaround the issue, or don't trigger it in the first place.

After about 20 minutes, the kernel issues a stack trace.
http://inai.de/files/m922.mp4 [4.2M]; this seems to repeat every 
20 minutes:

	Task swapper blocked for more than 491 seconds.
	schedule
	schedule_timeout
	__down_timeout
	down_timeout
	acpi_os_wait_semaphore
	acpi_ex_system_wait_semaphore
	acpi_ev_acquire_global_lock
	acpi_ex_acquire_mutex_object
	acpi_ex_acquire_global_lock
	acpi_ex_write_data_to_field
	acpi_ex_store_object_to_node
	acpi_ex_store
	acpi_ex_opcode_1A_1T_1R
	acpi_ds_exec_end_op
	acpi_ps_parse_loop
	[a few frames more]

For comparison, a (vastly) different laptop with a proper firmware,
the EC messages go like this:

	<Fujitsu U728 for comparison>
	ACPI: EC: EC started
	ACPI: EC: interrupt blocked
	ACPI: \_SB_.PCI0.LPCB.EC__: Used as first EC
	ACPI: \_SB_.PCI0.LPCB.EC__: GPE=0x22, EC_CMD/EC_SC=0x66, EC_DATA=0x62
	ACPI: \_SB_.PCI0.LPCB.EC__: Boot DSDT EC used to handle transactions
	ACPI: Interpreter enabled

It kind of makes sense that, if "\" is seen as an EC in the Medion that 
it is not going to work.

^ permalink raw reply

* Re: [PATCH 2/7] e2fsck: Fix indexed dir rehash failure with metadata_csum enabled
From: Andreas Dilger @ 2020-02-14 19:28 UTC (permalink / raw)
  To: Jan Kara; +Cc: Ted Tso, linux-ext4
In-Reply-To: <20200213101602.29096-3-jack@suse.cz>

[-- Attachment #1: Type: text/plain, Size: 1951 bytes --]

On Feb 13, 2020, at 3:15 AM, Jan Kara <jack@suse.cz> wrote:
> 
> E2fsck directory rehashing code can fail with ENOSPC due to a bug in
> ext2fs_htree_intnode_maxrecs() which fails to take metadata checksum
> into account and thus e.g. e2fsck can decide to create 1 indirect level
> of index tree when two are actually needed. Fix the logic to account for
> metadata checksum.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

Reviewed-by: Andreas Dilger <adilger@dilger.ca>

> ---
> lib/ext2fs/ext2fs.h | 10 +++++++---
> 1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
> index 93ecf29c568d..5fde3343b1f1 100644
> --- a/lib/ext2fs/ext2fs.h
> +++ b/lib/ext2fs/ext2fs.h
> @@ -1783,7 +1783,6 @@ extern blk_t ext2fs_group_first_block(ext2_filsys fs, dgrp_t group);
> extern blk_t ext2fs_group_last_block(ext2_filsys fs, dgrp_t group);
> extern blk_t ext2fs_inode_data_blocks(ext2_filsys fs,
> 				      struct ext2_inode *inode);
> -extern int ext2fs_htree_intnode_maxrecs(ext2_filsys fs, int blocks);
> extern unsigned int ext2fs_div_ceil(unsigned int a, unsigned int b);
> extern __u64 ext2fs_div64_ceil(__u64 a, __u64 b);
> extern int ext2fs_dirent_name_len(const struct ext2_dir_entry *entry);
> @@ -2015,9 +2014,14 @@ _INLINE_ blk_t ext2fs_inode_data_blocks(ext2_filsys fs,
> 	return (blk_t) ext2fs_inode_data_blocks2(fs, inode);
> }
> 
> -_INLINE_ int ext2fs_htree_intnode_maxrecs(ext2_filsys fs, int blocks)
> +static inline int ext2fs_htree_intnode_maxrecs(ext2_filsys fs, int blocks)
> {
> -	return blocks * ((fs->blocksize - 8) / sizeof(struct ext2_dx_entry));
> +	int csum_size = 0;
> +
> +	if (ext2fs_has_feature_metadata_csum(fs->super))
> +		csum_size = sizeof(struct ext2_dx_tail);
> +	return blocks * ((fs->blocksize - (8 + csum_size)) /
> +						sizeof(struct ext2_dx_entry));
> }
> 
> /*
> --
> 2.16.4
> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply


This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.