From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail.linutronix.de (146.0.238.70:993) by crypto-ml.lab.linutronix.de with IMAP4-SSL for ; 12 Jan 2019 03:04:56 -0000 Received: from mga07.intel.com ([134.134.136.100]) by Galois.linutronix.de with esmtps (TLS1.2:DHE_RSA_AES_256_CBC_SHA256:256) (Exim 4.80) (envelope-from ) id 1gi9bR-0004u1-RD for speck@linutronix.de; Sat, 12 Jan 2019 04:04:55 +0100 Date: Fri, 11 Jan 2019 19:04:49 -0800 From: Andi Kleen Subject: [MODERATED] Re: [PATCH v4 00/28] MDSv4 2 Message-ID: <20190112030449.GB6118@tassilo.jf.intel.com> References: MIME-Version: 1.0 In-Reply-To: Content-Type: multipart/mixed; boundary="rfwNdt5cNUUjB/69" Content-Disposition: inline To: speck@linutronix.de List-ID: --rfwNdt5cNUUjB/69 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Mailbox with the patches attached. -Andi --rfwNdt5cNUUjB/69 Content-Type: application/mbox Content-Disposition: attachment; filename="mdsv4.mbox" Content-Transfer-Encoding: quoted-printable =46rom 2a7a23c9c36cff225e2fcb80c09c3d369d9a7331 Mon Sep 17 00:00:00 2001=0A= =46rom: Andi Kleen =0ADate: Wed, 7 Nov 2018 16:08:39 -0= 800=0ASubject: [PATCH 01/28] x86/speculation/mds: Add basic bug infrastruct= ure for=0A MDS=0A=0AMDS is micro architectural data sampling, which is a si= de channel=0Aattack on internal buffers in Intel CPUs.=0A=0AMDS consists of= multiple sub-vulnerabilities:=0AMicroarchitectural Store Buffer Data Sampl= ing (MSBDS) (CVE-2018-12126)=0AMicroarchitectual Fill Buffer Data Sampling = (MFBDS) (CVE-2018-12130)=0AMicroarchitectual Load Port Data (MLPDS) (CVE-20= 18-12127),=0Awith the first leaking store data, and the second loads and so= metimes=0Astore data, and the third load data.=0A=0AThey all have the same = mitigations for single thread, so we lump them all=0Atogether as a single M= DS issue.=0A=0AThis patch adds the basic infrastructure to detect if the cu= rrent=0ACPU is affected by MDS, and if yes set the right BUG bits.=0A=0ASig= ned-off-by: Andi Kleen =0A---=0A arch/x86/include/asm/c= pufeatures.h | 2 ++=0A arch/x86/include/asm/msr-index.h | 1 +=0A arch/x= 86/kernel/cpu/common.c | 14 ++++++++++++++=0A 3 files changed, 17 ins= ertions(+)=0A=0Adiff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/= include/asm/cpufeatures.h=0Aindex 6d6122524711..233ca598826f 100644=0A--- a= /arch/x86/include/asm/cpufeatures.h=0A+++ b/arch/x86/include/asm/cpufeature= s.h=0A@@ -344,6 +344,7 @@=0A /* Intel-defined CPU features, CPUID level 0x0= 0000007:0 (EDX), word 18 */=0A #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2)= /* AVX-512 Neural Network Instructions */=0A #define X86_FEATURE_AVX512_4F= MAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */=0A+#de= fine X86_FEATURE_MD_CLEAR (18*32+10) /* Flush state on VERW */=0A #define = X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */=0A #define X86_FEATURE_= SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */=0A #define= X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Pr= edictors */=0A@@ -381,5 +382,6 @@=0A #define X86_BUG_SPECTRE_V2 X86_BUG(16= ) /* CPU is affected by Spectre variant 2 attack with indirect branches */= =0A #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by spe= culative store bypass attack */=0A #define X86_BUG_L1TF X86_BUG(18) /* CP= U is affected by L1 Terminal Fault */=0A+#define X86_BUG_MDS X86_BUG(19) = /* CPU is affected by Microarchitectural data sampling */=0A =0A #endif /* = _ASM_X86_CPUFEATURES_H */=0Adiff --git a/arch/x86/include/asm/msr-index.h b= /arch/x86/include/asm/msr-index.h=0Aindex 8e40c2446fd1..3e486d9d6e6c 100644= =0A--- a/arch/x86/include/asm/msr-index.h=0A+++ b/arch/x86/include/asm/msr-= index.h=0A@@ -77,6 +77,7 @@=0A * attack, so no Speculative Store = Bypass=0A * control required.=0A */=0A+#define ARCH_CAP= _MDS_NO (1 << 5) /* No Microarchitectural data sampling */=0A =0A #defi= ne MSR_IA32_FLUSH_CMD 0x0000010b=0A #define L1D_FLUSH (1 << 0) /*=0Adi= ff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c=0Ain= dex cb28e98a0659..0c900eb6f829 100644=0A--- a/arch/x86/kernel/cpu/common.c= =0A+++ b/arch/x86/kernel/cpu/common.c=0A@@ -998,6 +998,14 @@ static const _= _initconst struct x86_cpu_id cpu_no_l1tf[] =3D {=0A {}=0A };=0A =0A+static= const __initconst struct x86_cpu_id cpu_no_mds[] =3D {=0A+ /* in addition = to cpu_no_speculation */=0A+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMON= T },=0A+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X },=0A+ { X86_VEN= DOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_PLUS },=0A+ {}=0A+};=0A+=0A static v= oid __init cpu_set_bug_bits(struct cpuinfo_x86 *c)=0A {=0A u64 ia32_cap = =3D 0;=0A@@ -1019,6 +1027,12 @@ static void __init cpu_set_bug_bits(struct = cpuinfo_x86 *c)=0A if (ia32_cap & ARCH_CAP_IBRS_ALL)=0A setup_force_cpu_= cap(X86_FEATURE_IBRS_ENHANCED);=0A =0A+ if ((boot_cpu_data.x86_vendor =3D= =3D X86_VENDOR_INTEL &&=0A+ !x86_match_cpu(cpu_no_mds)) &&=0A+ !(ia= 32_cap & ARCH_CAP_MDS_NO) &&=0A+ !(ia32_cap & ARCH_CAP_RDCL_NO))=0A+ s= etup_force_cpu_bug(X86_BUG_MDS);=0A+=0A if (x86_match_cpu(cpu_no_meltdown)= )=0A return;=0A =0A-- =0A2.17.2=0A=0A=0AFrom edde5a24d37be41a4581e33d9f8e= 7f12f0cb2ad8 Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Fri, 11 Jan 2019 16:44:15 -0800=0ASubject: [PATCH 02/28] x86/spe= culation/mds: Add mds=3Doff=0A=0ANormally we execute VERW for clearing the = cpu unconditionally on kernel exits=0Athat might have touched sensitive. Ad= d a new flag to disable VERW usage.=0AThis is intended for systems that onl= y run trusted code and don't=0Awant the performance impact of the extra cle= aring.=0A=0AThis just sets the flag, actual implementation is in future pat= ches.=0A=0ASigned-off-by: Andi Kleen =0A---=0A Document= ation/admin-guide/kernel-parameters.txt | 3 +++=0A arch/x86/include/asm/cpu= features.h | 1 +=0A arch/x86/kernel/cpu/bugs.c = | 9 +++++++++=0A 3 files changed, 13 insertions(+)=0A=0Adiff --git a/= Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide= /kernel-parameters.txt=0Aindex b799bcf67d7b..9c967d0caeca 100644=0A--- a/Do= cumentation/admin-guide/kernel-parameters.txt=0A+++ b/Documentation/admin-g= uide/kernel-parameters.txt=0A@@ -2357,6 +2357,9 @@=0A Format: ,=0A Specifies range of consoles to be captured by the MDA.=0A =0A+ m= ds=3Doff [X86, Intel]=0A+ Disable workarounds for Micro-architectural Da= ta Sampling.=0A+=0A mem=3Dnn[KMG] [KNL,BOOT] Force usage of a specific amo= unt of memory=0A Amount of memory to be used when the kernel is not able= =0A to see the whole system memory or for test.=0Adiff --git a/arch/x86/= include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h=0Aindex 233c= a598826f..09347c6a8901 100644=0A--- a/arch/x86/include/asm/cpufeatures.h=0A= +++ b/arch/x86/include/asm/cpufeatures.h=0A@@ -221,6 +221,7 @@=0A #define X= 86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */=0A #defin= e X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion *= /=0A #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */=0A+#d= efine X86_FEATURE_NO_VERW ( 7*32+31) /* "" No VERW for MDS on kernel exit = */=0A =0A /* Virtualization flags: Linux defined, word 8 */=0A #define X86_= FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */=0Adiff --git a/arch/x= 86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c=0Aindex 8654b8b0c848..542= 6467143c9 100644=0A--- a/arch/x86/kernel/cpu/bugs.c=0A+++ b/arch/x86/kernel= /cpu/bugs.c=0A@@ -37,6 +37,7 @@=0A static void __init spectre_v2_select_mit= igation(void);=0A static void __init ssb_select_mitigation(void);=0A static= void __init l1tf_select_mitigation(void);=0A+static void __init mds_select= _mitigation(void);=0A =0A /* The base value of the SPEC_CTRL MSR that alway= s has to be preserved. */=0A u64 x86_spec_ctrl_base;=0A@@ -101,6 +102,8 @@ = void __init check_bugs(void)=0A =0A l1tf_select_mitigation();=0A =0A+ mds_= select_mitigation();=0A+=0A #ifdef CONFIG_X86_32=0A /*=0A * Check whethe= r we are able to run this kernel safely on SMP.=0A@@ -1058,6 +1061,12 @@ ea= rly_param("l1tf", l1tf_cmdline);=0A =0A #undef pr_fmt=0A =0A+static void md= s_select_mitigation(void)=0A+{=0A+ if (cmdline_find_option_bool(boot_comman= d_line, "mds=3Doff"))=0A+ setup_force_cpu_cap(X86_FEATURE_NO_VERW);=0A+}= =0A+=0A #ifdef CONFIG_SYSFS=0A =0A #define L1TF_DEFAULT_MSG "Mitigation: PT= E Inversion"=0A-- =0A2.17.2=0A=0A=0AFrom 9adffb04aca3afa85d3bf7e3de98da0c6d= 6c4b02 Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0AD= ate: Wed, 7 Nov 2018 16:12:17 -0800=0ASubject: [PATCH 03/28] x86/speculatio= n/mds: Support clearing CPU data on=0A kernel exit=0A=0AAdd infrastructure = for clearing CPU data on kernel exit=0A=0AInstead of clearing unconditional= ly we support clearing=0Alazily when some kernel subsystem touched sensitiv= e data=0Aand sets the new TIF_CLEAR_CPU flag.=0A=0AWe handle TIF_CLEAR_CPU = in kernel exit, similar to=0Aother kernel exit action flags.=0A=0AThe flush= ing is provided by new microcode as a new side=0Aeffect of the otherwise un= used VERW instruction.=0A=0ASo far this patch doesn't do anything, it relie= s on=0Alater patches to set TIF_CLEAR_CPU.=0A=0ASuggested-by: Linus Torvald= s =0ATested-by: Neelima Krishnan =0ASigned-off-by: Andi Kleen =0A---=0A= arch/x86/entry/common.c | 8 +++++++-=0A arch/x86/include/asm/c= learcpu.h | 23 +++++++++++++++++++++++=0A arch/x86/include/asm/thread_in= fo.h | 2 ++=0A 3 files changed, 32 insertions(+), 1 deletion(-)=0A create = mode 100644 arch/x86/include/asm/clearcpu.h=0A=0Adiff --git a/arch/x86/entr= y/common.c b/arch/x86/entry/common.c=0Aindex 7bc105f47d21..924f8dab2068 100= 644=0A--- a/arch/x86/entry/common.c=0A+++ b/arch/x86/entry/common.c=0A@@ -2= 9,6 +29,7 @@=0A #include =0A #include =0A #include= =0A+#include =0A #include =0A= #include =0A =0A@@ -132,7 +133,7 @@ static long syscall_= trace_enter(struct pt_regs *regs)=0A }=0A =0A #define EXIT_TO_USERMODE_LOOP= _FLAGS \=0A- (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \=0A+= (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | _TIF_CLEAR_CPU |\=0A= _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)=0A =0A= static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)= =0A@@ -170,6 +171,11 @@ static void exit_to_usermode_loop(struct pt_regs *r= egs, u32 cached_flags)=0A if (cached_flags & _TIF_USER_RETURN_NOTIFY)=0A = fire_user_return_notifiers();=0A =0A+ if (cached_flags & _TIF_CLEAR_CPU= ) {=0A+ clear_thread_flag(TIF_CLEAR_CPU);=0A+ clear_cpu();=0A+ }=0A+= =0A /* Disable IRQs and retry */=0A local_irq_disable();=0A =0Adiff --g= it a/arch/x86/include/asm/clearcpu.h b/arch/x86/include/asm/clearcpu.h=0Ane= w file mode 100644=0Aindex 000000000000..530ef619ac1b=0A--- /dev/null=0A+++= b/arch/x86/include/asm/clearcpu.h=0A@@ -0,0 +1,23 @@=0A+/* SPDX-License-Id= entifier: GPL-2.0 */=0A+#ifndef _ASM_CLEARCPU_H=0A+#define _ASM_CLEARCPU_H = 1=0A+=0A+#include =0A+#include =0A+#= include =0A+#include =0A+=0A+/*=0A+= * Clear CPU buffers to avoid side channels.=0A+ * We use microcode as a si= de effect of the obsolete VERW instruction=0A+ */=0A+=0A+static inline void= clear_cpu(void)=0A+{=0A+ unsigned kernel_ds =3D __KERNEL_DS;=0A+ /* Has to= be memory form, don't modify to use an register */=0A+ alternative_input("= verw %[kernelds]", "", X86_FEATURE_NO_VERW,=0A+ [kernelds] "m" (kernel_ds)= );=0A+}=0A+=0A+#endif=0Adiff --git a/arch/x86/include/asm/thread_info.h b/a= rch/x86/include/asm/thread_info.h=0Aindex e0eccbcb8447..0c1e3d71018e 100644= =0A--- a/arch/x86/include/asm/thread_info.h=0A+++ b/arch/x86/include/asm/th= read_info.h=0A@@ -95,6 +95,7 @@ struct thread_info {=0A #define TIF_MEMDIE = 20 /* is terminating due to OOM killer */=0A #define TIF_POLLING_NRFLAG 21= /* idle is polling for TIF_NEED_RESCHED */=0A #define TIF_IO_BITMAP 22 /*= uses I/O bitmap */=0A+#define TIF_CLEAR_CPU 23 /* clear CPU on kernel exi= t */=0A #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */= =0A #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */=0A #de= fine TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */=0A@@ -12= 3,6 +124,7 @@ struct thread_info {=0A #define _TIF_NOHZ (1 << TIF_NOHZ)=0A= #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)=0A #define _TIF_IO_B= ITMAP (1 << TIF_IO_BITMAP)=0A+#define _TIF_CLEAR_CPU (1 << TIF_CLEAR_CPU)= =0A #define _TIF_FORCED_TF (1 << TIF_FORCED_TF)=0A #define _TIF_BLOCKSTEP = (1 << TIF_BLOCKSTEP)=0A #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_U= PDATES)=0A-- =0A2.17.2=0A=0A=0AFrom 6bca3c2e17e4c6a272aef9a3dc6aae689723f57= 2 Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: = Fri, 14 Dec 2018 16:40:11 -0800=0ASubject: [PATCH 04/28] x86/speculation/md= s: Support mds=3Dfull=0A=0ASupport a new command line option to support unc= onditional flushing=0Aon each kernel exit. This is not enabled by default.= =0A=0ASigned-off-by: Andi Kleen =0A---=0A Documentation= /admin-guide/kernel-parameters.txt | 5 +++++=0A arch/x86/entry/common.c = | 7 ++++++-=0A arch/x86/include/asm/clearcpu.h = | 2 ++=0A arch/x86/kernel/cpu/bugs.c | 4 ++++= =0A 4 files changed, 17 insertions(+), 1 deletion(-)=0A=0Adiff --git a/Docu= mentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/ker= nel-parameters.txt=0Aindex 9c967d0caeca..5f5a8808c475 100644=0A--- a/Docume= ntation/admin-guide/kernel-parameters.txt=0A+++ b/Documentation/admin-guide= /kernel-parameters.txt=0A@@ -2360,6 +2360,11 @@=0A mds=3Doff [X86, Intel]= =0A Disable workarounds for Micro-architectural Data Sampling.=0A =0A+ m= ds=3Dfull [X86, Intel]=0A+ Always flush cpu buffers when exiting kernel f= or MDS.=0A+ Normally the kernel decides dynamically when flushing is=0A+ = needed or not.=0A+=0A mem=3Dnn[KMG] [KNL,BOOT] Force usage of a specific= amount of memory=0A Amount of memory to be used when the kernel is not = able=0A to see the whole system memory or for test.=0Adiff --git a/arch/= x86/entry/common.c b/arch/x86/entry/common.c=0Aindex 924f8dab2068..66c08e1d= 493a 100644=0A--- a/arch/x86/entry/common.c=0A+++ b/arch/x86/entry/common.c= =0A@@ -173,7 +173,9 @@ static void exit_to_usermode_loop(struct pt_regs *re= gs, u32 cached_flags)=0A =0A if (cached_flags & _TIF_CLEAR_CPU) {=0A c= lear_thread_flag(TIF_CLEAR_CPU);=0A- clear_cpu();=0A+ /* Don't do it tw= ice if forced */=0A+ if (!static_key_enabled(&force_cpu_clear))=0A+ cl= ear_cpu();=0A }=0A =0A /* Disable IRQs and retry */=0A@@ -217,6 +219,9 = @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)=0A = ti->status &=3D ~(TS_COMPAT|TS_I386_REGS_POKED);=0A #endif=0A =0A+ if (sta= tic_key_enabled(&force_cpu_clear))=0A+ clear_cpu();=0A+=0A user_enter_irq= off();=0A }=0A =0Adiff --git a/arch/x86/include/asm/clearcpu.h b/arch/x86/i= nclude/asm/clearcpu.h=0Aindex 530ef619ac1b..3b8ee76b9c07 100644=0A--- a/arc= h/x86/include/asm/clearcpu.h=0A+++ b/arch/x86/include/asm/clearcpu.h=0A@@ -= 20,4 +20,6 @@ static inline void clear_cpu(void)=0A [kernelds] "m" (kerne= l_ds));=0A }=0A =0A+DECLARE_STATIC_KEY_FALSE(force_cpu_clear);=0A+=0A #endi= f=0Adiff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c=0A= index 5426467143c9..40f7415dcd7e 100644=0A--- a/arch/x86/kernel/cpu/bugs.c= =0A+++ b/arch/x86/kernel/cpu/bugs.c=0A@@ -1061,10 +1061,14 @@ early_param("= l1tf", l1tf_cmdline);=0A =0A #undef pr_fmt=0A =0A+DEFINE_STATIC_KEY_FALSE(f= orce_cpu_clear);=0A+=0A static void mds_select_mitigation(void)=0A {=0A if= (cmdline_find_option_bool(boot_command_line, "mds=3Doff"))=0A setup_forc= e_cpu_cap(X86_FEATURE_NO_VERW);=0A+ if (cmdline_find_option_bool(boot_comma= nd_line, "mds=3Dfull"))=0A+ static_branch_enable(&force_cpu_clear);=0A }= =0A =0A #ifdef CONFIG_SYSFS=0A-- =0A2.17.2=0A=0A=0AFrom 021c5ba2a9fdae32605= 8dd16785b30c31546cd0f Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Wed, 7 Nov 2018 16:15:28 -0800=0ASubject: [PATCH 05/2= 8] x86/speculation/mds: Clear CPU buffers on entering idle=0A=0AWhen enteri= ng idle the internal state of the current CPU might=0Abecome visible to the= thread sibling because the CPU "frees" some=0Ainternal resources.=0A=0ATo = ensure there is no MDS leakage always clear the CPU state=0Abefore doing an= y idling. We only do this if SMT is enabled,=0Aas otherwise there is no lea= kage possible.=0A=0ANot needed for idle poll because it does not share reso= urces.=0A=0ASigned-off-by: Andi Kleen =0A---=0A arch/x8= 6/include/asm/clearcpu.h | 19 +++++++++++++++++++=0A arch/x86/kernel/acpi/c= state.c | 2 ++=0A arch/x86/kernel/kvm.c | 3 +++=0A arch/x86/k= ernel/process.c | 5 +++++=0A arch/x86/kernel/smpboot.c | 3 ++= +=0A drivers/acpi/acpi_pad.c | 2 ++=0A drivers/acpi/processor_idle= =2Ec | 3 +++=0A drivers/idle/intel_idle.c | 5 +++++=0A kernel/sch= ed/fair.c | 1 +=0A 9 files changed, 43 insertions(+)=0A=0Adiff= --git a/arch/x86/include/asm/clearcpu.h b/arch/x86/include/asm/clearcpu.h= =0Aindex 3b8ee76b9c07..b83ef1a5268f 100644=0A--- a/arch/x86/include/asm/cle= arcpu.h=0A+++ b/arch/x86/include/asm/clearcpu.h=0A@@ -20,6 +20,25 @@ static= inline void clear_cpu(void)=0A [kernelds] "m" (kernel_ds));=0A }=0A =0A+= /*=0A+ * Clear CPU buffers before going idle, so that no state is leaked to= SMT=0A+ * siblings taking over thread resources.=0A+ * Out of line to avoi= d include hell.=0A+ *=0A+ * Assumes that interrupts are disabled and only g= et reenabled=0A+ * before idle, otherwise the data from a racing interrupt = might not=0A+ * get cleared. There are some callers who violate this,=0A+ *= but they are only used in unattackable cases.=0A+ */=0A+=0A+static inline = void clear_cpu_idle(void)=0A+{=0A+ if (sched_smt_active()) {=0A+ clear_thr= ead_flag(TIF_CLEAR_CPU);=0A+ clear_cpu();=0A+ }=0A+}=0A+=0A DECLARE_STATIC= _KEY_FALSE(force_cpu_clear);=0A =0A #endif=0Adiff --git a/arch/x86/kernel/a= cpi/cstate.c b/arch/x86/kernel/acpi/cstate.c=0Aindex 158ad1483c43..48adea5a= facf 100644=0A--- a/arch/x86/kernel/acpi/cstate.c=0A+++ b/arch/x86/kernel/a= cpi/cstate.c=0A@@ -14,6 +14,7 @@=0A #include =0A #include= =0A #include =0A+#include =0A =0A /*=0A * Initialize bm_flags based on the CPU cache properties=0A= @@ -157,6 +158,7 @@ void __cpuidle acpi_processor_ffh_cstate_enter(struct a= cpi_processor_cx *cx)=0A unsigned int cpu =3D smp_processor_id();=0A stru= ct cstate_entry *percpu_entry;=0A =0A+ clear_cpu_idle();=0A percpu_entry = =3D per_cpu_ptr(cpu_cstate_entry, cpu);=0A mwait_idle_with_hints(percpu_en= try->states[cx->index].eax,=0A percpu_entry->states[= cx->index].ecx);=0Adiff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm= =2Ec=0Aindex ba4bfb7f6a36..c9206ad40a5b 100644=0A--- a/arch/x86/kernel/kvm.= c=0A+++ b/arch/x86/kernel/kvm.c=0A@@ -159,6 +159,7 @@ void kvm_async_pf_tas= k_wait(u32 token, int interrupt_kernel)=0A /*=0A * We cannot resched= ule. So halt.=0A */=0A+ clear_cpu_idle();=0A native_safe_halt();= =0A local_irq_disable();=0A }=0A@@ -785,6 +786,8 @@ static void kvm_wa= it(u8 *ptr, u8 val)=0A if (READ_ONCE(*ptr) !=3D val)=0A goto out;=0A =0A= + clear_cpu_idle();=0A+=0A /*=0A * halt until it's our turn and kicked. = Note that we do safe halt=0A * for irq enabled case to avoid hang when lo= ck info is overwritten=0Adiff --git a/arch/x86/kernel/process.c b/arch/x86/= kernel/process.c=0Aindex 90ae0ca51083..9d9f2d2b209d 100644=0A--- a/arch/x86= /kernel/process.c=0A+++ b/arch/x86/kernel/process.c=0A@@ -42,6 +42,7 @@=0A = #include =0A #include =0A #include =0A+#include =0A =0A #include "process.h"=0A =0A@@ -58= 9,6 +590,8 @@ void stop_this_cpu(void *dummy)=0A disable_local_APIC();=0A = mcheck_cpu_clear(this_cpu_ptr(&cpu_info));=0A =0A+ clear_cpu_idle();=0A+= =0A /*=0A * Use wbinvd on processors that support SME. This provides sup= port=0A * for performing a successful kexec when going from SME inactive= =0A@@ -675,6 +678,8 @@ static __cpuidle void mwait_idle(void)=0A mb(); /= * quirk */=0A }=0A =0A+ clear_cpu_idle();=0A+=0A __monitor((void *)&cu= rrent_thread_info()->flags, 0, 0);=0A if (!need_resched())=0A __sti_mw= ait(0, 0);=0Adiff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpbo= ot.c=0Aindex ccd1f2a8e557..c7fff6b09253 100644=0A--- a/arch/x86/kernel/smpb= oot.c=0A+++ b/arch/x86/kernel/smpboot.c=0A@@ -81,6 +81,7 @@=0A #include =0A #include =0A #include = =0A+#include =0A =0A /* representing HT siblings of each lo= gical CPU */=0A DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);= =0A@@ -1635,6 +1636,7 @@ static inline void mwait_play_dead(void)=0A wbinv= d();=0A =0A while (1) {=0A+ clear_cpu_idle();=0A /*=0A * The CLFLUSH= is a workaround for erratum AAI65 for=0A * the Xeon 7400 series. It's = not clear it is actually=0A@@ -1662,6 +1664,7 @@ void hlt_play_dead(void)= =0A wbinvd();=0A =0A while (1) {=0A+ clear_cpu_idle();=0A native_halt= ();=0A /*=0A * If NMI wants to wake up CPU0, start CPU0.=0Adiff --git = a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c=0Aindex a47676a55b84..2= dcbc38d0880 100644=0A--- a/drivers/acpi/acpi_pad.c=0A+++ b/drivers/acpi/acp= i_pad.c=0A@@ -27,6 +27,7 @@=0A #include =0A #include =0A #include =0A+#include =0A #include <= xen/xen.h>=0A =0A #define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad"=0A@@ -= 175,6 +176,7 @@ static int power_saving_thread(void *data)=0A tick_broad= cast_enable();=0A tick_broadcast_enter();=0A stop_critical_timings();= =0A+ clear_cpu_idle();=0A =0A mwait_idle_with_hints(power_saving_mwait= _eax, 1);=0A =0Adiff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/p= rocessor_idle.c=0Aindex b2131c4ea124..0342daa122fe 100644=0A--- a/drivers/a= cpi/processor_idle.c=0A+++ b/drivers/acpi/processor_idle.c=0A@@ -33,6 +33,7= @@=0A #include =0A #include =0A #include =0A+#include =0A =0A /*=0A * Include the ap= ic definitions for x86 to have the APIC timer related defines=0A@@ -120,6 += 121,7 @@ static const struct dmi_system_id processor_power_dmi_table[] =3D = {=0A */=0A static void __cpuidle acpi_safe_halt(void)=0A {=0A+ clear_cpu_i= dle();=0A if (!tif_need_resched()) {=0A safe_halt();=0A local_irq_disa= ble();=0A@@ -681,6 +683,7 @@ static int acpi_idle_play_dead(struct cpuidle_= device *dev, int index)=0A =0A ACPI_FLUSH_CPU_CACHE();=0A =0A+ clear_cpu_i= dle();=0A while (1) {=0A =0A if (cx->entry_method =3D=3D ACPI_CSTATE_HAL= T)=0Adiff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c=0Ai= ndex 8b5d85c91e9d..ddaa7603d53a 100644=0A--- a/drivers/idle/intel_idle.c=0A= +++ b/drivers/idle/intel_idle.c=0A@@ -65,6 +65,7 @@=0A #include =0A #include =0A #include =0A+#include =0A =0A #define INTEL_IDLE_VERSION "0.4.1"=0A =0A@@ -933,6 +93= 4,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,=0A }= =0A }=0A =0A+ clear_cpu_idle();=0A+=0A mwait_idle_with_hints(eax, ecx);= =0A =0A if (!static_cpu_has(X86_FEATURE_ARAT) && tick)=0A@@ -953,6 +956,8 = @@ static void intel_idle_s2idle(struct cpuidle_device *dev,=0A unsigned l= ong ecx =3D 1; /* break on interrupt flag */=0A unsigned long eax =3D flg2= MWAIT(drv->states[index].flags);=0A =0A+ clear_cpu_idle();=0A+=0A mwait_id= le_with_hints(eax, ecx);=0A }=0A =0Adiff --git a/kernel/sched/fair.c b/kern= el/sched/fair.c=0Aindex 50aa2aba69bd..b5a1bd4a1a46 100644=0A--- a/kernel/sc= hed/fair.c=0A+++ b/kernel/sched/fair.c=0A@@ -5980,6 +5980,7 @@ static inlin= e int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p=0A =0A= #ifdef CONFIG_SCHED_SMT=0A DEFINE_STATIC_KEY_FALSE(sched_smt_present);=0A+= EXPORT_SYMBOL(sched_smt_present);=0A =0A static inline void set_idle_cores(= int cpu, int val)=0A {=0A-- =0A2.17.2=0A=0A=0AFrom 8388d491eac74581e40abe20= 96e81213037482be Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Wed, 7 Nov 2018 16:17:11 -0800=0ASubject: [PATCH 06/28] x86/= speculation/mds: Add sysfs reporting=0A=0AReport mds mitigation state in sy= sfs vulnerabilities.=0A=0ASigned-off-by: Andi Kleen =0A= ---=0A .../ABI/testing/sysfs-devices-system-cpu | 1 +=0A arch/x86/= kernel/cpu/bugs.c | 16 ++++++++++++++++=0A drivers/ba= se/cpu.c | 8 ++++++++=0A 3 files changed, 25= insertions(+)=0A=0Adiff --git a/Documentation/ABI/testing/sysfs-devices-sy= stem-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu=0Aindex 9605d= bd4b5b5..2db5c3407fd6 100644=0A--- a/Documentation/ABI/testing/sysfs-device= s-system-cpu=0A+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu=0A@= @ -484,6 +484,7 @@ What: /sys/devices/system/cpu/vulnerabilities=0A /sys= /devices/system/cpu/vulnerabilities/spectre_v2=0A /sys/devices/system/cpu= /vulnerabilities/spec_store_bypass=0A /sys/devices/system/cpu/vulnerabili= ties/l1tf=0A+ /sys/devices/system/cpu/vulnerabilities/mds=0A Date: Januar= y 2018=0A Contact: Linux kernel mailing list = =0A Description: Information about CPU vulnerabilities=0Adiff --git a/arch/= x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c=0Aindex 40f7415dcd7e..58= 2b1cd019f7 100644=0A--- a/arch/x86/kernel/cpu/bugs.c=0A+++ b/arch/x86/kerne= l/cpu/bugs.c=0A@@ -1174,6 +1174,16 @@ static ssize_t cpu_show_common(struct= device *dev, struct device_attribute *attr=0A if (boot_cpu_has(X86_FEATU= RE_L1TF_PTEINV))=0A return l1tf_show_state(buf);=0A break;=0A+=0A+ cas= e X86_BUG_MDS:=0A+ /* Assumes Hypervisor exposed HT state to us if in gues= t */=0A+ if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) {=0A+ if (cpu_smt_contr= ol !=3D CPU_SMT_ENABLED)=0A+ return sprintf(buf, "Mitigation: microcode\= n");=0A+ return sprintf(buf, "Mitigation: microcode, HT vulnerable\n");= =0A+ }=0A+ return sprintf(buf, "Vulnerable\n");=0A+=0A default:=0A bre= ak;=0A }=0A@@ -1205,4 +1215,10 @@ ssize_t cpu_show_l1tf(struct device *dev= , struct device_attribute *attr, char *b=0A {=0A return cpu_show_common(de= v, attr, buf, X86_BUG_L1TF);=0A }=0A+=0A+ssize_t cpu_show_mds(struct device= *dev, struct device_attribute *attr, char *buf)=0A+{=0A+ return cpu_show_c= ommon(dev, attr, buf, X86_BUG_MDS);=0A+}=0A+=0A #endif=0Adiff --git a/drive= rs/base/cpu.c b/drivers/base/cpu.c=0Aindex eb9443d5bae1..2fd6ca1021c2 10064= 4=0A--- a/drivers/base/cpu.c=0A+++ b/drivers/base/cpu.c=0A@@ -546,11 +546,1= 8 @@ ssize_t __weak cpu_show_l1tf(struct device *dev,=0A return sprintf(bu= f, "Not affected\n");=0A }=0A =0A+ssize_t __weak cpu_show_mds(struct device= *dev,=0A+ struct device_attribute *attr, char *buf)=0A+{=0A+ return = sprintf(buf, "Not affected\n");=0A+}=0A+=0A static DEVICE_ATTR(meltdown, 04= 44, cpu_show_meltdown, NULL);=0A static DEVICE_ATTR(spectre_v1, 0444, cpu_s= how_spectre_v1, NULL);=0A static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spe= ctre_v2, NULL);=0A static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spe= c_store_bypass, NULL);=0A static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NUL= L);=0A+static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);=0A =0A static str= uct attribute *cpu_root_vulnerabilities_attrs[] =3D {=0A &dev_attr_meltdow= n.attr,=0A@@ -558,6 +565,7 @@ static struct attribute *cpu_root_vulnerabili= ties_attrs[] =3D {=0A &dev_attr_spectre_v2.attr,=0A &dev_attr_spec_store_= bypass.attr,=0A &dev_attr_l1tf.attr,=0A+ &dev_attr_mds.attr,=0A NULL=0A }= ;=0A =0A-- =0A2.17.2=0A=0A=0AFrom 12f1a5725f437f45abfe455cd344e1c037a7de2c = Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Fr= i, 14 Dec 2018 16:57:27 -0800=0ASubject: [PATCH 07/28] x86/speculation/mds:= Support mds=3Dfull for NMIs=0A=0ANMIs don't go through C code when exiting= to user space, so we need=0Ato add an assembler clear cpu for this case. O= nly used with=0Amds=3Dfull, because otherwise we assume NMIs don't touch=0A= other users or kernel sensitive data.=0A=0ASigned-off-by: Andi Kleen =0A---=0A arch/x86/entry/entry_64.S | 12 ++++++++++++= =0A arch/x86/include/asm/clearcpu.h | 11 +++++++++++=0A 2 files changed, 23= insertions(+)=0A=0Adiff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry= /entry_64.S=0Aindex 1f0efdb7b629..57f194e3e253 100644=0A--- a/arch/x86/entr= y/entry_64.S=0A+++ b/arch/x86/entry/entry_64.S=0A@@ -39,6 +39,7 @@=0A #incl= ude =0A #include =0A #include =0A+#include =0A =0A #include "calling.h"=0A =0A@@ -1407,= 6 +1408,17 @@ ENTRY(nmi)=0A movq $-1, %rsi=0A call do_nmi=0A =0A+ /*=0A+ = * Clear only when force clearing was enabled. Otherwise=0A+ * we assume N= MI code is not sensitive.=0A+ * If you don't have jump labels we always cl= ear too.=0A+ */=0A+#ifdef HAVE_JUMP_LABEL=0A+ STATIC_BRANCH_JMP l_yes=3D.L= no_clear_cpu key=3Dforce_cpu_clear, branch=3D1=0A+#endif=0A+ CLEAR_CPU=0A+.= Lno_clear_cpu:=0A+=0A /*=0A * Return back to user mode. We must *not* d= o the normal exit=0A * work, because we don't want to enable interrupts.= =0Adiff --git a/arch/x86/include/asm/clearcpu.h b/arch/x86/include/asm/clea= rcpu.h=0Aindex b83ef1a5268f..67c4e0d38802 100644=0A--- a/arch/x86/include/a= sm/clearcpu.h=0A+++ b/arch/x86/include/asm/clearcpu.h=0A@@ -2,6 +2,8 @@=0A = #ifndef _ASM_CLEARCPU_H=0A #define _ASM_CLEARCPU_H 1=0A =0A+#ifndef __ASSEM= BLY__=0A+=0A #include =0A #include = =0A #include =0A@@ -41,4 +43,13 @@ static inline void cl= ear_cpu_idle(void)=0A =0A DECLARE_STATIC_KEY_FALSE(force_cpu_clear);=0A =0A= +#else=0A+=0A+.macro CLEAR_CPU=0A+ ALTERNATIVE __stringify(push $__USER_DS = ; verw (% _ASM_SP ) ; add $8, % _ASM_SP ),\=0A+ "", X86_FEATURE_NO_VERW=0A= +.endm=0A+=0A+#endif=0A+=0A #endif=0A-- =0A2.17.2=0A=0A=0AFrom 7da7ec0940e3= 62ea937690352eaa421e1b12b8b9 Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Thu, 6 Dec 2018 16:49:30 -0800=0ASubject: [PATCH= 08/28] x86/speculation/mds: Support mds=3Dfull for 32bit NMI=0A=0AThe main= kernel exits on 32bit kernels are already handled by=0Aearlier patches.=0A= =0ABut for NMIs we need to clear in the assembler code because=0ANMIs don't= go through C code on exit, but they still=0Amight need to clear due to mds= =3Dfull=0A=0AThis could be handled with a static key like 64bit, but=0Afor = now just add an unconditional cpu clear on NMI exit.=0A=0ASigned-off-by: An= di Kleen =0A---=0A arch/x86/entry/entry_32.S | 6 ++++++= =0A 1 file changed, 6 insertions(+)=0A=0Adiff --git a/arch/x86/entry/entry_= 32.S b/arch/x86/entry/entry_32.S=0Aindex d309f30cf7af..28b640f37f8d 100644= =0A--- a/arch/x86/entry/entry_32.S=0A+++ b/arch/x86/entry/entry_32.S=0A@@ -= 45,6 +45,7 @@=0A #include =0A #include =0A #includ= e =0A+#include =0A =0A #include "calli= ng.h"=0A =0A@@ -1446,6 +1447,11 @@ ENTRY(nmi)=0A movl %ebx, %esp=0A =0A .L= nmi_return:=0A+ /*=0A+ * Only needed with mds=3Dfull=0A+ * But for now do= it unconditionally.=0A+ */=0A+ CLEAR_CPU=0A CHECK_AND_APPLY_ESPFIX=0A R= ESTORE_ALL_NMI cr3_reg=3D%edi pop=3D4=0A jmp .Lirq_return=0A-- =0A2.17.2= =0A=0A=0AFrom 27182b26ec0d7cc26459b1ebc68ebcc4f6743b45 Mon Sep 17 00:00:00 = 2001=0AFrom: Andi Kleen =0ADate: Wed, 12 Dec 2018 16:50= :07 -0800=0ASubject: [PATCH 09/28] x86/speculation/mds: Export MD_CLEAR CPU= ID to KVM=0A guests.=0A=0AExport the MD_CLEAR CPUID set by new microcode to= signal=0Athat VERW implements the clear cpu side effect to KVM guests.=0A= =0AAlso requires corresponding qemu patches=0A=0ATested-by: Neelima Krishna= n =0ASigned-off-by: Andi Kleen =0A---=0A arch/x86/kvm/cpuid.c | 3 ++-=0A 1 file changed, 2 insertions(= +), 1 deletion(-)=0A=0Adiff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpu= id.c=0Aindex bbffa6c54697..d61272f50aed 100644=0A--- a/arch/x86/kvm/cpuid.c= =0A+++ b/arch/x86/kvm/cpuid.c=0A@@ -409,7 +409,8 @@ static inline int __do_= cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,=0A /* cpuid 7.0.ed= x*/=0A const u32 kvm_cpuid_7_0_edx_x86_features =3D=0A F(AVX512_4VNNIW) = | F(AVX512_4FMAPS) | F(SPEC_CTRL) |=0A- F(SPEC_CTRL_SSBD) | F(ARCH_CAPABIL= ITIES) | F(INTEL_STIBP);=0A+ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(= INTEL_STIBP) |=0A+ F(MD_CLEAR);=0A =0A /* all calls to cpuid_count() shou= ld be made on the same cpu */=0A get_cpu();=0A-- =0A2.17.2=0A=0A=0AFrom af= c24f570651f02b914f13243540304ed7b357b5 Mon Sep 17 00:00:00 2001=0AFrom: And= i Kleen =0ADate: Fri, 14 Dec 2018 12:55:54 -0800=0ASubj= ect: [PATCH 10/28] mds: Add documentation for clear cpu usage=0A=0AIncludin= g the theory, and some guide lines for subsystem/driver=0Amaintainers.=0A= =0ASigned-off-by: Andi Kleen =0A---=0A Documentation/cl= earcpu.txt | 173 +++++++++++++++++++++++++++++++++++++=0A 1 file changed, 1= 73 insertions(+)=0A create mode 100644 Documentation/clearcpu.txt=0A=0Adiff= --git a/Documentation/clearcpu.txt b/Documentation/clearcpu.txt=0Anew file= mode 100644=0Aindex 000000000000..b204b1e7051c=0A--- /dev/null=0A+++ b/Doc= umentation/clearcpu.txt=0A@@ -0,0 +1,173 @@=0A+=0A+Security model for Micro= architectural Data Sampling=0A+=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=0A+=0A+Some CPUs can leave read or wri= tten data in internal buffers,=0A+which then later might be sampled through= side effects.=0A+For more details see CVE-2018-12126 CVE-2018-12130 CVE-20= 18-12127=0A+=0A+This can be avoided by explicitely clearing the CPU state.= =0A+=0A+We trying to avoid leaking data between different processes,=0A+and= also some sensitive data, like cryptographic data,=0A+or user data from ot= her processes.=0A+=0A+We support three modes:=0A+=0A+(1) mitigation off (md= s=3Doff)=0A+(2) clear only when needed (default)=0A+(3) clear on every kern= el exit, or guest entry (mds=3Dfull)=0A+=0A+(1) and (3) are trivial, the re= st of the document discusses (2)=0A+=0A+Basic requirements and assumptions= =0A+----------------------------------=0A+=0A+Kernel addresses and kernel t= emporary data are not sensitive.=0A+=0A+User data is sensitive, but only fo= r other processes.=0A+=0A+Kernel data is sensitive when it is cryptographic= keys.=0A+=0A+Guidance for driver/subsystem developers=0A+-----------------= -----------------------=0A+=0A+When you touch user supplied data of *other*= processes in system call=0A+context add lazy_clear_cpu().=0A+=0A+For the c= ases below we care only about data from other processes.=0A+Touching non cr= yptographic data from the current process is always allowed.=0A+=0A+Touchin= g only pointers to user data is always allowed.=0A+=0A+When your interrupt = does not touch user data directly consider marking=0A+it with IRQF_NO_USER.= =0A+=0A+When your tasklet does not touch user data directly consider markin= g=0A+it with TASKLET_NO_USER using tasklet_init_flags/or=0A+DECLARE_TASKLET= *_NOUSER.=0A+=0A+When your timer does not touch user data mark it with TIME= R_NO_USER.=0A+If it is a hrtimer mark it with HRTIMER_MODE_NO_USER.=0A+=0A+= When your irq poll handler does not touch user data, mark it=0A+with IRQ_PO= LL_F_NO_USER through irq_poll_init_flags.=0A+=0A+For networking code make s= ure to only touch user data through=0A+skb_push/put/copy [add more], unless= it is data from the current=0A+process. If that is not ensured add lazy_cl= ear_cpu or=0A+lazy_clear_cpu_interrupt. When the non skb data access is onl= y in a=0A+hardware interrupt controlled by the driver, it can rely on not= =0A+setting IRQF_NO_USER for that interrupt.=0A+=0A+Any cryptographic code = touching key data should use memzero_explicit=0A+or kzfree.=0A+=0A+If your = RCU callback touches user data add lazy_clear_cpu().=0A+=0A+These steps are= currently only needed for code that runs on MDS affected=0A+CPUs, which is= currently only x86. But might be worth being prepared=0A+if other architec= tures become affected too.=0A+=0A+Implementation details/assumptions=0A+---= -------------------------------=0A+=0A+If a system call touches data it is = for its own process, so does not=0A+need to be cleared, because it has alre= ady access to it.=0A+=0A+When context switching we clear data, unless the c= ontext switch=0A+is inside a process, or from/to idle. We also clear after = any=0A+context switches from kernel threads.=0A+=0A+Idle does not have sens= itive data, except for in interrupts, which=0A+are handled separately.=0A+= =0A+Cryptographic keys inside the kernel should be protected.=0A+We assume = they use kzfree() or memzero_explicit() to clear=0A+state, so these functio= ns trigger a cpu clear.=0A+=0A+Hard interrupts, tasklets, timers which can = run asynchronous are=0A+assumed to touch random user data, unless they have= been audited, and=0A+marked with NO_USER flags.=0A+=0A+Most interrupt hand= lers for modern devices should not touch=0A+user data because they rely on = DMA and only manipulate=0A+pointers. This needs auditing to confirm though.= =0A+=0A+For softirqs we assume that if they touch user data they use=0A+laz= y_clear_cpu()/lazy_clear_interrupt() as needed.=0A+Networking is handled th= rough skb_* below.=0A+Timer and Tasklets and IRQ poll are handled through o= pt-in.=0A+=0A+Scheduler softirq is assumed to not touch user data.=0A+=0A+B= lock softirq done callbacks are assumed to not touch user data.=0A+=0A+For = networking code, any skb functions that are likely=0A+touching non header p= acket data schedule a clear cpu at next=0A+kernel exit. This includes skb_c= opy and related, skb_put/push,=0A+checksum functions. We assume that any n= etworking code touching=0A+packet data uses these functions.=0A+=0A+[In pri= nciple packet data should be encrypted anyways for the wire,=0A+but we try = to avoid leaking it anyways]=0A+=0A+Some IO related functions like string P= IO and memcpy_from/to_io, or=0A+the software pci dma bounce function, which= touch data, schedule a=0A+buffer clear.=0A+=0A+We assume NMI/machine check= code does not touch other=0A+processes' data.=0A+=0A+Any buffer clearing i= s done lazily on next kernel exit, so can be=0A+triggered in fast paths.=0A= +=0A+Sandboxes=0A+---------=0A+=0A+We don't do anything special for seccomp= processes=0A+=0A+If there is a sandbox inside the process the process shou= ld take care=0A+itself of clearing its own sensitive data before running sa= ndbox=0A+code. This would include data touched by system calls.=0A+=0A+BPF= =0A+---=0A+=0A+Assume BPF execution does not touch other user's data, so do= es=0A+not need to schedule a clear for itself.=0A+=0A+BPF could attack the = rest of the kernel if it can successfully=0A+measure side channel side effe= cts.=0A+=0A+When the BPF program was loaded unprivileged, always clear the = CPU=0A+to prevent any exploits written in BPF using side channels to read= =0A+data leaked from other kernel code=0A+=0A+We only do this when running = in an interrupt, or if an clear cpu is=0A+already scheduled (which means fo= r example there was a context=0A+switch, or crypto operation before)=0A+=0A= +In process context we assume the code only accesses data of the=0A+current= user and check that the BPF running was loaded by the=0A+same user so even= if data leaked it would not cross privilege=0A+boundaries.=0A+=0A+Technica= lly we would only need to do this if the BPF program=0A+contains conditiona= l branches and loads dominated by them, but=0A+let's assume that near all d= o.=0A+=0A+This could be further optimized by allowing callers that do=0A+a = lot of individual BPF runs and are sure they don't touch=0A+other user's da= ta inbetween to do the clear only once=0A+at the beginning. We can add such= optimizations later based on=0A+profile data.=0A+=0A+Virtualization=0A+---= -----------=0A+=0A+When entering a guest in KVM we clear to avoid any leaka= ge to a guest.=0A+Normally this is done implicitely as part of the L1TF mit= igation.=0A+It relies on this being enabled. It also uses the "fast exit"= =0A+optimization that only clears if an interrupt or context switch=0A+happ= ened.=0A-- =0A2.17.2=0A=0A=0AFrom da8f40f77fc1e5a886402abb79c1e008df7bb188 = Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Tu= e, 18 Dec 2018 16:40:41 -0800=0ASubject: [PATCH 11/28] mds: Add preliminary= administrator documentation=0A=0AAdd a Documentation file for administrato= rs that describes MDS on a=0Ahigh level.=0A=0ASo far not covering SMT.=0A= =0ANeeds updates later for public URLs of supporting documentation.=0A=0ASi= gned-off-by: Andi Kleen =0A---=0A Documentation/admin-g= uide/mds.rst | 108 ++++++++++++++++++++++++++++++=0A 1 file changed, 108 in= sertions(+)=0A create mode 100644 Documentation/admin-guide/mds.rst=0A=0Adi= ff --git a/Documentation/admin-guide/mds.rst b/Documentation/admin-guide/md= s.rst=0Anew file mode 100644=0Aindex 000000000000..1f3021d20953=0A--- /dev/= null=0A+++ b/Documentation/admin-guide/mds.rst=0A@@ -0,0 +1,108 @@=0A+MDS -= Microarchitectural Data Sampling)=0A+=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=0A+=0A+Microarchitectural Data Sampling is a side channel vulnerabil= ity that=0A+allows an attacker to sample data that has been earlier used du= ring=0A+program execution. Internal buffers in the CPU may keep old data=0A= +for some limited time, which can the later be determined by an attacker=0A= +with side channel analysis. MDS can be used to occasionaly observe=0A+some= values accessed earlier, but it cannot be used to observe values=0A+not re= cently touched by other code running on the same core.=0A+=0A+It is difficu= lt to target particular data on a system using MDS,=0A+but attackers may be= able to infer secrets by collecting=0A+and analyzing large amounts of data= =2E MDS does not modify=0A+memory.=0A+=0A+MDS consists of multiple sub-vuln= erabilities:=0A+Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-= 2018-12126)=0A+Microarchitectual Fill Buffer Data Sampling (MFBDS) (CVE-201= 8-12130)=0A+Microarchitectual Load Port Data (MLPDS) (CVE-2018-12127),=0A+w= ith the first leaking store data, and the second loads and sometimes=0A+sto= re data, and the third load data.=0A+=0A+The effects and mitigations are si= milar for all three, so the Linux=0A+kernel handles and reports them all as= a single vulnerability called=0A+MDS. This also reduces the number of acro= nyms in use.=0A+=0A+Affected processors=0A+-------------------=0A+=0A+This = vulnerability affects a wide range of Intel processors.=0A+Not all CPUs are= affected by all of the sub vulnerabilities,=0A+however the kernel handles = it always the same.=0A+=0A+The vulnerability is not present in=0A+=0A+ -= Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)=0A+=0A+The kernel w= ill automatically detect future CPUs with hardware=0A+mitigations for these= issues and disable any workarounds.=0A+=0A+The kernel reports if the curre= nt CPU is vulnerable and any=0A+mitigations used in=0A+=0A+/sys/devices/sys= tem/cpu/vulnerabilities/mds=0A+=0A+Kernel mitigation=0A+-----------------= =0A+=0A+By default, the kernel automatically ensures no data leakage betwee= n=0A+different processes, or between kernel threads and interrupt handlers= =0A+and user processes, or from any cryptographic code in the kernel.=0A+= =0A+It does not isolate kernel code that only touches data of the=0A+curren= t process. If protecting such kernel code is desired,=0A+mds=3Dfull can be= specified.=0A+=0A+The mitigation is automatically enabled, but can be furt= her controlled=0A+with the command line options documented below.=0A+=0A+Th= e mitigation can be done with microcode support, requiring=0A+updated micro= code.=0A+=0A+The microcode should be loaded at early boot using the initrd.= Hot=0A+updating microcode will not enable the mitigations.=0A+=0A+Virtual = machine mitigation=0A+--------------------------=0A+=0A+The mitigation is e= nabled by default and controlled by the same options=0A+as L1TF cache clear= ing. See l1tf.rst for more details. In the default=0A+setting MDS for leaki= ng data out of the guest into other processes=0A+will be mitigated.=0A+=0A+= Kernel command line options=0A+---------------------------=0A+=0A+Normally = the kernel selects reasonable defaults and no special configuration=0A+is n= eeded. The default behavior can be overriden by the mds=3D kernel=0A+comman= d line options.=0A+=0A+These options can be specified in the boot loader. A= ny changes require a reboot.=0A+=0A+When the system only runs trusted code,= MDS mitigation can be disabled with=0A+mds=3Doff as a performance optimiza= tion.=0A+=0A+ - mds=3Doff Disable workarounds if the CPU is not affe= cted.=0A+=0A+By default the kernel only clears CPU data after execution=0A+= that is known or likely to have touched user data of other processes,=0A+or= cryptographic data. This relies on code audits done in the=0A+mainline Lin= ux kernel. When running unaudited large out of tree code,=0A+or binary driv= ers, who might violate these constraints it is possible=0A+to use mds=3Dful= l to always flush the CPU data on each kernel exit.=0A+=0A+ - mds=3Dfull = Always clear cpu state on exiting from kernel.=0A+=0A+TBD describe SMT= =0A+=0A+References=0A+----------=0A+=0A+Fore more details on the kernel int= ernal implementation of the MDS mitigations,=0A+please see Documentation/cl= earcpu.txt=0A+=0A+TBD Add URL for Intel white paper=0A+=0A+TBD add referenc= e to microcodes=0A-- =0A2.17.2=0A=0A=0AFrom 2ff6831f16391f3ec1ba7235e73269a= fafb62970 Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen = =0ADate: Wed, 12 Dec 2018 16:43:43 -0800=0ASubject: [PATCH 12/28] x86/specu= lation/mds: Introduce lazy_clear_cpu=0A=0AAdd basic infrastructure for code= to request CPU buffer clearing=0Aon the next kernel exit.=0A=0AWe have two= functions lazy_clear_cpu to request clearing,=0Aand lazy_clear_cpu_interru= pt to request clearing if running=0Ain an interrupt.=0A=0ANon architecture = specific code can include linux/clearcpu.h=0Aand use lazy_clear_cpu / lazy_= clear_interrupt. On x86=0Awe provide low level implementations that set the= TIF_CLEAR_CPU=0Abit.=0A=0ASigned-off-by: Andi Kleen = =0A---=0A arch/Kconfig | 3 +++=0A arch/x86/Kconfig = | 1 +=0A arch/x86/include/asm/clearcpu.h | 5 +++++=0A include/= linux/clearcpu.h | 36 +++++++++++++++++++++++++++++++++=0A 4 files c= hanged, 45 insertions(+)=0A create mode 100644 include/linux/clearcpu.h=0A= =0Adiff --git a/arch/Kconfig b/arch/Kconfig=0Aindex 4cfb6de48f79..e6b7bf917= 4aa 100644=0A--- a/arch/Kconfig=0A+++ b/arch/Kconfig=0A@@ -808,6 +808,9 @@ = config VMAP_STACK=0A the stack to map directly to the KASAN shadow map u= sing a formula=0A that is incorrect if the stack is in vmalloc space.=0A= =0A+config ARCH_HAS_CLEAR_CPU=0A+ def_bool n=0A+=0A config ARCH_OPTIONAL_K= ERNEL_RWX=0A def_bool n=0A =0Adiff --git a/arch/x86/Kconfig b/arch/x86/Kco= nfig=0Aindex 6185d4f33296..ccf05eff4151 100644=0A--- a/arch/x86/Kconfig=0A+= ++ b/arch/x86/Kconfig=0A@@ -84,6 +84,7 @@ config X86=0A select ARCH_WANT_B= ATCHED_UNMAP_TLB_FLUSH=0A select ARCH_WANTS_DYNAMIC_TASK_STRUCT=0A select= ARCH_WANTS_THP_SWAP if X86_64=0A+ select ARCH_HAS_CLEAR_CPU=0A select BU= ILDTIME_EXTABLE_SORT=0A select CLKEVT_I8253=0A select CLOCKSOURCE_VALIDAT= E_LAST_CYCLE=0Adiff --git a/arch/x86/include/asm/clearcpu.h b/arch/x86/incl= ude/asm/clearcpu.h=0Aindex 67c4e0d38802..35386628be6d 100644=0A--- a/arch/x= 86/include/asm/clearcpu.h=0A+++ b/arch/x86/include/asm/clearcpu.h=0A@@ -41,= 6 +41,11 @@ static inline void clear_cpu_idle(void)=0A }=0A }=0A =0A+stati= c inline void lazy_clear_cpu(void)=0A+{=0A+ set_thread_flag(TIF_CLEAR_CPU);= =0A+}=0A+=0A DECLARE_STATIC_KEY_FALSE(force_cpu_clear);=0A =0A #else=0Adiff= --git a/include/linux/clearcpu.h b/include/linux/clearcpu.h=0Anew file mod= e 100644=0Aindex 000000000000..63a6952b46fa=0A--- /dev/null=0A+++ b/include= /linux/clearcpu.h=0A@@ -0,0 +1,36 @@=0A+/* SPDX-License-Identifier: GPL-2.0= */=0A+#ifndef _LINUX_CLEARCPU_H=0A+#define _LINUX_CLEARCPU_H 1=0A+=0A+#inc= lude =0A+=0A+#ifdef CONFIG_ARCH_HAS_CLEAR_CPU=0A+#include = =0A+#else=0A+static inline void lazy_clear_cpu(void)=0A+{= =0A+}=0A+#endif=0A+=0A+/*=0A+ * Use this function when potentially touching= (reading or writing)=0A+ * user data in an interrupt. In this case schedul= e to clear the=0A+ * CPU buffers on kernel exit to avoid any potential side= channels.=0A+ *=0A+ * If not in an interrupt we assume the touched data be= longs to the=0A+ * current process and doesn't need to be cleared.=0A+ *=0A= + * This version is for code who might be in an interrupt.=0A+ * If you kno= w for sure you're in interrupt context call=0A+ * lazy_clear_cpu directly.= =0A+ *=0A+ * lazy_clear_cpu is reasonably cheap (just sets a bit) and=0A+ *= can be used in fast paths.=0A+ */=0A+static inline void lazy_clear_cpu_int= errupt(void)=0A+{=0A+ if (in_interrupt())=0A+ lazy_clear_cpu();=0A+}=0A+= =0A+#endif=0A-- =0A2.17.2=0A=0A=0AFrom 57421f0933b0367b54934c20cdeda3c202b9= 453a Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADat= e: Wed, 12 Dec 2018 16:49:56 -0800=0ASubject: [PATCH 13/28] x86/speculation= /mds: Schedule cpu clear on context=0A switch=0A=0AOn context switch we nee= d to schedule a cpu clear on the next=0Akernel exit when:=0A=0A- We're swit= ching between different processes=0A- We're switching from a kernel thread = that is not idle.=0AFor idle we assume only interrupts are sensitive, which= =0Aare already handled elsewhere. For kernel threads=0Alike work queue we a= ssume they might contain=0Asensitive (other user's or crypto) data.=0A=0ATh= e code hooks into the generic context switch, not=0Athe mm context switch, = because the mm context switch=0Adoesn't handle the idle thread case.=0A=0AT= his also transfers the clear cpu bit to the next task.=0A=0ATested-by: Neel= ima Krishnan =0ASigned-off-by: Andi Kleen =0A---=0A arch/x86/kernel/process.h | 27 +++++++++++++++++++= ++++++++=0A 1 file changed, 27 insertions(+)=0A=0Adiff --git a/arch/x86/ker= nel/process.h b/arch/x86/kernel/process.h=0Aindex 320ab978fb1f..52f97ccbf2d= c 100644=0A--- a/arch/x86/kernel/process.h=0A+++ b/arch/x86/kernel/process.= h=0A@@ -2,6 +2,7 @@=0A //=0A // Code shared between 32 and 64 bit=0A =0A+#i= nclude =0A #include =0A =0A void __switc= h_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);=0A@@ -29= ,6 +30,32 @@ static inline void switch_to_extra(struct task_struct *prev,= =0A }=0A }=0A =0A+ /*=0A+ * When we switch to a different process, or w= e switch=0A+ * from a kernel thread that was not idle, clear the CPU=0A+ = * buffers on next kernel exit.=0A+ *=0A+ * We assume that idle does not t= ouch user data, except=0A+ * for interrupts, which schedule their own clea= rs as needed.=0A+ * But other kernel threads, like work queues, might=0A+ = * touch user data, so flush in this case.=0A+ *=0A+ * This has to be her= e because switch_mm doesn't get=0A+ * called in the kernel thread case.=0A= + */=0A+ if (static_cpu_has(X86_BUG_MDS)) {=0A+ if (prev->pid && (next->m= m !=3D prev->mm || prev->mm =3D=3D NULL))=0A+ lazy_clear_cpu();=0A+ /*= =0A+ * Also transfer the clearcpu flag from the previous task.=0A+ * Ca= n be done non atomically because interrupts are off.=0A+ */=0A+ task_thr= ead_info(next)->status |=3D=0A+ task_thread_info(prev)->status & _TIF_CLE= AR_CPU;=0A+ task_thread_info(prev)->status &=3D ~_TIF_CLEAR_CPU;=0A+ }=0A+= =0A+=0A /*=0A * __switch_to_xtra() handles debug registers, i/o bitmaps,= =0A * speculation mitigations etc.=0A-- =0A2.17.2=0A=0A=0AFrom e78d8a60f0= 366b100ef3f1bb9bf333b093fe6a1e Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen = =0ADate: Fri, 14 Dec 2018 13:33:03 -0800=0ASubject: [PA= TCH 14/28] x86/speculation/mds: Add tracing for clear_cpu=0A=0AAdd trace po= ints for clear_cpu and lazy_clear_cpu. This is useful=0Afor debugging and p= erformance testing.=0A=0AThe trace points have to be partially out of line = to avoid=0Ainclude loops, but the fast path jump labels are inlined.=0A=0AT= he idle case cannot be traced because trace points=0Adon't like idle contex= t.=0A=0ASigned-off-by: Andi Kleen =0A---=0A arch/x86/in= clude/asm/clearcpu.h | 36 +++++++++++++++++++++++++--=0A arch/x86/inc= lude/asm/trace/clearcpu.h | 27 ++++++++++++++++++++=0A arch/x86/kernel/cpu/= bugs.c | 17 +++++++++++++=0A 3 files changed, 78 insertions(+), = 2 deletions(-)=0A create mode 100644 arch/x86/include/asm/trace/clearcpu.h= =0A=0Adiff --git a/arch/x86/include/asm/clearcpu.h b/arch/x86/include/asm/c= learcpu.h=0Aindex 35386628be6d..935b827a4175 100644=0A--- a/arch/x86/includ= e/asm/clearcpu.h=0A+++ b/arch/x86/include/asm/clearcpu.h=0A@@ -9,12 +9,35 @= @=0A #include =0A #include =0A =0A+= /*=0A+ * We cannot directly include the trace point header here=0A+ * becau= se it leads to include loops with other trace point=0A+ * files pulling thi= s one in. Define the static=0A+ * key manually here, which handles noping t= he fast path,=0A+ * and the actual tracing is done out of line.=0A+ */=0A+#= ifdef CONFIG_TRACEPOINTS=0A+#include =0A+#include =0A+=0A+extern struct tracepoint __tracepoint_clear_cpu;=0A+e= xtern struct tracepoint __tracepoint_lazy_clear_cpu;=0A+#define cc_tracepoi= nt_active(t) static_key_false(&(t).key)=0A+=0A+extern void do_trace_clear_c= pu(void);=0A+extern void do_trace_lazy_clear_cpu(void);=0A+#else=0A+#define= cc_tracepoint_active(t) false=0A+static inline void do_trace_clear_cpu(voi= d) {}=0A+static inline void do_trace_lazy_clear_cpu(void) {}=0A+#endif=0A+= =0A /*=0A * Clear CPU buffers to avoid side channels.=0A * We use microco= de as a side effect of the obsolete VERW instruction=0A */=0A =0A-static i= nline void clear_cpu(void)=0A+static inline void __clear_cpu(void)=0A {=0A = unsigned kernel_ds =3D __KERNEL_DS;=0A /* Has to be memory form, don't mo= dify to use an register */=0A@@ -22,6 +45,13 @@ static inline void clear_cp= u(void)=0A [kernelds] "m" (kernel_ds));=0A }=0A =0A+static inline void cl= ear_cpu(void)=0A+{=0A+ if (cc_tracepoint_active(__tracepoint_clear_cpu))=0A= + do_trace_clear_cpu();=0A+ __clear_cpu();=0A+}=0A+=0A /*=0A * Clear CPU = buffers before going idle, so that no state is leaked to SMT=0A * siblings= taking over thread resources.=0A@@ -37,12 +67,14 @@ static inline void cle= ar_cpu_idle(void)=0A {=0A if (sched_smt_active()) {=0A clear_thread_flag= (TIF_CLEAR_CPU);=0A- clear_cpu();=0A+ __clear_cpu();=0A }=0A }=0A =0A st= atic inline void lazy_clear_cpu(void)=0A {=0A+ if (cc_tracepoint_active(__t= racepoint_lazy_clear_cpu))=0A+ do_trace_lazy_clear_cpu();=0A set_thread_f= lag(TIF_CLEAR_CPU);=0A }=0A =0Adiff --git a/arch/x86/include/asm/trace/clea= rcpu.h b/arch/x86/include/asm/trace/clearcpu.h=0Anew file mode 100644=0Aind= ex 000000000000..e742b5cd8ee9=0A--- /dev/null=0A+++ b/arch/x86/include/asm/= trace/clearcpu.h=0A@@ -0,0 +1,27 @@=0A+#undef TRACE_SYSTEM=0A+#define TRACE= _SYSTEM clearcpu=0A+=0A+#if !defined(_TRACE_CLEARCPU_H) || defined(TRACE_HE= ADER_MULTI_READ)=0A+=0A+#include =0A+=0A+DECLARE_EVENT_= CLASS(clear_cpu,=0A+ TP_PROTO(int dummy),=0A+ TP_ARGS(dummy),=0A+= TP_STRUCT__entry(__field(int, dummy)),=0A+ TP_fast_assign(),=0A+= TP_printk("%d", __entry->dummy));=0A+=0A+DEFINE_EVENT(clear_cpu, clea= r_cpu, TP_PROTO(int dummy), TP_ARGS(dummy));=0A+DEFINE_EVENT(clear_cpu, laz= y_clear_cpu, TP_PROTO(int dummy), TP_ARGS(dummy));=0A+=0A+#define _TRACE_CL= EARCPU_H=0A+=0A+#undef TRACE_INCLUDE_PATH=0A+#define TRACE_INCLUDE_PATH asm= /trace/=0A+#undef TRACE_INCLUDE_FILE=0A+#define TRACE_INCLUDE_FILE clearcpu= =0A+#endif /* _TRACE_CLEARCPU_H */=0A+=0A+/* This part must be outside prot= ection */=0A+#include =0Adiff --git a/arch/x86/kernel= /cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c=0Aindex 582b1cd019f7..e54df06dd462= 100644=0A--- a/arch/x86/kernel/cpu/bugs.c=0A+++ b/arch/x86/kernel/cpu/bugs= =2Ec=0A@@ -1061,6 +1061,23 @@ early_param("l1tf", l1tf_cmdline);=0A =0A #un= def pr_fmt=0A =0A+#define CREATE_TRACE_POINTS=0A+#include =0A+=0A+void do_trace_clear_cpu(void)=0A+{=0A+ trace_clear_cpu(0);=0A+= }=0A+EXPORT_SYMBOL(do_trace_clear_cpu);=0A+EXPORT_TRACEPOINT_SYMBOL(clear_c= pu);=0A+=0A+void do_trace_lazy_clear_cpu(void)=0A+{=0A+ trace_lazy_clear_cp= u(0);=0A+}=0A+EXPORT_SYMBOL(do_trace_lazy_clear_cpu);=0A+EXPORT_TRACEPOINT_= SYMBOL(lazy_clear_cpu);=0A+=0A DEFINE_STATIC_KEY_FALSE(force_cpu_clear);=0A= =0A static void mds_select_mitigation(void)=0A-- =0A2.17.2=0A=0A=0AFrom 14= 630e87c2f4f0cfd1d5ba351677ec94028e682d Mon Sep 17 00:00:00 2001=0AFrom: And= i Kleen =0ADate: Fri, 14 Dec 2018 17:26:14 -0800=0ASubj= ect: [PATCH 15/28] mds: Force clear cpu on kernel preemption=0A=0AWhen the = kernel is preempted we need to force a cpu clear,=0Abecause the preemption = might happen before the code=0Ahas a chance to set TIF_CPU_CLEAR later.=0A= =0AWe cannot rely on kernel code setting the flag before=0Atouching sensiti= ve data: the flag setting could=0Abe implicit, like in memzero_explicit, wh= ich is always=0Acalled later.=0A=0ASigned-off-by: Andi Kleen =0A---=0A kernel/sched/core.c | 9 +++++++++=0A 1 file changed, 9 inse= rtions(+)=0A=0Adiff --git a/kernel/sched/core.c b/kernel/sched/core.c=0Aind= ex a674c7db2f29..b04918e9115c 100644=0A--- a/kernel/sched/core.c=0A+++ b/ke= rnel/sched/core.c=0A@@ -11,6 +11,8 @@=0A =0A #include =0A =0A= +#include =0A+=0A #include =0A #include = =0A =0A@@ -3619,6 +3621,13 @@ asmlinkage __visible void __sched = notrace preempt_schedule(void)=0A if (likely(!preemptible()))=0A return;= =0A =0A+ /*=0A+ * For kernel preemption we need to force a cpu clear=0A+ = * because it could happen before the code has a chance=0A+ * to set TIF_CL= EAR_CPU.=0A+ */=0A+ lazy_clear_cpu();=0A+=0A preempt_schedule_common();= =0A }=0A NOKPROBE_SYMBOL(preempt_schedule);=0A-- =0A2.17.2=0A=0A=0AFrom 29a= 16e98fca24ad5d542efd5bedbb93b779173b7 Mon Sep 17 00:00:00 2001=0AFrom: Andi= Kleen =0ADate: Wed, 12 Dec 2018 16:44:52 -0800=0ASubje= ct: [PATCH 16/28] mds: Schedule cpu clear for memzero_explicit and kzfree= =0A=0AAssume that any code using these functions is sensitive and shouldn't= =0Aleak any data.=0A=0AThis handles clearing for key data used in the kerne= l.=0A=0ASuggested-by: Linus Torvalds =0ASign= ed-off-by: Andi Kleen =0A---=0A lib/string.c | 6 ++= ++++=0A mm/slab_common.c | 5 ++++-=0A 2 files changed, 10 insertions(+), 1 = deletion(-)=0A=0Adiff --git a/lib/string.c b/lib/string.c=0Aindex 38e4ca08e= 757..9ce59dd86541 100644=0A--- a/lib/string.c=0A+++ b/lib/string.c=0A@@ -28= ,6 +28,7 @@=0A #include =0A #include =0A #inclu= de =0A+#include =0A =0A #include =0A #include =0A@@ -715,12 +716,17 @@ EXPORT_S= YMBOL(memset);=0A * necessary, memzero_explicit() should be used instead i= n=0A * order to prevent the compiler from optimising away zeroing.=0A *= =0A+ * As a side effect this may also trigger extra cleaning=0A+ * of CPU s= tate before the next kernel exit to avoid=0A+ * side channels.=0A+ *=0A * = memzero_explicit() doesn't need an arch-specific version as=0A * it just i= nvokes the one of memset() implicitly.=0A */=0A void memzero_explicit(void= *s, size_t count)=0A {=0A memset(s, 0, count);=0A+ lazy_clear_cpu();=0A = barrier_data(s);=0A }=0A EXPORT_SYMBOL(memzero_explicit);=0Adiff --git a/mm= /slab_common.c b/mm/slab_common.c=0Aindex 81732d05e74a..7b5e2e1318a2 100644= =0A--- a/mm/slab_common.c=0A+++ b/mm/slab_common.c=0A@@ -1576,6 +1576,9 @@ = EXPORT_SYMBOL(krealloc);=0A * Note: this function zeroes the whole allocat= ed buffer which can be a good=0A * deal bigger than the requested buffer s= ize passed to kmalloc(). So be=0A * careful when using this function in pe= rformance sensitive code.=0A+ *=0A+ * As a side effect this may also clear = CPU state later before the=0A+ * next kernel exit to avoid side channels.= =0A */=0A void kzfree(const void *p)=0A {=0A@@ -1585,7 +1588,7 @@ void kzf= ree(const void *p)=0A if (unlikely(ZERO_OR_NULL_PTR(mem)))=0A return;=0A= ks =3D ksize(mem);=0A- memset(mem, 0, ks);=0A+ memzero_explicit(mem, ks);= =0A kfree(mem);=0A }=0A EXPORT_SYMBOL(kzfree);=0A-- =0A2.17.2=0A=0A=0AFrom= 7b3c6596f1d23091e54d1261666a54625b6f31dc Mon Sep 17 00:00:00 2001=0AFrom: = Andi Kleen =0ADate: Wed, 12 Dec 2018 16:51:44 -0800=0AS= ubject: [PATCH 17/28] mds: Mark interrupts clear cpu, unless opted-out=0A= =0AInterrupts might touch user data from other processes=0Ain any context.= =0A=0ABy default we clear the CPU on the next kernel exit.=0A=0AAdd a new I= RQ_F_NO_USER interrupt flag. When the flag=0Ais not set on interrupt execut= ion we clear the cpu state on=0Anext kernel exit.=0A=0AThis allows interrup= ts to opt-out from the extra clearing=0Aoverhead, but is safe by default.= =0A=0AOver time as more interrupt code is audited it can set the opt-out.= =0A=0ASigned-off-by: Andi Kleen =0A---=0A include/linux= /interrupt.h | 2 ++=0A kernel/irq/handle.c | 8 ++++++++=0A kernel/irq= /manage.c | 1 +=0A 3 files changed, 11 insertions(+)=0A=0Adiff --git = a/include/linux/interrupt.h b/include/linux/interrupt.h=0Aindex c672f34235e= 7..291b7fee3afe 100644=0A--- a/include/linux/interrupt.h=0A+++ b/include/li= nux/interrupt.h=0A@@ -61,6 +61,7 @@=0A * interrupt handler = after suspending interrupts. For system=0A * wakeup devices= users need to implement wakeup detection in=0A * their int= errupt handlers.=0A+ * IRQF_NO_USER - Interrupt does not touch user data=0A= */=0A #define IRQF_SHARED 0x00000080=0A #define IRQF_PROBE_SHARED 0x0000= 0100=0A@@ -74,6 +75,7 @@=0A #define IRQF_NO_THREAD 0x00010000=0A #define I= RQF_EARLY_RESUME 0x00020000=0A #define IRQF_COND_SUSPEND 0x00040000=0A+#def= ine IRQF_NO_USER 0x00080000=0A =0A #define IRQF_TIMER (__IRQF_TIMER | IRQ= F_NO_SUSPEND | IRQF_NO_THREAD)=0A =0Adiff --git a/kernel/irq/handle.c b/ker= nel/irq/handle.c=0Aindex 38554bc35375..e5910938ce2b 100644=0A--- a/kernel/i= rq/handle.c=0A+++ b/kernel/irq/handle.c=0A@@ -13,6 +13,7 @@=0A #include =0A #include =0A #include =0A+#include =0A =0A #include = =0A =0A@@ -149,6 +150,13 @@ irqreturn_t __handle_irq_event_percpu(struct ir= q_desc *desc, unsigned int *flags=0A res =3D action->handler(irq, action-= >dev_id);=0A trace_irq_handler_exit(irq, action, res);=0A =0A+ /*=0A+ = * We aren't sure if the interrupt handler did or did not=0A+ * touch user= data. Schedule a cpu clear just in case.=0A+ */=0A+ if (!(action->flags= & IRQF_NO_USER))=0A+ lazy_clear_cpu();=0A+=0A if (WARN_ONCE(!irqs_disa= bled(),"irq %u handler %pF enabled interrupts\n",=0A irq, action->= handler))=0A local_irq_disable();=0Adiff --git a/kernel/irq/manage.c b/k= ernel/irq/manage.c=0Aindex a4888ce4667a..3f0c99240638 100644=0A--- a/kernel= /irq/manage.c=0A+++ b/kernel/irq/manage.c=0A@@ -1793,6 +1793,7 @@ EXPORT_SY= MBOL(free_irq);=0A *=0A * IRQF_SHARED Interrupt is shared=0A * IRQF_TRI= GGER_* Specify active edge(s) or level=0A+ * IRQF_NOUSER Does not touch u= ser data.=0A *=0A */=0A int request_threaded_irq(unsigned int irq, irq_ha= ndler_t handler,=0A-- =0A2.17.2=0A=0A=0AFrom 2e49da0730be61300ab1bab2f25418= b706f2cd26 Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen = =0ADate: Wed, 12 Dec 2018 16:46:09 -0800=0ASubject: [PATCH 18/28] mds: Clea= r cpu on all timers, unless the timer opts-out=0A=0ABy default we assume ti= mers might touch user data and schedule=0Aa cpu clear on next kernel exit.= =0A=0ASupport opt-outs where timer and hrtimer handlers can opt-in=0Athey t= hey don't touch any user data.=0A=0ANote this takes one bit from the timer = wheel index field away,=0Abut it seems there are less wheels available anyw= ays, so that=0Ashould be ok.=0A=0ASigned-off-by: Andi Kleen =0A---=0A include/linux/hrtimer.h | 4 ++++=0A include/linux/timer.h = | 9 ++++++---=0A kernel/time/hrtimer.c | 5 +++++=0A kernel/time/timer.c= | 8 ++++++++=0A 4 files changed, 23 insertions(+), 3 deletions(-)=0A= =0Adiff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h=0Aindex 2= e8957eac4d4..b32c76919f78 100644=0A--- a/include/linux/hrtimer.h=0A+++ b/in= clude/linux/hrtimer.h=0A@@ -32,6 +32,7 @@ struct hrtimer_cpu_base;=0A * = when starting the timer)=0A * HRTIMER_MODE_SOFT - Timer callback funct= ion will be executed in=0A * soft irq context=0A+ * HRTIMER_MODE_NO_U= SER - Handler does not touch user data.=0A */=0A enum hrtimer_mode {=0A = HRTIMER_MODE_ABS =3D 0x00,=0A@@ -48,6 +49,7 @@ enum hrtimer_mode {=0A HRTI= MER_MODE_ABS_PINNED_SOFT =3D HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,= =0A HRTIMER_MODE_REL_PINNED_SOFT =3D HRTIMER_MODE_REL_PINNED | HRTIMER_MOD= E_SOFT,=0A =0A+ HRTIMER_MODE_NO_USER =3D 0x08,=0A };=0A =0A /*=0A@@ -101,6 = +103,7 @@ enum hrtimer_restart {=0A * @state: state information (See bit v= alues above)=0A * @is_rel: Set if the timer was armed relative=0A * @is_s= oft: Set if hrtimer will be expired in soft interrupt context.=0A+ * @no_us= er: function does not touch user data.=0A *=0A * The hrtimer structure mu= st be initialized by hrtimer_init()=0A */=0A@@ -112,6 +115,7 @@ struct hrt= imer {=0A u8 state;=0A u8 is_rel;=0A u8 is_soft;=0A+ u8 no_u= ser;=0A };=0A =0A /**=0Adiff --git a/include/linux/timer.h b/include/linux/= timer.h=0Aindex 7b066fd38248..222e72432be3 100644=0A--- a/include/linux/tim= er.h=0A+++ b/include/linux/timer.h=0A@@ -56,10 +56,13 @@ struct timer_list = {=0A #define TIMER_DEFERRABLE 0x00080000=0A #define TIMER_PINNED 0x0010000= 0=0A #define TIMER_IRQSAFE 0x00200000=0A-#define TIMER_ARRAYSHIFT 22=0A-#d= efine TIMER_ARRAYMASK 0xFFC00000=0A+#define TIMER_NO_USER 0x00400000=0A+#= define TIMER_ARRAYSHIFT 23=0A+#define TIMER_ARRAYMASK 0xFF800000=0A =0A-#d= efine TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINN= ED | TIMER_IRQSAFE)=0A+#define TIMER_TRACE_FLAGMASK \=0A+ (TIMER_MIGRATING = | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE | \=0A+ TIMER_NO_USER)= =0A =0A #define __TIMER_INITIALIZER(_function, _flags) { \=0A .entry =3D= { .next =3D TIMER_ENTRY_STATIC }, \=0Adiff --git a/kernel/time/hrtimer.c b= /kernel/time/hrtimer.c=0Aindex f5cfa1b73d6f..e2c8776ba2a4 100644=0A--- a/ke= rnel/time/hrtimer.c=0A+++ b/kernel/time/hrtimer.c=0A@@ -42,6 +42,7 @@=0A #i= nclude =0A #include =0A #include =0A+#include =0A =0A #include =0A = =0A@@ -1276,6 +1277,7 @@ static void __hrtimer_init(struct hrtimer *timer, = clockid_t clock_id,=0A clock_id =3D CLOCK_MONOTONIC;=0A =0A base +=3D hr= timer_clockid_to_base(clock_id);=0A+ timer->no_user =3D !!(mode & HRTIMER_M= ODE_NO_USER);=0A timer->is_soft =3D softtimer;=0A timer->base =3D &cpu_ba= se->clock_base[base];=0A timerqueue_init(&timer->node);=0A@@ -1390,6 +1392= ,9 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,=0A trac= e_hrtimer_expire_exit(timer);=0A raw_spin_lock_irq(&cpu_base->lock);=0A = =0A+ if (!timer->no_user)=0A+ lazy_clear_cpu();=0A+=0A /*=0A * Note: We= clear the running state after enqueue_hrtimer and=0A * we do not reprogr= am the event hardware. Happens either in=0Adiff --git a/kernel/time/timer.c= b/kernel/time/timer.c=0Aindex 444156debfa0..e6ab6986ffc8 100644=0A--- a/ke= rnel/time/timer.c=0A+++ b/kernel/time/timer.c=0A@@ -43,6 +43,7 @@=0A #inclu= de =0A #include =0A #include =0A+#include =0A =0A #include =0A #= include =0A@@ -1338,6 +1339,13 @@ static void call_timer_fn(s= truct timer_list *timer, void (*fn)(struct timer_list=0A */=0A preempt= _count_set(count);=0A }=0A+=0A+ /*=0A+ * The timer might have touched use= r data. Schedule=0A+ * a cpu clear on the next kernel exit.=0A+ */=0A+ if= (!(timer->flags & TIMER_NO_USER))=0A+ lazy_clear_cpu();=0A }=0A =0A stati= c void expire_timers(struct timer_base *base, struct hlist_head *head)=0A--= =0A2.17.2=0A=0A=0AFrom f480148820a5d1cf0bb7013856a8039569fc4da4 Mon Sep 17= 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Thu, 13 Dec = 2018 11:28:55 -0800=0ASubject: [PATCH 19/28] mds: Clear CPU on tasklets, un= less opted-out=0A=0ABy default we assume tasklets might touch user data and= schedule=0Aa cpu clear on next kernel exit.=0A=0AAdd new interfaces to all= ow audited tasklets to opt-out.=0A=0ASigned-off-by: Andi Kleen =0A---=0A include/linux/interrupt.h | 16 +++++++++++++++-=0A kernel= /softirq.c | 25 +++++++++++++++++++------=0A 2 files changed, 34 i= nsertions(+), 7 deletions(-)=0A=0Adiff --git a/include/linux/interrupt.h b/= include/linux/interrupt.h=0Aindex 291b7fee3afe..81b852fb5ecf 100644=0A--- a= /include/linux/interrupt.h=0A+++ b/include/linux/interrupt.h=0A@@ -571,11 += 571,22 @@ struct tasklet_struct name =3D { NULL, 0, ATOMIC_INIT(0), func, d= ata }=0A #define DECLARE_TASKLET_DISABLED(name, func, data) \=0A struct tas= klet_struct name =3D { NULL, 0, ATOMIC_INIT(1), func, data }=0A =0A+#define= DECLARE_TASKLET_NOUSER(name, func, data) \=0A+struct tasklet_struct name = =3D { NULL, TASKLET_NO_USER, ATOMIC_INIT(0), func, data }=0A+=0A+#define DE= CLARE_TASKLET_DISABLED_NOUSER(name, func, data) \=0A+struct tasklet_struct = name =3D { NULL, TASKLET_NO_USER, ATOMIC_INIT(1), func, data }=0A =0A enum= =0A {=0A TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */=0A-= TASKLET_STATE_RUN /* Tasklet is running (SMP only) */=0A+ TASKLET_STATE_RU= N, /* Tasklet is running (SMP only) */=0A+=0A+ /*=0A+ * Set this flag when= the tasklet is known to not touch user data,=0A+ * so doesn't need extra = CPU state clearing.=0A+ */=0A+ TASKLET_NO_USER =3D 1 << 5,=0A };=0A =0A #= ifdef CONFIG_SMP=0A@@ -639,6 +650,9 @@ extern void tasklet_kill(struct task= let_struct *t);=0A extern void tasklet_kill_immediate(struct tasklet_struct= *t, unsigned int cpu);=0A extern void tasklet_init(struct tasklet_struct *= t,=0A void (*func)(unsigned long), unsigned long data);=0A+extern void = tasklet_init_flags(struct tasklet_struct *t,=0A+ void (*func)(unsigned l= ong), unsigned long data,=0A+ unsigned flags);=0A =0A struct tasklet_hrt= imer {=0A struct hrtimer timer;=0Adiff --git a/kernel/softirq.c b/kernel/= softirq.c=0Aindex d28813306b2c..fdd4e3be3db7 100644=0A--- a/kernel/softirq.= c=0A+++ b/kernel/softirq.c=0A@@ -26,6 +26,7 @@=0A #include =0A #include =0A #include =0A+#include =0A =0A #define CREATE_TRACE_POINTS=0A #include =0A@@ -522,6 +523,8 @@ static void tasklet_action_common(struct softir= q_action *a,=0A BUG();=0A t->func(t->data);=0A tasklet_unlock(= t);=0A+ if (!(t->state & TASKLET_NO_USER))=0A+ lazy_clear_cpu();=0A = continue;=0A }=0A tasklet_unlock(t);=0A@@ -546,15 +549,23 @@ stat= ic __latent_entropy void tasklet_hi_action(struct softirq_action *a)=0A ta= sklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);=0A }=0A = =0A-void tasklet_init(struct tasklet_struct *t,=0A- void (*func)(unsigne= d long), unsigned long data)=0A+void tasklet_init_flags(struct tasklet_stru= ct *t,=0A+ void (*func)(unsigned long), unsigned long data,=0A+ unsig= ned flags)=0A {=0A t->next =3D NULL;=0A- t->state =3D 0;=0A+ t->state =3D = flags;=0A atomic_set(&t->count, 0);=0A t->func =3D func;=0A t->data =3D = data;=0A }=0A+EXPORT_SYMBOL(tasklet_init_flags);=0A+=0A+void tasklet_init(s= truct tasklet_struct *t,=0A+ void (*func)(unsigned long), unsigned long = data)=0A+{=0A+ tasklet_init_flags(t, func, data, 0);=0A+}=0A EXPORT_SYMBOL(= tasklet_init);=0A =0A void tasklet_kill(struct tasklet_struct *t)=0A@@ -609= ,7 +620,8 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)= =0A * @ttimer: tasklet_hrtimer which is initialized=0A * @function: hrt= imer callback function which gets called from softirq context=0A * @which_= clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)=0A- * @mode: hrtimer mode= (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)=0A+ * @mode: hrtimer mode (HRTIMER_MO= DE_ABS/HRTIMER_MODE_REL),=0A+ * HRTIMER_MODE_NO_USER=0A */=0A void taskl= et_hrtimer_init(struct tasklet_hrtimer *ttimer,=0A enum hrtimer_restar= t (*function)(struct hrtimer *),=0A@@ -617,8 +629,9 @@ void tasklet_hrtimer= _init(struct tasklet_hrtimer *ttimer,=0A {=0A hrtimer_init(&ttimer->timer,= which_clock, mode);=0A ttimer->timer.function =3D __hrtimer_tasklet_tramp= oline;=0A- tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,=0A-= (unsigned long)ttimer);=0A+ tasklet_init_flags(&ttimer->tasklet, __t= asklet_hrtimer_trampoline,=0A+ (unsigned long)ttimer,=0A+ (mode= & HRTIMER_MODE_NO_USER) ? TASKLET_NO_USER : 0);=0A ttimer->function =3D f= unction;=0A }=0A EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);=0A-- =0A2.17.2=0A= =0A=0AFrom 83b8dadd27b981218b939af25e5936df8a5f43ae Mon Sep 17 00:00:00 200= 1=0AFrom: Andi Kleen =0ADate: Fri, 14 Dec 2018 13:29:35= -0800=0ASubject: [PATCH 20/28] mds: Clear CPU on irq poll, unless opted-ou= t=0A=0ABy default we assume that irq poll handlers running in the irq poll= =0Asoftirq might touch user data and we schedule a cpu clear on next=0Akern= el exit.=0A=0AAdd interfaces for audited handlers to declare that they are = safe.=0A=0ASigned-off-by: Andi Kleen =0A---=0A include/= linux/irq_poll.h | 2 ++=0A lib/irq_poll.c | 18 ++++++++++++++++-= -=0A 2 files changed, 18 insertions(+), 2 deletions(-)=0A=0Adiff --git a/in= clude/linux/irq_poll.h b/include/linux/irq_poll.h=0Aindex 16aaeccb65cb..5f1= 3582f1b8e 100644=0A--- a/include/linux/irq_poll.h=0A+++ b/include/linux/irq= _poll.h=0A@@ -15,6 +15,8 @@ struct irq_poll {=0A enum {=0A IRQ_POLL_F_SCHE= D =3D 0,=0A IRQ_POLL_F_DISABLE =3D 1,=0A+=0A+ IRQ_POLL_F_NO_USER =3D 1<<4,= =0A };=0A =0A extern void irq_poll_sched(struct irq_poll *);=0Adiff --git a= /lib/irq_poll.c b/lib/irq_poll.c=0Aindex 86a709954f5a..cb19431f53ec 100644= =0A--- a/lib/irq_poll.c=0A+++ b/lib/irq_poll.c=0A@@ -11,6 +11,7 @@=0A #incl= ude =0A #include =0A #include =0A+#include =0A =0A static unsigned int irq_poll_budget= __read_mostly =3D 256;=0A =0A@@ -111,6 +112,9 @@ static void __latent_entr= opy irq_poll_softirq(struct softirq_action *h)=0A =0A budget -=3D work;= =0A =0A+ if (!(iop->state & IRQ_POLL_F_NO_USER))=0A+ lazy_clear_cpu();= =0A+=0A local_irq_disable();=0A =0A /*=0A@@ -168,21 +172,31 @@ void irq= _poll_enable(struct irq_poll *iop)=0A EXPORT_SYMBOL(irq_poll_enable);=0A = =0A /**=0A- * irq_poll_init - Initialize this @iop=0A+ * irq_poll_init_flag= s - Initialize this @iop=0A * @iop: The parent iopoll structure=0A *= @weight: The default weight (or command completion budget)=0A * @poll_f= n: The handler to invoke=0A+ * @flags: IRQ_POLL_F_NO_USER if callback d= oes not touch user data.=0A *=0A * Description:=0A * Initialize and = enable this irq_poll structure.=0A **/=0A-void irq_poll_init(struct irq_po= ll *iop, int weight, irq_poll_fn *poll_fn)=0A+void irq_poll_init_flags(stru= ct irq_poll *iop, int weight, irq_poll_fn *poll_fn,=0A+ int flags)=0A {= =0A memset(iop, 0, sizeof(*iop));=0A INIT_LIST_HEAD(&iop->list);=0A iop-= >weight =3D weight;=0A iop->poll =3D poll_fn;=0A+ iop->state =3D flags;=0A= }=0A+EXPORT_SYMBOL(irq_poll_init_flags);=0A+=0A+void irq_poll_init(struct = irq_poll *iop, int weight, irq_poll_fn *poll_fn)=0A+{=0A+ return irq_poll_i= nit_flags(iop, weight, poll_fn, 0);=0A+}=0A+=0A EXPORT_SYMBOL(irq_poll_init= );=0A =0A static int irq_poll_cpu_dead(unsigned int cpu)=0A-- =0A2.17.2=0A= =0A=0AFrom 78ff683e5e057626174118b9d176befa8ff70f9a Mon Sep 17 00:00:00 200= 1=0AFrom: Andi Kleen =0ADate: Thu, 13 Dec 2018 11:28:23= -0800=0ASubject: [PATCH 21/28] mds: Clear cpu for string io/memcpy_*io in = interrupts=0A=0ASchedule a clear cpu on next kernel exit for string PIO=0Ao= r memcpy_from/to_io calls, when they are called in=0Ainterrupts.=0A=0AThe P= IO case is likely already handled by old drivers=0Anot opting in their inte= rrupt handlers to not clear,=0Abut let's do it just to be sure.=0A=0ASigned= -off-by: Andi Kleen =0A---=0A arch/x86/include/asm/io.h= | 3 +++=0A include/asm-generic/io.h | 3 +++=0A 2 files changed, 6 inserti= ons(+)=0A=0Adiff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/i= o.h=0Aindex 686247db3106..19e2208eaa94 100644=0A--- a/arch/x86/include/asm/= io.h=0A+++ b/arch/x86/include/asm/io.h=0A@@ -40,6 +40,7 @@=0A =0A #include = =0A #include =0A+#include =0A #include =0A #include =0A #includ= e =0A@@ -321,6 +322,7 @@ static inline void outs##bwl(= int port, const void *addr, unsigned long count) \=0A : "+S"(addr),= "+c"(count) \=0A : "d"(port) : "memory"); \=0A } \=0A+= lazy_clear_cpu_interrupt(); \=0A } \=0A \=0A static i= nline void ins##bwl(int port, void *addr, unsigned long count) \=0A@@ -337,= 6 +339,7 @@ static inline void ins##bwl(int port, void *addr, unsigned long= count) \=0A : "+D"(addr), "+c"(count) \=0A : "d"(port) := "memory"); \=0A } \=0A+ lazy_clear_cpu_interrupt(); \=0A }= =0A =0A BUILDIO(b, b, char)=0Adiff --git a/include/asm-generic/io.h b/inclu= de/asm-generic/io.h=0Aindex d356f802945a..cf58bceea042 100644=0A--- a/inclu= de/asm-generic/io.h=0A+++ b/include/asm-generic/io.h=0A@@ -14,6 +14,7 @@=0A= #include /* I/O is all done through memory accesses */=0A #in= clude /* for memset() and memcpy() */=0A #include =0A+#include =0A =0A #ifdef CONFIG_GENERIC_IOMAP= =0A #include =0A@@ -1115,6 +1116,7 @@ static inline vo= id memcpy_fromio(void *buffer,=0A size_t size)=0A {=0A memcpy(buffer,= __io_virt(addr), size);=0A+ lazy_clear_cpu_interrupt();=0A }=0A #endif=0A = =0A@@ -1132,6 +1134,7 @@ static inline void memcpy_toio(volatile void __iom= em *addr, const void *buffer,=0A size_t size)=0A {=0A memcpy(__i= o_virt(addr), buffer, size);=0A+ lazy_clear_cpu_interrupt();=0A }=0A #endif= =0A =0A-- =0A2.17.2=0A=0A=0AFrom 23d56823578c53edb4a63c25223aca8d84c20d39 M= on Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Thu= , 13 Dec 2018 11:29:09 -0800=0ASubject: [PATCH 22/28] mds: Schedule clear c= pu in swiotlb=0A=0ASchedule a cpu clear on next kernel exit for swiotlb run= ning=0Ain interrupt context, since it touches user data with the CPU.=0A=0A= Signed-off-by: Andi Kleen =0A---=0A kernel/dma/swiotlb.= c | 2 ++=0A 1 file changed, 2 insertions(+)=0A=0Adiff --git a/kernel/dma/sw= iotlb.c b/kernel/dma/swiotlb.c=0Aindex d6361776dc5c..e11ff1e45a4c 100644=0A= --- a/kernel/dma/swiotlb.c=0A+++ b/kernel/dma/swiotlb.c=0A@@ -34,6 +34,7 @@= =0A #include =0A #include =0A #in= clude =0A+#include =0A =0A #include <= asm/io.h>=0A #include =0A@@ -420,6 +421,7 @@ static void swiotlb= _bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,=0A } else {=0A memc= py(phys_to_virt(orig_addr), vaddr, size);=0A }=0A+ lazy_clear_cpu_interrup= t();=0A }=0A =0A phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,= =0A-- =0A2.17.2=0A=0A=0AFrom 58a7542c400ba68502b17fc4ed84d3f83669cb4e Mon S= ep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Wed, 12= Dec 2018 16:44:07 -0800=0ASubject: [PATCH 23/28] mds: Instrument skb funct= ions to clear cpu=0A automatically=0A=0AInstrument some strategic skbuff fu= nctions that either touch=0Apacket data directly, or are likely followed by= a user=0Adata touch like a memcpy, to schedule a cpu clear on next=0Akerne= l exit. This is only done inside interrupts,=0Aoutside we assume it only to= uches the current processes' data.=0A=0AIn principle network data should be= encrypted anyways,=0Abut it's better to not leak it.=0A=0AThis provides pr= otection for the network softirq.=0A=0ANeeds more auditing.=0A=0ASigned-off= -by: Andi Kleen =0A---=0A include/linux/skbuff.h | 2 += +=0A net/core/skbuff.c | 26 ++++++++++++++++++++++++++=0A 2 files chan= ged, 28 insertions(+)=0A=0Adiff --git a/include/linux/skbuff.h b/include/li= nux/skbuff.h=0Aindex 93f56fddd92a..5e147afa07e4 100644=0A--- a/include/linu= x/skbuff.h=0A+++ b/include/linux/skbuff.h=0A@@ -40,6 +40,7 @@=0A #include <= linux/in6.h>=0A #include =0A #include =0A+#i= nclude =0A =0A /* The interface for checksum offload betw= een the stack and networking drivers=0A * is as follows...=0A@@ -2093,6 +2= 094,7 @@ static inline void *__skb_put(struct sk_buff *skb, unsigned int le= n)=0A SKB_LINEAR_ASSERT(skb);=0A skb->tail +=3D len;=0A skb->len +=3D l= en;=0A+ lazy_clear_cpu_interrupt();=0A return tmp;=0A }=0A =0Adiff --git a= /net/core/skbuff.c b/net/core/skbuff.c=0Aindex 37317ffec146..eda9ef0ff63d 1= 00644=0A--- a/net/core/skbuff.c=0A+++ b/net/core/skbuff.c=0A@@ -1189,6 +118= 9,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)=0A if (!num= _frags)=0A goto release;=0A =0A+ /* Likely to copy user data */=0A+ lazy_= clear_cpu_interrupt();=0A+=0A new_frags =3D (__skb_pagelen(skb) + PAGE_SIZ= E - 1) >> PAGE_SHIFT;=0A for (i =3D 0; i < new_frags; i++) {=0A page =3D= alloc_page(gfp_mask);=0A@@ -1353,6 +1356,9 @@ struct sk_buff *skb_copy(con= st struct sk_buff *skb, gfp_t gfp_mask)=0A if (!n)=0A return NULL;=0A = =0A+ /* Copies user data */=0A+ lazy_clear_cpu_interrupt();=0A+=0A /* Set = the data pointer */=0A skb_reserve(n, headerlen);=0A /* Set the tail poin= ter and length */=0A@@ -1588,6 +1594,9 @@ struct sk_buff *skb_copy_expand(c= onst struct sk_buff *skb,=0A if (!n)=0A return NULL;=0A =0A+ /* May copy= user data */=0A+ lazy_clear_cpu_interrupt();=0A+=0A skb_reserve(n, newhea= droom);=0A =0A /* Set the tail pointer and length */=0A@@ -1676,6 +1685,8 = @@ EXPORT_SYMBOL(__skb_pad);=0A =0A void *pskb_put(struct sk_buff *skb, str= uct sk_buff *tail, int len)=0A {=0A+ /* Likely to be followed by a user dat= a copy */=0A+ lazy_clear_cpu_interrupt();=0A if (tail !=3D skb) {=0A skb= ->data_len +=3D len;=0A skb->len +=3D len;=0A@@ -1701,6 +1712,8 @@ void *= skb_put(struct sk_buff *skb, unsigned int len)=0A skb->len +=3D len;=0A = if (unlikely(skb->tail > skb->end))=0A skb_over_panic(skb, len, __builtin= _return_address(0));=0A+ /* Likely to be followed by a user data copy */=0A= + lazy_clear_cpu_interrupt();=0A return tmp;=0A }=0A EXPORT_SYMBOL(skb_put= );=0A@@ -1720,6 +1733,7 @@ void *skb_push(struct sk_buff *skb, unsigned int= len)=0A skb->len +=3D len;=0A if (unlikely(skb->data < skb->head))=0A = skb_under_panic(skb, len, __builtin_return_address(0));=0A+ /* No clear cp= u, assume this is only header data */=0A return skb->data;=0A }=0A EXPORT_= SYMBOL(skb_push);=0A@@ -2026,6 +2040,9 @@ int skb_copy_bits(const struct sk= _buff *skb, int offset, void *to, int len)=0A struct sk_buff *frag_iter;= =0A int i, copy;=0A =0A+ /* Copies user data */=0A+ lazy_clear_cpu_interru= pt();=0A+=0A if (offset > (int)skb->len - len)=0A goto fault;=0A =0A@@ -= 2387,6 +2404,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const= void *from, int len)=0A struct sk_buff *frag_iter;=0A int i, copy;=0A = =0A+ /* Copies user data */=0A+ lazy_clear_cpu_interrupt();=0A+=0A if (off= set > (int)skb->len - len)=0A goto fault;=0A =0A@@ -2467,6 +2487,9 @@ __w= sum __skb_checksum(const struct sk_buff *skb, int offset, int len,=0A stru= ct sk_buff *frag_iter;=0A int pos =3D 0;=0A =0A+ /* Reads packet data */= =0A+ lazy_clear_cpu_interrupt();=0A+=0A /* Checksum header. */=0A if (cop= y > 0) {=0A if (copy > len)=0A@@ -2559,6 +2582,9 @@ __wsum skb_copy_and_c= sum_bits(const struct sk_buff *skb, int offset,=0A struct sk_buff *frag_it= er;=0A int pos =3D 0;=0A =0A+ /* Reads packet data */=0A+ lazy_clear_cpu_i= nterrupt();=0A+=0A /* Copy header. */=0A if (copy > 0) {=0A if (copy > = len)=0A-- =0A2.17.2=0A=0A=0AFrom 6c367fc828b1d76d2020c98746356d42fce9fab5 M= on Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: Thu= , 13 Dec 2018 11:31:11 -0800=0ASubject: [PATCH 24/28] mds: Opt out tcp task= let to not touch user data=0A=0AMark the tcp tasklet as not needing an impl= icit cpu clear=0Aflush. If any is needed it will be triggered by the skb_*= =0Ahooks.=0A=0ASigned-off-by: Andi Kleen =0A---=0A net/= ipv4/tcp_output.c | 5 +++--=0A 1 file changed, 3 insertions(+), 2 deletions= (-)=0A=0Adiff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c=0Aindex= 730bc44dbad9..06bc635a54ca 100644=0A--- a/net/ipv4/tcp_output.c=0A+++ b/ne= t/ipv4/tcp_output.c=0A@@ -903,9 +903,10 @@ void __init tcp_tasklet_init(voi= d)=0A struct tsq_tasklet *tsq =3D &per_cpu(tsq_tasklet, i);=0A =0A INIT= _LIST_HEAD(&tsq->head);=0A- tasklet_init(&tsq->tasklet,=0A+ tasklet_init_= flags(&tsq->tasklet,=0A tcp_tasklet_func,=0A- (unsigned long= )tsq);=0A+ (unsigned long)tsq,=0A+ TASKLET_NO_USER);=0A }=0A= }=0A =0A-- =0A2.17.2=0A=0A=0AFrom 993bc736621e960d17409de11cf11342f3f89bf7= Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate: T= hu, 13 Dec 2018 11:30:30 -0800=0ASubject: [PATCH 25/28] mds: mark kernel/* = timers safe as not touching user=0A data=0A=0ASome preliminary auditing of = kernel/* shows no timers touch=0Aother processes' user data. Mark all the t= imers in kernel/*=0Aas not needed an implicit cpu clear.=0A=0AMore auditing= here would be useful.=0A=0ASigned-off-by: Andi Kleen = =0A---=0A kernel/events/core.c | 6 ++++--=0A kernel/fork.c = | 3 ++-=0A kernel/futex.c | 6 +++---=0A kernel/sched/core.c = | 5 +++--=0A kernel/sched/deadline.c | 6 ++++--=0A kernel/sched/f= air.c | 6 ++++--=0A kernel/sched/idle.c | 3 ++-=0A kernel/sch= ed/rt.c | 3 ++-=0A kernel/time/alarmtimer.c | 2 +-=0A kernel/tim= e/hrtimer.c | 6 +++---=0A kernel/time/posix-timers.c | 6 ++++--=0A ker= nel/time/sched_clock.c | 3 ++-=0A kernel/time/tick-sched.c | 6 ++++--=0A= kernel/watchdog.c | 3 ++-=0A 14 files changed, 40 insertions(+), = 24 deletions(-)=0A=0Adiff --git a/kernel/events/core.c b/kernel/events/core= =2Ec=0Aindex 3cd13a30f732..5d9a4ed0cf58 100644=0A--- a/kernel/events/core.c= =0A+++ b/kernel/events/core.c=0A@@ -1102,7 +1102,8 @@ static void __perf_mu= x_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)=0A cpuctx->hrtime= r_interval =3D ns_to_ktime(NSEC_PER_MSEC * interval);=0A =0A raw_spin_lock= _init(&cpuctx->hrtimer_lock);=0A- hrtimer_init(timer, CLOCK_MONOTONIC, HRTI= MER_MODE_ABS_PINNED);=0A+ hrtimer_init(timer, CLOCK_MONOTONIC,=0A+ HR= TIMER_MODE_ABS_PINNED|HRTIMER_MODE_NO_USER);=0A timer->function =3D perf_m= ux_hrtimer_handler;=0A }=0A =0A@@ -9202,7 +9203,8 @@ static void perf_sweve= nt_init_hrtimer(struct perf_event *event)=0A if (!is_sampling_event(event)= )=0A return;=0A =0A- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER= _MODE_REL);=0A+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC,=0A+ HRTI= MER_MODE_REL|HRTIMER_MODE_NO_USER);=0A hwc->hrtimer.function =3D perf_swev= ent_hrtimer;=0A =0A /*=0Adiff --git a/kernel/fork.c b/kernel/fork.c=0Ainde= x a60459947f18..d1edd0bce062 100644=0A--- a/kernel/fork.c=0A+++ b/kernel/fo= rk.c=0A@@ -1541,7 +1541,8 @@ static int copy_signal(unsigned long clone_fla= gs, struct task_struct *tsk)=0A =0A #ifdef CONFIG_POSIX_TIMERS=0A INIT_LIS= T_HEAD(&sig->posix_timers);=0A- hrtimer_init(&sig->real_timer, CLOCK_MONOTO= NIC, HRTIMER_MODE_REL);=0A+ hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC,= =0A+ HRTIMER_MODE_REL|HRTIMER_MODE_NO_USER);=0A sig->real_timer.func= tion =3D it_real_fn;=0A #endif=0A =0Adiff --git a/kernel/futex.c b/kernel/f= utex.c=0Aindex be3bff2315ff..4ac7a412f04b 100644=0A--- a/kernel/futex.c=0A+= ++ b/kernel/futex.c=0A@@ -2691,7 +2691,7 @@ static int futex_wait(u32 __use= r *uaddr, unsigned int flags, u32 val,=0A =0A hrtimer_init_on_stack(&to->= timer, (flags & FLAGS_CLOCKRT) ?=0A CLOCK_REALTIME : CLOCK_MONOTO= NIC,=0A- HRTIMER_MODE_ABS);=0A+ HRTIMER_MODE_ABS|HRTIMER_= MODE_NO_USER);=0A hrtimer_init_sleeper(to, current);=0A hrtimer_set_exp= ires_range_ns(&to->timer, *abs_time,=0A current->timer_slack_ns);= =0A@@ -2792,7 +2792,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsign= ed int flags,=0A if (time) {=0A to =3D &timeout;=0A hrtimer_init_on_st= ack(&to->timer, CLOCK_REALTIME,=0A- HRTIMER_MODE_ABS);=0A+ = HRTIMER_MODE_ABS|HRTIMER_MODE_NO_USER);=0A hrtimer_init_sleeper(to, cur= rent);=0A hrtimer_set_expires(&to->timer, *time);=0A }=0A@@ -3192,7 +319= 2,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int fla= gs,=0A to =3D &timeout;=0A hrtimer_init_on_stack(&to->timer, (flags & F= LAGS_CLOCKRT) ?=0A CLOCK_REALTIME : CLOCK_MONOTONIC,=0A- = HRTIMER_MODE_ABS);=0A+ HRTIMER_MODE_ABS|HRTIMER_MODE_NO_USER);=0A= hrtimer_init_sleeper(to, current);=0A hrtimer_set_expires_range_ns(&to= ->timer, *abs_time,=0A current->timer_slack_ns);=0Adiff --git a/k= ernel/sched/core.c b/kernel/sched/core.c=0Aindex b04918e9115c..6ca60c91cf30= 100644=0A--- a/kernel/sched/core.c=0A+++ b/kernel/sched/core.c=0A@@ -302,7= +302,7 @@ void hrtick_start(struct rq *rq, u64 delay)=0A */=0A delay = =3D max_t(u64, delay, 10000LL);=0A hrtimer_start(&rq->hrtick_timer, ns_to_= ktime(delay),=0A- HRTIMER_MODE_REL_PINNED);=0A+ HRTIMER_MODE_= REL_PINNED|HRTIMER_MODE_NO_USER);=0A }=0A #endif /* CONFIG_SMP */=0A =0A@@ = -316,7 +316,8 @@ static void hrtick_rq_init(struct rq *rq)=0A rq->hrtick_c= sd.info =3D rq;=0A #endif=0A =0A- hrtimer_init(&rq->hrtick_timer, CLOCK_MON= OTONIC, HRTIMER_MODE_REL);=0A+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTO= NIC,=0A+ HRTIMER_MODE_REL|HRTIMER_MODE_NO_USER);=0A rq->hrtick_timer= =2Efunction =3D hrtick;=0A }=0A #else /* CONFIG_SCHED_HRTICK */=0Adiff --gi= t a/kernel/sched/deadline.c b/kernel/sched/deadline.c=0Aindex fb8b7b5d745d.= =2Edce637e0b3bd 100644=0A--- a/kernel/sched/deadline.c=0A+++ b/kernel/sched= /deadline.c=0A@@ -1054,7 +1054,8 @@ void init_dl_task_timer(struct sched_dl= _entity *dl_se)=0A {=0A struct hrtimer *timer =3D &dl_se->dl_timer;=0A =0A= - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);=0A+ hrtimer_init(= timer, CLOCK_MONOTONIC,=0A+ HRTIMER_MODE_REL|HRTIMER_MODE_NO_USER);= =0A timer->function =3D dl_task_timer;=0A }=0A =0A@@ -1293,7 +1294,8 @@ vo= id init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)=0A {=0A stru= ct hrtimer *timer =3D &dl_se->inactive_timer;=0A =0A- hrtimer_init(timer, C= LOCK_MONOTONIC, HRTIMER_MODE_REL);=0A+ hrtimer_init(timer, CLOCK_MONOTONIC,= =0A+ HRTIMER_MODE_REL|HRTIMER_MODE_NO_USER);=0A timer->function =3D = inactive_task_timer;=0A }=0A =0Adiff --git a/kernel/sched/fair.c b/kernel/s= ched/fair.c=0Aindex b5a1bd4a1a46..b9d2a617b105 100644=0A--- a/kernel/sched/= fair.c=0A+++ b/kernel/sched/fair.c=0A@@ -4889,9 +4889,11 @@ void init_cfs_b= andwidth(struct cfs_bandwidth *cfs_b)=0A cfs_b->period =3D ns_to_ktime(def= ault_cfs_period());=0A =0A INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);=0A- h= rtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED)= ;=0A+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC,=0A+ HRTIMER= _MODE_ABS_PINNED|HRTIMER_MODE_NO_USER);=0A cfs_b->period_timer.function = =3D sched_cfs_period_timer;=0A- hrtimer_init(&cfs_b->slack_timer, CLOCK_MON= OTONIC, HRTIMER_MODE_REL);=0A+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONO= TONIC,=0A+ HRTIMER_MODE_REL|HRTIMER_MODE_NO_USER);=0A cfs_b->slack_t= imer.function =3D sched_cfs_slack_timer;=0A cfs_b->distribute_running =3D = 0;=0A }=0Adiff --git a/kernel/sched/idle.c b/kernel/sched/idle.c=0Aindex f5= 516bae0c1b..6a4cc46d8c4b 100644=0A--- a/kernel/sched/idle.c=0A+++ b/kernel/= sched/idle.c=0A@@ -330,7 +330,8 @@ void play_idle(unsigned long duration_ms= )=0A cpuidle_use_deepest_state(true);=0A =0A it.done =3D 0;=0A- hrtimer_i= nit_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);=0A+ hrtimer_ini= t_on_stack(&it.timer, CLOCK_MONOTONIC,=0A+ HRTIMER_MODE_REL|HRTIMER= _MODE_NO_USER);=0A it.timer.function =3D idle_inject_timer_fn;=0A hrtimer= _start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);=0A = =0Adiff --git a/kernel/sched/rt.c b/kernel/sched/rt.c=0Aindex e4f398ad9e73.= =2E24b90b260682 100644=0A--- a/kernel/sched/rt.c=0A+++ b/kernel/sched/rt.c= =0A@@ -46,7 +46,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 = period, u64 runtime)=0A raw_spin_lock_init(&rt_b->rt_runtime_lock);=0A =0A= hrtimer_init(&rt_b->rt_period_timer,=0A- CLOCK_MONOTONIC, HRTIMER_MODE_= REL);=0A+ CLOCK_MONOTONIC,=0A+ HRTIMER_MODE_REL|HRTIMER_MODE_NO_USE= R);=0A rt_b->rt_period_timer.function =3D sched_rt_period_timer;=0A }=0A = =0Adiff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c=0Aindex= 2c97e8c2d29f..f2efd9b5d0b7 100644=0A--- a/kernel/time/alarmtimer.c=0A+++ b= /kernel/time/alarmtimer.c=0A@@ -344,7 +344,7 @@ void alarm_init(struct alar= m *alarm, enum alarmtimer_type type,=0A enum alarmtimer_restart (*functio= n)(struct alarm *, ktime_t))=0A {=0A hrtimer_init(&alarm->timer, alarm_bas= es[type].base_clockid,=0A- HRTIMER_MODE_ABS);=0A+ HRTIMER_MODE_= ABS|HRTIMER_MODE_NO_USER);=0A __alarm_init(alarm, type, function);=0A }=0A= EXPORT_SYMBOL_GPL(alarm_init);=0Adiff --git a/kernel/time/hrtimer.c b/kern= el/time/hrtimer.c=0Aindex e2c8776ba2a4..58beefd3543a 100644=0A--- a/kernel/= time/hrtimer.c=0A+++ b/kernel/time/hrtimer.c=0A@@ -1713,7 +1713,7 @@ static= long __sched hrtimer_nanosleep_restart(struct restart_block *restart)=0A = int ret;=0A =0A hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid= ,=0A- HRTIMER_MODE_ABS);=0A+ HRTIMER_MODE_ABS|HRTIMER_MODE_NO_USER);= =0A hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);=0A =0A= ret =3D do_nanosleep(&t, HRTIMER_MODE_ABS);=0A@@ -1733,7 +1733,7 @@ long = hrtimer_nanosleep(const struct timespec64 *rqtp,=0A if (dl_task(current) |= | rt_task(current))=0A slack =3D 0;=0A =0A- hrtimer_init_on_stack(&t.time= r, clockid, mode);=0A+ hrtimer_init_on_stack(&t.timer, clockid, mode|HRTIME= R_MODE_NO_USER);=0A hrtimer_set_expires_range_ns(&t.timer, timespec64_to_k= time(*rqtp), slack);=0A ret =3D do_nanosleep(&t, mode);=0A if (ret !=3D -= ERESTART_RESTARTBLOCK)=0A@@ -1932,7 +1932,7 @@ schedule_hrtimeout_range_clo= ck(ktime_t *expires, u64 delta,=0A return -EINTR;=0A }=0A =0A- hrtimer_i= nit_on_stack(&t.timer, clock_id, mode);=0A+ hrtimer_init_on_stack(&t.timer,= clock_id, mode|HRTIMER_MODE_NO_USER);=0A hrtimer_set_expires_range_ns(&t.= timer, *expires, delta);=0A =0A hrtimer_init_sleeper(&t, current);=0Adiff = --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c=0Aindex 0e8= 4bb72a3da..0faf661cb4c8 100644=0A--- a/kernel/time/posix-timers.c=0A+++ b/k= ernel/time/posix-timers.c=0A@@ -464,7 +464,8 @@ static void release_posix_t= imer(struct k_itimer *tmr, int it_id_set)=0A =0A static int common_timer_cr= eate(struct k_itimer *new_timer)=0A {=0A- hrtimer_init(&new_timer->it.real.= timer, new_timer->it_clock, 0);=0A+ hrtimer_init(&new_timer->it.real.timer,= new_timer->it_clock,=0A+ HRTIMER_MODE_NO_USER);=0A return 0;=0A }=0A =0A= @@ -789,7 +790,8 @@ static void common_hrtimer_arm(struct k_itimer *timr, k= time_t expires,=0A if (timr->it_clock =3D=3D CLOCK_REALTIME)=0A timr->kc= lock =3D absolute ? &clock_realtime : &clock_monotonic;=0A =0A- hrtimer_ini= t(&timr->it.real.timer, timr->it_clock, mode);=0A+ hrtimer_init(&timr->it.r= eal.timer, timr->it_clock,=0A+ mode|HRTIMER_MODE_NO_USER);=0A timr->= it.real.timer.function =3D posix_timer_fn;=0A =0A if (!absolute)=0Adiff --= git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c=0Aindex 094b82c= a95e5..e0a59ed9199f 100644=0A--- a/kernel/time/sched_clock.c=0A+++ b/kernel= /time/sched_clock.c=0A@@ -249,7 +249,8 @@ void __init generic_sched_clock_i= nit(void)=0A * Start the timer to keep sched_clock() properly updated and= =0A * sets the initial epoch.=0A */=0A- hrtimer_init(&sched_clock_timer= , CLOCK_MONOTONIC, HRTIMER_MODE_REL);=0A+ hrtimer_init(&sched_clock_timer, = CLOCK_MONOTONIC,=0A+ HRTIMER_MODE_REL|HRTIMER_MODE_NO_USER);=0A sche= d_clock_timer.function =3D sched_clock_poll;=0A hrtimer_start(&sched_clock= _timer, cd.wrap_kt, HRTIMER_MODE_REL);=0A }=0Adiff --git a/kernel/time/tick= -sched.c b/kernel/time/tick-sched.c=0Aindex 6fa52cd6df0b..b95f6f1e7bc3 1006= 44=0A--- a/kernel/time/tick-sched.c=0A+++ b/kernel/time/tick-sched.c=0A@@ -= 1205,7 +1205,8 @@ static void tick_nohz_switch_to_nohz(void)=0A * Recycle= the hrtimer in ts, so we can share the=0A * hrtimer_forward with the hig= hres code.=0A */=0A- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTI= MER_MODE_ABS);=0A+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC,=0A+ = HRTIMER_MODE_ABS|HRTIMER_MODE_NO_USER);=0A /* Get the next period */=0A = next =3D tick_init_jiffy_update();=0A =0A@@ -1302,7 +1303,8 @@ void tick_s= etup_sched_timer(void)=0A /*=0A * Emulate tick processing via per-CPU hr= timers:=0A */=0A- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER= _MODE_ABS);=0A+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC,=0A+ H= RTIMER_MODE_ABS|HRTIMER_MODE_NO_USER);=0A ts->sched_timer.function =3D tic= k_sched_timer;=0A =0A /* Get the next period (per-CPU) */=0Adiff --git a/k= ernel/watchdog.c b/kernel/watchdog.c=0Aindex 977918d5d350..d3c9da0a4fce 100= 644=0A--- a/kernel/watchdog.c=0A+++ b/kernel/watchdog.c=0A@@ -483,7 +483,8 = @@ static void watchdog_enable(unsigned int cpu)=0A * Start the timer fir= st to prevent the NMI watchdog triggering=0A * before the timer has a cha= nce to fire.=0A */=0A- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MOD= E_REL);=0A+ hrtimer_init(hrtimer, CLOCK_MONOTONIC,=0A+ HRTIMER_MODE_REL|H= RTIMER_MODE_NO_USER);=0A hrtimer->function =3D watchdog_timer_fn;=0A hrti= mer_start(hrtimer, ns_to_ktime(sample_period),=0A HRTIMER_MODE_REL_= PINNED);=0A-- =0A2.17.2=0A=0A=0AFrom 27734e0d15075497bece467d55f46d3481a276= e2 Mon Sep 17 00:00:00 2001=0AFrom: Andi Kleen =0ADate:= Wed, 12 Dec 2018 16:51:22 -0800=0ASubject: [PATCH 26/28] mds: Mark AHCI in= terrupt as not needing cpu clear=0A=0AAHCI interrupt handlers never touch u= ser data with the CPU.=0A=0AJust to get the number of clears down on my tes= t system.=0A=0ASigned-off-by: Andi Kleen =0A---=0A driv= ers/ata/ahci.c | 2 +-=0A drivers/ata/ahci.h | 2 ++=0A drivers/ata/l= ibahci.c | 40 ++++++++++++++++++++++++----------------=0A 3 files changed, = 27 insertions(+), 17 deletions(-)=0A=0Adiff --git a/drivers/ata/ahci.c b/dr= ivers/ata/ahci.c=0Aindex 021ce46e2e57..1455ad89d2f9 100644=0A--- a/drivers/= ata/ahci.c=0A+++ b/drivers/ata/ahci.c=0A@@ -1865,7 +1865,7 @@ static int ah= ci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)=0A =0A = pci_set_master(pdev);=0A =0A- rc =3D ahci_host_activate(host, &ahci_sht);= =0A+ rc =3D ahci_host_activate_irqflags(host, &ahci_sht, IRQF_NO_USER);=0A = if (rc)=0A return rc;=0A =0Adiff --git a/drivers/ata/ahci.h b/drivers/at= a/ahci.h=0Aindex ef356e70e6de..42a3474f26b6 100644=0A--- a/drivers/ata/ahci= =2Eh=0A+++ b/drivers/ata/ahci.h=0A@@ -430,6 +430,8 @@ void ahci_set_em_mess= ages(struct ahci_host_priv *hpriv,=0A int ahci_reset_em(struct ata_host *ho= st);=0A void ahci_print_info(struct ata_host *host, const char *scc_s);=0A = int ahci_host_activate(struct ata_host *host, struct scsi_host_template *sh= t);=0A+int ahci_host_activate_irqflags(struct ata_host *host, struct scsi_h= ost_template *sht,=0A+ int irqflags);=0A void ahci_error_handler(struct = ata_port *ap);=0A u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_= masked);=0A =0Adiff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c= =0Aindex b5f57c69c487..b32664c7d8a1 100644=0A--- a/drivers/ata/libahci.c=0A= +++ b/drivers/ata/libahci.c=0A@@ -2548,7 +2548,8 @@ void ahci_set_em_messag= es(struct ahci_host_priv *hpriv,=0A EXPORT_SYMBOL_GPL(ahci_set_em_messages)= ;=0A =0A static int ahci_host_activate_multi_irqs(struct ata_host *host,=0A= - struct scsi_host_template *sht)=0A+ struct scsi_host_template *= sht,=0A+ int irqflags)=0A {=0A struct ahci_host_priv *hpriv =3D host-= >private_data;=0A int i, rc;=0A@@ -2571,7 +2572,7 @@ static int ahci_host_= activate_multi_irqs(struct ata_host *host,=0A }=0A =0A rc =3D devm_requ= est_irq(host->dev, irq, ahci_multi_irqs_intr_hard,=0A- 0, pp->irq_desc, = host->ports[i]);=0A+ irqflags, pp->irq_desc, host->ports[i]);=0A =0A i= f (rc)=0A return rc;=0A@@ -2581,18 +2582,8 @@ static int ahci_host_activ= ate_multi_irqs(struct ata_host *host,=0A return ata_host_register(host, sh= t);=0A }=0A =0A-/**=0A- * ahci_host_activate - start AHCI host, request IRQ= s and register it=0A- * @host: target ATA host=0A- * @sht: scsi_host_templa= te to use when registering the host=0A- *=0A- * LOCKING:=0A- * Inherited fr= om calling layer (may sleep).=0A- *=0A- * RETURNS:=0A- * 0 on success, -err= no otherwise.=0A- */=0A-int ahci_host_activate(struct ata_host *host, struc= t scsi_host_template *sht)=0A+int ahci_host_activate_irqflags(struct ata_ho= st *host, struct scsi_host_template *sht,=0A+ int irqflags)=0A {=0A str= uct ahci_host_priv *hpriv =3D host->private_data;=0A int irq =3D hpriv->ir= q;=0A@@ -2608,15 +2599,32 @@ int ahci_host_activate(struct ata_host *host, = struct scsi_host_template *sht)=0A return -EIO;=0A }=0A =0A- rc =3D a= hci_host_activate_multi_irqs(host, sht);=0A+ rc =3D ahci_host_activate_mul= ti_irqs(host, sht, irqflags);=0A } else {=0A rc =3D ata_host_activate(ho= st, irq, hpriv->irq_handler,=0A- IRQF_SHARED, sht);=0A+ = irqflags|IRQF_SHARED, sht);=0A }=0A =0A =0A return rc;=0A }=0A+EXPORT_SY= MBOL_GPL(ahci_host_activate_irqflags);=0A+=0A+/**=0A+ * ahci_host_activate = - start AHCI host, request IRQs and register it=0A+ * @host: target ATA hos= t=0A+ * @sht: scsi_host_template to use when registering the host=0A+ *=0A+= * LOCKING:=0A+ * Inherited from calling layer (may sleep).=0A+ *=0A+ * RET= URNS:=0A+ * 0 on success, -errno otherwise.=0A+ */=0A+int ahci_host_activat= e(struct ata_host *host, struct scsi_host_template *sht)=0A+{=0A+ return ah= ci_host_activate_irqflags(host, sht, 0);=0A+}=0A EXPORT_SYMBOL_GPL(ahci_hos= t_activate);=0A =0A MODULE_AUTHOR("Jeff Garzik");=0A-- =0A2.17.2=0A=0A=0AFr= om f1d6b6edf556c3c138d77fe38a5325cc0d656780 Mon Sep 17 00:00:00 2001=0AFrom= : Andi Kleen =0ADate: Fri, 14 Dec 2018 15:21:07 -0800= =0ASubject: [PATCH 27/28] mds: Mark ACPI interrupt as not needing cpu clear= =0A=0AACPI doesn't touch any user data, so doesn't need a cpu clear.=0A=0AS= igned-off-by: Andi Kleen =0A---=0A drivers/acpi/osl.c |= 3 ++-=0A 1 file changed, 2 insertions(+), 1 deletion(-)=0A=0Adiff --git a/= drivers/acpi/osl.c b/drivers/acpi/osl.c=0Aindex f29e427d0d1d..f31064134b37 = 100644=0A--- a/drivers/acpi/osl.c=0A+++ b/drivers/acpi/osl.c=0A@@ -572,7 +5= 72,8 @@ acpi_os_install_interrupt_handler(u32 gsi, acpi_osd_handler handler= ,=0A =0A acpi_irq_handler =3D handler;=0A acpi_irq_context =3D context;= =0A- if (request_irq(irq, acpi_irq, IRQF_SHARED, "acpi", acpi_irq)) {=0A+ i= f (request_irq(irq, acpi_irq, IRQF_SHARED|IRQF_NO_USER,=0A+ "acpi", acpi= _irq)) {=0A printk(KERN_ERR PREFIX "SCI (IRQ%d) allocation failed\n", irq= );=0A acpi_irq_handler =3D NULL;=0A return AE_NOT_ACQUIRED;=0A-- =0A2.1= 7.2=0A=0A=0AFrom a945f2853135a8bb22a443f1d8e00c66db13deb1 Mon Sep 17 00:00:= 00 2001=0AFrom: Andi Kleen =0ADate: Tue, 18 Dec 2018 16= :46:10 -0800=0ASubject: [PATCH 28/28] mds: Mitigate BPF=0A=0ABPF allows the= user to run untrusted code in the kernel.=0A=0ANormally MDS would allow so= me information leakage either=0Afrom other processes or sensitive kernel c= ode to the user=0Acontrolled BPF code. We cannot rule out that BPF code co= ntains=0Aan MDS exploit and it is difficult to pattern match.=0A=0AThe patc= h aims to add limited number of clear cpus=0Abefore BPF executions to make = EBPF executions safe.=0A=0AAssume BPF execution does not touch other user's= data, so does=0Anot need to schedule a clear for itself.=0A=0AFor EBPF pro= grams loaded privileged we never clear.=0A=0AWhen the BPF program was loade= d unprivileged clear the CPU=0Abefore the BPF execution, depending on the c= ontext it is running in:=0A=0AWe only do this when running in an interrupt,= or if an clear cpu is=0Aalready scheduled (which means for example there w= as a context=0Aswitch, or crypto operation before)=0A=0AIn process context = we check if the current process context=0Ahas the same userns+euid as the p= rocess who created the BPF.=0AThis handles the common seccomp filter case w= ithout=0Aany extra clears, but still adds clears when e.g. a socket=0Afilte= r runs on a socket inherited to a process with different user id.=0A=0AWe a= lso always clear when an earlier kernel subsystem scheduled=0Aa clear, e.g.= after a context switch or running crypto code.=0A=0ATechnically we would o= nly need to do this if the BPF program=0Acontains conditional branches and = loads dominated by them, but=0Alet's assume that near all do.=0A=0AFor exam= ple for running chromium with seccomp filters I see=0Aonly 15-18% of all sa= ndbox system calls have a clear, most=0Aare likely caused by context switch= es=0A=0AUnprivileged EBPF usages in interrupts currently always clear.=0A= =0AThis could be further optimized by allowing callers that do=0Aa lot of i= ndividual BPF runs and are sure they don't touch=0Aother user's data (that = is not accessible to the EBPF anyways)=0Ainbetween to do the clear only onc= e at the beginning. We can add=0Asuch optimizations later based on profile = data.=0A=0ASigned-off-by: Andi Kleen =0A---=0A arch/x86= /include/asm/clearbpf.h | 29 +++++++++++++++++++++++++++++=0A include/linux= /filter.h | 21 +++++++++++++++++++--=0A kernel/bpf/core.c = | 2 ++=0A 3 files changed, 50 insertions(+), 2 deletions(-)=0A creat= e mode 100644 arch/x86/include/asm/clearbpf.h=0A=0Adiff --git a/arch/x86/in= clude/asm/clearbpf.h b/arch/x86/include/asm/clearbpf.h=0Anew file mode 1006= 44=0Aindex 000000000000..dc1756722b48=0A--- /dev/null=0A+++ b/arch/x86/incl= ude/asm/clearbpf.h=0A@@ -0,0 +1,29 @@=0A+/* SPDX-License-Identifier: GPL-2.= 0 */=0A+#ifndef _ASM_CLEARBPF_H=0A+#define _ASM_CLEARBPF_H 1=0A+=0A+#includ= e =0A+#include =0A+#include =0A+=0A+/*=0A+ * When the BPF program was loaded unprivileged, clear th= e CPU=0A+ * to prevent any exploits written in BPF using side channels to r= ead=0A+ * data leaked from other kernel code. In some cases, like=0A+ * pro= cess context with the same uid, we can avoid it.=0A+ *=0A+ * See Documentat= ion/clearcpu.txt for more details.=0A+ */=0A+static inline void arch_bpf_pr= epare_nonpriv(kuid_t uid)=0A+{=0A+ if (!static_cpu_has(X86_BUG_MDS))=0A+ r= eturn;=0A+ if (in_interrupt() ||=0A+ test_thread_flag(TIF_CLEAR_CPU) ||=0A= + !uid_eq(current_euid(), uid)) {=0A+ clear_cpu();=0A+ clear_thread_flag= (TIF_CLEAR_CPU);=0A+ }=0A+}=0A+=0A+#endif=0Adiff --git a/include/linux/filt= er.h b/include/linux/filter.h=0Aindex ad106d845b22..b32547b4bd92 100644=0A-= -- a/include/linux/filter.h=0A+++ b/include/linux/filter.h=0A@@ -20,12 +20,= 21 @@=0A #include =0A #include =0A #i= nclude =0A+#include =0A =0A #include =0A =0A #include =0A #include =0A =0A+#ifdef CONFIG_ARCH_HAS_CLEAR_CPU=0A+#include =0A+#else=0A+static inline void arch_bpf_prepare_nonpriv(kuid_t uid)=0A= +{=0A+}=0A+#endif=0A+=0A struct sk_buff;=0A struct sock;=0A struct seccomp_= data;=0A@@ -490,7 +499,9 @@ struct bpf_prog {=0A blinded:1, /* Was blin= ded */=0A is_func:1, /* program is a bpf function */=0A kprobe_over= ride:1, /* Do we override a kprobe? */=0A- has_callchain_buf:1; /* callc= hain buffer allocated? */=0A+ has_callchain_buf:1, /* callchain buffer a= llocated? */=0A+ priv:1; /* Was loaded privileged */=0A+ kuid_t uid; = /* Original uid who created it */=0A enum bpf_prog_type type; /* Type of= BPF program */=0A enum bpf_attach_type expected_attach_type; /* For some = prog types */=0A u32 len; /* Number of filter blocks */=0A@@ -513,7 +52= 4,13 @@ struct sk_filter {=0A struct bpf_prog *prog;=0A };=0A =0A-#define = BPF_PROG_RUN(filter, ctx) (*(filter)->bpf_func)(ctx, (filter)->insnsi)=0A+= static inline unsigned _bpf_prog_run(const struct bpf_prog *bp, const void = *ctx)=0A+{=0A+ if (!bp->priv)=0A+ arch_bpf_prepare_nonpriv(bp->uid);=0A+ r= eturn bp->bpf_func(ctx, bp->insnsi);=0A+}=0A+#define BPF_PROG_RUN(filter, c= tx) _bpf_prog_run(filter, ctx)=0A =0A #define BPF_SKB_CB_LEN QDISC_CB_PRIV_= LEN=0A =0Adiff --git a/kernel/bpf/core.c b/kernel/bpf/core.c=0Aindex f908b9= 356025..67d845229d46 100644=0A--- a/kernel/bpf/core.c=0A+++ b/kernel/bpf/co= re.c=0A@@ -99,6 +99,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size,= gfp_t gfp_extra_flags)=0A fp->aux =3D aux;=0A fp->aux->prog =3D fp;=0A = fp->jit_requested =3D ebpf_jit_enabled();=0A+ fp->priv =3D !!capable(CAP_SY= S_ADMIN);=0A+ fp->uid =3D current_euid();=0A =0A INIT_LIST_HEAD_RCU(&fp->a= ux->ksym_lnode);=0A =0A-- =0A2.17.2=0A=0A --rfwNdt5cNUUjB/69--