LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v3 11/11] powerpc/64: use interrupt restart table to speed up return from interrupt
From: Nicholas Piggin @ 2021-06-16  1:20 UTC (permalink / raw)
  To: linuxppc-dev, Michael Ellerman
In-Reply-To: <87bl87tf86.fsf@mpe.ellerman.id.au>

Excerpts from Michael Ellerman's message of June 15, 2021 11:44 pm:
> Nicholas Piggin <npiggin@gmail.com> writes:
>> diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
>> index fe26f2fa0f3f..fbe94e2d5011 100644
>> --- a/arch/powerpc/lib/feature-fixups.c
>> +++ b/arch/powerpc/lib/feature-fixups.c
>> @@ -412,12 +430,19 @@ void do_entry_flush_fixups(enum l1d_flush_type types)
>>  	stop_machine(__do_entry_flush_fixups, &types, NULL);
>>  }
>>  
>> -void do_rfi_flush_fixups(enum l1d_flush_type types)
>> +static int __do_rfi_flush_fixups(void *data)
>>  {
>> +	enum l1d_flush_type types = *(enum l1d_flush_type *)data;
>>  	unsigned int instrs[3], *dest;
>>  	long *start, *end;
>>  	int i;
>>  
>> +	if (types & L1D_FLUSH_FALLBACK)
>> +		rfi_exit_not_reentrant = true;
>> +	else
>> +		rfi_exit_not_reentrant = false;
>> +	update_interrupt_exit();
> 
> This is not happy:

Ah, needs a static_key_enable_cpuslocked. I'll fix.

Thanks,
Nick

> 
> [    0.000000][    T0] ============================================
> [    0.000000][    T0] WARNING: possible recursive locking detected
> [    0.000000][    T0] 5.13.0-rc2-00118-gca433a3a44e3 #1 Not tainted
> [    0.000000][    T0] --------------------------------------------
> [    0.000000][    T0] swapper/0 is trying to acquire lock:
> [    0.000000][    T0] c00000000252fa10 (cpu_hotplug_lock){....}-{0:0}, at: static_key_enable+0x24/0x50
> [    0.000000][    T0]
> [    0.000000][    T0] but task is already holding lock:
> [    0.000000][    T0] c00000000252fa10 (cpu_hotplug_lock){....}-{0:0}, at: stop_machine+0x2c/0x60
> [    0.000000][    T0]
> [    0.000000][    T0] other info that might help us debug this:
> [    0.000000][    T0]  Possible unsafe locking scenario:
> [    0.000000][    T0]
> [    0.000000][    T0]        CPU0
> [    0.000000][    T0]        ----
> [    0.000000][    T0]   lock(cpu_hotplug_lock);
> [    0.000000][    T0]   lock(cpu_hotplug_lock);
> [    0.000000][    T0]
> [    0.000000][    T0]  *** DEADLOCK ***
> [    0.000000][    T0]
> [    0.000000][    T0]  May be due to missing lock nesting notation
> [    0.000000][    T0]
> [    0.000000][    T0] 1 lock held by swapper/0:
> [    0.000000][    T0]  #0: c00000000252fa10 (cpu_hotplug_lock){....}-{0:0}, at: stop_machine+0x2c/0x60
> [    0.000000][    T0]
> [    0.000000][    T0] stack backtrace:
> [    0.000000][    T0] CPU: 0 PID: 0 Comm: swapper Not tainted 5.13.0-rc2-00118-gca433a3a44e3 #1
> [    0.000000][    T0] Call Trace:
> [    0.000000][    T0] [c0000000027db8f0] [c00000000093dd28] dump_stack+0xec/0x144 (unreliable)
> [    0.000000][    T0] [c0000000027db940] [c0000000001ed5b4] __lock_acquire+0x1744/0x28b0
> [    0.000000][    T0] [c0000000027dba70] [c0000000001ef338] lock_acquire+0x128/0x600
> [    0.000000][    T0] [c0000000027dbb70] [c00000000015035c] cpus_read_lock+0x4c/0x170
> [    0.000000][    T0] [c0000000027dbba0] [c0000000003c2594] static_key_enable+0x24/0x50
> [    0.000000][    T0] [c0000000027dbbd0] [c0000000000ae87c] __do_rfi_flush_fixups+0x7c/0x300
> [    0.000000][    T0] [c0000000027dbc80] [c0000000002ab7e4] stop_machine_cpuslocked+0xe4/0x200
> [    0.000000][    T0] [c0000000027dbcf0] [c0000000002ab940] stop_machine+0x40/0x60
> [    0.000000][    T0] [c0000000027dbd30] [c0000000000aef30] do_rfi_flush_fixups+0x30/0x50
> [    0.000000][    T0] [c0000000027dbd60] [c000000000040890] setup_rfi_flush+0xa0/0x140
> [    0.000000][    T0] [c0000000027dbdd0] [c00000000201c6c4] pnv_setup_arch+0x304/0x4ac
> [    0.000000][    T0] [c0000000027dbe60] [c00000000200a31c] setup_arch+0x374/0x3c4
> [    0.000000][    T0] [c0000000027dbee0] [c000000002003d08] start_kernel+0xb0/0x790
> [    0.000000][    T0] [c0000000027dbf90] [c00000000000d79c] start_here_common+0x1c/0x600
> [    0.000000][    T0] rfi-flush: patched 12 locations (fallback displacement flush)
> 
> 
> cheers
> 

^ permalink raw reply

* Re: [PATCH v3 00/11] powerpc/64: fast interrupt exits
From: Nicholas Piggin @ 2021-06-16  1:25 UTC (permalink / raw)
  To: linuxppc-dev, Michael Ellerman
In-Reply-To: <878s3bte57.fsf@mpe.ellerman.id.au>

Excerpts from Michael Ellerman's message of June 16, 2021 12:07 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
>> This series attempts to improve the speed of interrupts and system calls
>> in three major ways.
> 
> With the full series applied I'm seeing various warnings.

Gah, sorry :( I've tested hash/radix guest host mambo qemu real hw...
Are they all hash, or qemu powernv radix?

Obviously not 64e since the interrupt rewrite, but it did seem to be
working (I'll fix and re test).

> I feel like I must be missing some lead up patch?

Shouldn't be, I would guess a config option maybe not marking srrs 
clobbered somewhere? I'm generally using the defconfigs plus some random 
smattering of debug options at any time.

I'll grab some .configs from you...

Thanks,
Nick

> 
> cheers
> 
> 
> mambo, p8, hash, be:
> 
> [    0.001038][    T0] ------------[ cut here ]------------
> [    0.001287][    T0] Interrupts were enabled early
> [    0.001510][    T0] WARNING: CPU: 0 PID: 0 at init/main.c:1004 .start_kernel+0x53c/0x720
> [    0.001902][    T0] Modules linked in:
> [    0.002084][    T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.13.0-rc2-00118-gca433a3a44e3-dirty #1
> [    0.002518][    T0] NIP:  c000000002004920 LR: c00000000200491c CTR: c0000000000e3cd0
> [    0.002887][    T0] REGS: c000000002acfc30 TRAP: 0700   Not tainted  (5.13.0-rc2-00118-gca433a3a44e3-dirty)
> [    0.003342][    T0] MSR:  9000000002029032 <SF,HV,VEC,EE,ME,IR,DR,RI>  CR: 28002222  XER: 22000000
> [    0.003818][    T0] CFAR: 000000003002afb0 IRQMASK: 0
> [    0.003818][    T0] GPR00: c00000000200491c c000000002acfed0 c000000002ad1a00 000000000000001d
> [    0.003818][    T0] GPR04: 0000000000000001 c0000000027645b0 0000000000000001 c0000000027645e0
> [    0.003818][    T0] GPR08: 00000000fd5c0000 c000000002764560 c000000002764560 9000000032001032
> [    0.003818][    T0] GPR12: 0000000028002222 c000000002cd0000 0000000000000000 0000000000000000
> [    0.003818][    T0] GPR16: 0000000031c10000 0000000030003180 0000000030000000 0000000000000000
> [    0.003818][    T0] GPR20: 0000000000000000 0000000000000000 000000003012e4eb 0000000030110ece
> [    0.003818][    T0] GPR24: 00000000301105f5 c000000000000000 0000000020010000 c000000002090008
> [    0.003818][    T0] GPR28: c000000002090008 c0000000011ae620 c0000000011ae620 c000000002090008
> [    0.007455][    T0] NIP [c000000002004920] .start_kernel+0x53c/0x720
> [    0.007765][    T0] LR [c00000000200491c] .start_kernel+0x538/0x720
> [    0.008072][    T0] Call Trace:
> [    0.008223][    T0] [c000000002acfed0] [c00000000200491c] .start_kernel+0x538/0x720 (unreliable)
> [    0.008655][    T0] [c000000002acff90] [c00000000000dd2c] start_here_common+0x1c/0x670
> [    0.009041][    T0] Instruction dump:
> [    0.009219][    T0] 60000000 4a23dadd 60000000 48035821 60000000 892d0932 71290001 40820018
> [    0.009659][    T0] 3c62fe6d 3863f810 4a171751 60000000 <0fe00000> 39200000 3d420003 38600000
> [    0.010111][    T0] ---[ end trace e2dce2fbc72d04fa ]---
> 
> 
> mambo, p9, hash, be:
> 
> [   17.668403][  T267] ------------[ cut here ]------------
> [   17.668433][  T267] WARNING: CPU: 1 PID: 267 at arch/powerpc/include/asm/book3s/64/kup.h:304 .arch_local_irq_restore+0x190/0x1a0
> [   17.668503][  T267] Modules linked in:
> [   17.668532][  T267] CPU: 1 PID: 267 Comm: halt Tainted: G        W         5.13.0-rc2-00118-gca433a3a44e3-dirty #1
> [   17.668588][  T267] NIP:  c000000000016680 LR: c00000000002f804 CTR: c0000000005901c0
> [   17.668632][  T267] REGS: c000000009f339b0 TRAP: 0700   Tainted: G        W          (5.13.0-rc2-00118-gca433a3a44e3-dirty)
> [   17.668686][  T267] MSR:  9000000000021032 <SF,HV,ME,IR,DR,RI>  CR: 48244222  XER: 04000000
> [   17.668777][  T267] CFAR: c00000000002fdcc IRQMASK: 3
> [   17.668777][  T267] GPR00: c00000000002f804 c000000009f33c50 c000000002ad1a00 0000000000000000
> [   17.668777][  T267] GPR04: 0000000000000000 0000000000000020 00000000000000b7 00000000000c9495
> [   17.668777][  T267] GPR08: 0000000000000008 fcffffffffffffff 3cffffffffffffff 0000000000000003
> [   17.668777][  T267] GPR12: 0000000028244228 c0000000fffff300 0000000000000000 0000000000000000
> [   17.668777][  T267] GPR16: 0000000000000000 0000000000000000 0000000000000000 00000000100a0a18
> [   17.668777][  T267] GPR20: 00007fffdf62ffac 00000000100a09d8 00000100011405f0 0000000002002000
> [   17.668777][  T267] GPR24: 0000000000000002 0000000000000001 0000000002802000 0000000000000000
> [   17.668777][  T267] GPR28: c00000000cc56080 0000000000000000 0000000000000000 3cffffffffffffff
> [   17.669291][  T267] NIP [c000000000016680] .arch_local_irq_restore+0x190/0x1a0
> [   17.669343][  T267] LR [c00000000002f804] .interrupt_exit_user_prepare_main+0x144/0x2b0
> [   17.669394][  T267] Call Trace:
> [   17.669418][  T267] [c000000009f33c50] [c000000009f33ce0] 0xc000000009f33ce0 (unreliable)
> [   17.669475][  T267] [c000000009f33cd0] [c00000000002f804] .interrupt_exit_user_prepare_main+0x144/0x2b0
> [   17.669538][  T267] [c000000009f33d90] [c00000000002fe08] .syscall_exit_restart+0x68/0x80
> [   17.669596][  T267] [c000000009f33e10] [c00000000000d168] system_call_common+0x278/0x2dc
> [   17.669652][  T267] --- interrupt: c00 at 0x7fffad5b867c
> [   17.669688][  T267] NIP:  00007fffad5b867c LR: 00000000100992a0 CTR: 0000000000000000
> [   17.669731][  T267] REGS: c000000009f33e80 TRAP: 0c00   Tainted: G        W          (5.13.0-rc2-00118-gca433a3a44e3-dirty)
> [   17.669785][  T267] MSR:  900000000280f032 <SF,HV,VEC,VSX,EE,PR,FP,ME,IR,DR,RI>  CR: 28244222  XER: 00000000
> [   17.669910][  T267] IRQMASK: 0
> [   17.669910][  T267] GPR00: 0000000000000006 00007fffd8758060 00007fffad687300 0000000000000000
> [   17.669910][  T267] GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [   17.669910][  T267] GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [   17.669910][  T267] GPR12: 0000000000000000 00007fffad70a3a0 0000000000000000 0000000000000000
> [   17.669910][  T267] GPR16: 0000000000000000 0000000000000000 0000000000000000 00000000100a0a18
> [   17.669910][  T267] GPR20: 00007fffdf62ffac 00000000100a09d8 00000100011405f0 00000000100e24d0
> [   17.669910][  T267] GPR24: 00000000100b91e5 00000000100c1e14 0000010019e002a0 00000000100c1e1e
> [   17.669910][  T267] GPR28: 00007fffd875821b 0000000000000004 00007fffd8758210 0000000000000000
> [   17.670410][  T267] NIP [00007fffad5b867c] 0x7fffad5b867c
> [   17.670446][  T267] LR [00000000100992a0] 0x100992a0
> [   17.670480][  T267] --- interrupt: c00
> [   17.670507][  T267] Instruction dump:
> [   17.670535][  T267] 4e800020 60000000 60000000 60000000 39200000 992d0932 39400000 992d0933
> [   17.670627][  T267] 614a8002 7d410164 4e800020 60000000 <0fe00000> 4bfffefc 60000000 60000000
> [   17.670719][  T267] ---[ end trace 5ba1b854b2957ce3 ]---
> 
> 
> qemu powernv:
> 
> [    3.446262][   T51] init[51]: segfault (11) at 0 nip 0 lr 1000ca78 code 1 in busybox[10000000+d0000]
> [    3.448351][   T51] init[51]: code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
> [    3.449598][   T51] init[51]: code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
> [    3.460559][   T52] init[52]: segfault (11) at 0 nip 0 lr 1000ca78 code 1 in busybox[10000000+d0000]
> [    3.461409][   T52] init[52]: code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
> [    3.462496][   T52] init[52]: code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
> 
> ... to infinity.
> 
> 
> qemu 64e gets to running /init but gets stuck there with no further output.
> 
> 
> qemu pseries, p8, be:
> 
> [    3.836653][   T75] S20urandom (75) used greatest stack depth: 9792 bytes left
> [    4.718806][    C0] ------------[ cut here ]------------
> [    4.718971][    C0] irq 17 handler .vp_interrupt+0x0/0xc0 enabled interrupts
> [    4.720956][    C0] WARNING: CPU: 0 PID: 99 at kernel/irq/handle.c:159 .__handle_irq_event_percpu+0x230/0x270
> [    4.721113][    C0] Modules linked in:
> [    4.721502][    C0] CPU: 0 PID: 99 Comm: ip Not tainted 5.13.0-rc2-00118-gca433a3a44e3-dirty #1
> [    4.721861][    C0] NIP:  c0000000002108b0 LR: c0000000002108ac CTR: c0000000009bb220
> [    4.721943][    C0] REGS: c0000000fffeba00 TRAP: 0700   Not tainted  (5.13.0-rc2-00118-gca433a3a44e3-dirty)
> [    4.722081][    C0] MSR:  8000000000029032 <SF,EE,ME,IR,DR,RI>  CR: 28002247  XER: 20000000
> [    4.722404][    C0] CFAR: c0000000001760d0 IRQMASK: 0
> [    4.722404][    C0] GPR00: c0000000002108ac c0000000fffebca0 c000000002ad1a00 0000000000000038
> [    4.722404][    C0] GPR04: 0000000000000001 c0000000027645b0 0000000000000027 c0000000ff906e10
> [    4.722404][    C0] GPR08: 0000000000000023 ffffffffffffffd8 0000000000000027 c0000000011d7b40
> [    4.722404][    C0] GPR12: 0000000028002247 c000000002cd0000 0000000000000000 0000000000000000
> [    4.722404][    C0] GPR16: 0000000000000000 0000000000000000 0000000000000000 00000000100b7866
> [    4.722404][    C0] GPR20: 00000000100b7933 00000000100b5562 0000000000000000 0000000000000001
> [    4.722404][    C0] GPR24: 0000000000000002 0000000000000001 0000000000000011 c000000008891800
> [    4.722404][    C0] GPR28: 0000000000000011 c0000000fffebdd4 c00000000814a880 0000000000000000
> [    4.723412][    C0] NIP [c0000000002108b0] .__handle_irq_event_percpu+0x230/0x270
> [    4.723497][    C0] LR [c0000000002108ac] .__handle_irq_event_percpu+0x22c/0x270
> [    4.723719][    C0] Call Trace:
> [    4.723907][    C0] [c0000000fffebca0] [c0000000002108ac] .__handle_irq_event_percpu+0x22c/0x270 (unreliable)
> [    4.724205][    C0] [c0000000fffebd60] [c000000000210ae0] .handle_irq_event+0x130/0x190
> [    4.724309][    C0] [c0000000fffebe00] [c0000000002182bc] .handle_fasteoi_irq+0xbc/0x310
> [    4.724397][    C0] [c0000000fffebea0] [c00000000020ea74] .generic_handle_irq+0x54/0x80
> [    4.724575][    C0] [c0000000fffebf10] [c000000000015770] .__do_irq+0x70/0x1c0
> [    4.724667][    C0] [c0000000fffebf90] [c0000000000159d4] .do_IRQ+0x114/0x250
> [    4.724764][    C0] [c00000000a0730c0] [c00000000001596c] .do_IRQ+0xac/0x250
> [    4.724848][    C0] [c00000000a073160] [c000000000009230] hardware_interrupt_common_virt+0x1b0/0x1c0
> [    4.724952][    C0] --- interrupt: 500 at .virtnet_send_command+0x138/0x1b0
> [    4.725031][    C0] NIP:  c000000000b4db68 LR: c000000000b4db58 CTR: c000000000984910
> [    4.725090][    C0] REGS: c00000000a0731d0 TRAP: 0500   Not tainted  (5.13.0-rc2-00118-gca433a3a44e3-dirty)
> [    4.725156][    C0] MSR:  8000000000009032 <SF,EE,ME,IR,DR,RI>  CR: 24002444  XER: 20000000
> [    4.725299][    C0] CFAR: c00000000097e110 IRQMASK: 0
> [    4.725299][    C0] GPR00: c000000000b4db58 c00000000a073470 c000000002ad1a00 0000000000000000
> [    4.725299][    C0] GPR04: c00000000a0734e4 0000000000000000 0000000000000000 0000000000000000
> [    4.725299][    C0] GPR08: 0000000000000000 0000000000000000 0000000000000000 c00c00000002bb48
> [    4.725299][    C0] GPR12: 0000000024002442 c000000002cd0000 0000000000000000 0000000000000000
> [    4.725299][    C0] GPR16: 0000000000000000 0000000000000000 0000000000000000 00000000100b7866
> [    4.725299][    C0] GPR20: 00000000100b7933 00000000100b5562 00000000100b78c8 c000000008d89000
> [    4.725299][    C0] GPR24: 0000000000000003 0000000000001002 c00000000a0734e8 0000000000000002
> [    4.725299][    C0] GPR28: 0000000000000000 0000000000000000 c00000000a073508 c00000000ac41900
> [    4.726021][    C0] NIP [c000000000b4db68] .virtnet_send_command+0x138/0x1b0
> [    4.726091][    C0] LR [c000000000b4db58] .virtnet_send_command+0x128/0x1b0
> [    4.726166][    C0] --- interrupt: 500
> [    4.726226][    C0] [c00000000a073580] [c000000000b4ee2c] .virtnet_set_rx_mode+0xdc/0x3d0
> [    4.726312][    C0] [c00000000a073670] [c000000000d98bac] .__dev_set_rx_mode+0x8c/0x120
> [    4.726397][    C0] [c00000000a073700] [c000000000d98e50] .__dev_open+0x1a0/0x220
> [    4.726477][    C0] [c00000000a0737b0] [c000000000d994ac] .__dev_change_flags+0x23c/0x2e0
> [    4.726557][    C0] [c00000000a073880] [c000000000d99598] .dev_change_flags+0x48/0xa0
> [    4.726639][    C0] [c00000000a073910] [c000000000ec2b18] .devinet_ioctl+0x358/0xa00
> [    4.726729][    C0] [c00000000a0739f0] [c000000000ec5b1c] .inet_ioctl+0x23c/0x340
> [    4.726808][    C0] [c00000000a073b00] [c000000000d52c98] .sock_do_ioctl+0x68/0x1c0
> [    4.726891][    C0] [c00000000a073bd0] [c000000000d55ef4] .sock_ioctl+0x394/0x670
> [    4.726971][    C0] [c00000000a073cc0] [c0000000004d3e2c] .__se_sys_ioctl+0xdc/0x140
> [    4.727081][    C0] [c00000000a073d60] [c00000000002faec] .system_call_exception+0x17c/0x2e0
> [    4.727169][    C0] [c00000000a073e10] [c00000000000cfe4] system_call_common+0xf4/0x2dc
> [    4.727260][    C0] --- interrupt: c00 at 0x7fff7f9bf2f0
> [    4.727630][    C0] NIP:  00007fff7f9bf2f0 LR: 000000001000c6b4 CTR: 0000000000000000
> [    4.727692][    C0] REGS: c00000000a073e80 TRAP: 0c00   Not tainted  (5.13.0-rc2-00118-gca433a3a44e3-dirty)
> [    4.727758][    C0] MSR:  800000000280f032 <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI>  CR: 22000248  XER: 00000000
> [    4.727956][    C0] IRQMASK: 0
> [    4.727956][    C0] GPR00: 0000000000000036 00007ffffba7db00 00007fff7fa87300 0000000000000003
> [    4.727956][    C0] GPR04: 0000000000008914 00007ffffba7dc60 00000000100b554e 0000000000000000
> [    4.727956][    C0] GPR08: 0000000000000003 0000000000000000 0000000000000000 0000000000000000
> [    4.727956][    C0] GPR12: 0000000000000000 00007fff7fb0a3a0 0000000000000000 0000000000000000
> [    4.727956][    C0] GPR16: 0000000000000000 0000000000000000 0000000000000000 00000000100b7866
> [    4.727956][    C0] GPR20: 00000000100b7933 00000000100b5562 00000000100b78c8 0000000000000000
> [    4.727956][    C0] GPR24: 0000000000000000 ffffffffffffffff ffffffffffffffff 0000000000000000
> [    4.727956][    C0] GPR28: 00007ffffba7ff69 0000000000000001 0000000000000003 00000000100b554e
> [    4.728666][    C0] NIP [00007fff7f9bf2f0] 0x7fff7f9bf2f0
> [    4.728719][    C0] LR [000000001000c6b4] 0x1000c6b4
> [    4.728770][    C0] --- interrupt: c00
> [    4.728909][    C0] Instruction dump:
> [    4.729071][    C0] 38636c58 4bffbab9 60000000 4bfffed4 e8be0000 3c62fe70 3d22ffe7 7f44d378
> [    4.729251][    C0] 38636c18 9ae9f062 4bf657c1 60000000 <0fe00000> 9b2d0932 4bfffe70 60000000
> [    4.729494][    C0] ---[ end trace 0d7d391060ba35b4 ]---
> qemu-system-ppc64: terminating on signal 1 from pid 462523 (
> 

^ permalink raw reply

* Re: [PATCH v7 00/11] Speedup mremap on ppc64
From: Nicholas Piggin @ 2021-06-16  1:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Aneesh Kumar K.V, linux-mm@kvack.org, kaleshsingh@google.com,
	joel@joelfernandes.org, Kirill A . Shutemov,
	akpm@linux-foundation.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <CAHk-=wipa02d8tN-fCYJ=iH915yHtFr6wEDBcOeFtawVVF4niQ@mail.gmail.com>

Excerpts from Linus Torvalds's message of June 9, 2021 3:10 am:
> On Mon, Jun 7, 2021 at 3:10 AM Nick Piggin <npiggin@gmail.com> wrote:
>>
>> I'd really rather not do this, I'm not sure if micro benchmark captures everything.
> 
> I don't much care what powerpc code does _itnernally_ for this
> architecture-specific mis-design issue, but I really don't want to see
> more complex generic interfaces unless you have better hard numbers
> for them.
> 
> So far the numbers are: "no observable difference".
> 
> It would have to be not just observable, but actually meaningful for
> me to go "ok, we'll add this crazy flag that nobody else cares about".

Fair enough, will have to try get more numbers then I suppose.

> 
> And honestly, from everything I've seen on page table walker caches:
> they are great, but once you start remapping big ranges and
> invallidating megabytes of TLB's, the walker caches just aren't going
> to be your issue.

Remapping big ranges is going to have to invalidate intermediate caches
(aka PWC), so is unmapping. So we're stuck with the big hammer PWC 
invalidate there anyway.

It's mprotect and friends that would care here, possibly some THP thing...
but I guess those are probably down the list a little way.

I'm a bit less concerned about the PWCs that might be caching the regions
of the big mprotect() we just did, and more concerned about the effect 
of flushing all unrelated caches. Including on all other CPUs a threaded
program is running on. HANA, Java, are threaded and do mremaps, 
unfortunately.


> 
> But: numbers talk.  I'd take the sane generic interfaces as a first
> cut. If somebody then has really compelling numbers, we can _then_
> look at that "optimize for odd page table walker cache situation"
> case.

Yep okay. It's not the end of the world (or if it is we'd be able to get
numbers presumably).

> And in the meantime, maybe you can talk to the hardware people and
> tell them that you want the "flush range" capability to work right,
> and that if the walker cache is <i>so</i> important they shouldn't
> have made it a all-or-nothing flush.

I have, more than once :(

Fixing that would fix munmap etc cases as well, so yeah.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v19 05/13] of: Add a common kexec FDT setup function
From: Michael Ellerman @ 2021-06-16  2:23 UTC (permalink / raw)
  To: Rob Herring, nramas
  Cc: Mark Rutland, tao.li, Mimi Zohar, Paul Mackerras,
	Vincenzo Frascino, Frank Rowand, Sasha Levin, Stephen Rothwell,
	Will Deacon, Masahiro Yamada, James Morris, AKASHI Takahiro,
	Geert Uytterhoeven, Linux ARM, Catalin Marinas, Serge E. Hallyn,
	open list:OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS,
	Pavel Tatashin, Prakhar Srivastava, Hsin-Yi Wang, Allison Randal,
	Christophe Leroy, Matthias Brugger, balajib, dmitry.kasatkin,
	Linux Kernel Mailing List, James Morse, Greg KH, Joe Perches,
	linux-integrity, linuxppc-dev, Thiago Jung Bauermann
In-Reply-To: <CAL_JsqJEucP043eViq0Y1kAeqWNTqP5fLjfjz7+ksYx7QP_V5w@mail.gmail.com>

Rob Herring <robh@kernel.org> writes:
> On Tue, Jun 15, 2021 at 10:13 AM nramas <nramas@linux.microsoft.com> wrote:
>>
>> On Tue, 2021-06-15 at 08:01 -0600, Rob Herring wrote:
>> > On Tue, Jun 15, 2021 at 6:18 AM Geert Uytterhoeven <
>> > geert@linux-m68k.org> wrote:
>> > >
>> > > > +void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
>> > > > +                                  unsigned long
>> > > > initrd_load_addr,
>> > > > +                                  unsigned long initrd_len,
>> > > > +                                  const char *cmdline, size_t
>> > > > extra_fdt_size)
>> > > > +{
>> > > > +       /* Did we boot using an initrd? */
>> > > > +       prop = fdt_getprop(fdt, chosen_node, "linux,initrd-
>> > > > start", NULL);
>> > > > +       if (prop) {
>> > > > +               u64 tmp_start, tmp_end, tmp_size;
>> > > > +
>> > > > +               tmp_start = fdt64_to_cpu(*((const fdt64_t *)
>> > > > prop));
>> > > > +
>> > > > +               prop = fdt_getprop(fdt, chosen_node,
>> > > > "linux,initrd-end", NULL);
>> > > > +               if (!prop) {
>> > > > +                       ret = -EINVAL;
>> > > > +                       goto out;
>> > > > +               }
>> > > > +
>> > > > +               tmp_end = fdt64_to_cpu(*((const fdt64_t *)
>> > > > prop));
>> > >
>> > > Some kernel code assumes "linux,initrd-{start,end}" are 64-bit,
>> > > other code assumes 32-bit.
>> >
>> > It can be either. The above code was a merge of arm64 and powerpc >> > both
>> > of which use 64-bit and still only runs on those arches. It looks >> > like
>> > some powerpc platforms may use 32-bit, but this would have been >> > broken
>> > before.

>> of_kexec_alloc_and_setup_fdt() is called from elf_64.c (in
>> arch/powerpc/kexec) which is for 64-bit powerpc platform only.
>
> 64-bit PPC could be writing 32-bit property values. The architecture
> size doesn't necessarily matter. And if the values came from the
> bootloader, who knows what size it used.
>
> This code is 32-bit powerpc only?:
>
> arch/powerpc/boot/main.c-       /* Tell the kernel initrd address via device tree */
> arch/powerpc/boot/main.c:       setprop_val(chosen, "linux,initrd-start", (u32)(initrd_addr));
> arch/powerpc/boot/main.c-       setprop_val(chosen, "linux,initrd-end", (u32)(initrd_addr+initrd_size));

Historically that code was always built 32-bit, even when used with a
64-bit kernel.

These days it is also built 64-bit (for ppc64le).

It looks like the drivers/of/fdt.c code can handle either 64 or 32-bit,
so I guess that's why it seems to be working.

Although I'm not sure how much testing the 64-bit case gets, because the
distros tend to just use the vmlinux.

cheers

^ permalink raw reply

* Re: [PATCH v4 1/2] module: add elf_check_module_arch for module specific elf arch checks
From: Michael Ellerman @ 2021-06-16  2:37 UTC (permalink / raw)
  To: Jessica Yu, Nicholas Piggin
  Cc: Michal Suchánek, linuxppc-dev, linux-kernel
In-Reply-To: <YMiaZOqhHck9iy0n@p200300cbcf109700df096d564fe976c3.dip0.t-ipconnect.de>

Jessica Yu <jeyu@kernel.org> writes:
> +++ Nicholas Piggin [15/06/21 12:05 +1000]:
>>Excerpts from Jessica Yu's message of June 14, 2021 10:06 pm:
>>> +++ Nicholas Piggin [11/06/21 19:39 +1000]:
>>>>The elf_check_arch() function is used to test usermode binaries, but
>>>>kernel modules may have more specific requirements. powerpc would like
>>>>to test for ABI version compatibility.
>>>>
>>>>Add an arch-overridable function elf_check_module_arch() that defaults
>>>>to elf_check_arch() and use it in elf_validity_check().
>>>>
>>>>Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>>>>[np: split patch, added changelog]
>>>>Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>>>---
>>>> include/linux/moduleloader.h | 5 +++++
>>>> kernel/module.c              | 2 +-
>>>> 2 files changed, 6 insertions(+), 1 deletion(-)
>>>>
>>>>diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
>>>>index 9e09d11ffe5b..fdc042a84562 100644
>>>>--- a/include/linux/moduleloader.h
>>>>+++ b/include/linux/moduleloader.h
>>>>@@ -13,6 +13,11 @@
>>>>  * must be implemented by each architecture.
>>>>  */
>>>>
>>>>+// Allow arch to optionally do additional checking of module ELF header
>>>>+#ifndef elf_check_module_arch
>>>>+#define elf_check_module_arch elf_check_arch
>>>>+#endif
>>>
>>> Hi Nicholas,
>>>
>>> Why not make elf_check_module_arch() consistent with the other
>>> arch-specific functions? Please see module_frob_arch_sections(),
>>> module_{init,exit}_section(), etc in moduleloader.h. That is, they are
>>> all __weak functions that are overridable by arches. We can maybe make
>>> elf_check_module_arch() a weak symbol, available for arches to
>>> override if they want to perform additional elf checks. Then we don't
>>> have to have this one-off #define.

>>Like this? I like it. Good idea.
>
> Yeah! Also, maybe we can alternatively make elf_check_module_arch() a
> separate check entirely so that the powerpc implementation doesn't
> have to include that extra elf_check_arch() call. Something like this maybe?

My thinking for making elf_check_module_arch() the only hook was that
conceivably you might not want/need to call elf_check_arch() from
elf_check_module_arch().

So having a single module specific hook allows arch code to decide
how to implement the check, which may or may not involve calling
elf_check_arch(), but that becomes an arch implementation detail.

It's also one arch hook instead of two (although elf_check_arch()
already exists).

But I don't feel that strongly either way, whatever you prefer.

cheers

^ permalink raw reply

* Re: [PATCH v4 1/2] module: add elf_check_module_arch for module specific elf arch checks
From: Michael Ellerman @ 2021-06-16  2:39 UTC (permalink / raw)
  To: Segher Boessenkool, Jessica Yu
  Cc: Michal Suchánek, linuxppc-dev, linux-kernel, Nicholas Piggin
In-Reply-To: <20210615143038.GH5077@gate.crashing.org>

Segher Boessenkool <segher@kernel.crashing.org> writes:
> On Tue, Jun 15, 2021 at 03:41:00PM +0200, Jessica Yu wrote:
>> +++ Segher Boessenkool [15/06/21 07:50 -0500]:
>> >On Tue, Jun 15, 2021 at 02:17:40PM +0200, Jessica Yu wrote:
>> >>+int __weak elf_check_module_arch(Elf_Ehdr *hdr)
>> >>+{
>> >>+       return 1;
>> >>+}
>> >
>> >But is this a good idea?  It isn't useful to be able to attempt to load
>> >a module not compiled for your architecture, and it increases the attack
>> >surface tremendously.  These checks are one of the few things that can
>> >*not* be weak symbols, imo.
>> 
>> Hm, could you please elaborate a bit more? This patchset is adding
>> extra Elf header checks specifically for powerpc, and the module
>> loader usually provides arch-specific hooks via weak symbols. We are
>> just providing an new hook here, which should act as a no-op if it
>> isn't used.
>> 
>> So if an architecture wants to provide extra header checks, it can do
>> so by overriding the new weak symbol. Otherwise, the weak function acts as
>> a noop. We also already have the existing elf_check_arch() check for each
>> arch and that is *not* a weak symbol.
>
> The way I read your patch the default elf_check_module_arch does not
> call elf_check_arch?  Is that clearly called elsewhere and I'm just
> dumb again?  Sorry for the distraction in that case :-/

Yeah elf_check_arch() is already called from elf_validity_check(), and
that call would remain.

cheers

^ permalink raw reply

* Re: [PATCH] powerpc: Fix initrd corruption with relative jump labels
From: Michael Ellerman @ 2021-06-16  2:40 UTC (permalink / raw)
  To: Roman Bolshakov
  Cc: Linus Torvalds, linuxppc-dev, groug, a.kovaleva, linux-kernel,
	dja
In-Reply-To: <YMit22vwhmeK5BvK@SPB-NB-133.local>

Roman Bolshakov <r.bolshakov@yadro.com> writes:
> On Mon, Jun 14, 2021 at 11:14:40PM +1000, Michael Ellerman wrote:
>> Commit b0b3b2c78ec0 ("powerpc: Switch to relative jump labels") switched
>> us to using relative jump labels. That involves changing the code,
>> target and key members in struct jump_entry to be relative to the
>> address of the jump_entry, rather than absolute addresses.
>> 
>> We have two static inlines that create a struct jump_entry,
>> arch_static_branch() and arch_static_branch_jump(), as well as an asm
>> macro ARCH_STATIC_BRANCH, which is used by the pseries-only hypervisor
>> tracing code.
>> 
>> Unfortunately we missed updating the key to be a relative reference in
>> ARCH_STATIC_BRANCH.
>> 
>> That causes a pseries kernel to have a handful of jump_entry structs
>> with bad key values. Instead of being a relative reference they instead
>> hold the full address of the key.
>> 
>> However the code doesn't expect that, it still adds the key value to the
>> address of the jump_entry (see jump_entry_key()) expecting to get a
>> pointer to a key somewhere in kernel data.
>> 
>> The table of jump_entry structs sits in rodata, which comes after the
>> kernel text. In a typical build this will be somewhere around 15MB. The
>> address of the key will be somewhere in data, typically around 20MB.
>> Adding the two values together gets us a pointer somewhere around 45MB.
>> 
>> We then call static_key_set_entries() with that bad pointer and modify
>> some members of the struct static_key we think we are pointing at.
>> 
>> A pseries kernel is typically ~30MB in size, so writing to ~45MB won't
>> corrupt the kernel itself. However if we're booting with an initrd,
>> depending on the size and exact location of the initrd, we can corrupt
>> the initrd. Depending on how exactly we corrupt the initrd it can either
>> cause the system to not boot, or just corrupt one of the files in the
>> initrd.
>> 
>> The fix is simply to make the key value relative to the jump_entry
>> struct in the ARCH_STATIC_BRANCH macro.
>> 
>> Fixes: b0b3b2c78ec0 ("powerpc: Switch to relative jump labels")
>> Reported-by: Anastasia Kovaleva <a.kovaleva@yadro.com>
>> Reported-by: Roman Bolshakov <r.bolshakov@yadro.com>
>> Reported-by: Greg Kurz <groug@kaod.org>
>> Reported-by: Daniel Axtens <dja@axtens.net>
>> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>> ---
>>  arch/powerpc/include/asm/jump_label.h | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h
>> index 2d5c6bec2b4f..93ce3ec25387 100644
>> --- a/arch/powerpc/include/asm/jump_label.h
>> +++ b/arch/powerpc/include/asm/jump_label.h
>> @@ -50,7 +50,7 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key, bool
>>  1098:	nop;					\
>>  	.pushsection __jump_table, "aw";	\
>>  	.long 1098b - ., LABEL - .;		\
>> -	FTR_ENTRY_LONG KEY;			\
>> +	FTR_ENTRY_LONG KEY - .;			\
>>  	.popsection
>>  #endif
>>  
>> -- 
>> 2.25.1
>> 
>
> Thanks for fixing the issue, Michael.
>
> Perhaps the fix should go to v5.13-rc7 if Linus plans to do it. It'd be
> good to avoid broken initrd on pseries guests in the release.

Yes it's in my fixes branch and I'll ask Linus to pull that before the
next rc.

cheers

^ permalink raw reply

* [PATCH 8/8] membarrier: Rewrite sync_core_before_usermode() and improve documentation
From: Andy Lutomirski @ 2021-06-16  3:21 UTC (permalink / raw)
  To: x86
  Cc: Catalin Marinas, Will Deacon, linux-mm, Peter Zijlstra, LKML,
	Nicholas Piggin, Dave Hansen, Paul Mackerras, stable,
	Andy Lutomirski, Mathieu Desnoyers, Andrew Morton, linuxppc-dev,
	linux-arm-kernel
In-Reply-To: <cover.1623813516.git.luto@kernel.org>

The old sync_core_before_usermode() comments suggested that a non-icache-syncing
return-to-usermode instruction is x86-specific and that all other
architectures automatically notice cross-modified code on return to
userspace.

This is misleading.  The incantation needed to modify code from one
CPU and execute it on another CPU is highly architecture dependent.
On x86, according to the SDM, one must modify the code, issue SFENCE
if the modification was WC or nontemporal, and then issue a "serializing
instruction" on the CPU that will execute the code.  membarrier() can do
the latter.

On arm64 and powerpc, one must flush the icache and then flush the pipeline
on the target CPU, although the CPU manuals don't necessarily use this
language.

So let's drop any pretense that we can have a generic way to define or
implement membarrier's SYNC_CORE operation and instead require all
architectures to define the helper and supply their own documentation as to
how to use it.  This means x86, arm64, and powerpc for now.  Let's also
rename the function from sync_core_before_usermode() to
membarrier_sync_core_before_usermode() because the precise flushing details
may very well be specific to membarrier, and even the concept of
"sync_core" in the kernel is mostly an x86-ism.

(It may well be the case that, on real x86 processors, synchronizing the
 icache (which requires no action at all) and "flushing the pipeline" is
 sufficient, but trying to use this language would be confusing at best.
 LFENCE does something awfully like "flushing the pipeline", but the SDM
 does not permit LFENCE as an alternative to a "serializing instruction"
 for this purpose.)

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: x86@kernel.org
Cc: stable@vger.kernel.org
Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE")
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 .../membarrier-sync-core/arch-support.txt     | 68 ++++++-------------
 arch/arm64/include/asm/sync_core.h            | 19 ++++++
 arch/powerpc/include/asm/sync_core.h          | 14 ++++
 arch/x86/Kconfig                              |  1 -
 arch/x86/include/asm/sync_core.h              |  7 +-
 arch/x86/kernel/alternative.c                 |  2 +-
 arch/x86/kernel/cpu/mce/core.c                |  2 +-
 arch/x86/mm/tlb.c                             |  3 +-
 drivers/misc/sgi-gru/grufault.c               |  2 +-
 drivers/misc/sgi-gru/gruhandles.c             |  2 +-
 drivers/misc/sgi-gru/grukservices.c           |  2 +-
 include/linux/sched/mm.h                      |  1 -
 include/linux/sync_core.h                     | 21 ------
 init/Kconfig                                  |  3 -
 kernel/sched/membarrier.c                     | 15 ++--
 15 files changed, 75 insertions(+), 87 deletions(-)
 create mode 100644 arch/arm64/include/asm/sync_core.h
 create mode 100644 arch/powerpc/include/asm/sync_core.h
 delete mode 100644 include/linux/sync_core.h

diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 883d33b265d6..41c9ebcb275f 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -5,51 +5,25 @@
 #
 # Architecture requirements
 #
-# * arm/arm64/powerpc
 #
-# Rely on implicit context synchronization as a result of exception return
-# when returning from IPI handler, and when returning to user-space.
-#
-# * x86
-#
-# x86-32 uses IRET as return from interrupt, which takes care of the IPI.
-# However, it uses both IRET and SYSEXIT to go back to user-space. The IRET
-# instruction is core serializing, but not SYSEXIT.
-#
-# x86-64 uses IRET as return from interrupt, which takes care of the IPI.
-# However, it can return to user-space through either SYSRETL (compat code),
-# SYSRETQ, or IRET.
-#
-# Given that neither SYSRET{L,Q}, nor SYSEXIT, are core serializing, we rely
-# instead on write_cr3() performed by switch_mm() to provide core serialization
-# after changing the current mm, and deal with the special case of kthread ->
-# uthread (temporarily keeping current mm into active_mm) by issuing a
-# sync_core_before_usermode() in that specific case.
-#
-    -----------------------
-    |         arch |status|
-    -----------------------
-    |       alpha: | TODO |
-    |         arc: | TODO |
-    |         arm: |  ok  |
-    |       arm64: |  ok  |
-    |        csky: | TODO |
-    |       h8300: | TODO |
-    |     hexagon: | TODO |
-    |        ia64: | TODO |
-    |        m68k: | TODO |
-    |  microblaze: | TODO |
-    |        mips: | TODO |
-    |       nds32: | TODO |
-    |       nios2: | TODO |
-    |    openrisc: | TODO |
-    |      parisc: | TODO |
-    |     powerpc: |  ok  |
-    |       riscv: | TODO |
-    |        s390: | TODO |
-    |          sh: | TODO |
-    |       sparc: | TODO |
-    |          um: | TODO |
-    |         x86: |  ok  |
-    |      xtensa: | TODO |
-    -----------------------
+# An architecture that wants to support
+# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE needs to define precisely what it
+# is supposed to do and implement membarrier_sync_core_before_usermode() to
+# make it do that.  Then it can select ARCH_HAS_MEMBARRIER_SYNC_CORE via
+# Kconfig.Unfortunately, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is not a
+# fantastic API and may not make sense on all architectures.  Once an
+# architecture meets these requirements,
+#
+# On x86, a program can safely modify code, issue
+# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, and then execute that code, via
+# the modified address or an alias, from any thread in the calling process.
+#
+# On arm64, a program can modify code, flush the icache as needed, and issue
+# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE to force a "context synchronizing
+# event", aka pipeline flush on all CPUs that might run the calling process.
+# Then the program can execute the modified code as long as it is executed
+# from an address consistent with the icache flush and the CPU's cache type.
+#
+# On powerpc, a program can use MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
+# similarly to arm64.  It would be nice if the powerpc maintainers could
+# add a more clear explanantion.
diff --git a/arch/arm64/include/asm/sync_core.h b/arch/arm64/include/asm/sync_core.h
new file mode 100644
index 000000000000..74996bf533bb
--- /dev/null
+++ b/arch/arm64/include/asm/sync_core.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARM64_SYNC_CORE_H
+#define _ASM_ARM64_SYNC_CORE_H
+
+#include <asm/barrier.h>
+
+/*
+ * On arm64, anyone trying to use membarrier() to handle JIT code is
+ * required to first flush the icache and then do SYNC_CORE.  All that's
+ * needed after the icache flush is to execute a "context synchronization
+ * event".  Right now, ERET does this, and we are guaranteed to ERET before
+ * any user code runs.  If Linux ever programs the CPU to make ERET stop
+ * being a context synchronizing event, then this will need to be adjusted.
+ */
+static inline void membarrier_sync_core_before_usermode(void)
+{
+}
+
+#endif /* _ASM_ARM64_SYNC_CORE_H */
diff --git a/arch/powerpc/include/asm/sync_core.h b/arch/powerpc/include/asm/sync_core.h
new file mode 100644
index 000000000000..589fdb34beab
--- /dev/null
+++ b/arch/powerpc/include/asm/sync_core.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_SYNC_CORE_H
+#define _ASM_POWERPC_SYNC_CORE_H
+
+#include <asm/barrier.h>
+
+/*
+ * XXX: can a powerpc person put an appropriate comment here?
+ */
+static inline void membarrier_sync_core_before_usermode(void)
+{
+}
+
+#endif /* _ASM_POWERPC_SYNC_CORE_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b44190..f010897a1e8a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -89,7 +89,6 @@ config X86
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
-	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAS_DEBUG_WX
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index ab7382f92aff..c665b453969a 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -89,11 +89,10 @@ static inline void sync_core(void)
 }
 
 /*
- * Ensure that a core serializing instruction is issued before returning
- * to user-mode. x86 implements return to user-space through sysexit,
- * sysrel, and sysretq, which are not core serializing.
+ * Ensure that the CPU notices any instruction changes before the next time
+ * it returns to usermode.
  */
-static inline void sync_core_before_usermode(void)
+static inline void membarrier_sync_core_before_usermode(void)
 {
 	/* With PTI, we unconditionally serialize before running user code. */
 	if (static_cpu_has(X86_FEATURE_PTI))
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6974b5174495..52ead5f4fcdc 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -17,7 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/mmu_context.h>
 #include <linux/bsearch.h>
-#include <linux/sync_core.h>
+#include <asm/sync_core.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index bf7fe87a7e88..4a577980d4d1 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -41,12 +41,12 @@
 #include <linux/irq_work.h>
 #include <linux/export.h>
 #include <linux/set_memory.h>
-#include <linux/sync_core.h>
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 
 #include <asm/intel-family.h>
 #include <asm/processor.h>
+#include <asm/sync_core.h>
 #include <asm/traps.h>
 #include <asm/tlbflush.h>
 #include <asm/mce.h>
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 59488d663e68..35b622fd2ed1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -11,6 +11,7 @@
 #include <linux/sched/mm.h>
 
 #include <asm/tlbflush.h>
+#include <asm/sync_core.h>
 #include <asm/mmu_context.h>
 #include <asm/nospec-branch.h>
 #include <asm/cache.h>
@@ -538,7 +539,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 */
 			if (unlikely(atomic_read(&next->membarrier_state) &
 				     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))
-				sync_core_before_usermode();
+				membarrier_sync_core_before_usermode();
 #endif
 
 			return;
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 723825524ea0..48fd5b101de1 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -20,8 +20,8 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <linux/security.h>
-#include <linux/sync_core.h>
 #include <linux/prefetch.h>
+#include <asm/sync_core.h>
 #include "gru.h"
 #include "grutables.h"
 #include "grulib.h"
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index 1d75d5e540bc..c8cba1c1b00f 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -16,7 +16,7 @@
 #define GRU_OPERATION_TIMEOUT	(((cycles_t) local_cpu_data->itc_freq)*10)
 #define CLKS2NSEC(c)		((c) *1000000000 / local_cpu_data->itc_freq)
 #else
-#include <linux/sync_core.h>
+#include <asm/sync_core.h>
 #include <asm/tsc.h>
 #define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
 #define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 0ea923fe6371..ce03ff3f7c3a 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -16,10 +16,10 @@
 #include <linux/miscdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
-#include <linux/sync_core.h>
 #include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/export.h>
+#include <asm/sync_core.h>
 #include <asm/io_apic.h>
 #include "gru.h"
 #include "grulib.h"
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c6eebbafadb0..845db11190cd 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -7,7 +7,6 @@
 #include <linux/sched.h>
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
-#include <linux/sync_core.h>
 
 /*
  * Routines for handling mm_structs
diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h
deleted file mode 100644
index 013da4b8b327..000000000000
--- a/include/linux/sync_core.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SYNC_CORE_H
-#define _LINUX_SYNC_CORE_H
-
-#ifdef CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
-#include <asm/sync_core.h>
-#else
-/*
- * This is a dummy sync_core_before_usermode() implementation that can be used
- * on all architectures which return to user-space through core serializing
- * instructions.
- * If your architecture returns to user-space through non-core-serializing
- * instructions, you need to write your own functions.
- */
-static inline void sync_core_before_usermode(void)
-{
-}
-#endif
-
-#endif /* _LINUX_SYNC_CORE_H */
-
diff --git a/init/Kconfig b/init/Kconfig
index 1ea12c64e4c9..e5d552b0823e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2377,9 +2377,6 @@ source "kernel/Kconfig.locks"
 config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	bool
 
-config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
-	bool
-
 # It may be useful for an architecture to override the definitions of the
 # SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in <linux/syscalls.h>
 # and the COMPAT_ variants in <linux/compat.h>, in particular to use a
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index c32c32a2441e..f72a6ab3fac2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -5,6 +5,9 @@
  * membarrier system call
  */
 #include "sched.h"
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#include <asm/sync_core.h>
+#endif
 
 /*
  * The basic principle behind the regular memory barrier mode of membarrier()
@@ -221,6 +224,7 @@ static void ipi_mb(void *info)
 	smp_mb();	/* IPIs should be serializing but paranoid. */
 }
 
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
 static void ipi_sync_core(void *info)
 {
 	/*
@@ -230,13 +234,14 @@ static void ipi_sync_core(void *info)
 	 * the big comment at the top of this file.
 	 *
 	 * A sync_core() would provide this guarantee, but
-	 * sync_core_before_usermode() might end up being deferred until
-	 * after membarrier()'s smp_mb().
+	 * membarrier_sync_core_before_usermode() might end up being deferred
+	 * until after membarrier()'s smp_mb().
 	 */
 	smp_mb();	/* IPIs should be serializing but paranoid. */
 
-	sync_core_before_usermode();
+	membarrier_sync_core_before_usermode();
 }
+#endif
 
 static void ipi_rseq(void *info)
 {
@@ -368,12 +373,14 @@ static int membarrier_private_expedited(int flags, int cpu_id)
 	smp_call_func_t ipi_func = ipi_mb;
 
 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
-		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+#ifndef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
 			return -EINVAL;
+#else
 		if (!(atomic_read(&mm->membarrier_state) &
 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 			return -EPERM;
 		ipi_func = ipi_sync_core;
+#endif
 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
 		if (!IS_ENABLED(CONFIG_RSEQ))
 			return -EINVAL;
-- 
2.31.1


^ permalink raw reply related

* [PATCH 6/8] powerpc/membarrier: Remove special barrier on mm switch
From: Andy Lutomirski @ 2021-06-16  3:21 UTC (permalink / raw)
  To: x86
  Cc: linux-mm, Peter Zijlstra, LKML, Nicholas Piggin, Dave Hansen,
	Paul Mackerras, Andy Lutomirski, Mathieu Desnoyers, Andrew Morton,
	linuxppc-dev
In-Reply-To: <cover.1623813516.git.luto@kernel.org>

powerpc did the following on some, but not all, paths through
switch_mm_irqs_off():

       /*
        * Only need the full barrier when switching between processes.
        * Barrier when switching from kernel to userspace is not
        * required here, given that it is implied by mmdrop(). Barrier
        * when switching from userspace to kernel is not needed after
        * store to rq->curr.
        */
       if (likely(!(atomic_read(&next->membarrier_state) &
                    (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
                     MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
               return;

This is puzzling: if !prev, then one might expect that we are switching
from kernel to user, not user to kernel, which is inconsistent with the
comment.  But this is all nonsense, because the one and only caller would
never have prev == NULL and would, in fact, OOPS if prev == NULL.

In any event, this code is unnecessary, since the new generic
membarrier_finish_switch_mm() provides the same barrier without arch help.

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/powerpc/include/asm/membarrier.h | 27 ---------------------------
 arch/powerpc/mm/mmu_context.c         |  2 --
 2 files changed, 29 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/membarrier.h

diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
deleted file mode 100644
index 6e20bb5c74ea..000000000000
--- a/arch/powerpc/include/asm/membarrier.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _ASM_POWERPC_MEMBARRIER_H
-#define _ASM_POWERPC_MEMBARRIER_H
-
-static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
-					     struct mm_struct *next,
-					     struct task_struct *tsk)
-{
-	/*
-	 * Only need the full barrier when switching between processes.
-	 * Barrier when switching from kernel to userspace is not
-	 * required here, given that it is implied by mmdrop(). Barrier
-	 * when switching from userspace to kernel is not needed after
-	 * store to rq->curr.
-	 */
-	if (likely(!(atomic_read(&next->membarrier_state) &
-		     (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
-		      MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
-		return;
-
-	/*
-	 * The membarrier system call requires a full memory barrier
-	 * after storing to rq->curr, before going back to user-space.
-	 */
-	smp_mb();
-}
-
-#endif /* _ASM_POWERPC_MEMBARRIER_H */
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index a857af401738..8daa95b3162b 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -85,8 +85,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
 	if (new_on_cpu)
 		radix_kvm_prefetch_workaround(next);
-	else
-		membarrier_arch_switch_mm(prev, next, tsk);
 
 	/*
 	 * The actual HW switching method differs between the various
-- 
2.31.1


^ permalink raw reply related

* Re: Oops (NULL pointer) with 'perf record' of selftest 'null_syscall'
From: Madhavan Srinivasan @ 2021-06-16  3:23 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <c141a18c-b18d-b775-1848-527c35a1c433@csgroup.eu>


On 6/15/21 8:35 PM, Christophe Leroy wrote:
> For your information, I'm getting the following Oops. Detected with 
> 5.13-rc6, it also oopses on 5.12 and 5.11.
> Runs ok on 5.10. I'm starting bisecting now.


Thanks for reporting, got the issue. What has happened in this case is 
that, pmu device is not registered
and trying to access the instruction point which will land in 
perf_instruction_pointer(). And recently I have added
a workaround patch for power10 DD1 which has caused this breakage. My 
bad. We are working on a fix patch
for the same and will post it out. Sorry again.

Maddy


>
> root@vgoippro:/tmp# perf record /root/null_syscall
> [  285.559987] BUG: Kernel NULL pointer dereference on read at 0x00000040
> [  285.566533] Faulting instruction address: 0xc0021f0c
> [  285.571486] Oops: Kernel access of bad area, sig: 11 [#1]
> [  285.576872] BE PAGE_SIZE=4K PREEMPT CMPCPRO
> [  285.581080] SAF3000 DIE NOTIFICATION
> [  285.584661] CPU: 0 PID: 442 Comm: null_syscall Not tainted 
> 5.13.0-rc6-s3k-dev-01645-g7649ee3d2957 #5164
> [  285.594035] NIP:  c0021f0c LR: c00e8ad8 CTR: c00d8a5c
> [  285.599074] REGS: e67757d0 TRAP: 0300   Not tainted 
> (5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
> [  285.607576] MSR:  00001032 <ME,IR,DR,RI>  CR: 44775b18 XER: 20000000
> [  285.614063] DAR: 00000040 DSISR: 20000000
> [  285.614063] GPR00: c00e8810 e6775880 c1c52640 e6775b20 7cb36ae0 
> f0000028 43ebeedc 5ccc47d0
> [  285.614063] GPR08: 00000000 00000900 e6775b20 00000001 00000000 
> 1025b2c0 10013088 10012ee0
> [  285.614063] GPR16: b0000000 00000007 00000001 c00deb64 00000042 
> 00000001 78db7b23 c0b13200
> [  285.614063] GPR24: 00000000 00000000 00000000 e6775b20 c13b8560 
> 00000107 e6775940 e67758e8
> [  285.651693] NIP [c0021f0c] perf_instruction_pointer+0x10/0x60
> [  285.657460] LR [c00e8ad8] perf_prepare_sample+0x344/0x674
> [  285.662859] Call Trace:
> [  285.665301] [e6775880] [c00e8810] perf_prepare_sample+0x7c/0x674 
> (unreliable)
> [  285.672452] [e67758c0] [c00e8e44] perf_event_output_forward+0x3c/0x94
> [  285.678903] [e6775910] [c00dea8c] __perf_event_overflow+0x74/0x14c
> [  285.685108] [e6775930] [c00dec5c] perf_swevent_hrtimer+0xf8/0x170
> [  285.691217] [e6775a40] [c008c8d0] 
> __hrtimer_run_queues.constprop.0+0x160/0x318
> [  285.698456] [e6775a90] [c008d94c] hrtimer_interrupt+0x148/0x3b0
> [  285.704394] [e6775ae0] [c000c0c0] timer_interrupt+0xc4/0x22c
> [  285.710067] [e6775b10] [c00046f0] Decrementer_virt+0xb8/0xbc
> [  285.715744] --- interrupt: 900 at pagecache_get_page+0x210/0x430
> [  285.721764] NIP:  c00f52a8 LR: c00f5408 CTR: c00f59d8
> [  285.726805] REGS: e6775b20 TRAP: 0900   Not tainted 
> (5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
> [  285.735306] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 28422d68  XER: 
> 00000000
> [  285.742056]
> [  285.742056] GPR00: c00f513c e6775bd0 c1c52640 c1c52640 00000000 
> 00000000 00000000 c1382c38
> [  285.742056] GPR08: 00000000 00000000 00000001 00000000 88482d68 
> 1025b2c0 10013088 10012ee0
> [  285.742056] GPR16: b0000000 00000007 00000001 10012ee0 c18187ac 
> c0b87800 61c88647 c0c18c00
> [  285.742056] GPR24: 00000001 00000003 00000000 00000002 c18187a8 
> 00100cca 00000044 00000000
> [  285.777079] NIP [c00f52a8] pagecache_get_page+0x210/0x430
> [  285.782482] LR [c00f5408] pagecache_get_page+0x370/0x430
> [  285.787796] --- interrupt: 900
> [  285.790843] [e6775bd0] [c00f513c] pagecache_get_page+0xa4/0x430 
> (unreliable)
> [  285.797910] [e6775c30] [c00f5ca8] filemap_fault+0x2d0/0x8e8
> [  285.803500] [e6775ca0] [c012d244] __do_fault+0x4c/0xd8
> [  285.808666] [e6775cb0] [c0130f64] handle_mm_fault+0x274/0x10b8
> [  285.814517] [e6775d30] [c0014f58] do_page_fault+0x1d4/0x67c
> [  285.820117] [e6775d60] [c000424c] DataAccess_virt+0xd4/0xe4
> [  285.825707] --- interrupt: 300 at __arch_clear_user+0x10/0xcc
> [  285.831458] NIP:  c001a3cc LR: c01d5cfc CTR: 00000000
> [  285.836497] REGS: e6775d70 TRAP: 0300   Not tainted 
> (5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
> [  285.845000] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 48004264  XER: 
> 20000000
> [  285.851751] DAR: 10012ee0 DSISR: 22000000
> [  285.851751] GPR00: c01d53fc e6775e20 c1c52640 00000000 00000120 
> 00000008 c136241c 00000000
> [  285.851751] GPR08: 00000000 9ffed120 10012ee0 00000004 28004868 
> 1025b2c0 10013088 10012ee0
> [  285.851751] GPR16: b0000000 00000007 00000001 10012ee0 10000000 
> 10012d0c 10000000 c1d74240
> [  285.851751] GPR24: 10012ee0 00000000 c1345e80 c1343dc0 10000b38 
> 00000000 c132ec00 c1386a00
> [  285.889384] NIP [c001a3cc] __arch_clear_user+0x10/0xcc
> [  285.894527] LR [c01d5cfc] load_elf_binary+0xec4/0x1340
> [  285.899682] --- interrupt: 300
> [  285.902730] [e6775e20] [c01d53fc] load_elf_binary+0x5c4/0x1340 
> (unreliable)
> [  285.909713] [e6775ea0] [c0163258] bprm_execve+0x200/0x55c
> [  285.915138] [e6775ef0] [c0163e00] do_execveat_common+0x178/0x1f4
> [  285.921162] [e6775f20] [c0165558] sys_execve+0x40/0x58
> [  285.926321] [e6775f40] [c001404c] ret_from_syscall+0x0/0x28
> [  285.931917] --- interrupt: c00 at 0xfc3ce78
> [  285.936097] NIP:  0fc3ce78 LR: 0fc3d7cc CTR: c01657cc
> [  285.941135] REGS: e6775f50 TRAP: 0c00   Not tainted 
> (5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
> [  285.949636] MSR:  0000d032 <EE,PR,ME,IR,DR,RI>  CR: 22004868  XER: 
> 20000000
> [  285.956655]
> [  285.956655] GPR00: 0000000b afab1bf0 a7d77a50 afab6ee1 afab64c8 
> 104bd9b0 fefefeff 7f7f7f7f
> [  285.956655] GPR08: afab6ee0 00000000 006df8f9 0000011d 24004864 
> 1025b2c0 10231a50 10249108
> [  285.956655] GPR16: 104beeb0 10254830 105dd3f4 10250000 1018a124 
> 10188448 10234e58 00000000
> [  285.956655] GPR24: 10231ae0 00000003 00000001 104bd9b0 afab64c8 
> afab6ee1 0fd25244 afab1bf0
> [  285.991684] NIP [0fc3ce78] 0xfc3ce78
> [  285.995257] LR [0fc3d7cc] 0xfc3d7cc
> [  285.998742] --- interrupt: c00
> [  286.001789] Instruction dump:
> [  286.004757] 3d20c07d 80a40010 3889847c 486bc444 80630084 546397fe 
> 38630001 4e800020
> [  286.012556] 3d20c0b9 7c6a1b78 810915e4 812300a0 <81080040> 55290036 
> 2c090f00 5508056a
> [  286.020531] ---[ end trace e381e6fcc2db5226 ]---

^ permalink raw reply

* [powerpc:fixes-test] BUILD SUCCESS 478036c4cd1a16e613a2f883d79c03cf187faacb
From: kernel test robot @ 2021-06-16  3:26 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git fixes-test
branch HEAD: 478036c4cd1a16e613a2f883d79c03cf187faacb  powerpc: Fix initrd corruption with relative jump labels

elapsed time: 721m

configs tested: 153
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
powerpc                      chrp32_defconfig
powerpc                  mpc866_ads_defconfig
ia64                        generic_defconfig
arm                       versatile_defconfig
arm                         lpc32xx_defconfig
mips                     cu1000-neo_defconfig
arm                         socfpga_defconfig
powerpc                     sbc8548_defconfig
arm                  colibri_pxa270_defconfig
m68k                        stmark2_defconfig
powerpc                        icon_defconfig
xtensa                    smp_lx200_defconfig
mips                          ath79_defconfig
powerpc                  iss476-smp_defconfig
h8300                            allyesconfig
mips                  cavium_octeon_defconfig
um                           x86_64_defconfig
arm                          pcm027_defconfig
powerpc                      ppc64e_defconfig
xtensa                  audio_kc705_defconfig
xtensa                generic_kc705_defconfig
powerpc                 mpc8540_ads_defconfig
arm                        oxnas_v6_defconfig
arm                        multi_v7_defconfig
s390                                defconfig
arm                            lart_defconfig
sh                          sdk7786_defconfig
arm                          collie_defconfig
powerpc                 mpc85xx_cds_defconfig
nds32                               defconfig
powerpc                          g5_defconfig
arm                            hisi_defconfig
xtensa                          iss_defconfig
powerpc                     kmeter1_defconfig
i386                                defconfig
sparc                       sparc32_defconfig
powerpc64                        alldefconfig
mips                 decstation_r4k_defconfig
powerpc                   lite5200b_defconfig
arm                        neponset_defconfig
openrisc                            defconfig
sh                        sh7763rdp_defconfig
mips                      bmips_stb_defconfig
um                            kunit_defconfig
arm                            zeus_defconfig
m68k                          atari_defconfig
arm                        mvebu_v5_defconfig
ia64                            zx1_defconfig
powerpc                     ksi8560_defconfig
ia64                             alldefconfig
h8300                       h8s-sim_defconfig
x86_64                           allyesconfig
arm                           u8500_defconfig
powerpc                       holly_defconfig
sh                        sh7785lcr_defconfig
powerpc                      ep88xc_defconfig
arm64                            alldefconfig
arc                    vdk_hs38_smp_defconfig
m68k                         amcore_defconfig
microblaze                      mmu_defconfig
m68k                       m5275evb_defconfig
arm                        keystone_defconfig
arm                         palmz72_defconfig
powerpc                mpc7448_hpc2_defconfig
powerpc                        fsp2_defconfig
s390                             alldefconfig
riscv             nommu_k210_sdcard_defconfig
powerpc                        warp_defconfig
powerpc                     tqm8560_defconfig
powerpc                      ppc44x_defconfig
m68k                          sun3x_defconfig
arm                          exynos_defconfig
sh                           se7343_defconfig
arm                              alldefconfig
arm                         hackkit_defconfig
arm                       imx_v6_v7_defconfig
powerpc                      katmai_defconfig
arm                           viper_defconfig
m68k                             allmodconfig
s390                          debug_defconfig
powerpc                 mpc832x_mds_defconfig
sparc                       sparc64_defconfig
riscv                          rv32_defconfig
x86_64                            allnoconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
s390                             allmodconfig
parisc                           allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                             allyesconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a001-20210615
x86_64               randconfig-a004-20210615
x86_64               randconfig-a002-20210615
x86_64               randconfig-a003-20210615
x86_64               randconfig-a006-20210615
x86_64               randconfig-a005-20210615
i386                 randconfig-a002-20210615
i386                 randconfig-a006-20210615
i386                 randconfig-a004-20210615
i386                 randconfig-a001-20210615
i386                 randconfig-a005-20210615
i386                 randconfig-a003-20210615
i386                 randconfig-a015-20210615
i386                 randconfig-a013-20210615
i386                 randconfig-a016-20210615
i386                 randconfig-a012-20210615
i386                 randconfig-a014-20210615
i386                 randconfig-a011-20210615
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
x86_64                    rhel-8.3-kselftests
um                             i386_defconfig
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                      rhel-8.3-kbuiltin
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-b001-20210615
x86_64               randconfig-a015-20210615
x86_64               randconfig-a011-20210615
x86_64               randconfig-a012-20210615
x86_64               randconfig-a014-20210615
x86_64               randconfig-a016-20210615
x86_64               randconfig-a013-20210615

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* [powerpc:next-test] BUILD REGRESSION 103bf32b0d2dd8b8a4d3d9ebdded5ba4e8263e6a
From: kernel test robot @ 2021-06-16  3:37 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test
branch HEAD: 103bf32b0d2dd8b8a4d3d9ebdded5ba4e8263e6a  powerpc/interrupt: Remove prep_irq_for_user_exit()

Error/Warning ids grouped by kconfigs:

gcc_recent_errors
|-- powerpc-allyesconfig
|   |-- arch-powerpc-kernel-interrupt.c:warning:no-previous-prototype-for-interrupt_exit_kernel_restart
|   |-- arch-powerpc-kernel-interrupt.c:warning:no-previous-prototype-for-interrupt_exit_user_restart
|   |-- arch-powerpc-kernel-interrupt.c:warning:no-previous-prototype-for-syscall_exit_restart
|   `-- arch-powerpc-lib-restart_table.c:warning:no-previous-prototype-for-search_kernel_restart_table
`-- x86_64-allnoconfig
    `-- arch-powerpc-kernel-interrupt_64.S:asm-head-.h-is-included-more-than-once.

clang_recent_errors
`-- powerpc-randconfig-r012-20210615
    |-- arch-powerpc-kernel-rtasd.c:error:unused-variable-rtas_log_proc_ops-Werror-Wunused-const-variable
    |-- arch-powerpc-platforms-pseries-lpar.c:error:unused-variable-vcpudispatch_stats_freq_proc_ops-Werror-Wunused-const-variable
    |-- arch-powerpc-platforms-pseries-lpar.c:error:unused-variable-vcpudispatch_stats_proc_ops-Werror-Wunused-const-variable
    |-- arch-powerpc-platforms-pseries-reconfig.c:error:unused-variable-ofdt_proc_ops-Werror-Wunused-const-variable
    `-- drivers-misc-cxl-native.c:error:unused-function-detach_spa-Werror-Wunused-function

elapsed time: 730m

configs tested: 94
configs skipped: 2

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
xtensa                         virt_defconfig
arm                            lart_defconfig
sh                           se7619_defconfig
ia64                      gensparse_defconfig
powerpc                 mpc836x_mds_defconfig
mips                     cu1000-neo_defconfig
m68k                       m5275evb_defconfig
arm                        keystone_defconfig
arm                         palmz72_defconfig
powerpc                mpc7448_hpc2_defconfig
s390                          debug_defconfig
powerpc                 mpc832x_mds_defconfig
arm                        mvebu_v5_defconfig
sparc                       sparc64_defconfig
powerpc                 mpc85xx_cds_defconfig
x86_64                            allnoconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
nds32                               defconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
s390                             allmodconfig
parisc                           allyesconfig
s390                                defconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
i386                 randconfig-a002-20210615
i386                 randconfig-a006-20210615
i386                 randconfig-a004-20210615
i386                 randconfig-a001-20210615
i386                 randconfig-a005-20210615
i386                 randconfig-a003-20210615
i386                 randconfig-a015-20210615
i386                 randconfig-a013-20210615
i386                 randconfig-a016-20210615
i386                 randconfig-a012-20210615
i386                 randconfig-a014-20210615
i386                 randconfig-a011-20210615
x86_64               randconfig-a001-20210615
x86_64               randconfig-a004-20210615
x86_64               randconfig-a002-20210615
x86_64               randconfig-a003-20210615
x86_64               randconfig-a006-20210615
x86_64               randconfig-a005-20210615
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                          rv32_defconfig
riscv                            allmodconfig
um                           x86_64_defconfig
um                             i386_defconfig
um                            kunit_defconfig
x86_64                           allyesconfig
x86_64                    rhel-8.3-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                      rhel-8.3-kbuiltin
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-b001-20210615
x86_64               randconfig-a015-20210615
x86_64               randconfig-a011-20210615
x86_64               randconfig-a012-20210615
x86_64               randconfig-a014-20210615
x86_64               randconfig-a016-20210615
x86_64               randconfig-a013-20210615

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* [powerpc:next] BUILD SUCCESS ddf4a7bcd09439e82c4d6f959f4ff6c53f07466f
From: kernel test robot @ 2021-06-16  3:37 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
branch HEAD: ddf4a7bcd09439e82c4d6f959f4ff6c53f07466f  powerpc/tau: Remove superfluous parameter in alloc_workqueue() call

elapsed time: 730m

configs tested: 146
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
powerpc                      chrp32_defconfig
powerpc                  mpc866_ads_defconfig
ia64                        generic_defconfig
arm                       versatile_defconfig
arm                         lpc32xx_defconfig
mips                     cu1000-neo_defconfig
arm                         socfpga_defconfig
powerpc                     sbc8548_defconfig
arm                  colibri_pxa270_defconfig
m68k                        stmark2_defconfig
powerpc                        icon_defconfig
xtensa                    smp_lx200_defconfig
mips                          ath79_defconfig
powerpc                  iss476-smp_defconfig
h8300                            allyesconfig
mips                  cavium_octeon_defconfig
um                           x86_64_defconfig
arm                          pcm027_defconfig
powerpc                      ppc64e_defconfig
xtensa                  audio_kc705_defconfig
xtensa                generic_kc705_defconfig
powerpc                 mpc8540_ads_defconfig
arm                        oxnas_v6_defconfig
s390                                defconfig
arm                        multi_v7_defconfig
arm                            lart_defconfig
sh                          sdk7786_defconfig
arm                          collie_defconfig
powerpc                 mpc85xx_cds_defconfig
nds32                               defconfig
powerpc                          g5_defconfig
arm                            hisi_defconfig
xtensa                          iss_defconfig
powerpc                     kmeter1_defconfig
i386                                defconfig
sparc                       sparc32_defconfig
powerpc64                        alldefconfig
mips                 decstation_r4k_defconfig
powerpc                   lite5200b_defconfig
arm                        neponset_defconfig
openrisc                            defconfig
sh                        sh7763rdp_defconfig
mips                      bmips_stb_defconfig
um                            kunit_defconfig
arm                            zeus_defconfig
m68k                          atari_defconfig
arm                        mvebu_v5_defconfig
ia64                            zx1_defconfig
powerpc                     ksi8560_defconfig
ia64                             alldefconfig
h8300                       h8s-sim_defconfig
x86_64                           allyesconfig
arm                           u8500_defconfig
powerpc                       holly_defconfig
sh                        sh7785lcr_defconfig
powerpc                      ep88xc_defconfig
arm64                            alldefconfig
arc                    vdk_hs38_smp_defconfig
m68k                         amcore_defconfig
microblaze                      mmu_defconfig
m68k                       m5275evb_defconfig
arm                        keystone_defconfig
arm                         palmz72_defconfig
powerpc                mpc7448_hpc2_defconfig
powerpc                        fsp2_defconfig
s390                             alldefconfig
riscv             nommu_k210_sdcard_defconfig
arm                         hackkit_defconfig
arm                       imx_v6_v7_defconfig
powerpc                      katmai_defconfig
arm                           viper_defconfig
s390                          debug_defconfig
powerpc                 mpc832x_mds_defconfig
sparc                       sparc64_defconfig
riscv                          rv32_defconfig
x86_64                            allnoconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
s390                             allmodconfig
parisc                           allyesconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a001-20210615
x86_64               randconfig-a004-20210615
x86_64               randconfig-a002-20210615
x86_64               randconfig-a003-20210615
x86_64               randconfig-a006-20210615
x86_64               randconfig-a005-20210615
i386                 randconfig-a002-20210615
i386                 randconfig-a006-20210615
i386                 randconfig-a004-20210615
i386                 randconfig-a001-20210615
i386                 randconfig-a005-20210615
i386                 randconfig-a003-20210615
i386                 randconfig-a015-20210615
i386                 randconfig-a013-20210615
i386                 randconfig-a016-20210615
i386                 randconfig-a012-20210615
i386                 randconfig-a014-20210615
i386                 randconfig-a011-20210615
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
x86_64                    rhel-8.3-kselftests
um                             i386_defconfig
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                      rhel-8.3-kbuiltin
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-b001-20210615
x86_64               randconfig-a015-20210615
x86_64               randconfig-a011-20210615
x86_64               randconfig-a012-20210615
x86_64               randconfig-a014-20210615
x86_64               randconfig-a016-20210615
x86_64               randconfig-a013-20210615

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* Re: Oops (NULL pointer) with 'perf record' of selftest 'null_syscall'
From: Athira Rajeev @ 2021-06-16  3:40 UTC (permalink / raw)
  To: Madhavan Srinivasan, Christophe Leroy; +Cc: linuxppc-dev
In-Reply-To: <3388922c-0224-e4aa-f7b7-4fea43e060f9@linux.ibm.com>



> On 16-Jun-2021, at 8:53 AM, Madhavan Srinivasan <maddy@linux.ibm.com> wrote:
> 
> 
> On 6/15/21 8:35 PM, Christophe Leroy wrote:
>> For your information, I'm getting the following Oops. Detected with 5.13-rc6, it also oopses on 5.12 and 5.11.
>> Runs ok on 5.10. I'm starting bisecting now.
> 
> 
> Thanks for reporting, got the issue. What has happened in this case is that, pmu device is not registered
> and trying to access the instruction point which will land in perf_instruction_pointer(). And recently I have added
> a workaround patch for power10 DD1 which has caused this breakage. My bad. We are working on a fix patch
> for the same and will post it out. Sorry again.
> 

Hi Christophe,

Can you please try with below patch in your environment and test if it works for you.

From 55d3afc9369dfbe28a7152c8e9f856c11c7fe43d Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Tue, 15 Jun 2021 22:28:11 -0400
Subject: [PATCH] powerpc/perf: Fix crash with 'perf_instruction_pointer' when
 pmu is not set

On systems without any specific PMU driver support registered, running
perf record causes oops:

[   38.841073] NIP [c00000000013af54] perf_instruction_pointer+0x24/0x100
[   38.841079] LR [c0000000003c7358] perf_prepare_sample+0x4e8/0x820
[   38.841085] --- interrupt: 300
[   38.841088] [c00000001cf03440] [c0000000003c6ef8] perf_prepare_sample+0x88/0x820 (unreliable)
[   38.841096] [c00000001cf034a0] [c0000000003c76d0] perf_event_output_forward+0x40/0xc0
[   38.841104] [c00000001cf03520] [c0000000003b45e8] __perf_event_overflow+0x88/0x1b0
[   38.841112] [c00000001cf03570] [c0000000003b480c] perf_swevent_hrtimer+0xfc/0x1a0
[   38.841119] [c00000001cf03740] [c0000000002399cc] __hrtimer_run_queues+0x17c/0x380
[   38.841127] [c00000001cf037c0] [c00000000023a5f8] hrtimer_interrupt+0x128/0x2f0
[   38.841135] [c00000001cf03870] [c00000000002962c] timer_interrupt+0x13c/0x370
[   38.841143i] [c00000001cf038d0] [c000000000009ba4] decrementer_common_virt+0x1a4/0x1b0
[   38.841151] --- interrupt: 900 at copypage_power7+0xd4/0x1c0

During perf record session, perf_instruction_pointer() is called to
capture the sample ip. This function in core-book3s accesses ppmu->flags.
If a platform specific PMU driver is not registered, ppmu is set to NULL
and accessing its members results in a crash. Fix this crash by checking
if ppmu is set.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reported-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/perf/core-book3s.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 16d4d1b6a1ff..816756588cb7 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2254,7 +2254,7 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs)
 	bool use_siar = regs_use_siar(regs);
 	unsigned long siar = mfspr(SPRN_SIAR);
 
-	if (ppmu->flags & PPMU_P10_DD1) {
+	if (ppmu && ppmu->flags & PPMU_P10_DD1) {
 		if (siar)
 			return siar;
 		else
-- 
2.27.0


Thanks
Athira

> Maddy
> 
> 
>> 
>> root@vgoippro:/tmp# perf record /root/null_syscall
>> [  285.559987] BUG: Kernel NULL pointer dereference on read at 0x00000040
>> [  285.566533] Faulting instruction address: 0xc0021f0c
>> [  285.571486] Oops: Kernel access of bad area, sig: 11 [#1]
>> [  285.576872] BE PAGE_SIZE=4K PREEMPT CMPCPRO
>> [  285.581080] SAF3000 DIE NOTIFICATION
>> [  285.584661] CPU: 0 PID: 442 Comm: null_syscall Not tainted 5.13.0-rc6-s3k-dev-01645-g7649ee3d2957 #5164
>> [  285.594035] NIP:  c0021f0c LR: c00e8ad8 CTR: c00d8a5c
>> [  285.599074] REGS: e67757d0 TRAP: 0300   Not tainted (5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
>> [  285.607576] MSR:  00001032 <ME,IR,DR,RI>  CR: 44775b18 XER: 20000000
>> [  285.614063] DAR: 00000040 DSISR: 20000000
>> [  285.614063] GPR00: c00e8810 e6775880 c1c52640 e6775b20 7cb36ae0 f0000028 43ebeedc 5ccc47d0
>> [  285.614063] GPR08: 00000000 00000900 e6775b20 00000001 00000000 1025b2c0 10013088 10012ee0
>> [  285.614063] GPR16: b0000000 00000007 00000001 c00deb64 00000042 00000001 78db7b23 c0b13200
>> [  285.614063] GPR24: 00000000 00000000 00000000 e6775b20 c13b8560 00000107 e6775940 e67758e8
>> [  285.651693] NIP [c0021f0c] perf_instruction_pointer+0x10/0x60
>> [  285.657460] LR [c00e8ad8] perf_prepare_sample+0x344/0x674
>> [  285.662859] Call Trace:
>> [  285.665301] [e6775880] [c00e8810] perf_prepare_sample+0x7c/0x674 (unreliable)
>> [  285.672452] [e67758c0] [c00e8e44] perf_event_output_forward+0x3c/0x94
>> [  285.678903] [e6775910] [c00dea8c] __perf_event_overflow+0x74/0x14c
>> [  285.685108] [e6775930] [c00dec5c] perf_swevent_hrtimer+0xf8/0x170
>> [  285.691217] [e6775a40] [c008c8d0] __hrtimer_run_queues.constprop.0+0x160/0x318
>> [  285.698456] [e6775a90] [c008d94c] hrtimer_interrupt+0x148/0x3b0
>> [  285.704394] [e6775ae0] [c000c0c0] timer_interrupt+0xc4/0x22c
>> [  285.710067] [e6775b10] [c00046f0] Decrementer_virt+0xb8/0xbc
>> [  285.715744] --- interrupt: 900 at pagecache_get_page+0x210/0x430
>> [  285.721764] NIP:  c00f52a8 LR: c00f5408 CTR: c00f59d8
>> [  285.726805] REGS: e6775b20 TRAP: 0900   Not tainted (5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
>> [  285.735306] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 28422d68  XER: 00000000
>> [  285.742056]
>> [  285.742056] GPR00: c00f513c e6775bd0 c1c52640 c1c52640 00000000 00000000 00000000 c1382c38
>> [  285.742056] GPR08: 00000000 00000000 00000001 00000000 88482d68 1025b2c0 10013088 10012ee0
>> [  285.742056] GPR16: b0000000 00000007 00000001 10012ee0 c18187ac c0b87800 61c88647 c0c18c00
>> [  285.742056] GPR24: 00000001 00000003 00000000 00000002 c18187a8 00100cca 00000044 00000000
>> [  285.777079] NIP [c00f52a8] pagecache_get_page+0x210/0x430
>> [  285.782482] LR [c00f5408] pagecache_get_page+0x370/0x430
>> [  285.787796] --- interrupt: 900
>> [  285.790843] [e6775bd0] [c00f513c] pagecache_get_page+0xa4/0x430 (unreliable)
>> [  285.797910] [e6775c30] [c00f5ca8] filemap_fault+0x2d0/0x8e8
>> [  285.803500] [e6775ca0] [c012d244] __do_fault+0x4c/0xd8
>> [  285.808666] [e6775cb0] [c0130f64] handle_mm_fault+0x274/0x10b8
>> [  285.814517] [e6775d30] [c0014f58] do_page_fault+0x1d4/0x67c
>> [  285.820117] [e6775d60] [c000424c] DataAccess_virt+0xd4/0xe4
>> [  285.825707] --- interrupt: 300 at __arch_clear_user+0x10/0xcc
>> [  285.831458] NIP:  c001a3cc LR: c01d5cfc CTR: 00000000
>> [  285.836497] REGS: e6775d70 TRAP: 0300   Not tainted (5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
>> [  285.845000] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 48004264  XER: 20000000
>> [  285.851751] DAR: 10012ee0 DSISR: 22000000
>> [  285.851751] GPR00: c01d53fc e6775e20 c1c52640 00000000 00000120 00000008 c136241c 00000000
>> [  285.851751] GPR08: 00000000 9ffed120 10012ee0 00000004 28004868 1025b2c0 10013088 10012ee0
>> [  285.851751] GPR16: b0000000 00000007 00000001 10012ee0 10000000 10012d0c 10000000 c1d74240
>> [  285.851751] GPR24: 10012ee0 00000000 c1345e80 c1343dc0 10000b38 00000000 c132ec00 c1386a00
>> [  285.889384] NIP [c001a3cc] __arch_clear_user+0x10/0xcc
>> [  285.894527] LR [c01d5cfc] load_elf_binary+0xec4/0x1340
>> [  285.899682] --- interrupt: 300
>> [  285.902730] [e6775e20] [c01d53fc] load_elf_binary+0x5c4/0x1340 (unreliable)
>> [  285.909713] [e6775ea0] [c0163258] bprm_execve+0x200/0x55c
>> [  285.915138] [e6775ef0] [c0163e00] do_execveat_common+0x178/0x1f4
>> [  285.921162] [e6775f20] [c0165558] sys_execve+0x40/0x58
>> [  285.926321] [e6775f40] [c001404c] ret_from_syscall+0x0/0x28
>> [  285.931917] --- interrupt: c00 at 0xfc3ce78
>> [  285.936097] NIP:  0fc3ce78 LR: 0fc3d7cc CTR: c01657cc
>> [  285.941135] REGS: e6775f50 TRAP: 0c00   Not tainted (5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
>> [  285.949636] MSR:  0000d032 <EE,PR,ME,IR,DR,RI>  CR: 22004868  XER: 20000000
>> [  285.956655]
>> [  285.956655] GPR00: 0000000b afab1bf0 a7d77a50 afab6ee1 afab64c8 104bd9b0 fefefeff 7f7f7f7f
>> [  285.956655] GPR08: afab6ee0 00000000 006df8f9 0000011d 24004864 1025b2c0 10231a50 10249108
>> [  285.956655] GPR16: 104beeb0 10254830 105dd3f4 10250000 1018a124 10188448 10234e58 00000000
>> [  285.956655] GPR24: 10231ae0 00000003 00000001 104bd9b0 afab64c8 afab6ee1 0fd25244 afab1bf0
>> [  285.991684] NIP [0fc3ce78] 0xfc3ce78
>> [  285.995257] LR [0fc3d7cc] 0xfc3d7cc
>> [  285.998742] --- interrupt: c00
>> [  286.001789] Instruction dump:
>> [  286.004757] 3d20c07d 80a40010 3889847c 486bc444 80630084 546397fe 38630001 4e800020
>> [  286.012556] 3d20c0b9 7c6a1b78 810915e4 812300a0 <81080040> 55290036 2c090f00 5508056a
>> [  286.020531] ---[ end trace e381e6fcc2db5226 ]---


^ permalink raw reply related

* [PATCH v11 00/12] Restricted DMA
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman

This series implements mitigations for lack of DMA access control on
systems without an IOMMU, which could result in the DMA accessing the
system memory at unexpected times and/or unexpected addresses, possibly
leading to data leakage or corruption.

For example, we plan to use the PCI-e bus for Wi-Fi and that PCI-e bus is
not behind an IOMMU. As PCI-e, by design, gives the device full access to
system memory, a vulnerability in the Wi-Fi firmware could easily escalate
to a full system exploit (remote wifi exploits: [1a], [1b] that shows a
full chain of exploits; [2], [3]).

To mitigate the security concerns, we introduce restricted DMA. Restricted
DMA utilizes the existing swiotlb to bounce streaming DMA in and out of a
specially allocated region and does memory allocation from the same region.
The feature on its own provides a basic level of protection against the DMA
overwriting buffer contents at unexpected times. However, to protect
against general data leakage and system memory corruption, the system needs
to provide a way to restrict the DMA to a predefined memory region (this is
usually done at firmware level, e.g. MPU in ATF on some ARM platforms [4]).

[1a] https://googleprojectzero.blogspot.com/2017/04/over-air-exploiting-broadcoms-wi-fi_4.html
[1b] https://googleprojectzero.blogspot.com/2017/04/over-air-exploiting-broadcoms-wi-fi_11.html
[2] https://blade.tencent.com/en/advisories/qualpwn/
[3] https://www.bleepingcomputer.com/news/security/vulnerabilities-found-in-highly-popular-firmware-for-wifi-chips/
[4] https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/mediatek/mt8183/drivers/emi_mpu/emi_mpu.c#L132

v11:
- Rebase against swiotlb devel/for-linus-5.14
- s/mempry/memory/g
- exchange the order of patch 09/12 and 10/12
https://lore.kernel.org/patchwork/cover/1446882/

v10:
Address the comments in v9 to
  - fix the dev->dma_io_tlb_mem assignment
  - propagate swiotlb_force setting into io_tlb_default_mem->force
  - move set_memory_decrypted out of swiotlb_init_io_tlb_mem
  - move debugfs_dir declaration into the main CONFIG_DEBUG_FS block
  - add swiotlb_ prefix to find_slots and release_slots
  - merge the 3 alloc/free related patches
  - move the CONFIG_DMA_RESTRICTED_POOL later

v9:
Address the comments in v7 to
  - set swiotlb active pool to dev->dma_io_tlb_mem
  - get rid of get_io_tlb_mem
  - dig out the device struct for is_swiotlb_active
  - move debugfs_create_dir out of swiotlb_create_debugfs
  - do set_memory_decrypted conditionally in swiotlb_init_io_tlb_mem
  - use IS_ENABLED in kernel/dma/direct.c
  - fix redefinition of 'of_dma_set_restricted_buffer'
https://lore.kernel.org/patchwork/cover/1445081/

v8:
- Fix reserved-memory.txt and add the reg property in example.
- Fix sizeof for of_property_count_elems_of_size in
  drivers/of/address.c#of_dma_set_restricted_buffer.
- Apply Will's suggestion to try the OF node having DMA configuration in
  drivers/of/address.c#of_dma_set_restricted_buffer.
- Fix typo in the comment of drivers/of/address.c#of_dma_set_restricted_buffer.
- Add error message for PageHighMem in
  kernel/dma/swiotlb.c#rmem_swiotlb_device_init and move it to
  rmem_swiotlb_setup.
- Fix the message string in rmem_swiotlb_setup.
https://lore.kernel.org/patchwork/cover/1437112/

v7:
Fix debugfs, PageHighMem and comment style in rmem_swiotlb_device_init
https://lore.kernel.org/patchwork/cover/1431031/

v6:
Address the comments in v5
https://lore.kernel.org/patchwork/cover/1423201/

v5:
Rebase on latest linux-next
https://lore.kernel.org/patchwork/cover/1416899/

v4:
- Fix spinlock bad magic
- Use rmem->name for debugfs entry
- Address the comments in v3
https://lore.kernel.org/patchwork/cover/1378113/

v3:
Using only one reserved memory region for both streaming DMA and memory
allocation.
https://lore.kernel.org/patchwork/cover/1360992/

v2:
Building on top of swiotlb.
https://lore.kernel.org/patchwork/cover/1280705/

v1:
Using dma_map_ops.
https://lore.kernel.org/patchwork/cover/1271660/

Claire Chang (12):
  swiotlb: Refactor swiotlb init functions
  swiotlb: Refactor swiotlb_create_debugfs
  swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used
  swiotlb: Update is_swiotlb_buffer to add a struct device argument
  swiotlb: Update is_swiotlb_active to add a struct device argument
  swiotlb: Use is_dev_swiotlb_force for swiotlb data bouncing
  swiotlb: Move alloc_size to swiotlb_find_slots
  swiotlb: Refactor swiotlb_tbl_unmap_single
  swiotlb: Add restricted DMA alloc/free support
  swiotlb: Add restricted DMA pool initialization
  dt-bindings: of: Add restricted DMA pool
  of: Add plumbing for restricted DMA pool

 .../reserved-memory/reserved-memory.txt       |  36 ++-
 drivers/base/core.c                           |   4 +
 drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   2 +-
 drivers/gpu/drm/nouveau/nouveau_ttm.c         |   2 +-
 drivers/iommu/dma-iommu.c                     |  12 +-
 drivers/of/address.c                          |  33 +++
 drivers/of/device.c                           |   3 +
 drivers/of/of_private.h                       |   6 +
 drivers/pci/xen-pcifront.c                    |   2 +-
 drivers/xen/swiotlb-xen.c                     |   2 +-
 include/linux/device.h                        |   4 +
 include/linux/swiotlb.h                       |  40 ++-
 kernel/dma/Kconfig                            |  14 +
 kernel/dma/direct.c                           |  60 +++--
 kernel/dma/direct.h                           |   8 +-
 kernel/dma/swiotlb.c                          | 255 +++++++++++++-----
 16 files changed, 380 insertions(+), 103 deletions(-)

-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply

* [PATCH v11 01/12] swiotlb: Refactor swiotlb init functions
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Add a new function, swiotlb_init_io_tlb_mem, for the io_tlb_mem struct
initialization to make the code reusable.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 49 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 52e2ac526757..3ba0f08a39a1 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -168,9 +168,28 @@ void __init swiotlb_update_mem_attributes(void)
 	memset(vaddr, 0, bytes);
 }
 
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
+				    unsigned long nslabs, bool late_alloc)
 {
+	void *vaddr = phys_to_virt(start);
 	unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+
+	mem->nslabs = nslabs;
+	mem->start = start;
+	mem->end = mem->start + bytes;
+	mem->index = 0;
+	mem->late_alloc = late_alloc;
+	spin_lock_init(&mem->lock);
+	for (i = 0; i < mem->nslabs; i++) {
+		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+		mem->slots[i].alloc_size = 0;
+	}
+	memset(vaddr, 0, bytes);
+}
+
+int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+{
 	struct io_tlb_mem *mem;
 	size_t alloc_size;
 
@@ -186,16 +205,8 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 	if (!mem)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
-	mem->nslabs = nslabs;
-	mem->start = __pa(tlb);
-	mem->end = mem->start + bytes;
-	mem->index = 0;
-	spin_lock_init(&mem->lock);
-	for (i = 0; i < mem->nslabs; i++) {
-		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
-		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
-		mem->slots[i].alloc_size = 0;
-	}
+
+	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
 
 	io_tlb_default_mem = mem;
 	if (verbose)
@@ -282,8 +293,8 @@ swiotlb_late_init_with_default_size(size_t default_size)
 int
 swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 {
-	unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
 	struct io_tlb_mem *mem;
+	unsigned long bytes = nslabs << IO_TLB_SHIFT;
 
 	if (swiotlb_force == SWIOTLB_NO_FORCE)
 		return 0;
@@ -297,20 +308,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!mem)
 		return -ENOMEM;
 
-	mem->nslabs = nslabs;
-	mem->start = virt_to_phys(tlb);
-	mem->end = mem->start + bytes;
-	mem->index = 0;
-	mem->late_alloc = 1;
-	spin_lock_init(&mem->lock);
-	for (i = 0; i < mem->nslabs; i++) {
-		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
-		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
-		mem->slots[i].alloc_size = 0;
-	}
-
+	swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
 	set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
-	memset(tlb, 0, bytes);
 
 	io_tlb_default_mem = mem;
 	swiotlb_print_info();
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 02/12] swiotlb: Refactor swiotlb_create_debugfs
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Split the debugfs creation to make the code reusable for supporting
different bounce buffer pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 3ba0f08a39a1..af416bcd1914 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -670,19 +670,26 @@ bool is_swiotlb_active(void)
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
 #ifdef CONFIG_DEBUG_FS
+static struct dentry *debugfs_dir;
 
-static int __init swiotlb_create_debugfs(void)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
 {
-	struct io_tlb_mem *mem = io_tlb_default_mem;
-
-	if (!mem)
-		return 0;
-	mem->debugfs = debugfs_create_dir("swiotlb", NULL);
 	debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
 	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+	struct io_tlb_mem *mem = io_tlb_default_mem;
+
+	debugfs_dir = debugfs_create_dir("swiotlb", NULL);
+	if (mem) {
+		mem->debugfs = debugfs_dir;
+		swiotlb_create_debugfs_files(mem);
+	}
 	return 0;
 }
 
-late_initcall(swiotlb_create_debugfs);
+late_initcall(swiotlb_create_default_debugfs);
 
 #endif
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 03/12] swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Always have the pointer to the swiotlb pool used in struct device. This
could help simplify the code for other pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/base/core.c    | 4 ++++
 include/linux/device.h | 4 ++++
 kernel/dma/swiotlb.c   | 8 ++++----
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f29839382f81..cb3123e3954d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,6 +27,7 @@
 #include <linux/netdevice.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/swiotlb.h>
 #include <linux/sysfs.h>
 #include <linux/dma-map-ops.h> /* for dma_default_coherent */
 
@@ -2736,6 +2737,9 @@ void device_initialize(struct device *dev)
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 	dev->dma_coherent = dma_default_coherent;
 #endif
+#ifdef CONFIG_SWIOTLB
+	dev->dma_io_tlb_mem = io_tlb_default_mem;
+#endif
 }
 EXPORT_SYMBOL_GPL(device_initialize);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index ba660731bd25..240d652a0696 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -416,6 +416,7 @@ struct dev_links_info {
  * @dma_pools:	Dma pools (if dma'ble device).
  * @dma_mem:	Internal for coherent mem override.
  * @cma_area:	Contiguous memory area for dma allocations
+ * @dma_io_tlb_mem: Pointer to the swiotlb pool used.  Not for driver use.
  * @archdata:	For arch-specific additions.
  * @of_node:	Associated device tree node.
  * @fwnode:	Associated device node supplied by platform firmware.
@@ -518,6 +519,9 @@ struct device {
 #ifdef CONFIG_DMA_CMA
 	struct cma *cma_area;		/* contiguous memory area for dma
 					   allocations */
+#endif
+#ifdef CONFIG_SWIOTLB
+	struct io_tlb_mem *dma_io_tlb_mem;
 #endif
 	/* arch specific additions */
 	struct dev_archdata	archdata;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index af416bcd1914..a9f5c08dd94a 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -339,7 +339,7 @@ void __init swiotlb_exit(void)
 static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
 			   enum dma_data_direction dir)
 {
-	struct io_tlb_mem *mem = io_tlb_default_mem;
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
 	unsigned int offset = (tlb_addr - mem->start) & (IO_TLB_SIZE - 1);
 	phys_addr_t orig_addr = mem->slots[index].orig_addr;
@@ -430,7 +430,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
 static int find_slots(struct device *dev, phys_addr_t orig_addr,
 		size_t alloc_size)
 {
-	struct io_tlb_mem *mem = io_tlb_default_mem;
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	unsigned long boundary_mask = dma_get_seg_boundary(dev);
 	dma_addr_t tbl_dma_addr =
 		phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -507,7 +507,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		size_t mapping_size, size_t alloc_size,
 		enum dma_data_direction dir, unsigned long attrs)
 {
-	struct io_tlb_mem *mem = io_tlb_default_mem;
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	unsigned int offset = swiotlb_align_offset(dev, orig_addr);
 	unsigned int i;
 	int index;
@@ -558,7 +558,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      size_t mapping_size, enum dma_data_direction dir,
 			      unsigned long attrs)
 {
-	struct io_tlb_mem *mem = io_tlb_default_mem;
+	struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem;
 	unsigned long flags;
 	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
 	int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 04/12] swiotlb: Update is_swiotlb_buffer to add a struct device argument
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Update is_swiotlb_buffer to add a struct device argument. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c | 12 ++++++------
 drivers/xen/swiotlb-xen.c |  2 +-
 include/linux/swiotlb.h   |  7 ++++---
 kernel/dma/direct.c       |  6 +++---
 kernel/dma/direct.h       |  6 +++---
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3087d9fa6065..10997ef541f8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -507,7 +507,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 
 	__iommu_dma_unmap(dev, dma_addr, size);
 
-	if (unlikely(is_swiotlb_buffer(phys)))
+	if (unlikely(is_swiotlb_buffer(dev, phys)))
 		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
@@ -578,7 +578,7 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
 	}
 
 	iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
-	if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+	if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys))
 		swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
 	return iova;
 }
@@ -749,7 +749,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_cpu(phys, size, dir);
 
-	if (is_swiotlb_buffer(phys))
+	if (is_swiotlb_buffer(dev, phys))
 		swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
 
@@ -762,7 +762,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	if (is_swiotlb_buffer(phys))
+	if (is_swiotlb_buffer(dev, phys))
 		swiotlb_sync_single_for_device(dev, phys, size, dir);
 
 	if (!dev_is_dma_coherent(dev))
@@ -783,7 +783,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 		if (!dev_is_dma_coherent(dev))
 			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 
-		if (is_swiotlb_buffer(sg_phys(sg)))
+		if (is_swiotlb_buffer(dev, sg_phys(sg)))
 			swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
 						    sg->length, dir);
 	}
@@ -800,7 +800,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 		return;
 
 	for_each_sg(sgl, sg, nelems, i) {
-		if (is_swiotlb_buffer(sg_phys(sg)))
+		if (is_swiotlb_buffer(dev, sg_phys(sg)))
 			swiotlb_sync_single_for_device(dev, sg_phys(sg),
 						       sg->length, dir);
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 4c89afc0df62..0c6ed09f8513 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -100,7 +100,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr)
 	 * in our domain. Therefore _only_ check address within our domain.
 	 */
 	if (pfn_valid(PFN_DOWN(paddr)))
-		return is_swiotlb_buffer(paddr);
+		return is_swiotlb_buffer(dev, paddr);
 	return 0;
 }
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 216854a5e513..d1f3d95881cd 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_SWIOTLB_H
 #define __LINUX_SWIOTLB_H
 
+#include <linux/device.h>
 #include <linux/dma-direction.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -101,9 +102,9 @@ struct io_tlb_mem {
 };
 extern struct io_tlb_mem *io_tlb_default_mem;
 
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
-	struct io_tlb_mem *mem = io_tlb_default_mem;
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 
 	return mem && paddr >= mem->start && paddr < mem->end;
 }
@@ -115,7 +116,7 @@ bool is_swiotlb_active(void);
 void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
 	return false;
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f737e3347059..84c9feb5474a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -343,7 +343,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 	for_each_sg(sgl, sg, nents, i) {
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
-		if (unlikely(is_swiotlb_buffer(paddr)))
+		if (unlikely(is_swiotlb_buffer(dev, paddr)))
 			swiotlb_sync_single_for_device(dev, paddr, sg->length,
 						       dir);
 
@@ -369,7 +369,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		if (!dev_is_dma_coherent(dev))
 			arch_sync_dma_for_cpu(paddr, sg->length, dir);
 
-		if (unlikely(is_swiotlb_buffer(paddr)))
+		if (unlikely(is_swiotlb_buffer(dev, paddr)))
 			swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
 						    dir);
 
@@ -504,7 +504,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
 bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	return !dev_is_dma_coherent(dev) ||
-		is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
+	       is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr));
 }
 
 /**
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 50afc05b6f1d..13e9e7158d94 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -56,7 +56,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 {
 	phys_addr_t paddr = dma_to_phys(dev, addr);
 
-	if (unlikely(is_swiotlb_buffer(paddr)))
+	if (unlikely(is_swiotlb_buffer(dev, paddr)))
 		swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
 	if (!dev_is_dma_coherent(dev))
@@ -73,7 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		arch_sync_dma_for_cpu_all();
 	}
 
-	if (unlikely(is_swiotlb_buffer(paddr)))
+	if (unlikely(is_swiotlb_buffer(dev, paddr)))
 		swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
 
 	if (dir == DMA_FROM_DEVICE)
@@ -113,7 +113,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
-	if (unlikely(is_swiotlb_buffer(phys)))
+	if (unlikely(is_swiotlb_buffer(dev, phys)))
 		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 #endif /* _KERNEL_DMA_DIRECT_H */
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 05/12] swiotlb: Update is_swiotlb_active to add a struct device argument
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Update is_swiotlb_active to add a struct device argument. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +-
 drivers/gpu/drm/nouveau/nouveau_ttm.c        | 2 +-
 drivers/pci/xen-pcifront.c                   | 2 +-
 include/linux/swiotlb.h                      | 4 ++--
 kernel/dma/direct.c                          | 2 +-
 kernel/dma/swiotlb.c                         | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index a9d65fc8aa0e..4b7afa0fc85d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj)
 
 	max_order = MAX_ORDER;
 #ifdef CONFIG_SWIOTLB
-	if (is_swiotlb_active()) {
+	if (is_swiotlb_active(obj->base.dev->dev)) {
 		unsigned int max_segment;
 
 		max_segment = swiotlb_max_segment();
diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index 9662522aa066..be15bfd9e0ee 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c
@@ -321,7 +321,7 @@ nouveau_ttm_init(struct nouveau_drm *drm)
 	}
 
 #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
-	need_swiotlb = is_swiotlb_active();
+	need_swiotlb = is_swiotlb_active(dev->dev);
 #endif
 
 	ret = ttm_bo_device_init(&drm->ttm.bdev, &nouveau_bo_driver,
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index b7a8f3a1921f..0d56985bfe81 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -693,7 +693,7 @@ static int pcifront_connect_and_init_dma(struct pcifront_device *pdev)
 
 	spin_unlock(&pcifront_dev_lock);
 
-	if (!err && !is_swiotlb_active()) {
+	if (!err && !is_swiotlb_active(&pdev->xdev->dev)) {
 		err = pci_xen_swiotlb_init_late();
 		if (err)
 			dev_err(&pdev->xdev->dev, "Could not setup SWIOTLB!\n");
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d1f3d95881cd..dd1c30a83058 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -112,7 +112,7 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
-bool is_swiotlb_active(void);
+bool is_swiotlb_active(struct device *dev);
 void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
@@ -132,7 +132,7 @@ static inline size_t swiotlb_max_mapping_size(struct device *dev)
 	return SIZE_MAX;
 }
 
-static inline bool is_swiotlb_active(void)
+static inline bool is_swiotlb_active(struct device *dev)
 {
 	return false;
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 84c9feb5474a..7a88c34d0867 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -495,7 +495,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
 size_t dma_direct_max_mapping_size(struct device *dev)
 {
 	/* If SWIOTLB is active, use its maximum mapping size */
-	if (is_swiotlb_active() &&
+	if (is_swiotlb_active(dev) &&
 	    (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
 		return swiotlb_max_mapping_size(dev);
 	return SIZE_MAX;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index a9f5c08dd94a..101abeb0a57d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -663,9 +663,9 @@ size_t swiotlb_max_mapping_size(struct device *dev)
 	return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
 }
 
-bool is_swiotlb_active(void)
+bool is_swiotlb_active(struct device *dev)
 {
-	return io_tlb_default_mem != NULL;
+	return dev->dma_io_tlb_mem != NULL;
 }
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 06/12] swiotlb: Use is_dev_swiotlb_force for swiotlb data bouncing
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Propagate the swiotlb_force setting into io_tlb_default_mem->force and
use it to determine whether to bounce the data or not. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 11 +++++++++++
 kernel/dma/direct.c     |  2 +-
 kernel/dma/direct.h     |  2 +-
 kernel/dma/swiotlb.c    |  4 ++++
 4 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index dd1c30a83058..efcd56e3a16c 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -84,6 +84,7 @@ extern enum swiotlb_force swiotlb_force;
  *		unmap calls.
  * @debugfs:	The dentry to debugfs.
  * @late_alloc:	%true if allocated using the page allocator
+ * @force:      %true if swiotlb is forced
  */
 struct io_tlb_mem {
 	phys_addr_t start;
@@ -94,6 +95,7 @@ struct io_tlb_mem {
 	spinlock_t lock;
 	struct dentry *debugfs;
 	bool late_alloc;
+	bool force;
 	struct io_tlb_slot {
 		phys_addr_t orig_addr;
 		size_t alloc_size;
@@ -109,6 +111,11 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 	return mem && paddr >= mem->start && paddr < mem->end;
 }
 
+static inline bool is_dev_swiotlb_force(struct device *dev)
+{
+	return dev->dma_io_tlb_mem->force;
+}
+
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
@@ -120,6 +127,10 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
 	return false;
 }
+static inline bool is_dev_swiotlb_force(struct device *dev)
+{
+	return false;
+}
 static inline void swiotlb_exit(void)
 {
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 7a88c34d0867..3713461d6fe0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -496,7 +496,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
 {
 	/* If SWIOTLB is active, use its maximum mapping size */
 	if (is_swiotlb_active(dev) &&
-	    (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+	    (dma_addressing_limited(dev) || is_dev_swiotlb_force(dev)))
 		return swiotlb_max_mapping_size(dev);
 	return SIZE_MAX;
 }
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 13e9e7158d94..6c4d13caceb1 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -87,7 +87,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
 	phys_addr_t phys = page_to_phys(page) + offset;
 	dma_addr_t dma_addr = phys_to_dma(dev, phys);
 
-	if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+	if (is_dev_swiotlb_force(dev))
 		return swiotlb_map(dev, phys, size, dir, attrs);
 
 	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 101abeb0a57d..a9907ac262fc 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -179,6 +179,10 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	mem->end = mem->start + bytes;
 	mem->index = 0;
 	mem->late_alloc = late_alloc;
+
+	if (swiotlb_force == SWIOTLB_FORCE)
+		mem->force = true;
+
 	spin_lock_init(&mem->lock);
 	for (i = 0; i < mem->nslabs; i++) {
 		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 07/12] swiotlb: Move alloc_size to swiotlb_find_slots
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Rename find_slots to swiotlb_find_slots and move the maintenance of
alloc_size to it for better code reusability later.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index a9907ac262fc..037772724b3c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -431,8 +431,8 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
  * Find a suitable number of IO TLB entries size that will fit this request and
  * allocate a buffer from that IO TLB pool.
  */
-static int find_slots(struct device *dev, phys_addr_t orig_addr,
-		size_t alloc_size)
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+			      size_t alloc_size)
 {
 	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	unsigned long boundary_mask = dma_get_seg_boundary(dev);
@@ -487,8 +487,11 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
 	return -1;
 
 found:
-	for (i = index; i < index + nslots; i++)
+	for (i = index; i < index + nslots; i++) {
 		mem->slots[i].list = 0;
+		mem->slots[i].alloc_size =
+			alloc_size - ((i - index) << IO_TLB_SHIFT);
+	}
 	for (i = index - 1;
 	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
 	     mem->slots[i].list; i--)
@@ -529,7 +532,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
-	index = find_slots(dev, orig_addr, alloc_size + offset);
+	index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset);
 	if (index == -1) {
 		if (!(attrs & DMA_ATTR_NO_WARN))
 			dev_warn_ratelimited(dev,
@@ -543,11 +546,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 * This is needed when we sync the memory.  Then we sync the buffer if
 	 * needed.
 	 */
-	for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+	for (i = 0; i < nr_slots(alloc_size + offset); i++)
 		mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
-		mem->slots[index + i].alloc_size =
-			alloc_size - (i << IO_TLB_SHIFT);
-	}
 	tlb_addr = slot_addr(mem->start, index) + offset;
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
 	    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 08/12] swiotlb: Refactor swiotlb_tbl_unmap_single
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Add a new function, swiotlb_release_slots, to make the code reusable for
supporting different bounce buffer pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 037772724b3c..fec4934b9926 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -555,27 +555,15 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	return tlb_addr;
 }
 
-/*
- * tlb_addr is the physical address of the bounce buffer to unmap.
- */
-void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
-			      size_t mapping_size, enum dma_data_direction dir,
-			      unsigned long attrs)
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 {
-	struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem;
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	unsigned long flags;
-	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+	unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
 	int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
 	int nslots = nr_slots(mem->slots[index].alloc_size + offset);
 	int count, i;
 
-	/*
-	 * First, sync the memory before unmapping the entry
-	 */
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-		swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
-
 	/*
 	 * Return the buffer to the free list by setting the corresponding
 	 * entries to indicate the number of contiguous entries available.
@@ -610,6 +598,23 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	spin_unlock_irqrestore(&mem->lock, flags);
 }
 
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+			      size_t mapping_size, enum dma_data_direction dir,
+			      unsigned long attrs)
+{
+	/*
+	 * First, sync the memory before unmapping the entry
+	 */
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+	    (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+		swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+
+	swiotlb_release_slots(dev, tlb_addr);
+}
+
 void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
 		size_t size, enum dma_data_direction dir)
 {
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 09/12] swiotlb: Add restricted DMA alloc/free support
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Add the functions, swiotlb_{alloc,free} to support the memory allocation
from restricted DMA pool.

The restricted DMA pool is preferred if available.

Note that since coherent allocation needs remapping, one must set up
another device coherent pool by shared-dma-pool and use
dma_alloc_from_dev_coherent instead for atomic coherent allocation.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 15 +++++++++++++
 kernel/dma/direct.c     | 50 ++++++++++++++++++++++++++++++-----------
 kernel/dma/swiotlb.c    | 45 +++++++++++++++++++++++++++++++++++--
 3 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index efcd56e3a16c..2d5ec670e064 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -156,4 +156,19 @@ static inline void swiotlb_adjust_size(unsigned long size)
 extern void swiotlb_print_info(void);
 extern void swiotlb_set_max_segment(unsigned int);
 
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+struct page *swiotlb_alloc(struct device *dev, size_t size);
+bool swiotlb_free(struct device *dev, struct page *page, size_t size);
+#else
+static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+	return NULL;
+}
+static inline bool swiotlb_free(struct device *dev, struct page *page,
+				size_t size)
+{
+	return false;
+}
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
+
 #endif /* __LINUX_SWIOTLB_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 3713461d6fe0..da0e09621230 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,6 +75,15 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 		min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
+static void __dma_direct_free_pages(struct device *dev, struct page *page,
+				    size_t size)
+{
+	if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
+	    swiotlb_free(dev, page, size))
+		return;
+	dma_free_contiguous(dev, page, size);
+}
+
 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		gfp_t gfp)
 {
@@ -86,7 +95,16 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 
 	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
 					   &phys_limit);
-	page = dma_alloc_contiguous(dev, size, gfp);
+	if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) {
+		page = swiotlb_alloc(dev, size);
+		if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+			__dma_direct_free_pages(dev, page, size);
+			return NULL;
+		}
+	}
+
+	if (!page)
+		page = dma_alloc_contiguous(dev, size, gfp);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		dma_free_contiguous(dev, page, size);
 		page = NULL;
@@ -142,7 +160,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 		gfp |= __GFP_NOWARN;
 
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-	    !force_dma_unencrypted(dev)) {
+	    !force_dma_unencrypted(dev) && !is_dev_swiotlb_force(dev)) {
 		page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
 		if (!page)
 			return NULL;
@@ -155,18 +173,23 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	}
 
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    !dev_is_dma_coherent(dev))
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) &&
+	    !is_dev_swiotlb_force(dev))
 		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 
 	/*
 	 * Remapping or decrypting memory may block. If either is required and
 	 * we can't block, allocate the memory from the atomic pools.
+	 * If restricted DMA (i.e., is_dev_swiotlb_force) is required, one must
+	 * set up another device coherent pool by shared-dma-pool and use
+	 * dma_alloc_from_dev_coherent instead.
 	 */
 	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
 	    !gfpflags_allow_blocking(gfp) &&
 	    (force_dma_unencrypted(dev) ||
-	     (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
+	     (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	      !dev_is_dma_coherent(dev))) &&
+	    !is_dev_swiotlb_force(dev))
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	/* we always manually zero the memory once we are done */
@@ -237,7 +260,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 			return NULL;
 	}
 out_free_pages:
-	dma_free_contiguous(dev, page, size);
+	__dma_direct_free_pages(dev, page, size);
 	return NULL;
 }
 
@@ -247,15 +270,15 @@ void dma_direct_free(struct device *dev, size_t size,
 	unsigned int page_order = get_order(size);
 
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-	    !force_dma_unencrypted(dev)) {
+	    !force_dma_unencrypted(dev) && !is_dev_swiotlb_force(dev)) {
 		/* cpu_addr is a struct page cookie, not a kernel address */
 		dma_free_contiguous(dev, cpu_addr, size);
 		return;
 	}
 
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    !dev_is_dma_coherent(dev)) {
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) &&
+	    !is_dev_swiotlb_force(dev)) {
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
 		return;
 	}
@@ -273,7 +296,7 @@ void dma_direct_free(struct device *dev, size_t size,
 	else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
 		arch_dma_clear_uncached(cpu_addr, size);
 
-	dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
+	__dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
 }
 
 struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -283,7 +306,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 	void *ret;
 
 	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
-	    force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
+	    force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
+	    !is_dev_swiotlb_force(dev))
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	page = __dma_direct_alloc_pages(dev, size, gfp);
@@ -310,7 +334,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 	return page;
 out_free_pages:
-	dma_free_contiguous(dev, page, size);
+	__dma_direct_free_pages(dev, page, size);
 	return NULL;
 }
 
@@ -329,7 +353,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
 	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
 
-	dma_free_contiguous(dev, page, size);
+	__dma_direct_free_pages(dev, page, size);
 }
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index fec4934b9926..6ad85b48f101 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -462,8 +462,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 
 	index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
 	do {
-		if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
-		    (orig_addr & iotlb_align_mask)) {
+		if (orig_addr &&
+		    (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+			    (orig_addr & iotlb_align_mask)) {
 			index = wrap_index(mem, index + 1);
 			continue;
 		}
@@ -702,3 +703,43 @@ static int __init swiotlb_create_default_debugfs(void)
 late_initcall(swiotlb_create_default_debugfs);
 
 #endif
+
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+	phys_addr_t tlb_addr;
+	int index;
+
+	/*
+	 * Skip io_tlb_default_mem since swiotlb_alloc doesn't support atomic
+	 * coherent allocation. Otherwise might break existing devices.
+	 * One must set up another device coherent pool by shared-dma-pool and
+	 * use dma_alloc_from_dev_coherent instead for atomic coherent
+	 * allocation to avoid memory remapping.
+	 */
+	if (!mem || mem == io_tlb_default_mem)
+		return NULL;
+
+	index = swiotlb_find_slots(dev, 0, size);
+	if (index == -1)
+		return NULL;
+
+	tlb_addr = slot_addr(mem->start, index);
+
+	return pfn_to_page(PFN_DOWN(tlb_addr));
+}
+
+bool swiotlb_free(struct device *dev, struct page *page, size_t size)
+{
+	phys_addr_t tlb_addr = page_to_phys(page);
+
+	if (!is_swiotlb_buffer(dev, tlb_addr))
+		return false;
+
+	swiotlb_release_slots(dev, tlb_addr);
+
+	return true;
+}
+
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related

* [PATCH v11 10/12] swiotlb: Add restricted DMA pool initialization
From: Claire Chang @ 2021-06-16  3:52 UTC (permalink / raw)
  To: Rob Herring, mpe, Joerg Roedel, Will Deacon, Frank Rowand,
	Konrad Rzeszutek Wilk, boris.ostrovsky, jgross, Christoph Hellwig,
	Marek Szyprowski
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, mingo, jxgao, sstabellini,
	Saravana Kannan, xypron.glpk, Rafael J . Wysocki,
	Bartosz Golaszewski, bskeggs, linux-pci, xen-devel,
	Thierry Reding, intel-gfx, matthew.auld, linux-devicetree, daniel,
	airlied, maarten.lankhorst, linuxppc-dev, jani.nikula,
	Nicolas Boichat, rodrigo.vivi, bhelgaas, tientzu, Dan Williams,
	Andy Shevchenko, Greg KH, Randy Dunlap, lkml, tfiga,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, Robin Murphy, bauerman
In-Reply-To: <20210616035240.840463-1-tientzu@chromium.org>

Add the initialization function to create restricted DMA pools from
matching reserved-memory nodes.

Regardless of swiotlb setting, the restricted DMA pool is preferred if
available.

The restricted DMA pools provide a basic level of protection against the
DMA overwriting buffer contents at unexpected times. However, to protect
against general data leakage and system memory corruption, the system
needs to provide a way to lock down the memory access, e.g., MPU.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h |  3 +-
 kernel/dma/Kconfig      | 14 ++++++++
 kernel/dma/swiotlb.c    | 75 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 2d5ec670e064..9616346b727f 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -73,7 +73,8 @@ extern enum swiotlb_force swiotlb_force;
  *		range check to see if the memory was in fact allocated by this
  *		API.
  * @nslabs:	The number of IO TLB blocks (in groups of 64) between @start and
- *		@end. This is command line adjustable via setup_io_tlb_npages.
+ *		@end. For default swiotlb, this is command line adjustable via
+ *		setup_io_tlb_npages.
  * @used:	The number of used IO TLB block.
  * @list:	The free list describing the number of free entries available
  *		from each index.
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 77b405508743..3e961dc39634 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -80,6 +80,20 @@ config SWIOTLB
 	bool
 	select NEED_DMA_MAP_STATE
 
+config DMA_RESTRICTED_POOL
+	bool "DMA Restricted Pool"
+	depends on OF && OF_RESERVED_MEM
+	select SWIOTLB
+	help
+	  This enables support for restricted DMA pools which provide a level of
+	  DMA memory protection on systems with limited hardware protection
+	  capabilities, such as those lacking an IOMMU.
+
+	  For more information see
+	  <Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt>
+	  and <kernel/dma/swiotlb.c>.
+	  If unsure, say "n".
+
 #
 # Should be selected if we can mmap non-coherent mappings to userspace.
 # The only thing that is really required is a way to set an uncached bit
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6ad85b48f101..f3f271f7e272 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -39,6 +39,13 @@
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/slab.h>
+#endif
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -742,4 +749,72 @@ bool swiotlb_free(struct device *dev, struct page *page, size_t size)
 	return true;
 }
 
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+				    struct device *dev)
+{
+	struct io_tlb_mem *mem = rmem->priv;
+	unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+	/*
+	 * Since multiple devices can share the same pool, the private data,
+	 * io_tlb_mem struct, will be initialized by the first device attached
+	 * to it.
+	 */
+	if (!mem) {
+		mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL);
+		if (!mem)
+			return -ENOMEM;
+
+		swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
+		mem->force = true;
+		set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+				     rmem->size >> PAGE_SHIFT);
+
+		rmem->priv = mem;
+
+		if (IS_ENABLED(CONFIG_DEBUG_FS)) {
+			mem->debugfs =
+				debugfs_create_dir(rmem->name, debugfs_dir);
+			swiotlb_create_debugfs_files(mem);
+		}
+	}
+
+	dev->dma_io_tlb_mem = mem;
+
+	return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+					struct device *dev)
+{
+	dev->dma_io_tlb_mem = io_tlb_default_mem;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+	.device_init = rmem_swiotlb_device_init,
+	.device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+	unsigned long node = rmem->fdt_node;
+
+	if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+	    of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+	    of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+	    of_get_flat_dt_prop(node, "no-map", NULL))
+		return -EINVAL;
+
+	if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
+		pr_err("Restricted DMA pool must be accessible within the linear mapping.");
+		return -EINVAL;
+	}
+
+	rmem->ops = &rmem_swiotlb_ops;
+	pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
+		&rmem->base, (unsigned long)rmem->size / SZ_1M);
+	return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
 #endif /* CONFIG_DMA_RESTRICTED_POOL */
-- 
2.32.0.272.g935e593368-goog


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox