LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re[2]: PS3 platform is broken on Linux 3.7.0
From: Phileas Fogg @ 2013-02-10  8:59 UTC (permalink / raw)
  To: Geoff Levand; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1358206641.4489.8.camel@smoke>

IEhpLAoKaSBmb3VuZCB3aGVyZSB0aGUgcHJvYmxlbSBsaWVzLgpJIGFsc28gcHJpbnRlZCBzb21l
IHZhbHVlcyBpbiBwczNfaHB0ZV9pbnNlcnQgd2l0aCBhbmQgd2l0aG91dCA2NFRCIHN1cHBvcnQs
IGkgdXNlZCBPcGVuV1JUIHdpdGggTGludXggMy43LjYgZm9yIHRlc3RpbmcuCgpTb21lIHZhbHVl
cyB3aXRob3V0IDY0VEIgc3VwcG9ydDoKLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0t
LS0tLS0tLS0tLS0tLS0tLQoKW8KgwqDCoCAwLjA2MDQ4N10gUlBDOiBSZWdpc3RlcmVkIG5hbWVk
IFVOSVggc29ja2V0IHRyYW5zcG9ydCBtb2R1bGUuClvCoMKgwqAgMC4wNjA1MTFdIFJQQzogUmVn
aXN0ZXJlZCB1ZHAgdHJhbnNwb3J0IG1vZHVsZS4KW8KgwqDCoCAwLjA2MDY3Ml0gUlBDOiBSZWdp
c3RlcmVkIHRjcCB0cmFuc3BvcnQgbW9kdWxlLgpbwqDCoMKgIDAuMDYwODczXSBSUEM6IFJlZ2lz
dGVyZWQgdGNwIE5GU3Y0LjEgYmFja2NoYW5uZWwgdHJhbnNwb3J0IG1vZHVsZS4KW8KgwqDCoCAw
LjA2MTA4MF0gaW5pdGNhbGwgLmluaXRfc3VucnBjKzB4MC8weGJjIHJldHVybmVkIDAgYWZ0ZXIg
Nzg0IHVzZWNzClvCoMKgwqAgMC4wNjEyODBdIGNhbGxpbmfCoCAucG9wdWxhdGVfcm9vdGZzKzB4
MC8weDEyMCBAIDEKW8KgwqDCoCAwLjA2MTY4M10gcHMzX2hwdGVfaW5zZXJ0OnJlc3VsdD0wIHZw
bj1mMDliODlhZjUwMTAxIHBhPWQ0ZTAwMDAgaXg9ZGZhMCB2PWYwOWI4OWFmNTAwMSByPTZjMDA1
ZDRlMDE5NCBwc2l6ZT0wIHNzaXplPTAgbHBhcj02YzAwNWQ0ZTAwMDAKW8KgwqDCoCAwLjA2MTcz
M10gcHMzX2hwdGVfaW5zZXJ0OnJlc3VsdD0wIHZwbj1mMDliODlhZjUwMTAyIHBhPWQ0ZTEwMDAg
aXg9ZGZiOCB2PWYwOWI4OWFmNTAwMSByPTZjMDA1ZDRlMTE5NCBwc2l6ZT0wIHNzaXplPTAgbHBh
cj02YzAwNWQ0ZTEwMDAKW8KgwqDCoCAwLjA2MTg5NV0gcHMzX2hwdGVfaW5zZXJ0OnJlc3VsdD0w
IHZwbj1mMDliODlhZjUwMTAzIHBhPWQ0ZTIwMDAgaXg9ZGZiMCB2PWYwOWI4OWFmNTAwMSByPTZj
MDA1ZDRlMjE5NCBwc2l6ZT0wIHNzaXplPTAgbHBhcj02YzAwNWQ0ZTIwMDAKCgpTb21lIHZhbHVl
cyB3aXRoIDY0VEIgc3VwcG9ydDoKLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0t
LS0tLS0tLS0tLS0tLQoKW8KgwqDCoCAwLjA3NjQ3N10gY2FsbGluZ8KgIC5pbml0X3N1bnJwYysw
eDAvMHhiYyBAIDEKW8KgwqDCoCAwLjA3Njk5Ml0gUlBDOiBSZWdpc3RlcmVkIG5hbWVkIFVOSVgg
c29ja2V0IHRyYW5zcG9ydCBtb2R1bGUuClvCoMKgwqAgMC4wNzcwMTddIFJQQzogUmVnaXN0ZXJl
ZCB1ZHAgdHJhbnNwb3J0IG1vZHVsZS4KW8KgwqDCoCAwLjA3NzA3Nl0gUlBDOiBSZWdpc3RlcmVk
IHRjcCB0cmFuc3BvcnQgbW9kdWxlLgpbwqDCoMKgIDAuMDc3Mjc3XSBSUEM6IFJlZ2lzdGVyZWQg
dGNwIE5GU3Y0LjEgYmFja2NoYW5uZWwgdHJhbnNwb3J0IG1vZHVsZS4KW8KgwqDCoCAwLjA3NzQ4
NF0gaW5pdGNhbGwgLmluaXRfc3VucnBjKzB4MC8weGJjIHJldHVybmVkIDAgYWZ0ZXIgNzg0IHVz
ZWNzClvCoMKgwqAgMC4wNzc2ODRdIGNhbGxpbmfCoCAucG9wdWxhdGVfcm9vdGZzKzB4MC8weDEy
MCBAIDEKW8KgwqDCoCAwLjA3ODEyNl0gcHMzX2hwdGVfaW5zZXJ0OnJlc3VsdD0tMTcgdnBuPTI1
MDA4Njg0ZDgwMTAxIHBhPWQ1NjcwMDAgaXg9MmVjOCB2PTI1MDA4Njg0ZDgwMDEgcj02YzAwNWQ1
NjcxOTQgcHNpemU9MCBzc2l6ZT0wIGxwYXI9NmMwMDVkNTY3MDAwClvCoMKgwqAgMC4wNzgxNjRd
IHBzM19ocHRlX2luc2VydDpyZXN1bHQ9LTE3IHZwbj0yNTAwODY4NGQ4MDEwMSBwYT1kNTY3MDAw
IGl4PTJlYzggdj0yNTAwODY4NGQ4MDAxIHI9NmMwMDVkNTY3MTk0IHBzaXplPTAgc3NpemU9MCBs
cGFyPTZjMDA1ZDU2NzAwMApbwqDCoMKgIDAuMDc4Mjg3XSAtLS0tLS0tLS0tLS1bIGN1dCBoZXJl
IF0tLS0tLS0tLS0tLS0KW8KgwqDCoCAwLjA3ODQ4Ml0gS2VybmVsIEJVRyBhdCBjMDAwMDAwMDAw
MDJjYjNjIFt2ZXJib3NlIGRlYnVnIGluZm8gdW5hdmFpbGFibGVdClvCoMKgwqAgMC4wNzg2ODZd
IE9vcHM6IEV4Y2VwdGlvbiBpbiBrZXJuZWwgbW9kZSwgc2lnOiA1IFsjMV0KW8KgwqDCoCAwLjA3
ODg4M10gU01QIE5SX0NQVVM9MiBQUzMKW8KgwqDCoCAwLjA3OTA4NF0gTW9kdWxlcyBsaW5rZWQg
aW46ClvCoMKgwqAgMC4wNzkyODddIE5JUDogYzAwMDAwMDAwMDAyY2IzYyBMUjogYzAwMDAwMDAw
MDAyY2IzOCBDVFI6IDAwMDAwMDAwMDAyZmZjMzgKW8KgwqDCoCAwLjA3OTQ4OV0gUkVHUzogYzAw
MDAwMDAwZDA0ZjBlMCBUUkFQOiAwNzAwwqDCoCBOb3QgdGFpbnRlZMKgICgzLjcuNikKW8KgwqDC
oCAwLjA3OTY4N10gTVNSOiA4MDAwMDAwMDAwMDIwMDMyIDxTRixJUixEUixSST7CoCBDUjogMjIw
MDAwMjLCoCBYRVI6IDAwMDAwMDAwClvCoMKgwqAgMC4wNzk4ODhdIFNPRlRFOiAwClvCoMKgwqAg
MC4wODAwOTBdIFRBU0sgPSBjMDAwMDAwMDBkMDQ5MDYwWzFdICdzd2FwcGVyLzEnIFRIUkVBRDog
YzAwMDAwMDAwZDA0YzAwMCBDUFU6IDEKR1BSMDA6IGMwMDAwMDAwMDAwMmNiMzggYzAwMDAwMDAw
ZDA0ZjM2MCBjMDAwMDAwMDAxMmVjOGQwIDAwMDAwMDAwMDAwMDAwODEgCkdQUjA0OiAwMDAwMDAw
MDAwMDAwMDAwIDAwMDAwMDAwMDAwMDAwMDAgMDAwMDAwMDAwMDAwMDAwMCAwMDAwMDAwMDAwMDAw
MDAwIApHUFIwODogMDAwMDAwMDAwMDAwMDAwMCBjMDAwMDAwMDAxMjRjZTEwIDAwMDAwMDAwMDAw
MDAwMDAgYzAwMDAwMDAwMDAyYmNmMCAKR1BSMTI6IDAwMDAwMDAwMjIwMDAwMjIgYzAwMDAwMDAw
N2ZmZTI4MCBjMDAwMDAwMDAwMDA4Yzk0IGMwMDAwMDAwMDA1Y2JhMDAgCgoKCkFuZCBub3cgdGFr
ZSBhIGxvb2sgYXQgJ3YnIHZhbHVlcyBpbiBib3RoIGNhc2VzLgoKV2l0aG91dCA2NFRCIHN1cHBv
cnQ6ICAgdj1mMDliODlhZjUwMDEKV2l0aCA2NFRCIHN1cHBvcnQ6IHY9MjUwMDg2ODRkODAwMQoK
TnVtYmVyIG9mIGxlYWRpbmcgemVyb3MgaW4gZjA5Yjg5YWY1MDAxIGlzIDE2LgpOdW1iZXIgb2Yg
bGVhZGluZyB6ZXJvcyBpbiAyNTAwODY4NGQ4MDAxIGlzIDE0LgoKQW5kIHRoYXQncyB3aHkgbHYx
X2luc2VydF9odGFiX2VudHJ5IGZhaWxzIHdpdGggLTE3IHdoaWNoIG1lYW5zIExWMV9JTExFR0FM
X1BBUkFNRVRFUl9WQUxVRSBiZWNhdXNlCnRoZSBIeXBlcnZpc29yIG9mIFBTMyBjaGVja3MgJ0FW
UE4nIHZhbHVlcyBmb3IgbnVtYmVyIG9mIGxlYWRpbmcgemVyb3MgYW5kIGFsbG93cyBhdCBsZWFz
dCAxNSBiaXRzIHdoaWNoIGluIGNhc2UKb2YgJ3YnIHZhbHVlIDI1MDA4Njg0ZDgwMDEgaXMgdG9v
IHNtYWxsIG9mIGNvdXJzZS4KCk5vdCBzdXJlIGhvdyB0byBmaXggaXQgaW4gY3VycmVudCBMaW51
eCBrZXJuZWwuIFlvdSBndXlzIGtub3cgaXQgYmV0dGVyIHRoYW4gbWUuCgpSZWdhcmRzCgoKCgrQ
n9C+0L3QtdC00LXQu9GM0L3QuNC6LCAxNCDRj9C90LLQsNGA0Y8gMjAxMywgMTU6MzcgLTA4OjAw
INC+0YIgR2VvZmYgTGV2YW5kIDxnZW9mZkBpbmZyYWRlYWQub3JnPjoKPkhpLAo+Cj5PbiBGcmks
IDIwMTMtMDEtMTEgYXQgMTg6MTIgLTA4MDAsIEdlb2ZmIExldmFuZCB3cm90ZToKPj4gSSBjaGVj
a2VkIHRoZXNlLCBhbmQgTWljaGFlbCdzIDQwNzgyMWEzNGZjZTg5YjRmMGIwMzFkYmFiNWNlYzdk
MDU5ZjQ2YmMKPj4gZG9lcyBpbmRlZWQgY2F1c2UgdGhlIExWMSBoeXBlcnZpc29yIHRvIHBhbmlj
IGVhcmx5LCBhbmQgaWYgdGhhdCBpcwo+PiByZXZlcnRlZCwgQW5lZXNoJ3MgMDQ4ZWUwOTkzZWM4
MzYwYWJiMGI1MWJkZjhmODcyMWU5ZWQ2MmVjNCBoaXRzIGEgQlVHLgo+Cj5KdXN0IHRvIGdpdmUg
YW4gdXBkYXRlLCBJIGRpZCBhIGxpdHRsZSBtb3JlIHdvcmsgb24gaXQgYW5kIGZvdW5kIHRoYXQK
PnRoZSBjYWxsIHRvIGx2MV9pbnNlcnRfaHRhYl9lbnRyeSgpIGluc2lkZSBwczNfaHB0ZV9pbnNl
cnQoKSBpcwo+ZmFpbGluZy4KPgo+wqDCoMKgIGh0dHA6Ly9naXQua2VybmVsLm9yZy8/cD1saW51
eC9rZXJuZWwvZ2l0L2dlb2ZmL3BzMy1saW51eC5naXQ7YT1ibG9iO2Y9YXJjaC9wb3dlcnBjL3Bs
YXRmb3Jtcy9wczMvaHRhYi5jO2hiPUhFQUQjbDcwCj4KPlRoZSB2YWx1ZXMgb2YgdGhlIHZhcmlh
YmxlcyBwcmludGVkIGFsbCBsb29rIHN0cmFuZ2UgY29tcGFyZWQgd2l0aAo+Y29tbWl0IDA0OGVl
MDk5MyByZXZlcnRlZC4gIEknbGwgdHJ5IGRvIHNvbWUgbW9yZSB3b3JrIG9uIGl0IHRoaXMKPndl
ZWsuCj4KPi1HZW9mZgo+Cj4KPgo+Cj4KPl9fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19f
X19fX19fX19fX19fX19fCj5MaW51eHBwYy1kZXYgbWFpbGluZyBsaXN0Cj5MaW51eHBwYy1kZXZA
bGlzdHMub3psYWJzLm9yZwo+aHR0cHM6Ly9saXN0cy5vemxhYnMub3JnL2xpc3RpbmZvL2xpbnV4
cHBjLWRldgoK

^ permalink raw reply

* Re[3]: PS3 platform is broken on Linux 3.7.0
From: Phileas Fogg @ 2013-02-10  9:16 UTC (permalink / raw)
  To: Geoff Levand, linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1360486777.923599245@f85.mail.ru>

[-- Attachment #1: Type: text/plain, Size: 5185 bytes --]

 And another note.
I took a look at the MMU chapter in the Cell Architecture handbook and indeed the first 15 bits in VA are treated as 0 by the hardware.

Quote:

1. High-order bits above 65 bits in the 80-bit virtual address (VA[0:14]) are not implemented. The hardware always
   treats these bits as `0'. Software must not set these bits to any other value than `0' or the results are undefined in
   the PPE.


regards


Воскресенье, 10 февраля 2013, 12:59 +04:00 от Phileas Fogg <phileas-fogg@mail.ru>:
>Hi,
>
>i found where the problem lies.
>I also printed some values in ps3_hpte_insert with and without 64TB support, i used OpenWRT with Linux 3.7.6 for testing.
>
>Some values without 64TB support:
>-------------------------------------------------
>
>[    0.060487] RPC: Registered named UNIX socket transport module.
>[    0.060511] RPC: Registered udp transport module.
>[    0.060672] RPC: Registered tcp transport module.
>[    0.060873] RPC: Registered tcp NFSv4.1 backchannel transport module.
>[    0.061080] initcall .init_sunrpc+0x0/0xbc returned 0 after 784 usecs
>[    0.061280] calling  .populate_rootfs+0x0/0x120 @ 1
>[    0.061683] ps3_hpte_insert:result=0 vpn=f09b89af50101 pa=d4e0000 ix=dfa0 v=f09b89af5001 r=6c005d4e0194 psize=0 ssize=0 lpar=6c005d4e0000
>[    0.061733] ps3_hpte_insert:result=0 vpn=f09b89af50102 pa=d4e1000 ix=dfb8 v=f09b89af5001 r=6c005d4e1194 psize=0 ssize=0 lpar=6c005d4e1000
>[    0.061895] ps3_hpte_insert:result=0 vpn=f09b89af50103 pa=d4e2000 ix=dfb0 v=f09b89af5001 r=6c005d4e2194 psize=0 ssize=0 lpar=6c005d4e2000
>
>
>Some values with 64TB support:
>-------------------------------------------------
>
>[    0.076477] calling  .init_sunrpc+0x0/0xbc @ 1
>[    0.076992] RPC: Registered named UNIX socket transport module.
>[    0.077017] RPC: Registered udp transport module.
>[    0.077076] RPC: Registered tcp transport module.
>[    0.077277] RPC: Registered tcp NFSv4.1 backchannel transport module.
>[    0.077484] initcall .init_sunrpc+0x0/0xbc returned 0 after 784 usecs
>[    0.077684] calling  .populate_rootfs+0x0/0x120 @ 1
>[    0.078126] ps3_hpte_insert:result=-17 vpn=25008684d80101 pa=d567000 ix=2ec8 v=25008684d8001 r=6c005d567194 psize=0 ssize=0 lpar=6c005d567000
>[    0.078164] ps3_hpte_insert:result=-17 vpn=25008684d80101 pa=d567000 ix=2ec8 v=25008684d8001 r=6c005d567194 psize=0 ssize=0 lpar=6c005d567000
>[    0.078287] ------------[ cut here ]------------
>[    0.078482] Kernel BUG at c00000000002cb3c [verbose debug info unavailable]
>[    0.078686] Oops: Exception in kernel mode, sig: 5 [#1]
>[    0.078883] SMP NR_CPUS=2 PS3
>[    0.079084] Modules linked in:
>[    0.079287] NIP: c00000000002cb3c LR: c00000000002cb38 CTR: 00000000002ffc38
>[    0.079489] REGS: c00000000d04f0e0 TRAP: 0700   Not tainted  (3.7.6)
>[    0.079687] MSR: 8000000000020032 <SF,IR,DR,RI>  CR: 22000022  XER: 00000000
>[    0.079888] SOFTE: 0
>[    0.080090] TASK = c00000000d049060[1] 'swapper/1' THREAD: c00000000d04c000 CPU: 1
>GPR00: c00000000002cb38 c00000000d04f360 c0000000012ec8d0 0000000000000081 
>GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 
>GPR08: 0000000000000000 c00000000124ce10 0000000000000000 c00000000002bcf0 
>GPR12: 0000000022000022 c000000007ffe280 c000000000008c94 c0000000005cba00 
>
>
>
>And now take a look at 'v' values in both cases.
>
>Without 64TB support:   v=f09b89af5001
>With 64TB support: v=25008684d8001
>
>Number of leading zeros in f09b89af5001 is 16.
>Number of leading zeros in 25008684d8001 is 14.
>
>And that's why lv1_insert_htab_entry fails with -17 which means LV1_ILLEGAL_PARAMETER_VALUE because
>the Hypervisor of PS3 checks 'AVPN' values for number of leading zeros and allows at least 15 bits which in case
>of 'v' value 25008684d8001 is too small of course.
>
>Not sure how to fix it in current Linux kernel. You guys know it better than me.
>
>Regards
>
>
>
>
>Понедельник, 14 января 2013, 15:37 -08:00 от Geoff Levand < geoff@infradead.org >:
>>Hi,
>>
>>On Fri, 2013-01-11 at 18:12 -0800, Geoff Levand wrote:
>>> I checked these, and Michael's 407821a34fce89b4f0b031dbab5cec7d059f46bc
>>> does indeed cause the LV1 hypervisor to panic early, and if that is
>>> reverted, Aneesh's 048ee0993ec8360abb0b51bdf8f8721e9ed62ec4 hits a BUG.
>>
>>Just to give an update, I did a little more work on it and found that
>>the call to lv1_insert_htab_entry() inside ps3_hpte_insert() is
>>failing.
>>
>>     http://git.kernel.org/?p=linux/kernel/git/geoff/ps3-linux.git;a=blob;f=arch/powerpc/platforms/ps3/htab.c;hb=HEAD#l70
>>
>>The values of the variables printed all look strange compared with
>>commit 048ee0993 reverted.  I'll try do some more work on it this
>>week.
>>
>>-Geoff
>>
>>
>>
>>
>>
>>_______________________________________________
>>Linuxppc-dev mailing list
>>Linuxppc-dev@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@lists.ozlabs.org
>https://lists.ozlabs.org/listinfo/linuxppc-dev


[-- Attachment #2: Type: text/html, Size: 6880 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 1/5] powerpc: Syscall hooks for context tracking subsystem
From: Frederic Weisbecker @ 2013-02-10 10:32 UTC (permalink / raw)
  To: Li Zhong; +Cc: paulmck, linuxppc-dev, linux-kernel, paulus
In-Reply-To: <1359714465-6297-2-git-send-email-zhong@linux.vnet.ibm.com>

2013/2/1 Li Zhong <zhong@linux.vnet.ibm.com>:
> This is the syscall slow path hooks for context tracking subsystem,
> corresponding to
> [PATCH] x86: Syscall hooks for userspace RCU extended QS
>   commit bf5a3c13b939813d28ce26c01425054c740d6731
>
> TIF_MEMDIE is moved to the second 16-bits (with value 17), as it seems there
> is no asm code using it. TIF_NOHZ is added to _TIF_SYCALL_T_OR_A, so it is
> better for it to be in the same 16 bits with others in the group, so in the
> asm code, andi. with this group could work.
>
> Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>

Looks good. Thanks.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>

> ---
>  arch/powerpc/include/asm/thread_info.h |    7 +++++--
>  arch/powerpc/kernel/ptrace.c           |    5 +++++
>  2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
> index 406b7b9..414a261 100644
> --- a/arch/powerpc/include/asm/thread_info.h
> +++ b/arch/powerpc/include/asm/thread_info.h
> @@ -97,7 +97,7 @@ static inline struct thread_info *current_thread_info(void)
>  #define TIF_PERFMON_CTXSW      6       /* perfmon needs ctxsw calls */
>  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
>  #define TIF_SINGLESTEP         8       /* singlestepping active */
> -#define TIF_MEMDIE             9       /* is terminating due to OOM killer */
> +#define TIF_NOHZ               9       /* in adaptive nohz mode */
>  #define TIF_SECCOMP            10      /* secure computing */
>  #define TIF_RESTOREALL         11      /* Restore all regs (implies NOERROR) */
>  #define TIF_NOERROR            12      /* Force successful syscall return */
> @@ -106,6 +106,7 @@ static inline struct thread_info *current_thread_info(void)
>  #define TIF_SYSCALL_TRACEPOINT 15      /* syscall tracepoint instrumentation */
>  #define TIF_EMULATE_STACK_STORE        16      /* Is an instruction emulation
>                                                 for stack store? */
> +#define TIF_MEMDIE             17      /* is terminating due to OOM killer */
>
>  /* as above, but as bit values */
>  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
> @@ -124,8 +125,10 @@ static inline struct thread_info *current_thread_info(void)
>  #define _TIF_UPROBE            (1<<TIF_UPROBE)
>  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
> +#define _TIF_NOHZ              (1<<TIF_NOHZ)
>  #define _TIF_SYSCALL_T_OR_A    (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
> -                                _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
> +                                _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
> +                                _TIF_NOHZ)
>
>  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
>                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE)
> diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
> index c497000..62238dd 100644
> --- a/arch/powerpc/kernel/ptrace.c
> +++ b/arch/powerpc/kernel/ptrace.c
> @@ -32,6 +32,7 @@
>  #include <trace/syscall.h>
>  #include <linux/hw_breakpoint.h>
>  #include <linux/perf_event.h>
> +#include <linux/context_tracking.h>
>
>  #include <asm/uaccess.h>
>  #include <asm/page.h>
> @@ -1745,6 +1746,8 @@ long do_syscall_trace_enter(struct pt_regs *regs)
>  {
>         long ret = 0;
>
> +       user_exit();
> +
>         secure_computing_strict(regs->gpr[0]);
>
>         if (test_thread_flag(TIF_SYSCALL_TRACE) &&
> @@ -1789,4 +1792,6 @@ void do_syscall_trace_leave(struct pt_regs *regs)
>         step = test_thread_flag(TIF_SINGLESTEP);
>         if (step || test_thread_flag(TIF_SYSCALL_TRACE))
>                 tracehook_report_syscall_exit(regs, step);
> +
> +       user_enter();
>  }
> --
> 1.7.9.5
>

^ permalink raw reply

* Re[2]: PS3 platform is broken on Linux 3.7.0
From: Phileas Fogg @ 2013-02-10 11:45 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1355954017.5397.53.camel@pasglop>

Cj5PbiBGcmksIDIwMTItMTItMTQgYXQgMTY6MzUgKzA0MDAsIFBoaWxlYXMgRm9nZyB3cm90ZToK
Pj4gSGksCj4+IAo+PiBJIHdhbnRlZCB0byBicmluZyB0byB5b3VyIGF0dGVudGlvbiB0aGUgZmFj
dCB0aGF0IHRoZSBQUzMgcGxhdGZvcm0gaXMgYnJva2VuIG9uIExpbnV4IDMuNy4wLgo+PiAKPj4g
aSdtIG5vdCBhYmxlIHRvIGJvb3QgTGludXggMy43LjAgb24gbXkgUFMzIHNsaW0uIExpbnV4IDMu
Ni4xMCBib290cyBqdXN0IGZpbmUgYnV0IG5vdCAzLjcuMAo+PiBXaGVuIGkgdHJ5IHRvIGJvb3Qg
TGludXggMy43LjAgdGhlbiBteSBQUzMgIHNodXRzIGRvd24uCj4+IAo+PiBTbyBpIGNsb25lZCB0
aGUgTGludXggcG93ZXJwYyBHSVQgcmVwb3NpdG9yeSBhbmQgdHJpZWQgdG8gZmluZCBvdXQgd2hp
Y2ggY29tbWl0cyBicm9rZSB0aGUgUFMzIHBsYXRmb3JtLgo+PiBBZnRlciBzb21lIHRpbWUgSSB0
cmFja2VkIGl0IGRvd24gdG8gMiBjb21taXRzOgo+Cj5BbmVlc2gsIGRvIHlvdSBoYXZlIGFueSBp
ZGVhIHdoYXQgbWlnaHQgYmUgZ29pbmcgb24gdGhlcmUgPyBDYW4geW91IGxvb2sKPmF0IHRoZSBQ
UzMgaGFzaCBjb2RlID8gSXQncyBhIGJpdCBkaWZmZXJlbnQgZnJvbSB0aGUgcmVzdCwgeW91IG1p
Z2h0Cj5oYXZlIG1pc3NlZCBhbiB1cGRhdGUgb3IgdHdvLi4uCj4KPk1pY2hhZWwsIHNhbWUgZGVh
bCB3aXRoIFBBQ0EuLi4KPgo+Q2hlZXJzLAo+QmVuLgoKCkkgZGVidWdnZWQgdGhlIGlzc3VlIHdp
dGggdGhlIHBhbmljIG9uIFBBQ0EgYWNjZXNzIG9uIFBTMyBhcmNoIGFuZCBmb3VuZCBvdXQgdGhh
dCBpdCBwYW5pY3MgaW4KCmFyY2gvcG93ZXJwYy9rZXJuZWwvc2V0dXBfNjQuYyAtPiBlYXJseV9z
ZXR1cCAtPiB1ZGJnX2Vhcmx5X2luaXQgLT4gcmVnaXN0ZXJfZWFybHlfdWRiZ19jb25zb2xlIC0+
IGNvbnNvbGVfbG9jayAtPiBkb3duIC0+IHJhd19zcGluX3VubG9ja19pcnFyZXN0b3JlCgpJdCBw
YW5pY3Mgb25seSBpZiBpIGVuYWJsZSBsb2NrIGRlYnVnZ2luZyBpbiBrZXJuZWwuCgpJIHN1Z2dl
c3QgdGhlIGZvbGxvd2luZyBwYXRjaCB0byBmaXggdGhlIGlzc3VlOgoKLS0tIGFyY2gvcG93ZXJw
Yy9rZXJuZWwvc2V0dXBfNjQuYy5vbGQJMjAxMy0wMi0xMCAxMzozOTo0NS4xNDcxMzE1NDcgKzAx
MDAKKysrIGFyY2gvcG93ZXJwYy9rZXJuZWwvc2V0dXBfNjQuYwkyMDEzLTAyLTEwIDEzOjQwOjUx
LjY5NzEzNTQxOSArMDEwMApAQCAtMTg2LDYgKzE4Niw5IEBACiAJaW5pdGlhbGlzZV9wYWNhKCZi
b290X3BhY2EsIDApOwogCXNldHVwX3BhY2EoJmJvb3RfcGFjYSk7CiAKKwkvKiBBbGxvdyBwZXJj
cHUgYWNjZXNzZXMgdG8gIndvcmsiIHVudGlsIHdlIHNldHVwIHBlcmNwdSBkYXRhICovCisJZ2V0
X3BhY2EoKS0+ZGF0YV9vZmZzZXQgPSAwOworCiAJLyogSW5pdGlhbGl6ZSBsb2NrZGVwIGVhcmx5
IG9yIGVsc2Ugc3BpbmxvY2tzIHdpbGwgYmxvdyAqLwogCWxvY2tkZXBfaW5pdCgpOwogCkBAIC0y
MDgsOCArMjExLDYgQEAKIAogCS8qIEZpeCB1cCBwYWNhIGZpZWxkcyByZXF1aXJlZCBmb3IgdGhl
IGJvb3QgY3B1ICovCiAJZ2V0X3BhY2EoKS0+Y3B1X3N0YXJ0ID0gMTsKLQkvKiBBbGxvdyBwZXJj
cHUgYWNjZXNzZXMgdG8gIndvcmsiIHVudGlsIHdlIHNldHVwIHBlcmNwdSBkYXRhICovCi0JZ2V0
X3BhY2EoKS0+ZGF0YV9vZmZzZXQgPSAwOwogCiAJLyogUHJvYmUgdGhlIG1hY2hpbmUgdHlwZSAq
LwogCXByb2JlX21hY2hpbmUoKTsKCgoKCg==

^ permalink raw reply

* Re[3]: PS3 platform is broken on Linux 3.7.0
From: Phileas Fogg @ 2013-02-10 12:17 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1360496719.267074236@f380.i.mail.ru>

IFBsZWFzZSBpZ25vcmUgdGhlIHByZXZpb3VzIHBhdGNoIHRvIGZpeCB0aGUgUEFDQSBpc3N1ZSBv
biBQUzMgYXJjaC4KVGhpcyBpcyB0aGUgY29ycmVjdCBvbmU6CgotLS0gYS9hcmNoL3Bvd2VycGMv
a2VybmVsL3NldHVwXzY0LmMJMjAxMy0wMi0xMCAxMzo1NjoxMi44MDM4NTU2NzMgKzAxMDAKKysr
IGIvYXJjaC9wb3dlcnBjL2tlcm5lbC9zZXR1cF82NC5jCTIwMTMtMDItMTAgMTQ6MDc6MjIuODcw
NTYxMzIyICswMTAwCkBAIC0xODYsNiArMTg2LDkgQEAKIAlpbml0aWFsaXNlX3BhY2EoJmJvb3Rf
cGFjYSwgMCk7CiAJc2V0dXBfcGFjYSgmYm9vdF9wYWNhKTsKIAorCS8qIEFsbG93IHBlcmNwdSBh
Y2Nlc3NlcyB0byAid29yayIgdW50aWwgd2Ugc2V0dXAgcGVyY3B1IGRhdGEgKi8KKwlib290X3Bh
Y2EuZGF0YV9vZmZzZXQgPSAwOworCiAJLyogSW5pdGlhbGl6ZSBsb2NrZGVwIGVhcmx5IG9yIGVs
c2Ugc3BpbmxvY2tzIHdpbGwgYmxvdyAqLwogCWxvY2tkZXBfaW5pdCgpOwogCgoKCgrQktC+0YHQ
utGA0LXRgdC10L3RjNC1LCAxMCDRhNC10LLRgNCw0LvRjyAyMDEzLCAxNTo0NSArMDQ6MDAg0L7R
giBQaGlsZWFzIEZvZ2cgPHBoaWxlYXMtZm9nZ0BtYWlsLnJ1PjoKPgo+Pk9uIEZyaSwgMjAxMi0x
Mi0xNCBhdCAxNjozNSArMDQwMCwgUGhpbGVhcyBGb2dnIHdyb3RlOgo+Pj4gSGksCj4+PiAKPj4+
IEkgd2FudGVkIHRvIGJyaW5nIHRvIHlvdXIgYXR0ZW50aW9uIHRoZSBmYWN0IHRoYXQgdGhlIFBT
MyBwbGF0Zm9ybSBpcyBicm9rZW4gb24gTGludXggMy43LjAuCj4+PiAKPj4+IGknbSBub3QgYWJs
ZSB0byBib290IExpbnV4IDMuNy4wIG9uIG15IFBTMyBzbGltLiBMaW51eCAzLjYuMTAgYm9vdHMg
anVzdCBmaW5lIGJ1dCBub3QgMy43LjAKPj4+IFdoZW4gaSB0cnkgdG8gYm9vdCBMaW51eCAzLjcu
MCB0aGVuIG15IFBTMyAgc2h1dHMgZG93bi4KPj4+IAo+Pj4gU28gaSBjbG9uZWQgdGhlIExpbnV4
IHBvd2VycGMgR0lUIHJlcG9zaXRvcnkgYW5kIHRyaWVkIHRvIGZpbmQgb3V0IHdoaWNoIGNvbW1p
dHMgYnJva2UgdGhlIFBTMyBwbGF0Zm9ybS4KPj4+IEFmdGVyIHNvbWUgdGltZSBJIHRyYWNrZWQg
aXQgZG93biB0byAyIGNvbW1pdHM6Cj4+Cj4+QW5lZXNoLCBkbyB5b3UgaGF2ZSBhbnkgaWRlYSB3
aGF0IG1pZ2h0IGJlIGdvaW5nIG9uIHRoZXJlID8gQ2FuIHlvdSBsb29rCj4+YXQgdGhlIFBTMyBo
YXNoIGNvZGUgPyBJdCdzIGEgYml0IGRpZmZlcmVudCBmcm9tIHRoZSByZXN0LCB5b3UgbWlnaHQK
Pj5oYXZlIG1pc3NlZCBhbiB1cGRhdGUgb3IgdHdvLi4uCj4+Cj4+TWljaGFlbCwgc2FtZSBkZWFs
IHdpdGggUEFDQS4uLgo+Pgo+PkNoZWVycywKPj5CZW4uCj4KPgo+SSBkZWJ1Z2dlZCB0aGUgaXNz
dWUgd2l0aCB0aGUgcGFuaWMgb24gUEFDQSBhY2Nlc3Mgb24gUFMzIGFyY2ggYW5kIGZvdW5kIG91
dCB0aGF0IGl0IHBhbmljcyBpbgo+Cj5hcmNoL3Bvd2VycGMva2VybmVsL3NldHVwXzY0LmMgLT4g
ZWFybHlfc2V0dXAgLT4gdWRiZ19lYXJseV9pbml0IC0+IHJlZ2lzdGVyX2Vhcmx5X3VkYmdfY29u
c29sZSAtPiBjb25zb2xlX2xvY2sgLT4gZG93biAtPiByYXdfc3Bpbl91bmxvY2tfaXJxcmVzdG9y
ZQo+Cj5JdCBwYW5pY3Mgb25seSBpZiBpIGVuYWJsZSBsb2NrIGRlYnVnZ2luZyBpbiBrZXJuZWwu
Cj4KPkkgc3VnZ2VzdCB0aGUgZm9sbG93aW5nIHBhdGNoIHRvIGZpeCB0aGUgaXNzdWU6Cj4KPi0t
LSBhcmNoL3Bvd2VycGMva2VybmVsL3NldHVwXzY0LmMub2xkCTIwMTMtMDItMTAgMTM6Mzk6NDUu
MTQ3MTMxNTQ3ICswMTAwCj4rKysgYXJjaC9wb3dlcnBjL2tlcm5lbC9zZXR1cF82NC5jCTIwMTMt
MDItMTAgMTM6NDA6NTEuNjk3MTM1NDE5ICswMTAwCj5AQCAtMTg2LDYgKzE4Niw5IEBACj7CoAlp
bml0aWFsaXNlX3BhY2EoJmJvb3RfcGFjYSwgMCk7Cj7CoAlzZXR1cF9wYWNhKCZib290X3BhY2Ep
Owo+wqAKPisJLyogQWxsb3cgcGVyY3B1IGFjY2Vzc2VzIHRvICJ3b3JrIiB1bnRpbCB3ZSBzZXR1
cCBwZXJjcHUgZGF0YSAqLwo+KwlnZXRfcGFjYSgpLT5kYXRhX29mZnNldCA9IDA7Cj4rCj7CoAkv
KiBJbml0aWFsaXplIGxvY2tkZXAgZWFybHkgb3IgZWxzZSBzcGlubG9ja3Mgd2lsbCBibG93ICov
Cj7CoAlsb2NrZGVwX2luaXQoKTsKPsKgCj5AQCAtMjA4LDggKzIxMSw2IEBACj7CoAo+wqAJLyog
Rml4IHVwIHBhY2EgZmllbGRzIHJlcXVpcmVkIGZvciB0aGUgYm9vdCBjcHUgKi8KPsKgCWdldF9w
YWNhKCktPmNwdV9zdGFydCA9IDE7Cj4tCS8qIEFsbG93IHBlcmNwdSBhY2Nlc3NlcyB0byAid29y
ayIgdW50aWwgd2Ugc2V0dXAgcGVyY3B1IGRhdGEgKi8KPi0JZ2V0X3BhY2EoKS0+ZGF0YV9vZmZz
ZXQgPSAwOwo+wqAKPsKgCS8qIFByb2JlIHRoZSBtYWNoaW5lIHR5cGUgKi8KPsKgCXByb2JlX21h
Y2hpbmUoKTsKPgo+Cj4KPgo+X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19f
X19fX19fX18KPkxpbnV4cHBjLWRldiBtYWlsaW5nIGxpc3QKPkxpbnV4cHBjLWRldkBsaXN0cy5v
emxhYnMub3JnCj5odHRwczovL2xpc3RzLm96bGFicy5vcmcvbGlzdGluZm8vbGludXhwcGMtZGV2
Cgo=

^ permalink raw reply

* Re: [RFC PATCH 2/5] powerpc: Exception hooks for context tracking subsystem
From: Frederic Weisbecker @ 2013-02-10 14:10 UTC (permalink / raw)
  To: Li Zhong; +Cc: paulmck, linuxppc-dev, linux-kernel, paulus
In-Reply-To: <1359714465-6297-3-git-send-email-zhong@linux.vnet.ibm.com>

2013/2/1 Li Zhong <zhong@linux.vnet.ibm.com>:
> This is the exception hooks for context tracking subsystem, including
> data access, program check, single step, instruction breakpoint, machine check,
> alignment, fp unavailable, altivec assist, unknown exception, whose handlers
> might use RCU.
>
> This patch corresponds to
> [PATCH] x86: Exception hooks for userspace RCU extended QS
>   commit 6ba3c97a38803883c2eee489505796cb0a727122
>
> Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>

Looks good!

I guess we should move exception_enter/exit definition to the generic
code. They should be the same for all archs after all. Also we are
relying on user_mode(regs) but this may be buggy with some corner
cases. For example if an exception happen after a call to user_exit()
(on syscall exit) but before we actually resume in userspace, the
exception will exit in kernel mode from the context tracking POV.

So instead on relying on the regs, which are not sync with the context
tracking state, we should use something like:

prev_state = exception_enter();
...
exception_exit(prev_state);

Also preempt_schedule_irq() is concerned as well by this problem. So I
should convert it to that scheme as well. I'm going to prepare some
patches.

Feel free to merge this patch in the powerpc tree, I'll do the
conversion along the way.

Thanks.

^ permalink raw reply

* Re: Re[3]: PS3 platform is broken on Linux 3.7.0
From: Aneesh Kumar K.V @ 2013-02-10 15:46 UTC (permalink / raw)
  To: Phileas Fogg, Benjamin Herrenschmidt, linuxppc-dev
In-Reply-To: <1360498641.346858866@f116.mail.ru>

Phileas Fogg <phileas-fogg@mail.ru> writes:

>  Please ignore the previous patch to fix the PACA issue on PS3 arch.
> This is the correct one:
>
> --- a/arch/powerpc/kernel/setup_64.c	2013-02-10 13:56:12.803855673 +0100
> +++ b/arch/powerpc/kernel/setup_64.c	2013-02-10 14:07:22.870561322 +0100
> @@ -186,6 +186,9 @@
>  	initialise_paca(&boot_paca, 0);
>  	setup_paca(&boot_paca);
>  
> +	/* Allow percpu accesses to "work" until we setup percpu data */
> +	boot_paca.data_offset = 0;
> +
>  	/* Initialize lockdep early or else spinlocks will blow */
>  	lockdep_init();
>  
>

commit 466921c5a4669f4315528a25f9afd66601ce2c04 is done to fix the
lockdep related issue on ppc64. So this may need little bit more
explanation. So if we explicitly use boot_paca, do we still need the
changes in the above commit ?

-aneesh

^ permalink raw reply

* Re: Re[3]: PS3 platform is broken on Linux 3.7.0
From: Aneesh Kumar K.V @ 2013-02-10 16:11 UTC (permalink / raw)
  To: Phileas Fogg, Geoff Levand, linuxppc-dev
In-Reply-To: <1360487778.293641922@f350.mail.ru>

Phileas Fogg <phileas-fogg@mail.ru> writes:

>  And another note.
> I took a look at the MMU chapter in the Cell Architecture handbook and in=
deed the first 15 bits in VA are treated as 0 by the hardware.
>
> Quote:
>
> 1. High-order bits above 65 bits in the 80-bit virtual address (VA[0:14])=
 are not implemented. The hardware always
> =C2=A0=C2=A0 treats these bits as `0'. Software must not set these bits t=
o any other value than `0' or the results are undefined in
> =C2=A0=C2=A0 the PPE.
>
>

True, we missed the below part of ISA doc:

ISA doc says

"On implementations that support a virtual address size
of only n bits, n < 78, bits 0:77-n of the AVA field must be
zeros. "

The Cell document I found at=20

https://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/7A77CCDF14FE70D58=
52575CA0074E8ED/$file/CellBE_Handbook_v1.12_3Apr09_pub.pdf

gives=20

Virtual Address (VA) Size -> 65 bits

So as per ISA, bits 0:12 should be zero, which should make 0:14 of PTE
fields zero for Cell.

I will try to do a patch.=20

Thanks for debugging this.

-aneesh

^ permalink raw reply

* Re[5]: PS3 platform is broken on Linux 3.7.0
From: Phileas Fogg @ 2013-02-10 17:51 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <87mwvc41fc.fsf@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 1978 bytes --]


>Phileas Fogg < phileas-fogg@mail.ru > writes:
>
>>  Please ignore the previous patch to fix the PACA issue on PS3 arch.
>> This is the correct one:
>>
>> --- a/arch/powerpc/kernel/setup_64.c	2013-02-10 13:56:12.803855673 +0100
>> +++ b/arch/powerpc/kernel/setup_64.c	2013-02-10 14:07:22.870561322 +0100
>> @@ -186,6 +186,9 @@
>>  	initialise_paca(&boot_paca, 0);
>>  	setup_paca(&boot_paca);
>> 
>> +	/* Allow percpu accesses to "work" until we setup percpu data */
>> +	boot_paca.data_offset = 0;
>> +
>>  	/* Initialize lockdep early or else spinlocks will blow */
>>  	lockdep_init();
>> 
>>
>
>commit 466921c5a4669f4315528a25f9afd66601ce2c04 is done to fix the
>lockdep related issue on ppc64. So this may need little bit more
>explanation. So if we explicitly use boot_paca, do we still need the
>changes in the above commit ?
>
>-aneesh
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@lists.ozlabs.org
>https://lists.ozlabs.org/listinfo/linuxppc-dev

Ok, here is the next PACA fix test.

I tested the following patch with Linux 3.8.0-rc7 on PS3 arch and still getting panics.

Patch:

--- arch/powerpc/kernel/setup_64.c.old    2013-02-10 19:34:53.787366191 +0100
+++ arch/powerpc/kernel/setup_64.c    2013-02-10 19:35:38.834035478 +0100
@@ -186,6 +186,9 @@
     initialise_paca(&boot_paca, 0);
     setup_paca(&boot_paca);
 
+    /* Allow percpu accesses to "work" until we setup percpu data */
+    boot_paca.data_offset = 0;
+
     /* Initialize lockdep early or else spinlocks will blow */
     lockdep_init();
 
@@ -208,8 +211,6 @@
 
     /* Fix up paca fields required for the boot cpu */
     get_paca()->cpu_start = 1;
-    /* Allow percpu accesses to "work" until we setup percpu data */
-    get_paca()->data_offset = 0;
 
     /* Probe the machine type */
     probe_machine();



It seems that 'boot_paca' and 'get_paca()' refer to different PACAs.

regards 



[-- Attachment #2: Type: text/html, Size: 3151 bytes --]

^ permalink raw reply

* Re: [PATCH v5 04/45] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks
From: Oleg Nesterov @ 2013-02-10 18:06 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-doc, peterz, fweisbec, mingo, linux-arch, linux,
	xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt, rjw,
	namhyung, tglx, linux-arm-kernel, netdev, linux-kernel, sbw,
	Srivatsa S. Bhat, tj, akpm, linuxppc-dev
In-Reply-To: <20130208231017.GK2666@linux.vnet.ibm.com>

On 02/08, Paul E. McKenney wrote:
>
> On Tue, Jan 22, 2013 at 01:03:53PM +0530, Srivatsa S. Bhat wrote:
> >
> >  void percpu_read_unlock(struct percpu_rwlock *pcpu_rwlock)
> >  {
> > -	read_unlock(&pcpu_rwlock->global_rwlock);
>
> We need an smp_mb() here to keep the critical section ordered before the
> this_cpu_dec() below.  Otherwise, if a writer shows up just after we
> exit the fastpath, that writer is not guaranteed to see the effects of
> our critical section.  Equivalently, the prior read-side critical section
> just might see some of the writer's updates, which could be a bit of
> a surprise to the reader.

Agreed, we should not assume that a "reader" doesn't write. And we should
ensure that this "read" section actually completes before this_cpu_dec().

> > +	/*
> > +	 * We never allow heterogeneous nesting of readers. So it is trivial
> > +	 * to find out the kind of reader we are, and undo the operation
> > +	 * done by our corresponding percpu_read_lock().
> > +	 */
> > +	if (__this_cpu_read(*pcpu_rwlock->reader_refcnt)) {
> > +		this_cpu_dec(*pcpu_rwlock->reader_refcnt);
> > +		smp_wmb(); /* Paired with smp_rmb() in sync_reader() */
>
> Given an smp_mb() above, I don't understand the need for this smp_wmb().
> Isn't the idea that if the writer sees ->reader_refcnt decremented to
> zero, it also needs to see the effects of the corresponding reader's
> critical section?

I am equally confused ;)

OTOH, we can probably aboid any barrier if reader_nested_percpu() == T.

> > +static void announce_writer_inactive(struct percpu_rwlock *pcpu_rwlock)
> > +{
> > +   unsigned int cpu;
> > +
> > +   drop_writer_signal(pcpu_rwlock, smp_processor_id());
>
> Why do we drop ourselves twice?  More to the point, why is it important to
> drop ourselves first?

And don't we need mb() _before_ we clear ->writer_signal ?

> > +static inline void sync_reader(struct percpu_rwlock *pcpu_rwlock,
> > +			       unsigned int cpu)
> > +{
> > +	smp_rmb(); /* Paired with smp_[w]mb() in percpu_read_[un]lock() */
>
> As I understand it, the purpose of this memory barrier is to ensure
> that the stores in drop_writer_signal() happen before the reads from
> ->reader_refcnt in reader_uses_percpu_refcnt(), thus preventing the
> race between a new reader attempting to use the fastpath and this writer
> acquiring the lock.  Unless I am confused, this must be smp_mb() rather
> than smp_rmb().

And note that before sync_reader() we call announce_writer_active() which
already adds mb() before sync_all_readers/sync_reader, so this rmb() looks
unneeded.

But, at the same time, could you confirm that we do not need another mb()
after sync_all_readers() in percpu_write_lock() ? I mean, without mb(),
can't this reader_uses_percpu_refcnt() LOAD leak into the critical section
protected by ->global_rwlock? Then this LOAD can be re-ordered with other
memory operations done by the writer.

Srivatsa, I think that the code would be more understandable if you kill
the helpers like sync_reader/raise_writer_signal. Perhaps even all "write"
helpers, I am not sure. At least, it seems to me that all barriers should
be moved to percpu_write_lock/unlock. But I won't insist of course, up to
you.

And cosmetic nit... How about

	struct xxx {
		unsigned long	reader_refcnt;
		bool		writer_signal;
	}

	struct percpu_rwlock {
		struct xxx __percpu	*xxx;
		rwlock_t		global_rwlock;
	};

?

This saves one alloc_percpu() and ensures that reader_refcnt/writer_signal
are always in the same cache-line.

Oleg.

^ permalink raw reply

* Re: [PATCH v5 04/45] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks
From: Srivatsa S. Bhat @ 2013-02-10 18:38 UTC (permalink / raw)
  To: paulmck
  Cc: linux-doc, peterz, fweisbec, linux-kernel, walken, mingo,
	linux-arch, linux, xiaoguangrong, wangyun, nikunj, linux-pm,
	rusty, rostedt, rjw, Namhyung Kim, tglx, linux-arm-kernel, netdev,
	oleg, sbw, Tejun Heo, akpm, linuxppc-dev
In-Reply-To: <20130208224742.GJ2666@linux.vnet.ibm.com>

On 02/09/2013 04:17 AM, Paul E. McKenney wrote:
> On Tue, Jan 29, 2013 at 08:12:37PM +0900, Namhyung Kim wrote:
>> On Thu, 24 Jan 2013 10:00:04 +0530, Srivatsa S. Bhat wrote:
>>> On 01/24/2013 01:27 AM, Tejun Heo wrote:
>>>> On Thu, Jan 24, 2013 at 01:03:52AM +0530, Srivatsa S. Bhat wrote:
>>>>> CPU 0                          CPU 1
>>>>>
>>>>> read_lock(&rwlock)
>>>>>
>>>>>                               write_lock(&rwlock) //spins, because CPU 0
>>>>>                               //has acquired the lock for read
>>>>>
>>>>> read_lock(&rwlock)
>>>>>    ^^^^^
>>>>> What happens here? Does CPU 0 start spinning (and hence deadlock) or will
>>>>> it continue realizing that it already holds the rwlock for read?
>>>>
>>>> I don't think rwlock allows nesting write lock inside read lock.
>>>> read_lock(); write_lock() will always deadlock.
>>>>
>>>
>>> Sure, I understand that :-) My question was, what happens when *two* CPUs
>>> are involved, as in, the read_lock() is invoked only on CPU 0 whereas the
>>> write_lock() is invoked on CPU 1.
>>>
>>> For example, the same scenario shown above, but with slightly different
>>> timing, will NOT result in a deadlock:
>>>
>>> Scenario 2:
>>>   CPU 0                                CPU 1
>>>
>>> read_lock(&rwlock)
>>>
>>>
>>> read_lock(&rwlock) //doesn't spin
>>>
>>>                                     write_lock(&rwlock) //spins, because CPU 0
>>>                                     //has acquired the lock for read
>>>
>>>
>>> So I was wondering whether the "fairness" logic of rwlocks would cause
>>> the second read_lock() to spin (in the first scenario shown above) because
>>> a writer is already waiting (and hence new readers should spin) and thus
>>> cause a deadlock.
>>
>> In my understanding, current x86 rwlock does basically this (of course,
>> in an atomic fashion):
>>
>>
>> #define RW_LOCK_BIAS 0x10000
>>
>> rwlock_init(rwlock)
>> {
>> 	rwlock->lock = RW_LOCK_BIAS;
>> }
>>
>> arch_read_lock(rwlock)
>> {
>> retry:
>> 	if (--rwlock->lock >= 0)
>> 		return;
>>
>>         rwlock->lock++;
>>         while (rwlock->lock < 1)
>>         	continue;
>>
>>         goto retry;
>> }
>>
>> arch_write_lock(rwlock)
>> {
>> retry:
>> 	if ((rwlock->lock -= RW_LOCK_BIAS) == 0)
>>         	return;
>>
>>         rwlock->lock += RW_LOCK_BIAS;
>> 	while (rwlock->lock != RW_LOCK_BIAS)
>> 		continue;
>>
>>         goto retry;
>> }
>>
>>
>> So I can't find where the 'fairness' logic comes from..
> 
> I looked through several of the rwlock implementations, and in all of
> them the writer backs off if it sees readers -- or refrains from asserting
> ownership of the lock to begin with.
> 
> So it should be OK to use rwlock as shown in the underlying patch.
> 

Thanks a lot for confirming that Paul! So I guess we can use rwlocks
as it is, since its behaviour suits our needs perfectly. So I won't tinker
with atomic counters for a while, atleast not until someone starts making
rwlocks fair.. ;-)

Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v5 05/45] percpu_rwlock: Make percpu-rwlocks IRQ-safe, optimally
From: Oleg Nesterov @ 2013-02-10 18:42 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: linux-doc, peterz, fweisbec, mingo, linux-arch, linux,
	xiaoguangrong, wangyun, paulmck, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, linux-kernel, sbw,
	tj, akpm, linuxppc-dev
In-Reply-To: <20130122073400.13822.52336.stgit@srivatsabhat.in.ibm.com>

only one cosmetic nit...

On 01/22, Srivatsa S. Bhat wrote:
>
> +#define READER_PRESENT		(1UL << 16)
> +#define READER_REFCNT_MASK	(READER_PRESENT - 1)
> +
>  #define reader_uses_percpu_refcnt(pcpu_rwlock, cpu)			\
>  		(ACCESS_ONCE(per_cpu(*((pcpu_rwlock)->reader_refcnt), cpu)))
>
>  #define reader_nested_percpu(pcpu_rwlock)				\
> -			(__this_cpu_read(*((pcpu_rwlock)->reader_refcnt)) > 1)
> +	(__this_cpu_read(*((pcpu_rwlock)->reader_refcnt)) & READER_REFCNT_MASK)
>
>  #define writer_active(pcpu_rwlock)					\
>  			(__this_cpu_read(*((pcpu_rwlock)->writer_signal)))

I think this all can go to lib/percpu-rwlock.c. Nobody needs to know
these implementation details.

Oleg.

^ permalink raw reply

* Re: [PATCH v5 04/45] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks
From: Srivatsa S. Bhat @ 2013-02-10 19:10 UTC (permalink / raw)
  To: paulmck
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw, tj,
	akpm, linuxppc-dev
In-Reply-To: <20130208231017.GK2666@linux.vnet.ibm.com>

On 02/09/2013 04:40 AM, Paul E. McKenney wrote:
> On Tue, Jan 22, 2013 at 01:03:53PM +0530, Srivatsa S. Bhat wrote:
>> Using global rwlocks as the backend for per-CPU rwlocks helps us avoid many
>> lock-ordering related problems (unlike per-cpu locks). However, global
>> rwlocks lead to unnecessary cache-line bouncing even when there are no
>> writers present, which can slow down the system needlessly.
>>
[...]
> 
> Looks pretty close!  Some comments interspersed below.  Please either
> fix the code or my confusion, as the case may be.  ;-)
> 

Sure :-)

>> ---
>>
>>  include/linux/percpu-rwlock.h |   10 +++
>>  lib/percpu-rwlock.c           |  128 ++++++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 136 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/percpu-rwlock.h b/include/linux/percpu-rwlock.h
>> index 8dec8fe..6819bb8 100644
>> --- a/include/linux/percpu-rwlock.h
>> +++ b/include/linux/percpu-rwlock.h
>> @@ -68,4 +68,14 @@ extern void percpu_free_rwlock(struct percpu_rwlock *);
>>  	__percpu_init_rwlock(pcpu_rwlock, #pcpu_rwlock, &rwlock_key);	\
>>  })
>>
>> +#define reader_uses_percpu_refcnt(pcpu_rwlock, cpu)			\
>> +		(ACCESS_ONCE(per_cpu(*((pcpu_rwlock)->reader_refcnt), cpu)))
>> +
>> +#define reader_nested_percpu(pcpu_rwlock)				\
>> +			(__this_cpu_read(*((pcpu_rwlock)->reader_refcnt)) > 1)
>> +
>> +#define writer_active(pcpu_rwlock)					\
>> +			(__this_cpu_read(*((pcpu_rwlock)->writer_signal)))
>> +
>>  #endif
>> +
>> diff --git a/lib/percpu-rwlock.c b/lib/percpu-rwlock.c
>> index 80dad93..992da5c 100644
>> --- a/lib/percpu-rwlock.c
>> +++ b/lib/percpu-rwlock.c
>> @@ -64,21 +64,145 @@ void percpu_free_rwlock(struct percpu_rwlock *pcpu_rwlock)
>>
>>  void percpu_read_lock(struct percpu_rwlock *pcpu_rwlock)
>>  {
>> -	read_lock(&pcpu_rwlock->global_rwlock);
>> +	preempt_disable();
>> +
>> +	/* First and foremost, let the writer know that a reader is active */
>> +	this_cpu_inc(*pcpu_rwlock->reader_refcnt);
>> +
>> +	/*
>> +	 * If we are already using per-cpu refcounts, it is not safe to switch
>> +	 * the synchronization scheme. So continue using the refcounts.
>> +	 */
>> +	if (reader_nested_percpu(pcpu_rwlock)) {
>> +		goto out;
>> +	} else {
>> +		/*
>> +		 * The write to 'reader_refcnt' must be visible before we
>> +		 * read 'writer_signal'.
>> +		 */
>> +		smp_mb(); /* Paired with smp_rmb() in sync_reader() */
>> +
>> +		if (likely(!writer_active(pcpu_rwlock))) {
>> +			goto out;
>> +		} else {
>> +			/* Writer is active, so switch to global rwlock. */
>> +			read_lock(&pcpu_rwlock->global_rwlock);
>> +
>> +			/*
>> +			 * We might have raced with a writer going inactive
>> +			 * before we took the read-lock. So re-evaluate whether
>> +			 * we still need to hold the rwlock or if we can switch
>> +			 * back to per-cpu refcounts. (This also helps avoid
>> +			 * heterogeneous nesting of readers).
>> +			 */
>> +			if (writer_active(pcpu_rwlock))
> 
> The above writer_active() can be reordered with the following this_cpu_dec(),
> strange though it might seem.  But this is OK because holding the rwlock
> is conservative.  But might be worth a comment.
> 

Ok..

>> +				this_cpu_dec(*pcpu_rwlock->reader_refcnt);
>> +			else
> 
> In contrast, no reordering can happen here because read_unlock() is
> required to keep the critical section underneath the lock.
>

Ok..
 
>> +				read_unlock(&pcpu_rwlock->global_rwlock);
>> +		}
>> +	}
>> +
>> +out:
>> +	/* Prevent reordering of any subsequent reads */
>> +	smp_rmb();
> 
> This should be smp_mb().  "Readers" really can do writes.  Hence the
> name lglock -- "local/global" rather than "reader/writer".
> 

Ok!

[ We were trying to avoid full memory barriers in get/put_online_cpus_atomic()
in the fastpath, as far as possible... Now it feels like all that discussion
was for nothing :-( ]

>>  }
>>
>>  void percpu_read_unlock(struct percpu_rwlock *pcpu_rwlock)
>>  {
>> -	read_unlock(&pcpu_rwlock->global_rwlock);
> 
> We need an smp_mb() here to keep the critical section ordered before the
> this_cpu_dec() below.  Otherwise, if a writer shows up just after we
> exit the fastpath, that writer is not guaranteed to see the effects of
> our critical section.  Equivalently, the prior read-side critical section
> just might see some of the writer's updates, which could be a bit of
> a surprise to the reader.
> 

Ok, will add it.

>> +	/*
>> +	 * We never allow heterogeneous nesting of readers. So it is trivial
>> +	 * to find out the kind of reader we are, and undo the operation
>> +	 * done by our corresponding percpu_read_lock().
>> +	 */
>> +	if (__this_cpu_read(*pcpu_rwlock->reader_refcnt)) {
>> +		this_cpu_dec(*pcpu_rwlock->reader_refcnt);
>> +		smp_wmb(); /* Paired with smp_rmb() in sync_reader() */
> 
> Given an smp_mb() above, I don't understand the need for this smp_wmb().
> Isn't the idea that if the writer sees ->reader_refcnt decremented to
> zero, it also needs to see the effects of the corresponding reader's
> critical section?
>

Not sure what you meant, but my idea here was that the writer should see
the reader_refcnt falling to zero as soon as possible, to avoid keeping the
writer waiting in a tight loop for longer than necessary.
I might have been a little over-zealous to use lighter memory barriers though,
(given our lengthy discussions in the previous versions to reduce the memory
barrier overheads), so the smp_wmb() used above might be wrong.

So, are you saying that the smp_mb() you indicated above would be enough
to make the writer observe the 1->0 transition of reader_refcnt immediately?
 
> Or am I missing something subtle here?  In any case, if this smp_wmb()
> really is needed, there should be some subsequent write that the writer
> might observe.  From what I can see, there is no subsequent write from
> this reader that the writer cares about.
> 

I thought the smp_wmb() here and the smp_rmb() at the writer would ensure
immediate reflection of the reader state at the writer side... Please correct
me if my understanding is incorrect.

>> +	} else {
>> +		read_unlock(&pcpu_rwlock->global_rwlock);
>> +	}
>> +
>> +	preempt_enable();
>> +}
>> +
>> +static inline void raise_writer_signal(struct percpu_rwlock *pcpu_rwlock,
>> +				       unsigned int cpu)
>> +{
>> +	per_cpu(*pcpu_rwlock->writer_signal, cpu) = true;
>> +}
>> +
>> +static inline void drop_writer_signal(struct percpu_rwlock *pcpu_rwlock,
>> +				      unsigned int cpu)
>> +{
>> +	per_cpu(*pcpu_rwlock->writer_signal, cpu) = false;
>> +}
>> +
>> +static void announce_writer_active(struct percpu_rwlock *pcpu_rwlock)
>> +{
>> +	unsigned int cpu;
>> +
>> +	for_each_online_cpu(cpu)
>> +		raise_writer_signal(pcpu_rwlock, cpu);
>> +
>> +	smp_mb(); /* Paired with smp_rmb() in percpu_read_[un]lock() */
>> +}
>> +
>> +static void announce_writer_inactive(struct percpu_rwlock *pcpu_rwlock)
>> +{
>> +	unsigned int cpu;
>> +
>> +	drop_writer_signal(pcpu_rwlock, smp_processor_id());
> 
> Why do we drop ourselves twice?  More to the point, why is it important to
> drop ourselves first?
> 

I don't see where we are dropping ourselves twice. Note that we are no longer
in the cpu_online_mask, so the 'for' loop below won't include us. So we need
to manually drop ourselves. It doesn't matter whether we drop ourselves first
or later.

>> +
>> +	for_each_online_cpu(cpu)
>> +		drop_writer_signal(pcpu_rwlock, cpu);
>> +
>> +	smp_mb(); /* Paired with smp_rmb() in percpu_read_[un]lock() */
>> +}
>> +
>> +/*
>> + * Wait for the reader to see the writer's signal and switch from percpu
>> + * refcounts to global rwlock.
>> + *
>> + * If the reader is still using percpu refcounts, wait for him to switch.
>> + * Else, we can safely go ahead, because either the reader has already
>> + * switched over, or the next reader that comes along on that CPU will
>> + * notice the writer's signal and will switch over to the rwlock.
>> + */
>> +static inline void sync_reader(struct percpu_rwlock *pcpu_rwlock,
>> +			       unsigned int cpu)
>> +{
>> +	smp_rmb(); /* Paired with smp_[w]mb() in percpu_read_[un]lock() */
> 
> As I understand it, the purpose of this memory barrier is to ensure
> that the stores in drop_writer_signal() happen before the reads from
> ->reader_refcnt in reader_uses_percpu_refcnt(),

No, that was not what I intended. announce_writer_inactive() already does
a full smp_mb() after calling drop_writer_signal().

I put the smp_rmb() here and the smp_wmb() at the reader side (after updates
to the ->reader_refcnt) to reflect the state change of ->reader_refcnt
immediately at the writer, so that the writer doesn't have to keep spinning
unnecessarily still referring to the old (non-zero) value of ->reader_refcnt.
Or perhaps I am confused about how to use memory barriers properly.. :-(

> thus preventing the
> race between a new reader attempting to use the fastpath and this writer
> acquiring the lock.  Unless I am confused, this must be smp_mb() rather
> than smp_rmb().
> 
> Also, why not just have a single smp_mb() at the beginning of
> sync_all_readers() instead of executing one barrier per CPU?

Well, since my intention was to help the writer see the update (->reader_refcnt
dropping to zero) ASAP, I kept the multiple smp_rmb()s.

> 
>> +
>> +	while (reader_uses_percpu_refcnt(pcpu_rwlock, cpu))
>> +		cpu_relax();
>> +}
>> +
>> +static void sync_all_readers(struct percpu_rwlock *pcpu_rwlock)
>> +{
>> +	unsigned int cpu;
>> +
>> +	for_each_online_cpu(cpu)
>> +		sync_reader(pcpu_rwlock, cpu);
>>  }
>>
>>  void percpu_write_lock(struct percpu_rwlock *pcpu_rwlock)
>>  {
>> +	/*
>> +	 * Tell all readers that a writer is becoming active, so that they
>> +	 * start switching over to the global rwlock.
>> +	 */
>> +	announce_writer_active(pcpu_rwlock);
>> +	sync_all_readers(pcpu_rwlock);
>>  	write_lock(&pcpu_rwlock->global_rwlock);
>>  }
>>
>>  void percpu_write_unlock(struct percpu_rwlock *pcpu_rwlock)
>>  {
>> +	/*
>> +	 * Inform all readers that we are done, so that they can switch back
>> +	 * to their per-cpu refcounts. (We don't need to wait for them to
>> +	 * see it).
>> +	 */
>> +	announce_writer_inactive(pcpu_rwlock);
>>  	write_unlock(&pcpu_rwlock->global_rwlock);
>>  }
>>
>>

Thanks a lot for your detailed review and comments! :-)

Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v5 04/45] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks
From: Srivatsa S. Bhat @ 2013-02-10 19:24 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-doc, peterz, fweisbec, mingo, linux-arch, linux,
	xiaoguangrong, wangyun, Paul E. McKenney, nikunj, linux-pm, rusty,
	rostedt, rjw, namhyung, tglx, linux-arm-kernel, netdev,
	linux-kernel, sbw, tj, akpm, linuxppc-dev
In-Reply-To: <20130210180607.GA1375@redhat.com>

On 02/10/2013 11:36 PM, Oleg Nesterov wrote:
> On 02/08, Paul E. McKenney wrote:
>>
>> On Tue, Jan 22, 2013 at 01:03:53PM +0530, Srivatsa S. Bhat wrote:
>>>
>>>  void percpu_read_unlock(struct percpu_rwlock *pcpu_rwlock)
>>>  {
>>> -	read_unlock(&pcpu_rwlock->global_rwlock);
>>
>> We need an smp_mb() here to keep the critical section ordered before the
>> this_cpu_dec() below.  Otherwise, if a writer shows up just after we
>> exit the fastpath, that writer is not guaranteed to see the effects of
>> our critical section.  Equivalently, the prior read-side critical section
>> just might see some of the writer's updates, which could be a bit of
>> a surprise to the reader.
> 
> Agreed, we should not assume that a "reader" doesn't write. And we should
> ensure that this "read" section actually completes before this_cpu_dec().
>

Right, will fix.
 
>>> +	/*
>>> +	 * We never allow heterogeneous nesting of readers. So it is trivial
>>> +	 * to find out the kind of reader we are, and undo the operation
>>> +	 * done by our corresponding percpu_read_lock().
>>> +	 */
>>> +	if (__this_cpu_read(*pcpu_rwlock->reader_refcnt)) {
>>> +		this_cpu_dec(*pcpu_rwlock->reader_refcnt);
>>> +		smp_wmb(); /* Paired with smp_rmb() in sync_reader() */
>>
>> Given an smp_mb() above, I don't understand the need for this smp_wmb().
>> Isn't the idea that if the writer sees ->reader_refcnt decremented to
>> zero, it also needs to see the effects of the corresponding reader's
>> critical section?
> 
> I am equally confused ;)
> 
> OTOH, we can probably aboid any barrier if reader_nested_percpu() == T.
> 

Good point! Will add that optimization, thank you!

> 
>>> +static void announce_writer_inactive(struct percpu_rwlock *pcpu_rwlock)
>>> +{
>>> +   unsigned int cpu;
>>> +
>>> +   drop_writer_signal(pcpu_rwlock, smp_processor_id());
>>
>> Why do we drop ourselves twice?  More to the point, why is it important to
>> drop ourselves first?
> 
> And don't we need mb() _before_ we clear ->writer_signal ?
> 

Oh, right! Or, how about moving announce_writer_inactive() to _after_
write_unlock()?

>>> +static inline void sync_reader(struct percpu_rwlock *pcpu_rwlock,
>>> +			       unsigned int cpu)
>>> +{
>>> +	smp_rmb(); /* Paired with smp_[w]mb() in percpu_read_[un]lock() */
>>
>> As I understand it, the purpose of this memory barrier is to ensure
>> that the stores in drop_writer_signal() happen before the reads from
>> ->reader_refcnt in reader_uses_percpu_refcnt(), thus preventing the
>> race between a new reader attempting to use the fastpath and this writer
>> acquiring the lock.  Unless I am confused, this must be smp_mb() rather
>> than smp_rmb().
> 
> And note that before sync_reader() we call announce_writer_active() which
> already adds mb() before sync_all_readers/sync_reader, so this rmb() looks
> unneeded.
> 

My intention was to help the writer see the ->reader_refcnt drop to zero
ASAP; hence I used smp_wmb() at reader and smp_rmb() here at the writer.
Please correct me if my understanding of memory barriers is wrong here..

> But, at the same time, could you confirm that we do not need another mb()
> after sync_all_readers() in percpu_write_lock() ? I mean, without mb(),
> can't this reader_uses_percpu_refcnt() LOAD leak into the critical section
> protected by ->global_rwlock? Then this LOAD can be re-ordered with other
> memory operations done by the writer.
> 

Hmm.. it appears that we need a smp_mb() there.

> 
> 
> Srivatsa, I think that the code would be more understandable if you kill
> the helpers like sync_reader/raise_writer_signal. Perhaps even all "write"
> helpers, I am not sure. At least, it seems to me that all barriers should
> be moved to percpu_write_lock/unlock. But I won't insist of course, up to
> you.
> 

Sure, sure. Even Tejun pointed out that those helpers are getting in the way
of readability. I'll get rid of them in the next version.

> And cosmetic nit... How about
> 
> 	struct xxx {
> 		unsigned long	reader_refcnt;
> 		bool		writer_signal;
> 	}
> 
> 	struct percpu_rwlock {
> 		struct xxx __percpu	*xxx;
> 		rwlock_t		global_rwlock;
> 	};
> 
> ?
> 
> This saves one alloc_percpu() and ensures that reader_refcnt/writer_signal
> are always in the same cache-line.
>

Ok, that sounds better. Will make that change. Thanks a lot Oleg!

Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v5 05/45] percpu_rwlock: Make percpu-rwlocks IRQ-safe, optimally
From: Srivatsa S. Bhat @ 2013-02-10 19:27 UTC (permalink / raw)
  To: paulmck
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw, tj,
	akpm, linuxppc-dev
In-Reply-To: <20130208234403.GL2666@linux.vnet.ibm.com>

On 02/09/2013 05:14 AM, Paul E. McKenney wrote:
> On Tue, Jan 22, 2013 at 01:04:11PM +0530, Srivatsa S. Bhat wrote:
>> If interrupt handlers can also be readers, then one of the ways to make
>> per-CPU rwlocks safe, is to disable interrupts at the reader side before
>> trying to acquire the per-CPU rwlock and keep it disabled throughout the
>> duration of the read-side critical section.
[...]
>> -void percpu_read_lock(struct percpu_rwlock *pcpu_rwlock)
>> +void percpu_read_lock_irqsafe(struct percpu_rwlock *pcpu_rwlock)
>>  {
>>  	preempt_disable();
>>
>>  	/* First and foremost, let the writer know that a reader is active */
>> -	this_cpu_inc(*pcpu_rwlock->reader_refcnt);
>> +	this_cpu_add(*pcpu_rwlock->reader_refcnt, READER_PRESENT);
>>
>>  	/*
>>  	 * If we are already using per-cpu refcounts, it is not safe to switch
>>  	 * the synchronization scheme. So continue using the refcounts.
>>  	 */
>>  	if (reader_nested_percpu(pcpu_rwlock)) {
>> -		goto out;
>> +		this_cpu_inc(*pcpu_rwlock->reader_refcnt);
> 
> Hmmm...  If the reader is nested, it -doesn't- need the memory barrier at
> the end of this function.  If there is lots of nesting, it might be
> worth getting rid of it.
> 

Yes, good point! Will get rid of it.

Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v5 05/45] percpu_rwlock: Make percpu-rwlocks IRQ-safe, optimally
From: Srivatsa S. Bhat @ 2013-02-10 19:30 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-doc, peterz, fweisbec, mingo, linux-arch, linux,
	xiaoguangrong, wangyun, paulmck, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, linux-kernel, sbw,
	tj, akpm, linuxppc-dev
In-Reply-To: <20130210184209.GA3041@redhat.com>

On 02/11/2013 12:12 AM, Oleg Nesterov wrote:
> only one cosmetic nit...
> 
> On 01/22, Srivatsa S. Bhat wrote:
>>
>> +#define READER_PRESENT		(1UL << 16)
>> +#define READER_REFCNT_MASK	(READER_PRESENT - 1)
>> +
>>  #define reader_uses_percpu_refcnt(pcpu_rwlock, cpu)			\
>>  		(ACCESS_ONCE(per_cpu(*((pcpu_rwlock)->reader_refcnt), cpu)))
>>
>>  #define reader_nested_percpu(pcpu_rwlock)				\
>> -			(__this_cpu_read(*((pcpu_rwlock)->reader_refcnt)) > 1)
>> +	(__this_cpu_read(*((pcpu_rwlock)->reader_refcnt)) & READER_REFCNT_MASK)
>>
>>  #define writer_active(pcpu_rwlock)					\
>>  			(__this_cpu_read(*((pcpu_rwlock)->writer_signal)))
> 
> I think this all can go to lib/percpu-rwlock.c. Nobody needs to know
> these implementation details.
> 

Ok, will move them.
 
Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v5 06/45] percpu_rwlock: Allow writers to be readers, and add lockdep annotations
From: Srivatsa S. Bhat @ 2013-02-10 19:32 UTC (permalink / raw)
  To: paulmck
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw, tj,
	akpm, linuxppc-dev
In-Reply-To: <20130208234754.GM2666@linux.vnet.ibm.com>

On 02/09/2013 05:17 AM, Paul E. McKenney wrote:
> On Tue, Jan 22, 2013 at 01:04:23PM +0530, Srivatsa S. Bhat wrote:
>> CPU hotplug (which will be the first user of per-CPU rwlocks) has a special
>> requirement with respect to locking: the writer, after acquiring the per-CPU
>> rwlock for write, must be allowed to take the same lock for read, without
>> deadlocking and without getting complaints from lockdep. In comparison, this
>> is similar to what get_online_cpus()/put_online_cpus() does today: it allows
>> a hotplug writer (who holds the cpu_hotplug.lock mutex) to invoke it without
>> locking issues, because it silently returns if the caller is the hotplug
>> writer itself.
>>
>> This can be easily achieved with per-CPU rwlocks as well (even without a
>> "is this a writer?" check) by incrementing the per-CPU refcount of the writer
>> immediately after taking the global rwlock for write, and then decrementing
>> the per-CPU refcount before releasing the global rwlock.
>> This ensures that any reader that comes along on that CPU while the writer is
>> active (on that same CPU), notices the non-zero value of the nested counter
>> and assumes that it is a nested read-side critical section and proceeds by
>> just incrementing the refcount. Thus we prevent the reader from taking the
>> global rwlock for read, which prevents the writer from deadlocking itself.
>>
>> Add that support and teach lockdep about this special locking scheme so
>> that it knows that this sort of usage is valid. Also add the required lockdep
>> annotations to enable it to detect common locking problems with per-CPU
>> rwlocks.
> 
> Very nice!  The write-side interrupt disabling ensures that the task
> stays on CPU, as required.
> 
> One request: Could we please have a comment explaining the reasons for
> the writer incrementing and decrementing the reader reference count?
> 
> It looked really really strange to me until I came back and read the
> commit log.  ;-)
> 

Sure :-)

Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v5 07/45] CPU hotplug: Provide APIs to prevent CPU offline from atomic context
From: Srivatsa S. Bhat @ 2013-02-10 19:34 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, paulmck, nikunj, linux-pm, rusty,
	rostedt, rjw, tglx, linux-arm-kernel, netdev, oleg, sbw, tj, akpm,
	linuxppc-dev
In-Reply-To: <87mwvsuw68.fsf@sejong.aot.lge.com>

Hi Namhyung,

On 01/29/2013 03:51 PM, Namhyung Kim wrote:
> Hi Srivatsa,
> 
> On Tue, 22 Jan 2013 13:04:54 +0530, Srivatsa S. Bhat wrote:
>> @@ -246,15 +291,21 @@ struct take_cpu_down_param {
>>  static int __ref take_cpu_down(void *_param)
>>  {
>>  	struct take_cpu_down_param *param = _param;
>> -	int err;
>> +	unsigned long flags;
>> +	int err = 0;
> 
> It seems no need to set 'err' to 0.
>

Sorry for the late reply. This mail got buried in my inbox and
I hadn't noticed it until now.. :-(

I'll remove the unnecessary initialization. Thank you!

Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v5 09/45] smp, cpu hotplug: Fix smp_call_function_*() to prevent CPU offline properly
From: Srivatsa S. Bhat @ 2013-02-10 19:41 UTC (permalink / raw)
  To: paulmck
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw, tj,
	akpm, linuxppc-dev
In-Reply-To: <20130209000717.GP2666@linux.vnet.ibm.com>

On 02/09/2013 05:37 AM, Paul E. McKenney wrote:
> On Tue, Jan 22, 2013 at 01:05:10PM +0530, Srivatsa S. Bhat wrote:
>> Once stop_machine() is gone from the CPU offline path, we won't be able to
>> depend on preempt_disable() to prevent CPUs from going offline from under us.
>>
>> Use the get/put_online_cpus_atomic() APIs to prevent CPUs from going offline,
>> while invoking from atomic context.
>>
>> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
> 
> Would it make sense for get_online_cpus_atomic() to return the current
> CPU number?

Hmm, I'm not so sure. I tried to model it after get_online_cpus(), which doesn't
return anything (for other reasons, of course..)

Moreover, a function name like *_cpu_* returning the CPU number would be intuitive.
But a name such as *_cpus_* (plural) returning a CPU number might appear confusing..

And also I don't think it'll make a huge improvement in the callers.. (We might
be better off avoiding an smp_processor_id() if possible, since this function could
be called in very hot paths).. So I don't see a strong case for returning the
CPU number. But let me know if you think it'll still be worth it for some reason...

>  Looks good otherwise.
> 

Thank you very much for the detailed review, Paul!

Regards,
Srivatsa S. Bhat

> 
>> ---
>>
>>  kernel/smp.c |   40 ++++++++++++++++++++++++++--------------
>>  1 file changed, 26 insertions(+), 14 deletions(-)
>>
>> diff --git a/kernel/smp.c b/kernel/smp.c
>> index 29dd40a..f421bcc 100644
>> --- a/kernel/smp.c
>> +++ b/kernel/smp.c
>> @@ -310,7 +310,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
>>  	 * prevent preemption and reschedule on another processor,
>>  	 * as well as CPU removal
>>  	 */
>> -	this_cpu = get_cpu();
>> +	get_online_cpus_atomic();
>> +	this_cpu = smp_processor_id();
>>
>>  	/*
>>  	 * Can deadlock when called with interrupts disabled.
>> @@ -342,7 +343,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
>>  		}
>>  	}
>>
>> -	put_cpu();
>> +	put_online_cpus_atomic();
>>
>>  	return err;
>>  }
>> @@ -371,8 +372,10 @@ int smp_call_function_any(const struct cpumask *mask,
>>  	const struct cpumask *nodemask;
>>  	int ret;
>>
>> +	get_online_cpus_atomic();
>>  	/* Try for same CPU (cheapest) */
>> -	cpu = get_cpu();
>> +	cpu = smp_processor_id();
>> +
>>  	if (cpumask_test_cpu(cpu, mask))
>>  		goto call;
>>
>> @@ -388,7 +391,7 @@ int smp_call_function_any(const struct cpumask *mask,
>>  	cpu = cpumask_any_and(mask, cpu_online_mask);
>>  call:
>>  	ret = smp_call_function_single(cpu, func, info, wait);
>> -	put_cpu();
>> +	put_online_cpus_atomic();
>>  	return ret;
>>  }
>>  EXPORT_SYMBOL_GPL(smp_call_function_any);
>> @@ -409,25 +412,28 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
>>  	unsigned int this_cpu;
>>  	unsigned long flags;
>>
>> -	this_cpu = get_cpu();
>> +	get_online_cpus_atomic();
>> +
>> +	this_cpu = smp_processor_id();
>> +
>>  	/*
>>  	 * Can deadlock when called with interrupts disabled.
>>  	 * We allow cpu's that are not yet online though, as no one else can
>>  	 * send smp call function interrupt to this cpu and as such deadlocks
>>  	 * can't happen.
>>  	 */
>> -	WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
>> +	WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled()
>>  		     && !oops_in_progress);
>>
>>  	if (cpu == this_cpu) {
>>  		local_irq_save(flags);
>>  		data->func(data->info);
>>  		local_irq_restore(flags);
>> -	} else {
>> +	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
>>  		csd_lock(data);
>>  		generic_exec_single(cpu, data, wait);
>>  	}
>> -	put_cpu();
>> +	put_online_cpus_atomic();
>>  }
>>
>>  /**
>> @@ -451,6 +457,8 @@ void smp_call_function_many(const struct cpumask *mask,
>>  	unsigned long flags;
>>  	int refs, cpu, next_cpu, this_cpu = smp_processor_id();
>>
>> +	get_online_cpus_atomic();
>> +
>>  	/*
>>  	 * Can deadlock when called with interrupts disabled.
>>  	 * We allow cpu's that are not yet online though, as no one else can
>> @@ -467,17 +475,18 @@ void smp_call_function_many(const struct cpumask *mask,
>>
>>  	/* No online cpus?  We're done. */
>>  	if (cpu >= nr_cpu_ids)
>> -		return;
>> +		goto out_unlock;
>>
>>  	/* Do we have another CPU which isn't us? */
>>  	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
>>  	if (next_cpu == this_cpu)
>> -		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
>> +		next_cpu = cpumask_next_and(next_cpu, mask,
>> +						cpu_online_mask);
>>
>>  	/* Fastpath: do that cpu by itself. */
>>  	if (next_cpu >= nr_cpu_ids) {
>>  		smp_call_function_single(cpu, func, info, wait);
>> -		return;
>> +		goto out_unlock;
>>  	}
>>
>>  	data = &__get_cpu_var(cfd_data);
>> @@ -523,7 +532,7 @@ void smp_call_function_many(const struct cpumask *mask,
>>  	/* Some callers race with other cpus changing the passed mask */
>>  	if (unlikely(!refs)) {
>>  		csd_unlock(&data->csd);
>> -		return;
>> +		goto out_unlock;
>>  	}
>>
>>  	raw_spin_lock_irqsave(&call_function.lock, flags);
>> @@ -554,6 +563,9 @@ void smp_call_function_many(const struct cpumask *mask,
>>  	/* Optionally wait for the CPUs to complete */
>>  	if (wait)
>>  		csd_lock_wait(&data->csd);
>> +
>> +out_unlock:
>> +	put_online_cpus_atomic();
>>  }
>>  EXPORT_SYMBOL(smp_call_function_many);
>>
>> @@ -574,9 +586,9 @@ EXPORT_SYMBOL(smp_call_function_many);
>>   */
>>  int smp_call_function(smp_call_func_t func, void *info, int wait)
>>  {
>> -	preempt_disable();
>> +	get_online_cpus_atomic();
>>  	smp_call_function_many(cpu_online_mask, func, info, wait);
>> -	preempt_enable();
>> +	put_online_cpus_atomic();
>>
>>  	return 0;
>>  }
>>

^ permalink raw reply

* Re: [PATCH v5 14/45] rcu, CPU hotplug: Fix comment referring to stop_machine()
From: Srivatsa S. Bhat @ 2013-02-10 19:43 UTC (permalink / raw)
  To: paulmck
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw, tj,
	akpm, linuxppc-dev
In-Reply-To: <20130209001439.GQ2666@linux.vnet.ibm.com>

On 02/09/2013 05:44 AM, Paul E. McKenney wrote:
> On Tue, Jan 22, 2013 at 01:06:34PM +0530, Srivatsa S. Bhat wrote:
>> Don't refer to stop_machine() in the CPU hotplug path, since we are going
>> to get rid of it. Also, move the comment referring to callback adoption
>> to the CPU_DEAD case, because that's where it happens now.
>>
>> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
> 
> Ouch!  That comment is indeed obsolete and must die.
> 
> I queued this to -rcu with your Signed-off-by.  However, I omitted
> the added comment, as it is imcomplete -- it is easy to look at
> rcu_cleanup_dead_cpu() to see what it does.
>

Cool! I'll drop it from my patch series then. Thank you!

Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v5 44/45] CPU hotplug, stop_machine: Decouple CPU hotplug from stop_machine() in Kconfig
From: Srivatsa S. Bhat @ 2013-02-10 19:45 UTC (permalink / raw)
  To: paulmck
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw, tj,
	akpm, linuxppc-dev
In-Reply-To: <20130209001554.GR2666@linux.vnet.ibm.com>

On 02/09/2013 05:45 AM, Paul E. McKenney wrote:
> On Tue, Jan 22, 2013 at 01:15:22PM +0530, Srivatsa S. Bhat wrote:
>> ... and also cleanup a comment that refers to CPU hotplug being dependent on
>> stop_machine().
>>
>> Cc: David Howells <dhowells@redhat.com>
>> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
> 
> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> (Hey, I thought I owed myself an easy one!)
> 

Haha ;-)

Regards,
Srivatsa S. Bhat

>> ---
>>
>>  include/linux/stop_machine.h |    2 +-
>>  init/Kconfig                 |    2 +-
>>  2 files changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
>> index 3b5e910..ce2d3c4 100644
>> --- a/include/linux/stop_machine.h
>> +++ b/include/linux/stop_machine.h
>> @@ -120,7 +120,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
>>   * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
>>   *
>>   * Description: This is a special version of the above, which assumes cpus
>> - * won't come or go while it's being called.  Used by hotplug cpu.
>> + * won't come or go while it's being called.
>>   */
>>  int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
>>
>> diff --git a/init/Kconfig b/init/Kconfig
>> index be8b7f5..048a0c5 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -1711,7 +1711,7 @@ config INIT_ALL_POSSIBLE
>>  config STOP_MACHINE
>>  	bool
>>  	default y
>> -	depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU
>> +	depends on (SMP && MODULE_UNLOAD)
>>  	help
>>  	  Need stop_machine() primitive.
>>
>>

^ permalink raw reply

* Re: [PATCH v5 04/45] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks
From: Paul E. McKenney @ 2013-02-10 19:47 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw, tj,
	akpm, linuxppc-dev
In-Reply-To: <5117F0C0.2030605@linux.vnet.ibm.com>

On Mon, Feb 11, 2013 at 12:40:56AM +0530, Srivatsa S. Bhat wrote:
> On 02/09/2013 04:40 AM, Paul E. McKenney wrote:
> > On Tue, Jan 22, 2013 at 01:03:53PM +0530, Srivatsa S. Bhat wrote:
> >> Using global rwlocks as the backend for per-CPU rwlocks helps us avoid many
> >> lock-ordering related problems (unlike per-cpu locks). However, global
> >> rwlocks lead to unnecessary cache-line bouncing even when there are no
> >> writers present, which can slow down the system needlessly.
> >>
> [...]
> > 
> > Looks pretty close!  Some comments interspersed below.  Please either
> > fix the code or my confusion, as the case may be.  ;-)
> > 
> 
> Sure :-)
> 
> >> ---
> >>
> >>  include/linux/percpu-rwlock.h |   10 +++
> >>  lib/percpu-rwlock.c           |  128 ++++++++++++++++++++++++++++++++++++++++-
> >>  2 files changed, 136 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/linux/percpu-rwlock.h b/include/linux/percpu-rwlock.h
> >> index 8dec8fe..6819bb8 100644
> >> --- a/include/linux/percpu-rwlock.h
> >> +++ b/include/linux/percpu-rwlock.h
> >> @@ -68,4 +68,14 @@ extern void percpu_free_rwlock(struct percpu_rwlock *);
> >>  	__percpu_init_rwlock(pcpu_rwlock, #pcpu_rwlock, &rwlock_key);	\
> >>  })
> >>
> >> +#define reader_uses_percpu_refcnt(pcpu_rwlock, cpu)			\
> >> +		(ACCESS_ONCE(per_cpu(*((pcpu_rwlock)->reader_refcnt), cpu)))
> >> +
> >> +#define reader_nested_percpu(pcpu_rwlock)				\
> >> +			(__this_cpu_read(*((pcpu_rwlock)->reader_refcnt)) > 1)
> >> +
> >> +#define writer_active(pcpu_rwlock)					\
> >> +			(__this_cpu_read(*((pcpu_rwlock)->writer_signal)))
> >> +
> >>  #endif
> >> +
> >> diff --git a/lib/percpu-rwlock.c b/lib/percpu-rwlock.c
> >> index 80dad93..992da5c 100644
> >> --- a/lib/percpu-rwlock.c
> >> +++ b/lib/percpu-rwlock.c
> >> @@ -64,21 +64,145 @@ void percpu_free_rwlock(struct percpu_rwlock *pcpu_rwlock)
> >>
> >>  void percpu_read_lock(struct percpu_rwlock *pcpu_rwlock)
> >>  {
> >> -	read_lock(&pcpu_rwlock->global_rwlock);
> >> +	preempt_disable();
> >> +
> >> +	/* First and foremost, let the writer know that a reader is active */
> >> +	this_cpu_inc(*pcpu_rwlock->reader_refcnt);
> >> +
> >> +	/*
> >> +	 * If we are already using per-cpu refcounts, it is not safe to switch
> >> +	 * the synchronization scheme. So continue using the refcounts.
> >> +	 */
> >> +	if (reader_nested_percpu(pcpu_rwlock)) {
> >> +		goto out;
> >> +	} else {
> >> +		/*
> >> +		 * The write to 'reader_refcnt' must be visible before we
> >> +		 * read 'writer_signal'.
> >> +		 */
> >> +		smp_mb(); /* Paired with smp_rmb() in sync_reader() */
> >> +
> >> +		if (likely(!writer_active(pcpu_rwlock))) {
> >> +			goto out;
> >> +		} else {
> >> +			/* Writer is active, so switch to global rwlock. */
> >> +			read_lock(&pcpu_rwlock->global_rwlock);
> >> +
> >> +			/*
> >> +			 * We might have raced with a writer going inactive
> >> +			 * before we took the read-lock. So re-evaluate whether
> >> +			 * we still need to hold the rwlock or if we can switch
> >> +			 * back to per-cpu refcounts. (This also helps avoid
> >> +			 * heterogeneous nesting of readers).
> >> +			 */
> >> +			if (writer_active(pcpu_rwlock))
> > 
> > The above writer_active() can be reordered with the following this_cpu_dec(),
> > strange though it might seem.  But this is OK because holding the rwlock
> > is conservative.  But might be worth a comment.
> > 
> 
> Ok..
> 
> >> +				this_cpu_dec(*pcpu_rwlock->reader_refcnt);
> >> +			else
> > 
> > In contrast, no reordering can happen here because read_unlock() is
> > required to keep the critical section underneath the lock.
> >
> 
> Ok..
> 
> >> +				read_unlock(&pcpu_rwlock->global_rwlock);
> >> +		}
> >> +	}
> >> +
> >> +out:
> >> +	/* Prevent reordering of any subsequent reads */
> >> +	smp_rmb();
> > 
> > This should be smp_mb().  "Readers" really can do writes.  Hence the
> > name lglock -- "local/global" rather than "reader/writer".
> > 
> 
> Ok!
> 
> [ We were trying to avoid full memory barriers in get/put_online_cpus_atomic()
> in the fastpath, as far as possible... Now it feels like all that discussion
> was for nothing :-( ]
> 
> >>  }
> >>
> >>  void percpu_read_unlock(struct percpu_rwlock *pcpu_rwlock)
> >>  {
> >> -	read_unlock(&pcpu_rwlock->global_rwlock);
> > 
> > We need an smp_mb() here to keep the critical section ordered before the
> > this_cpu_dec() below.  Otherwise, if a writer shows up just after we
> > exit the fastpath, that writer is not guaranteed to see the effects of
> > our critical section.  Equivalently, the prior read-side critical section
> > just might see some of the writer's updates, which could be a bit of
> > a surprise to the reader.
> > 
> 
> Ok, will add it.
> 
> >> +	/*
> >> +	 * We never allow heterogeneous nesting of readers. So it is trivial
> >> +	 * to find out the kind of reader we are, and undo the operation
> >> +	 * done by our corresponding percpu_read_lock().
> >> +	 */
> >> +	if (__this_cpu_read(*pcpu_rwlock->reader_refcnt)) {
> >> +		this_cpu_dec(*pcpu_rwlock->reader_refcnt);
> >> +		smp_wmb(); /* Paired with smp_rmb() in sync_reader() */
> > 
> > Given an smp_mb() above, I don't understand the need for this smp_wmb().
> > Isn't the idea that if the writer sees ->reader_refcnt decremented to
> > zero, it also needs to see the effects of the corresponding reader's
> > critical section?
> >
> 
> Not sure what you meant, but my idea here was that the writer should see
> the reader_refcnt falling to zero as soon as possible, to avoid keeping the
> writer waiting in a tight loop for longer than necessary.
> I might have been a little over-zealous to use lighter memory barriers though,
> (given our lengthy discussions in the previous versions to reduce the memory
> barrier overheads), so the smp_wmb() used above might be wrong.
> 
> So, are you saying that the smp_mb() you indicated above would be enough
> to make the writer observe the 1->0 transition of reader_refcnt immediately?
> 
> > Or am I missing something subtle here?  In any case, if this smp_wmb()
> > really is needed, there should be some subsequent write that the writer
> > might observe.  From what I can see, there is no subsequent write from
> > this reader that the writer cares about.
> 
> I thought the smp_wmb() here and the smp_rmb() at the writer would ensure
> immediate reflection of the reader state at the writer side... Please correct
> me if my understanding is incorrect.

Ah, but memory barriers are not so much about making data move faster
through the machine, but more about making sure that ordering constraints
are met.  After all, memory barriers cannot make electrons flow faster
through silicon.  You should therefore use memory barriers only to
constrain ordering, not to try to expedite electrons.

> >> +	} else {
> >> +		read_unlock(&pcpu_rwlock->global_rwlock);
> >> +	}
> >> +
> >> +	preempt_enable();
> >> +}
> >> +
> >> +static inline void raise_writer_signal(struct percpu_rwlock *pcpu_rwlock,
> >> +				       unsigned int cpu)
> >> +{
> >> +	per_cpu(*pcpu_rwlock->writer_signal, cpu) = true;
> >> +}
> >> +
> >> +static inline void drop_writer_signal(struct percpu_rwlock *pcpu_rwlock,
> >> +				      unsigned int cpu)
> >> +{
> >> +	per_cpu(*pcpu_rwlock->writer_signal, cpu) = false;
> >> +}
> >> +
> >> +static void announce_writer_active(struct percpu_rwlock *pcpu_rwlock)
> >> +{
> >> +	unsigned int cpu;
> >> +
> >> +	for_each_online_cpu(cpu)
> >> +		raise_writer_signal(pcpu_rwlock, cpu);
> >> +
> >> +	smp_mb(); /* Paired with smp_rmb() in percpu_read_[un]lock() */
> >> +}
> >> +
> >> +static void announce_writer_inactive(struct percpu_rwlock *pcpu_rwlock)
> >> +{
> >> +	unsigned int cpu;
> >> +
> >> +	drop_writer_signal(pcpu_rwlock, smp_processor_id());
> > 
> > Why do we drop ourselves twice?  More to the point, why is it important to
> > drop ourselves first?
> 
> I don't see where we are dropping ourselves twice. Note that we are no longer
> in the cpu_online_mask, so the 'for' loop below won't include us. So we need
> to manually drop ourselves. It doesn't matter whether we drop ourselves first
> or later.

Good point, apologies for my confusion!  Still worth a commment, though.

> >> +
> >> +	for_each_online_cpu(cpu)
> >> +		drop_writer_signal(pcpu_rwlock, cpu);
> >> +
> >> +	smp_mb(); /* Paired with smp_rmb() in percpu_read_[un]lock() */
> >> +}
> >> +
> >> +/*
> >> + * Wait for the reader to see the writer's signal and switch from percpu
> >> + * refcounts to global rwlock.
> >> + *
> >> + * If the reader is still using percpu refcounts, wait for him to switch.
> >> + * Else, we can safely go ahead, because either the reader has already
> >> + * switched over, or the next reader that comes along on that CPU will
> >> + * notice the writer's signal and will switch over to the rwlock.
> >> + */
> >> +static inline void sync_reader(struct percpu_rwlock *pcpu_rwlock,
> >> +			       unsigned int cpu)
> >> +{
> >> +	smp_rmb(); /* Paired with smp_[w]mb() in percpu_read_[un]lock() */
> > 
> > As I understand it, the purpose of this memory barrier is to ensure
> > that the stores in drop_writer_signal() happen before the reads from
> > ->reader_refcnt in reader_uses_percpu_refcnt(),
> 
> No, that was not what I intended. announce_writer_inactive() already does
> a full smp_mb() after calling drop_writer_signal().
> 
> I put the smp_rmb() here and the smp_wmb() at the reader side (after updates
> to the ->reader_refcnt) to reflect the state change of ->reader_refcnt
> immediately at the writer, so that the writer doesn't have to keep spinning
> unnecessarily still referring to the old (non-zero) value of ->reader_refcnt.
> Or perhaps I am confused about how to use memory barriers properly.. :-(

Sadly, no, memory barriers don't make electrons move faster.  So you
should only need the one -- the additional memory barriers are just
slowing things down.

> > thus preventing the
> > race between a new reader attempting to use the fastpath and this writer
> > acquiring the lock.  Unless I am confused, this must be smp_mb() rather
> > than smp_rmb().
> > 
> > Also, why not just have a single smp_mb() at the beginning of
> > sync_all_readers() instead of executing one barrier per CPU?
> 
> Well, since my intention was to help the writer see the update (->reader_refcnt
> dropping to zero) ASAP, I kept the multiple smp_rmb()s.

At least you were consistent.  ;-)

> >> +
> >> +	while (reader_uses_percpu_refcnt(pcpu_rwlock, cpu))
> >> +		cpu_relax();
> >> +}
> >> +
> >> +static void sync_all_readers(struct percpu_rwlock *pcpu_rwlock)
> >> +{
> >> +	unsigned int cpu;
> >> +
> >> +	for_each_online_cpu(cpu)
> >> +		sync_reader(pcpu_rwlock, cpu);
> >>  }
> >>
> >>  void percpu_write_lock(struct percpu_rwlock *pcpu_rwlock)
> >>  {
> >> +	/*
> >> +	 * Tell all readers that a writer is becoming active, so that they
> >> +	 * start switching over to the global rwlock.
> >> +	 */
> >> +	announce_writer_active(pcpu_rwlock);
> >> +	sync_all_readers(pcpu_rwlock);
> >>  	write_lock(&pcpu_rwlock->global_rwlock);
> >>  }
> >>
> >>  void percpu_write_unlock(struct percpu_rwlock *pcpu_rwlock)
> >>  {
> >> +	/*
> >> +	 * Inform all readers that we are done, so that they can switch back
> >> +	 * to their per-cpu refcounts. (We don't need to wait for them to
> >> +	 * see it).
> >> +	 */
> >> +	announce_writer_inactive(pcpu_rwlock);
> >>  	write_unlock(&pcpu_rwlock->global_rwlock);
> >>  }
> >>
> >>
> 
> Thanks a lot for your detailed review and comments! :-)

It will be good to get this in!

							Thanx, Paul

^ permalink raw reply

* Re: [PATCH v5 04/45] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks
From: Oleg Nesterov @ 2013-02-10 19:50 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: linux-doc, peterz, fweisbec, mingo, linux-arch, linux,
	xiaoguangrong, wangyun, Paul E. McKenney, nikunj, linux-pm, rusty,
	rostedt, rjw, namhyung, tglx, linux-arm-kernel, netdev,
	linux-kernel, sbw, tj, akpm, linuxppc-dev
In-Reply-To: <5117F403.1050300@linux.vnet.ibm.com>

On 02/11, Srivatsa S. Bhat wrote:
>
> On 02/10/2013 11:36 PM, Oleg Nesterov wrote:
> >>> +static void announce_writer_inactive(struct percpu_rwlock *pcpu_rwlock)
> >>> +{
> >>> +   unsigned int cpu;
> >>> +
> >>> +   drop_writer_signal(pcpu_rwlock, smp_processor_id());
> >>
> >> Why do we drop ourselves twice?  More to the point, why is it important to
> >> drop ourselves first?
> >
> > And don't we need mb() _before_ we clear ->writer_signal ?
> >
>
> Oh, right! Or, how about moving announce_writer_inactive() to _after_
> write_unlock()?

Not sure this will help... but, either way it seems we have another
problem...

percpu_rwlock tries to be "generic". This means we should "ignore" its
usage in hotplug, and _write_lock should not race with _write_unlock.

IOW. Suppose that _write_unlock clears ->writer_signal. We need to ensure
that this can't race with another write which wants to set this flag.
Perhaps it should be counter as well, and it should be protected by
the same ->global_rwlock, but _write_lock() should drop it before
sync_all_readers() and then take it again?

> >>> +static inline void sync_reader(struct percpu_rwlock *pcpu_rwlock,
> >>> +			       unsigned int cpu)
> >>> +{
> >>> +	smp_rmb(); /* Paired with smp_[w]mb() in percpu_read_[un]lock() */
> >>
> >> As I understand it, the purpose of this memory barrier is to ensure
> >> that the stores in drop_writer_signal() happen before the reads from
> >> ->reader_refcnt in reader_uses_percpu_refcnt(), thus preventing the
> >> race between a new reader attempting to use the fastpath and this writer
> >> acquiring the lock.  Unless I am confused, this must be smp_mb() rather
> >> than smp_rmb().
> >
> > And note that before sync_reader() we call announce_writer_active() which
> > already adds mb() before sync_all_readers/sync_reader, so this rmb() looks
> > unneeded.
> >
>
> My intention was to help the writer see the ->reader_refcnt drop to zero
> ASAP; hence I used smp_wmb() at reader and smp_rmb() here at the writer.

Hmm, interesting... Not sure, but can't really comment. However I can
answer your next question:

> Please correct me if my understanding of memory barriers is wrong here..

Who? Me??? No we have paulmck for that ;)

Oleg.

^ permalink raw reply

* Re: [PATCH v5 04/45] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks
From: Paul E. McKenney @ 2013-02-10 19:54 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-doc, peterz, fweisbec, mingo, linux-arch, linux,
	xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt, rjw,
	namhyung, tglx, linux-arm-kernel, netdev, linux-kernel, sbw,
	Srivatsa S. Bhat, tj, akpm, linuxppc-dev
In-Reply-To: <20130210180607.GA1375@redhat.com>

On Sun, Feb 10, 2013 at 07:06:07PM +0100, Oleg Nesterov wrote:
> On 02/08, Paul E. McKenney wrote:

[ . . . ]

> > > +static inline void sync_reader(struct percpu_rwlock *pcpu_rwlock,
> > > +			       unsigned int cpu)
> > > +{
> > > +	smp_rmb(); /* Paired with smp_[w]mb() in percpu_read_[un]lock() */
> >
> > As I understand it, the purpose of this memory barrier is to ensure
> > that the stores in drop_writer_signal() happen before the reads from
> > ->reader_refcnt in reader_uses_percpu_refcnt(), thus preventing the
> > race between a new reader attempting to use the fastpath and this writer
> > acquiring the lock.  Unless I am confused, this must be smp_mb() rather
> > than smp_rmb().
> 
> And note that before sync_reader() we call announce_writer_active() which
> already adds mb() before sync_all_readers/sync_reader, so this rmb() looks
> unneeded.
> 
> But, at the same time, could you confirm that we do not need another mb()
> after sync_all_readers() in percpu_write_lock() ? I mean, without mb(),
> can't this reader_uses_percpu_refcnt() LOAD leak into the critical section
> protected by ->global_rwlock? Then this LOAD can be re-ordered with other
> memory operations done by the writer.

As soon as I get the rest of the way through Thomas's patchset.  ;-)

							Thanx, Paul

^ permalink raw reply

* Re: [PATCH v5 09/45] smp, cpu hotplug: Fix smp_call_function_*() to prevent CPU offline properly
From: Paul E. McKenney @ 2013-02-10 19:56 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, nikunj, linux-pm, rusty, rostedt,
	rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw, tj,
	akpm, linuxppc-dev
In-Reply-To: <5117F7E9.7070906@linux.vnet.ibm.com>

On Mon, Feb 11, 2013 at 01:11:29AM +0530, Srivatsa S. Bhat wrote:
> On 02/09/2013 05:37 AM, Paul E. McKenney wrote:
> > On Tue, Jan 22, 2013 at 01:05:10PM +0530, Srivatsa S. Bhat wrote:
> >> Once stop_machine() is gone from the CPU offline path, we won't be able to
> >> depend on preempt_disable() to prevent CPUs from going offline from under us.
> >>
> >> Use the get/put_online_cpus_atomic() APIs to prevent CPUs from going offline,
> >> while invoking from atomic context.
> >>
> >> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
> > 
> > Would it make sense for get_online_cpus_atomic() to return the current
> > CPU number?
> 
> Hmm, I'm not so sure. I tried to model it after get_online_cpus(), which doesn't
> return anything (for other reasons, of course..)
> 
> Moreover, a function name like *_cpu_* returning the CPU number would be intuitive.
> But a name such as *_cpus_* (plural) returning a CPU number might appear confusing..
> 
> And also I don't think it'll make a huge improvement in the callers.. (We might
> be better off avoiding an smp_processor_id() if possible, since this function could
> be called in very hot paths).. So I don't see a strong case for returning the
> CPU number. But let me know if you think it'll still be worth it for some reason...

I just noted a lot of two-line code sequences in your patch that would be
one line if the CPU number was returned.  But I don't feel strongly about
it, so if people are OK with the current version, no problem.

							Thanx, Paul

> >  Looks good otherwise.
> > 
> 
> Thank you very much for the detailed review, Paul!
> 
> Regards,
> Srivatsa S. Bhat
> 
> > 
> >> ---
> >>
> >>  kernel/smp.c |   40 ++++++++++++++++++++++++++--------------
> >>  1 file changed, 26 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/kernel/smp.c b/kernel/smp.c
> >> index 29dd40a..f421bcc 100644
> >> --- a/kernel/smp.c
> >> +++ b/kernel/smp.c
> >> @@ -310,7 +310,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
> >>  	 * prevent preemption and reschedule on another processor,
> >>  	 * as well as CPU removal
> >>  	 */
> >> -	this_cpu = get_cpu();
> >> +	get_online_cpus_atomic();
> >> +	this_cpu = smp_processor_id();
> >>
> >>  	/*
> >>  	 * Can deadlock when called with interrupts disabled.
> >> @@ -342,7 +343,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
> >>  		}
> >>  	}
> >>
> >> -	put_cpu();
> >> +	put_online_cpus_atomic();
> >>
> >>  	return err;
> >>  }
> >> @@ -371,8 +372,10 @@ int smp_call_function_any(const struct cpumask *mask,
> >>  	const struct cpumask *nodemask;
> >>  	int ret;
> >>
> >> +	get_online_cpus_atomic();
> >>  	/* Try for same CPU (cheapest) */
> >> -	cpu = get_cpu();
> >> +	cpu = smp_processor_id();
> >> +
> >>  	if (cpumask_test_cpu(cpu, mask))
> >>  		goto call;
> >>
> >> @@ -388,7 +391,7 @@ int smp_call_function_any(const struct cpumask *mask,
> >>  	cpu = cpumask_any_and(mask, cpu_online_mask);
> >>  call:
> >>  	ret = smp_call_function_single(cpu, func, info, wait);
> >> -	put_cpu();
> >> +	put_online_cpus_atomic();
> >>  	return ret;
> >>  }
> >>  EXPORT_SYMBOL_GPL(smp_call_function_any);
> >> @@ -409,25 +412,28 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
> >>  	unsigned int this_cpu;
> >>  	unsigned long flags;
> >>
> >> -	this_cpu = get_cpu();
> >> +	get_online_cpus_atomic();
> >> +
> >> +	this_cpu = smp_processor_id();
> >> +
> >>  	/*
> >>  	 * Can deadlock when called with interrupts disabled.
> >>  	 * We allow cpu's that are not yet online though, as no one else can
> >>  	 * send smp call function interrupt to this cpu and as such deadlocks
> >>  	 * can't happen.
> >>  	 */
> >> -	WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
> >> +	WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled()
> >>  		     && !oops_in_progress);
> >>
> >>  	if (cpu == this_cpu) {
> >>  		local_irq_save(flags);
> >>  		data->func(data->info);
> >>  		local_irq_restore(flags);
> >> -	} else {
> >> +	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
> >>  		csd_lock(data);
> >>  		generic_exec_single(cpu, data, wait);
> >>  	}
> >> -	put_cpu();
> >> +	put_online_cpus_atomic();
> >>  }
> >>
> >>  /**
> >> @@ -451,6 +457,8 @@ void smp_call_function_many(const struct cpumask *mask,
> >>  	unsigned long flags;
> >>  	int refs, cpu, next_cpu, this_cpu = smp_processor_id();
> >>
> >> +	get_online_cpus_atomic();
> >> +
> >>  	/*
> >>  	 * Can deadlock when called with interrupts disabled.
> >>  	 * We allow cpu's that are not yet online though, as no one else can
> >> @@ -467,17 +475,18 @@ void smp_call_function_many(const struct cpumask *mask,
> >>
> >>  	/* No online cpus?  We're done. */
> >>  	if (cpu >= nr_cpu_ids)
> >> -		return;
> >> +		goto out_unlock;
> >>
> >>  	/* Do we have another CPU which isn't us? */
> >>  	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
> >>  	if (next_cpu == this_cpu)
> >> -		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
> >> +		next_cpu = cpumask_next_and(next_cpu, mask,
> >> +						cpu_online_mask);
> >>
> >>  	/* Fastpath: do that cpu by itself. */
> >>  	if (next_cpu >= nr_cpu_ids) {
> >>  		smp_call_function_single(cpu, func, info, wait);
> >> -		return;
> >> +		goto out_unlock;
> >>  	}
> >>
> >>  	data = &__get_cpu_var(cfd_data);
> >> @@ -523,7 +532,7 @@ void smp_call_function_many(const struct cpumask *mask,
> >>  	/* Some callers race with other cpus changing the passed mask */
> >>  	if (unlikely(!refs)) {
> >>  		csd_unlock(&data->csd);
> >> -		return;
> >> +		goto out_unlock;
> >>  	}
> >>
> >>  	raw_spin_lock_irqsave(&call_function.lock, flags);
> >> @@ -554,6 +563,9 @@ void smp_call_function_many(const struct cpumask *mask,
> >>  	/* Optionally wait for the CPUs to complete */
> >>  	if (wait)
> >>  		csd_lock_wait(&data->csd);
> >> +
> >> +out_unlock:
> >> +	put_online_cpus_atomic();
> >>  }
> >>  EXPORT_SYMBOL(smp_call_function_many);
> >>
> >> @@ -574,9 +586,9 @@ EXPORT_SYMBOL(smp_call_function_many);
> >>   */
> >>  int smp_call_function(smp_call_func_t func, void *info, int wait)
> >>  {
> >> -	preempt_disable();
> >> +	get_online_cpus_atomic();
> >>  	smp_call_function_many(cpu_online_mask, func, info, wait);
> >> -	preempt_enable();
> >> +	put_online_cpus_atomic();
> >>
> >>  	return 0;
> >>  }
> >>
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox