Netdev List
 help / color / mirror / Atom feed
* Re: KASAN: use-after-free Write in nr_insert_socket
From: Dmitry Vyukov @ 2019-07-23 16:24 UTC (permalink / raw)
  To: syzbot, Ralf Baechle, David Miller, linux-hams, netdev
  Cc: LKML, syzkaller-bugs
In-Reply-To: <0000000000006241fe058e5b9490@google.com>

On Tue, Jul 23, 2019 at 6:21 PM syzbot
<syzbot+5e54e8e637bc970bbd2b@syzkaller.appspotmail.com> wrote:
>
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:    c6dd78fc Merge branch 'x86-urgent-for-linus' of git://git...
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=178a5c10600000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=3c8985c08e1f9727
> dashboard link: https://syzkaller.appspot.com/bug?extid=5e54e8e637bc970bbd2b
> compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
>
> Unfortunately, I don't have any reproducer for this crash yet.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+5e54e8e637bc970bbd2b@syzkaller.appspotmail.com

+net/netrom/af_netrom.c maintainers

> ==================================================================
> BUG: KASAN: use-after-free in atomic_try_cmpxchg
> /./include/asm-generic/atomic-instrumented.h:693 [inline]
> BUG: KASAN: use-after-free in refcount_inc_not_zero_checked+0xef/0x200
> /lib/refcount.c:134
> Write of size 4 at addr ffff88805b0a2d00 by task syz-executor.1/29302
>
> CPU: 1 PID: 29302 Comm: syz-executor.1 Not tainted 5.2.0+ #64
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>   <IRQ>
>   __dump_stack /lib/dump_stack.c:77 [inline]
>   dump_stack+0x16f/0x1f0 /lib/dump_stack.c:113
>   print_address_description.cold+0xd4/0x306 /mm/kasan/report.c:351
>   __kasan_report.cold+0x1b/0x36 /mm/kasan/report.c:482
>   kasan_report+0x12/0x17 /mm/kasan/common.c:612
>   check_memory_region_inline /mm/kasan/generic.c:185 [inline]
>   check_memory_region+0x134/0x1a0 /mm/kasan/generic.c:192
>   __kasan_check_write+0x14/0x20 /mm/kasan/common.c:98
>   atomic_try_cmpxchg /./include/asm-generic/atomic-instrumented.h:693
> [inline]
>   refcount_inc_not_zero_checked+0xef/0x200 /lib/refcount.c:134
>   refcount_inc_checked+0x17/0x70 /lib/refcount.c:156
>   sock_hold /./include/net/sock.h:649 [inline]
>   sk_add_node /./include/net/sock.h:701 [inline]
>   nr_insert_socket+0x2d/0xe0 /net/netrom/af_netrom.c:137
>   nr_rx_frame+0x1605/0x1e73 /net/netrom/af_netrom.c:1023
>   nr_loopback_timer+0x7b/0x170 /net/netrom/nr_loopback.c:59
>   call_timer_fn+0x1ac/0x700 /kernel/time/timer.c:1322
>   expire_timers /kernel/time/timer.c:1366 [inline]
>   __run_timers /kernel/time/timer.c:1685 [inline]
>   __run_timers /kernel/time/timer.c:1653 [inline]
>   run_timer_softirq+0x66c/0x16a0 /kernel/time/timer.c:1698
>   __do_softirq+0x30d/0x970 /kernel/softirq.c:292
>   invoke_softirq /kernel/softirq.c:373 [inline]
>   irq_exit+0x1d0/0x200 /kernel/softirq.c:413
>   exiting_irq /./arch/x86/include/asm/apic.h:537 [inline]
>   smp_apic_timer_interrupt+0x14e/0x550 /arch/x86/kernel/apic/apic.c:1095
>   apic_timer_interrupt+0xf/0x20 /arch/x86/entry/entry_64.S:828
>   </IRQ>
> RIP: 0010:__preempt_count_sub /./arch/x86/include/asm/preempt.h:84 [inline]
> RIP: 0010:__raw_spin_unlock_irq /./include/linux/spinlock_api_smp.h:169
> [inline]
> RIP: 0010:_raw_spin_unlock_irq+0x54/0x70 /kernel/locking/spinlock.c:199
> Code: c0 a0 e3 d2 88 48 ba 00 00 00 00 00 fc ff df 48 c1 e8 03 80 3c 10 00
> 75 1e 48 83 3d d5 65 a0 01 00 74 12 fb 66 0f 1f 44 00 00 <65> ff 0d ad 7f
> cf 78 41 5c 5d c3 0f 0b 48 c7 c7 a0 e3 d2 88 e8 d3
> RSP: 0018:ffff88808fc87370 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
> RAX: 1ffffffff11a5c74 RBX: ffff888092ace300 RCX: 0000000000000006
> RDX: dffffc0000000000 RSI: 0000000000000008 RDI: ffff888092aceb4c
> RBP: ffff88808fc87378 R08: 1ffffffff14a6d50 R09: fffffbfff14a6d51
> R10: fffffbfff14a6d50 R11: ffffffff8a536a87 R12: ffff8880ae935380
> R13: ffff888053dde280 R14: 0000000000000000 R15: 0000000000000001
>   finish_lock_switch /kernel/sched/core.c:3004 [inline]
>   finish_task_switch+0x11d/0x690 /kernel/sched/core.c:3104
>   context_switch /kernel/sched/core.c:3257 [inline]
>   __schedule+0x77a/0x1530 /kernel/sched/core.c:3880
>   preempt_schedule_common+0x35/0x70 /kernel/sched/core.c:4025
>   _cond_resched+0x1d/0x30 /kernel/sched/core.c:5423
>   generic_perform_write+0x332/0x540 /mm/filemap.c:3344
>   __generic_file_write_iter+0x25e/0x630 /mm/filemap.c:3456
>   ext4_file_write_iter+0x373/0x1430 /fs/ext4/file.c:270
>   call_write_iter /./include/linux/fs.h:1870 [inline]
>   do_iter_readv_writev+0x5f8/0x8f0 /fs/read_write.c:693
>   do_iter_write /fs/read_write.c:970 [inline]
>   do_iter_write+0x184/0x610 /fs/read_write.c:951
>   vfs_iter_write+0x77/0xb0 /fs/read_write.c:983
>   iter_file_splice_write+0x66d/0xbe0 /fs/splice.c:746
>   do_splice_from /fs/splice.c:848 [inline]
>   direct_splice_actor+0x123/0x190 /fs/splice.c:1020
>   splice_direct_to_actor+0x366/0x970 /fs/splice.c:975
>   do_splice_direct+0x1da/0x2a0 /fs/splice.c:1063
>   do_sendfile+0x597/0xd00 /fs/read_write.c:1464
>   __do_sys_sendfile64 /fs/read_write.c:1519 [inline]
>   __se_sys_sendfile64 /fs/read_write.c:1511 [inline]
>   __x64_sys_sendfile64+0x15a/0x220 /fs/read_write.c:1511
>   do_syscall_64+0xfd/0x6a0 /arch/x86/entry/common.c:296
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x459829
> Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
> ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:00007fe4c4360c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
> RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 0000000000459829
> RDX: 0000000020000080 RSI: 0000000000000003 RDI: 0000000000000003
> RBP: 000000000075bfc8 R08: 0000000000000000 R09: 0000000000000000
> R10: 000000000000a198 R11: 0000000000000246 R12: 00007fe4c43616d4
> R13: 00000000004c6e87 R14: 00000000004dc238 R15: 00000000ffffffff
>
> Allocated by task 29302:
>   save_stack+0x23/0x90 /mm/kasan/common.c:69
>   set_track /mm/kasan/common.c:77 [inline]
>   __kasan_kmalloc /mm/kasan/common.c:487 [inline]
>   __kasan_kmalloc.constprop.0+0xcf/0xe0 /mm/kasan/common.c:460
>   kasan_kmalloc+0x9/0x10 /mm/kasan/common.c:501
>   __do_kmalloc /mm/slab.c:3655 [inline]
>   __kmalloc+0x163/0x760 /mm/slab.c:3664
>   kmalloc /./include/linux/slab.h:557 [inline]
>   sk_prot_alloc+0x23a/0x310 /net/core/sock.c:1603
>   sk_alloc+0x39/0xf60 /net/core/sock.c:1657
>   nr_make_new /net/netrom/af_netrom.c:476 [inline]
>   nr_rx_frame+0x733/0x1e73 /net/netrom/af_netrom.c:959
>   nr_loopback_timer+0x7b/0x170 /net/netrom/nr_loopback.c:59
>   call_timer_fn+0x1ac/0x700 /kernel/time/timer.c:1322
>   expire_timers /kernel/time/timer.c:1366 [inline]
>   __run_timers /kernel/time/timer.c:1685 [inline]
>   __run_timers /kernel/time/timer.c:1653 [inline]
>   run_timer_softirq+0x66c/0x16a0 /kernel/time/timer.c:1698
>   __do_softirq+0x30d/0x970 /kernel/softirq.c:292
>
> Freed by task 29300:
>   save_stack+0x23/0x90 /mm/kasan/common.c:69
>   set_track /mm/kasan/common.c:77 [inline]
>   __kasan_slab_free+0x102/0x150 /mm/kasan/common.c:449
>   kasan_slab_free+0xe/0x10 /mm/kasan/common.c:457
>   __cache_free /mm/slab.c:3425 [inline]
>   kfree+0x10a/0x2a0 /mm/slab.c:3756
>   sk_prot_free /net/core/sock.c:1640 [inline]
>   __sk_destruct+0x4f7/0x6e0 /net/core/sock.c:1726
>   sk_destruct+0x86/0xa0 /net/core/sock.c:1734
>   __sk_free+0xfb/0x360 /net/core/sock.c:1745
>   sk_free+0x42/0x50 /net/core/sock.c:1756
>   sock_put /./include/net/sock.h:1725 [inline]
>   sock_efree+0x61/0x80 /net/core/sock.c:2042
>   skb_release_head_state+0xeb/0x250 /net/core/skbuff.c:652
>   skb_release_all+0x16/0x60 /net/core/skbuff.c:663
>   __kfree_skb /net/core/skbuff.c:679 [inline]
>   kfree_skb /net/core/skbuff.c:697 [inline]
>   kfree_skb+0x101/0x380 /net/core/skbuff.c:691
>   nr_accept+0x56e/0x700 /net/netrom/af_netrom.c:819
>   __sys_accept4+0x34e/0x6a0 /net/socket.c:1754
>   __do_sys_accept4 /net/socket.c:1789 [inline]
>   __se_sys_accept4 /net/socket.c:1786 [inline]
>   __x64_sys_accept4+0x97/0xf0 /net/socket.c:1786
>   do_syscall_64+0xfd/0x6a0 /arch/x86/entry/common.c:296
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> The buggy address belongs to the object at ffff88805b0a2c80
>   which belongs to the cache kmalloc-2k of size 2048
> The buggy address is located 128 bytes inside of
>   2048-byte region [ffff88805b0a2c80, ffff88805b0a3480)
> The buggy address belongs to the page:
> page:ffffea00016c2880 refcount:1 mapcount:0 mapping:ffff8880aa400e00
> index:0x0 compound_mapcount: 0
> flags: 0x1fffc0000010200(slab|head)
> raw: 01fffc0000010200 ffffea000180bc08 ffffea00025aef08 ffff8880aa400e00
> raw: 0000000000000000 ffff88805b0a2400 0000000100000003 0000000000000000
> page dumped because: kasan: bad access detected
>
> Memory state around the buggy address:
>   ffff88805b0a2c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>   ffff88805b0a2c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > ffff88805b0a2d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>                     ^
>   ffff88805b0a2d80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>   ffff88805b0a2e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ==================================================================
> ------------[ cut here ]------------
> ODEBUG: activate not available (active state 0) object type: timer_list
> hint: nr_heartbeat_expiry+0x0/0x410 /net/netrom/nr_timer.c:52
> WARNING: CPU: 1 PID: 29302 at lib/debugobjects.c:481
> debug_print_object+0x168/0x250 /lib/debugobjects.c:481
> Modules linked in:
> CPU: 1 PID: 29302 Comm: syz-executor.1 Tainted: G    B             5.2.0+
> #64
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> RIP: 0010:debug_print_object+0x168/0x250 /lib/debugobjects.c:481
> Code: dd 80 02 c6 87 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 b5 00 00 00 48
> 8b 14 dd 80 02 c6 87 48 c7 c7 80 f7 c5 87 e8 60 0d 09 fe <0f> 0b 83 05 e3
> 68 6a 06 01 48 83 c4 20 5b 41 5c 41 5d 41 5e 5d c3
> RSP: 0018:ffff8880ae9099d0 EFLAGS: 00010082
> RAX: 0000000000000000 RBX: 0000000000000005 RCX: 0000000000000000
> RDX: 0000000000000100 RSI: ffffffff815b9de2 RDI: ffffed1015d2132c
> RBP: ffff8880ae909a10 R08: ffff888092ace300 R09: fffffbfff13494e8
> R10: fffffbfff13494e7 R11: ffffffff89a4a73f R12: 0000000000000001
> R13: ffffffff88db3f40 R14: ffffffff8160d430 R15: 1ffff11015d21348
> FS:  00007fe4c4361700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000001b2e620000 CR3: 000000007bf8a000 CR4: 00000000001406e0
> Call Trace:
>   <IRQ>
>   debug_object_activate+0x2e5/0x470 /lib/debugobjects.c:680
>   debug_timer_activate /kernel/time/timer.c:710 [inline]
>   __mod_timer /kernel/time/timer.c:1035 [inline]
>   mod_timer+0x415/0xb90 /kernel/time/timer.c:1096
>   sk_reset_timer+0x24/0x60 /net/core/sock.c:2821
>   nr_start_heartbeat+0x47/0x60 /net/netrom/nr_timer.c:79
>   nr_rx_frame+0x160d/0x1e73 /net/netrom/af_netrom.c:1025
>   nr_loopback_timer+0x7b/0x170 /net/netrom/nr_loopback.c:59
>   call_timer_fn+0x1ac/0x700 /kernel/time/timer.c:1322
>   expire_timers /kernel/time/timer.c:1366 [inline]
>   __run_timers /kernel/time/timer.c:1685 [inline]
>   __run_timers /kernel/time/timer.c:1653 [inline]
>   run_timer_softirq+0x66c/0x16a0 /kernel/time/timer.c:1698
>   __do_softirq+0x30d/0x970 /kernel/softirq.c:292
>   invoke_softirq /kernel/softirq.c:373 [inline]
>   irq_exit+0x1d0/0x200 /kernel/softirq.c:413
>   exiting_irq /./arch/x86/include/asm/apic.h:537 [inline]
>   smp_apic_timer_interrupt+0x14e/0x550 /arch/x86/kernel/apic/apic.c:1095
>   apic_timer_interrupt+0xf/0x20 /arch/x86/entry/entry_64.S:828
>   </IRQ>
> RIP: 0010:__preempt_count_sub /./arch/x86/include/asm/preempt.h:84 [inline]
> RIP: 0010:__raw_spin_unlock_irq /./include/linux/spinlock_api_smp.h:169
> [inline]
> RIP: 0010:_raw_spin_unlock_irq+0x54/0x70 /kernel/locking/spinlock.c:199
> Code: c0 a0 e3 d2 88 48 ba 00 00 00 00 00 fc ff df 48 c1 e8 03 80 3c 10 00
> 75 1e 48 83 3d d5 65 a0 01 00 74 12 fb 66 0f 1f 44 00 00 <65> ff 0d ad 7f
> cf 78 41 5c 5d c3 0f 0b 48 c7 c7 a0 e3 d2 88 e8 d3
> RSP: 0018:ffff88808fc87370 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
> RAX: 1ffffffff11a5c74 RBX: ffff888092ace300 RCX: 0000000000000006
> RDX: dffffc0000000000 RSI: 0000000000000008 RDI: ffff888092aceb4c
> RBP: ffff88808fc87378 R08: 1ffffffff14a6d50 R09: fffffbfff14a6d51
> R10: fffffbfff14a6d50 R11: ffffffff8a536a87 R12: ffff8880ae935380
> R13: ffff888053dde280 R14: 0000000000000000 R15: 0000000000000001
>   finish_lock_switch /kernel/sched/core.c:3004 [inline]
>   finish_task_switch+0x11d/0x690 /kernel/sched/core.c:3104
>   context_switch /kernel/sched/core.c:3257 [inline]
>   __schedule+0x77a/0x1530 /kernel/sched/core.c:3880
>   preempt_schedule_common+0x35/0x70 /kernel/sched/core.c:4025
>   _cond_resched+0x1d/0x30 /kernel/sched/core.c:5423
>   generic_perform_write+0x332/0x540 /mm/filemap.c:3344
>   __generic_file_write_iter+0x25e/0x630 /mm/filemap.c:3456
>   ext4_file_write_iter+0x373/0x1430 /fs/ext4/file.c:270
>   call_write_iter /./include/linux/fs.h:1870 [inline]
>   do_iter_readv_writev+0x5f8/0x8f0 /fs/read_write.c:693
>   do_iter_write /fs/read_write.c:970 [inline]
>   do_iter_write+0x184/0x610 /fs/read_write.c:951
>   vfs_iter_write+0x77/0xb0 /fs/read_write.c:983
>   iter_file_splice_write+0x66d/0xbe0 /fs/splice.c:746
>   do_splice_from /fs/splice.c:848 [inline]
>   direct_splice_actor+0x123/0x190 /fs/splice.c:1020
>   splice_direct_to_actor+0x366/0x970 /fs/splice.c:975
>   do_splice_direct+0x1da/0x2a0 /fs/splice.c:1063
>   do_sendfile+0x597/0xd00 /fs/read_write.c:1464
>   __do_sys_sendfile64 /fs/read_write.c:1519 [inline]
>   __se_sys_sendfile64 /fs/read_write.c:1511 [inline]
>   __x64_sys_sendfile64+0x15a/0x220 /fs/read_write.c:1511
>   do_syscall_64+0xfd/0x6a0 /arch/x86/entry/common.c:296
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x459829
> Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
> ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:00007fe4c4360c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
> RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 0000000000459829
> RDX: 0000000020000080 RSI: 0000000000000003 RDI: 0000000000000003
> RBP: 000000000075bfc8 R08: 0000000000000000 R09: 0000000000000000
> R10: 000000000000a198 R11: 0000000000000246 R12: 00007fe4c43616d4
> R13: 00000000004c6e87 R14: 00000000004dc238 R15: 00000000ffffffff
> irq event stamp: 9211
> hardirqs last  enabled at (9210): [<ffffffff8100651a>]
> trace_hardirqs_on_thunk+0x1a/0x20 /arch/x86/entry/thunk_64.S:41
> hardirqs last disabled at (9211): [<ffffffff8732831f>]
> __raw_spin_lock_irqsave /./include/linux/spinlock_api_smp.h:108 [inline]
> hardirqs last disabled at (9211): [<ffffffff8732831f>]
> _raw_spin_lock_irqsave+0x6f/0xd0 /kernel/locking/spinlock.c:159
> softirqs last  enabled at (8598): [<ffffffff8760069a>]
> __do_softirq+0x69a/0x970 /kernel/softirq.c:319
> softirqs last disabled at (8879): [<ffffffff814526b0>] invoke_softirq
> /kernel/softirq.c:373 [inline]
> softirqs last disabled at (8879): [<ffffffff814526b0>] irq_exit+0x1d0/0x200
> /kernel/softirq.c:413
> ---[ end trace 1f19b790eed0288b ]---
>
>
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report. See:
> https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller-bugs+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller-bugs/0000000000006241fe058e5b9490%40google.com.

^ permalink raw reply

* Re: memory leak in rds_send_probe
From: Dmitry Vyukov @ 2019-07-23 16:19 UTC (permalink / raw)
  To: syzbot, Santosh Shilimkar, David Miller, netdev, linux-rdma,
	rds-devel
  Cc: LKML, syzkaller-bugs
In-Reply-To: <000000000000ad1dfe058e5b89ab@google.com>

On Tue, Jul 23, 2019 at 6:18 PM syzbot
<syzbot+5134cdf021c4ed5aaa5f@syzkaller.appspotmail.com> wrote:
>
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:    c6dd78fc Merge branch 'x86-urgent-for-linus' of git://git...
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=14be98c8600000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=8de7d700ea5ac607
> dashboard link: https://syzkaller.appspot.com/bug?extid=5134cdf021c4ed5aaa5f
> compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=145df0c8600000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=170001f4600000

+net/rds/message.c maintainers

> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+5134cdf021c4ed5aaa5f@syzkaller.appspotmail.com
>
> BUG: memory leak
> unreferenced object 0xffff8881234e9c00 (size 512):
>    comm "kworker/u4:2", pid 286, jiffies 4294948041 (age 7.750s)
>    hex dump (first 32 bytes):
>      01 00 00 00 00 00 00 00 08 9c 4e 23 81 88 ff ff  ..........N#....
>      08 9c 4e 23 81 88 ff ff 18 9c 4e 23 81 88 ff ff  ..N#......N#....
>    backtrace:
>      [<0000000032e378fa>] kmemleak_alloc_recursive
> /./include/linux/kmemleak.h:43 [inline]
>      [<0000000032e378fa>] slab_post_alloc_hook /mm/slab.h:522 [inline]
>      [<0000000032e378fa>] slab_alloc /mm/slab.c:3319 [inline]
>      [<0000000032e378fa>] __do_kmalloc /mm/slab.c:3653 [inline]
>      [<0000000032e378fa>] __kmalloc+0x16d/0x2d0 /mm/slab.c:3664
>      [<0000000015bc9536>] kmalloc /./include/linux/slab.h:557 [inline]
>      [<0000000015bc9536>] kzalloc /./include/linux/slab.h:748 [inline]
>      [<0000000015bc9536>] rds_message_alloc+0x3e/0xc0 /net/rds/message.c:291
>      [<00000000a806d18d>] rds_send_probe.constprop.0+0x42/0x2f0
> /net/rds/send.c:1419
>      [<00000000794a00cc>] rds_send_pong+0x1e/0x23 /net/rds/send.c:1482
>      [<00000000b2a248d0>] rds_recv_incoming+0x27e/0x460 /net/rds/recv.c:343
>      [<00000000ea1503db>] rds_loop_xmit+0x86/0x100 /net/rds/loop.c:96
>      [<00000000a9857f5a>] rds_send_xmit+0x524/0x9a0 /net/rds/send.c:355
>      [<00000000557b0101>] rds_send_worker+0x3c/0xd0 /net/rds/threads.c:200
>      [<000000004ba94868>] process_one_work+0x23f/0x490
> /kernel/workqueue.c:2269
>      [<00000000e793f811>] worker_thread+0x195/0x580 /kernel/workqueue.c:2415
>      [<000000003ee8c1a1>] kthread+0x13e/0x160 /kernel/kthread.c:255
>      [<000000004cd53c81>] ret_from_fork+0x1f/0x30
> /arch/x86/entry/entry_64.S:352
>
>
>
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report. See:
> https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
> syzbot can test patches for this bug, for details see:
> https://goo.gl/tpsmEJ#testing-patches
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller-bugs+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller-bugs/000000000000ad1dfe058e5b89ab%40google.com.

^ permalink raw reply

* KASAN: slab-out-of-bounds Read in do_jit
From: syzbot @ 2019-07-23 16:18 UTC (permalink / raw)
  To: andriin, ast, bpf, daniel, kafai, linux-kernel, netdev,
	songliubraving, syzkaller-bugs, yhs

Hello,

syzbot found the following crash on:

HEAD commit:    89099d85 Merge branch 'flow_offload-fixes'
git tree:       net
console output: https://syzkaller.appspot.com/x/log.txt?x=17169bb7a00000
kernel config:  https://syzkaller.appspot.com/x/.config?x=4dba67bf8b8c9ad7
dashboard link: https://syzkaller.appspot.com/bug?extid=6b40f58c6d280fa23b40
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=117364f0600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=11fcbaafa00000

The bug was bisected to:

commit 2589726d12a1b12eaaa93c7f1ea64287e383c7a5
Author: Alexei Starovoitov <ast@kernel.org>
Date:   Sat Jun 15 19:12:20 2019 +0000

     bpf: introduce bounded loops

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=15d38fa4600000
final crash:    https://syzkaller.appspot.com/x/report.txt?x=17d38fa4600000
console output: https://syzkaller.appspot.com/x/log.txt?x=13d38fa4600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+6b40f58c6d280fa23b40@syzkaller.appspotmail.com
Fixes: 2589726d12a1 ("bpf: introduce bounded loops")

==================================================================
BUG: KASAN: slab-out-of-bounds in do_jit.isra.0+0x4c35/0x5630  
/arch/x86/net/bpf_jit_comp.c:966
Read of size 4 at addr ffff8880a654ec3c by task syz-executor906/8876

CPU: 0 PID: 8876 Comm: syz-executor906 Not tainted 5.2.0+ #95
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack /lib/dump_stack.c:77 [inline]
  dump_stack+0x172/0x1f0 /lib/dump_stack.c:113
  print_address_description.cold+0xd4/0x306 /mm/kasan/report.c:351
  __kasan_report.cold+0x1b/0x36 /mm/kasan/report.c:482
  kasan_report+0x12/0x17 /mm/kasan/common.c:612
  __asan_report_load4_noabort+0x14/0x20 /mm/kasan/generic_report.c:131
  do_jit.isra.0+0x4c35/0x5630 /arch/x86/net/bpf_jit_comp.c:966
  bpf_int_jit_compile+0x374/0xda0 /arch/x86/net/bpf_jit_comp.c:1132
  bpf_prog_select_runtime+0x4cd/0x7d0 /kernel/bpf/core.c:1726
  bpf_prog_load+0xe9b/0x1670 /kernel/bpf/syscall.c:1702
  __do_sys_bpf+0xa46/0x42f0 /kernel/bpf/syscall.c:2849
  __se_sys_bpf /kernel/bpf/syscall.c:2808 [inline]
  __x64_sys_bpf+0x73/0xb0 /kernel/bpf/syscall.c:2808
  do_syscall_64+0xfd/0x6a0 /arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4402c9
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffc7d7bb638 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 00000000004402c9
RDX: 0000000000000046 RSI: 0000000020000180 RDI: 0000000000000005
RBP: 00000000006ca018 R08: 0000000000000000 R09: 0000000000000000
R10: 00000000ffffffff R11: 0000000000000246 R12: 0000000000401b50
R13: 0000000000401be0 R14: 0000000000000000 R15: 0000000000000000

Allocated by task 8388:
  save_stack+0x23/0x90 /mm/kasan/common.c:69
  set_track /mm/kasan/common.c:77 [inline]
  __kasan_kmalloc /mm/kasan/common.c:487 [inline]
  __kasan_kmalloc.constprop.0+0xcf/0xe0 /mm/kasan/common.c:460
  kasan_kmalloc+0x9/0x10 /mm/kasan/common.c:501
  __do_kmalloc /mm/slab.c:3655 [inline]
  __kmalloc+0x163/0x770 /mm/slab.c:3664
  kmalloc /./include/linux/slab.h:557 [inline]
  kzalloc /./include/linux/slab.h:748 [inline]
  lsm_cred_alloc /security/security.c:466 [inline]
  security_prepare_creds+0x11d/0x190 /security/security.c:1514
  prepare_creds+0x2f5/0x3f0 /kernel/cred.c:281
  do_faccessat+0xa2/0x7f0 /fs/open.c:360
  __do_sys_access /fs/open.c:431 [inline]
  __se_sys_access /fs/open.c:429 [inline]
  __x64_sys_access+0x59/0x80 /fs/open.c:429
  do_syscall_64+0xfd/0x6a0 /arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 8372:
  save_stack+0x23/0x90 /mm/kasan/common.c:69
  set_track /mm/kasan/common.c:77 [inline]
  __kasan_slab_free+0x102/0x150 /mm/kasan/common.c:449
  kasan_slab_free+0xe/0x10 /mm/kasan/common.c:457
  __cache_free /mm/slab.c:3425 [inline]
  kfree+0x10a/0x2c0 /mm/slab.c:3756
  security_cred_free+0xa9/0x110 /security/security.c:1508
  put_cred_rcu+0x129/0x4b0 /kernel/cred.c:114
  __rcu_reclaim /kernel/rcu/rcu.h:222 [inline]
  rcu_do_batch /kernel/rcu/tree.c:2114 [inline]
  rcu_core+0x67f/0x1580 /kernel/rcu/tree.c:2314
  rcu_core_si+0x9/0x10 /kernel/rcu/tree.c:2323
  __do_softirq+0x262/0x98c /kernel/softirq.c:292

The buggy address belongs to the object at ffff8880a654ec00
  which belongs to the cache kmalloc-32 of size 32
The buggy address is located 28 bytes to the right of
  32-byte region [ffff8880a654ec00, ffff8880a654ec20)
The buggy address belongs to the page:
page:ffffea0002995380 refcount:1 mapcount:0 mapping:ffff8880aa4001c0  
index:0xffff8880a654efc1
flags: 0x1fffc0000000200(slab)
raw: 01fffc0000000200 ffffea000290f1c8 ffffea0002a154c8 ffff8880aa4001c0
raw: ffff8880a654efc1 ffff8880a654e000 0000000100000027 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8880a654eb00: fb fb fb fb fc fc fc fc 00 04 fc fc fc fc fc fc
  ffff8880a654eb80: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
> ffff8880a654ec00: fb fb fb fb fc fc fc fc 00 00 fc fc fc fc fc fc
                                         ^
  ffff8880a654ec80: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
  ffff8880a654ed00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
==================================================================


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [RFC PATCH net-next 00/12] drop_monitor: Capture dropped packets and metadata
From: Toke Høiland-Jørgensen @ 2019-07-23 16:08 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: netdev, davem, nhorman, dsahern, roopa, nikolay, jakub.kicinski,
	andy, f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <20190723151423.GA10342@splinter>

Ido Schimmel <idosch@idosch.org> writes:

> On Tue, Jul 23, 2019 at 02:17:49PM +0200, Toke Høiland-Jørgensen wrote:
>> Ido Schimmel <idosch@idosch.org> writes:
>> 
>> > On Mon, Jul 22, 2019 at 09:43:15PM +0200, Toke Høiland-Jørgensen wrote:
>> >> Is there a mechanism for the user to filter the packets before they are
>> >> sent to userspace? A bpf filter would be the obvious choice I guess...
>> >
>> > Hi Toke,
>> >
>> > Yes, it's on my TODO list to write an eBPF program that only lets
>> > "unique" packets to be enqueued on the netlink socket. Where "unique" is
>> > defined as {5-tuple, PC}. The rest of the copies will be counted in an
>> > eBPF map, which is just a hash table keyed by {5-tuple, PC}.
>> 
>> Yeah, that's a good idea. Or even something simpler like tcpdump-style
>> filters for the packets returned by drop monitor (say if I'm just trying
>> to figure out what happens to my HTTP requests).
>
> Yep, that's a good idea. I guess different users will use different
> programs. Will look into both options.

Cool.

>> > I think it would be good to have the program as part of the bcc
>> > repository [1]. What do you think?
>> 
>> Sure. We could also add it to the XDP tutorial[2]; it could go into a
>> section on introspection and debugging (just added a TODO about that[3]).
>
> Great!
>
>> >> For integrating with XDP the trick would be to find a way to do it that
>> >> doesn't incur any overhead when it's not enabled. Are you envisioning
>> >> that this would be enabled separately for the different "modes" (kernel,
>> >> hardware, XDP, etc)?
>> >
>> > Yes. Drop monitor have commands to enable and disable tracing, but they
>> > don't carry any attributes at the moment. My plan is to add an attribute
>> > (e.g., 'NET_DM_ATTR_DROP_TYPE') that will specify the type of drops
>> > you're interested in - SW/HW/XDP. If the attribute is not specified,
>> > then current behavior is maintained and all the drop types are traced.
>> > But if you're only interested in SW drops, then overhead for the rest
>> > should be zero.
>> 
>> Makes sense (although "should be" is the key here ;)).
>> 
>> I'm also worried about the drop monitor getting overwhelmed; if you turn
>> it on for XDP and you're running a filtering program there, you'll
>> suddenly get *a lot* of drops.
>> 
>> As I read your patch, the current code can basically queue up an
>> unbounded number of packets waiting to go out over netlink, can't it?
>
> That's a very good point. Each CPU holds a drop list. It probably makes
> sense to limit it by default (to 1000?) and allow user to change it
> later, if needed. I can expose a counter that shows how many packets
> were dropped because of this limit. It can be used as an indication to
> adjust the queue length (or flip to 'summary' mode).

Yup, sounds reasonable. What we don't want to happen is that the system
falls over as soon as someone turns on debugging, because XDP dumps 20
million packets/s into the debug queue ;)

Also, presumably the queue will have to change from a struct
sk_buff_head to something that can hold XDP frames and whatever devlink
puts there as well, right?

-Toke

^ permalink raw reply

* RE: [RFC PATCH 1/2] gianfar: convert to phylink
From: Claudiu Manoil @ 2019-07-23 16:07 UTC (permalink / raw)
  To: Arseny Solokha, Ioana Ciornei, Russell King, Andrew Lunn
  Cc: netdev@vger.kernel.org
In-Reply-To: <20190723151702.14430-2-asolokha@kb.kras.ru>

>-----Original Message-----
>From: Arseny Solokha <asolokha@kb.kras.ru>
>Sent: Tuesday, July 23, 2019 6:17 PM
>To: Claudiu Manoil <claudiu.manoil@nxp.com>; Ioana Ciornei
><ioana.ciornei@nxp.com>; Russell King <linux@armlinux.org.uk>; Andrew Lunn
><andrew@lunn.ch>
>Cc: netdev@vger.kernel.org; Arseny Solokha <asolokha@kb.kras.ru>
>Subject: [RFC PATCH 1/2] gianfar: convert to phylink
>
>Convert gianfar to use the phylink API for better SFP modules support.
>
>The driver still uses phylib for serdes configuration over the TBI
>interface, as there seems to be no functionally equivalent API present
>in phylink (yet). phylib usage is basically confined in two functions.
>

Thanks for your patch.  Phylink in gianfar... that would be something!
At first glance a lot of code has changed with this patch or got relocated.
To make it easier to swallow, I think a few cleanup patches could be
separated before migrating to phylink.  Like for instance getting rid of the
old* link state variables, which I think are an artifact from early phylib usage.
Nonetheless good to see this implemented, I'll have a closer look asap.

Thanks
Claudiu

^ permalink raw reply

* Re: [PATCH net 2/2] lib/dim: Fix -Wunused-const-variable warnings
From: Bart Van Assche @ 2019-07-23 15:57 UTC (permalink / raw)
  To: Leon Romanovsky, David S . Miller
  Cc: Leon Romanovsky, Doug Ledford, Jason Gunthorpe, RDMA mailing list,
	Tal Gilboa, Yamin Friedman, Saeed Mahameed, linux-netdev
In-Reply-To: <20190723072248.6844-3-leon@kernel.org>

On 7/23/19 12:22 AM, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
> 
> DIM causes to the following warnings during kernel compilation
> which indicates that tx_profile and rx_profile are supposed to
> be declared in *.c and not in *.h files.

Thanks Leon for this fix.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>

^ permalink raw reply

* Re: [RFC PATCH net-next 00/12] drop_monitor: Capture dropped packets and metadata
From: David Ahern @ 2019-07-23 15:47 UTC (permalink / raw)
  To: Ido Schimmel, Toke Høiland-Jørgensen
  Cc: netdev, davem, nhorman, roopa, nikolay, jakub.kicinski, andy,
	f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <20190723151423.GA10342@splinter>

Hi Ido:

Thanks for taking this on. Looks like a good direction to me.

On 7/23/19 8:14 AM, Ido Schimmel wrote:
> On Tue, Jul 23, 2019 at 02:17:49PM +0200, Toke Høiland-Jørgensen wrote:
>> Ido Schimmel <idosch@idosch.org> writes:
>>
>>> On Mon, Jul 22, 2019 at 09:43:15PM +0200, Toke Høiland-Jørgensen wrote:
>>>> Is there a mechanism for the user to filter the packets before they are
>>>> sent to userspace? A bpf filter would be the obvious choice I guess...
>>>
>>> Hi Toke,
>>>
>>> Yes, it's on my TODO list to write an eBPF program that only lets
>>> "unique" packets to be enqueued on the netlink socket. Where "unique" is
>>> defined as {5-tuple, PC}. The rest of the copies will be counted in an
>>> eBPF map, which is just a hash table keyed by {5-tuple, PC}.
>>
>> Yeah, that's a good idea. Or even something simpler like tcpdump-style
>> filters for the packets returned by drop monitor (say if I'm just trying
>> to figure out what happens to my HTTP requests).
> 
> Yep, that's a good idea. I guess different users will use different
> programs. Will look into both options.

Perhaps I am missing something, but the dropmon code only allows a
single user at the moment (in my attempts to run 2 instances the second
one failed). If that part stays with the design it afford better options
for the design. e.g., attributes that control the enqueued packets when
the event occurs as opposed to bpf filters which run much later when the
message is enqueued to the socket.

> 
>>> I think it would be good to have the program as part of the bcc
>>> repository [1]. What do you think?
>>
>> Sure. We could also add it to the XDP tutorial[2]; it could go into a
>> section on introspection and debugging (just added a TODO about that[3]).
> 
> Great!
> 
>>>> For integrating with XDP the trick would be to find a way to do it that
>>>> doesn't incur any overhead when it's not enabled. Are you envisioning
>>>> that this would be enabled separately for the different "modes" (kernel,
>>>> hardware, XDP, etc)?
>>>
>>> Yes. Drop monitor have commands to enable and disable tracing, but they
>>> don't carry any attributes at the moment. My plan is to add an attribute
>>> (e.g., 'NET_DM_ATTR_DROP_TYPE') that will specify the type of drops
>>> you're interested in - SW/HW/XDP. If the attribute is not specified,
>>> then current behavior is maintained and all the drop types are traced.
>>> But if you're only interested in SW drops, then overhead for the rest
>>> should be zero.
>>
>> Makes sense (although "should be" is the key here ;)).

static_key is used in other parts of the packet fast path.

Toke/Jesper: Any reason to believe it is too much overhead for this path?

>>
>> I'm also worried about the drop monitor getting overwhelmed; if you turn
>> it on for XDP and you're running a filtering program there, you'll
>> suddenly get *a lot* of drops.
>>
>> As I read your patch, the current code can basically queue up an
>> unbounded number of packets waiting to go out over netlink, can't it?
> 
> That's a very good point. Each CPU holds a drop list. It probably makes
> sense to limit it by default (to 1000?) and allow user to change it
> later, if needed. I can expose a counter that shows how many packets
> were dropped because of this limit. It can be used as an indication to
> adjust the queue length (or flip to 'summary' mode).
> 

And then with a single user limit, you can have an attribute that
controls the backlog.


^ permalink raw reply

* Re: [PATCH 1/3] mm/gup: introduce __put_user_pages()
From: Christoph Hellwig @ 2019-07-23 15:36 UTC (permalink / raw)
  To: John Hubbard
  Cc: Christoph Hellwig, john.hubbard, Andrew Morton, Alexander Viro,
	Björn Töpel, Boaz Harrosh, Daniel Vetter, Dan Williams,
	Dave Chinner, David Airlie, David S . Miller, Ilya Dryomov,
	Jan Kara, Jason Gunthorpe, Jens Axboe, Jérôme Glisse,
	Johannes Thumshirn, Magnus Karlsson, Matthew Wilcox,
	Miklos Szeredi, Ming Lei, Sage Weil, Santosh Shilimkar, Yan Zheng,
	netdev, dri-devel, linux-mm, linux-rdma, bpf, LKML
In-Reply-To: <8ab4899c-ec12-a713-cac2-d951fff2a347@nvidia.com>

On Mon, Jul 22, 2019 at 11:33:32PM -0700, John Hubbard wrote:
> I'm seeing about 18 places where set_page_dirty() is used, in the call site
> conversions so far, and about 20 places where set_page_dirty_lock() is
> used. So without knowing how many of the former (if any) represent bugs,
> you can see why the proposal here supports both DIRTY and DIRTY_LOCK.

Well, it should be fairly easy to audit.  set_page_dirty() is only
safe if we are dealing with a file backed page where we have reference
on the inode it hangs off.  Which should basically be never or almost
never.

^ permalink raw reply

* Re: [PATCH net-next 1/4] sctp: check addr_size with sa_family_t size in __sctp_setsockopt_connectx
From: Neil Horman @ 2019-07-23 15:24 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, linux-sctp, Marcelo Ricardo Leitner, davem
In-Reply-To: <c875aa0a5b2965636dc3da83398856627310b280.1563817029.git.lucien.xin@gmail.com>

On Tue, Jul 23, 2019 at 01:37:57AM +0800, Xin Long wrote:
> Now __sctp_connect() is called by __sctp_setsockopt_connectx() and
> sctp_inet_connect(), the latter has done addr_size check with size
> of sa_family_t.
> 
> In the next patch to clean up __sctp_connect(), we will remove
> addr_size check with size of sa_family_t from __sctp_connect()
> for the 1st address.
> 
> So before doing that, __sctp_setsockopt_connectx() should do
> this check first, as sctp_inet_connect() does.
> 
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
>  net/sctp/socket.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index aa80cda..5f92e4a 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -1311,7 +1311,7 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
>  	pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
>  		 __func__, sk, addrs, addrs_size);
>  
> -	if (unlikely(addrs_size <= 0))
> +	if (unlikely(addrs_size < sizeof(sa_family_t)))
I don't think this is what you want to check for here.  sa_family_t is
an unsigned short, and addrs_size is the number of bytes in the addrs
array.  The addrs array should be at least the size of one struct
sockaddr (16 bytes iirc), and, if larger, should be a multiple of
sizeof(struct sockaddr)

Neil

>  		return -EINVAL;
>  
>  	kaddrs = memdup_user(addrs, addrs_size);
> -- 
> 2.1.0
> 
> 

^ permalink raw reply

* [RFC PATCH 1/2] gianfar: convert to phylink
From: Arseny Solokha @ 2019-07-23 15:17 UTC (permalink / raw)
  To: Claudiu Manoil, Ioana Ciornei, Russell King, Andrew Lunn
  Cc: netdev, Arseny Solokha
In-Reply-To: <20190723151702.14430-1-asolokha@kb.kras.ru>

Convert gianfar to use the phylink API for better SFP modules support.

The driver still uses phylib for serdes configuration over the TBI
interface, as there seems to be no functionally equivalent API present
in phylink (yet). phylib usage is basically confined in two functions.

One needs to change their Device Tree accordingly to get working SFP
support:

 enet1: ethernet@25000 {
 <...>
 	device_type = "network";
 	model = "eTSEC";
 	compatible = "gianfar";
 	tbi-handle = <&tbi0>;
+	phy-connection-type = "sgmii";
+	managed = "in-band-status";
+	sfp = <&sfp>;
 	max-speed = <1000>;

 	mdio@520 {
 		#address-cells = <1>;
 		#size-cells = <0>;
 		compatible = "fsl,gianfar-tbi";
 		reg = <0x520 0x20>;

 		tbi0: tbi-phy@1f {
 			reg = <0x1f>;
 			device_type = "tbi-phy";
 		};
 	};
+
+	sfp: sfp0 {
+		compatible = "sff,sfp";
+		i2c-bus = <&i2c1>;
+		mod-def0-gpios = <&gpio 4 GPIO_ACTIVE_LOW>;
+		rate-select0-gpios = <&gpio 13 GPIO_ACTIVE_HIGH>;
+		tx-disable-gpios = <&gpio 5 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios = <&gpio 12 GPIO_ACTIVE_HIGH>;
+		los-gpios = <&gpio 3 GPIO_ACTIVE_HIGH>;
+	};
 };

Signed-off-by: Arseny Solokha <asolokha@kb.kras.ru>
---
 drivers/net/ethernet/freescale/Kconfig        |   2 +-
 drivers/net/ethernet/freescale/gianfar.c      | 409 +++++++++---------
 drivers/net/ethernet/freescale/gianfar.h      |  26 +-
 .../net/ethernet/freescale/gianfar_ethtool.c  |  79 ++--
 4 files changed, 251 insertions(+), 265 deletions(-)

diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig
index 6a7e8993119f..8b51d423b61d 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -89,7 +89,7 @@ config GIANFAR
 	tristate "Gianfar Ethernet"
 	depends on HAS_DMA
 	select FSL_PQ_MDIO
-	select PHYLIB
+	select PHYLINK
 	select CRC32
 	---help---
 	  This driver supports the Gigabit TSEC on the MPC83xx, MPC85xx,
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 7ea19e173339..64c7b174e591 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -96,6 +96,7 @@
 #include <linux/mii.h>
 #include <linux/phy.h>
 #include <linux/phy_fixed.h>
+#include <linux/phylink.h>
 #include <linux/of.h>
 #include <linux/of_net.h>
 
@@ -117,8 +118,18 @@ static int gfar_change_mtu(struct net_device *dev, int new_mtu);
 static irqreturn_t gfar_error(int irq, void *dev_id);
 static irqreturn_t gfar_transmit(int irq, void *dev_id);
 static irqreturn_t gfar_interrupt(int irq, void *dev_id);
-static void adjust_link(struct net_device *dev);
-static noinline void gfar_update_link_state(struct gfar_private *priv);
+static void gfar_phylink_validate(struct phylink_config *config,
+				  unsigned long *supported,
+				  struct phylink_link_state *state);
+static int gfar_mac_link_state(struct phylink_config *config,
+			       struct phylink_link_state *state);
+static void gfar_mac_config(struct phylink_config *config, unsigned int mode,
+			    const struct phylink_link_state *state);
+static void gfar_mac_an_restart(struct phylink_config *config);
+static void gfar_mac_link_down(struct phylink_config *config, unsigned int mode,
+			       phy_interface_t interface);
+static void gfar_mac_link_up(struct phylink_config *config, unsigned int mode,
+			     phy_interface_t interface, struct phy_device *phy);
 static int init_phy(struct net_device *dev);
 static int gfar_probe(struct platform_device *ofdev);
 static int gfar_remove(struct platform_device *ofdev);
@@ -504,6 +515,15 @@ static const struct net_device_ops gfar_netdev_ops = {
 #endif
 };
 
+static const struct phylink_mac_ops gfar_phylink_ops = {
+	.validate = gfar_phylink_validate,
+	.mac_link_state = gfar_mac_link_state,
+	.mac_config = gfar_mac_config,
+	.mac_an_restart = gfar_mac_an_restart,
+	.mac_link_down = gfar_mac_link_down,
+	.mac_link_up = gfar_mac_link_up,
+};
+
 static void gfar_ints_disable(struct gfar_private *priv)
 {
 	int i;
@@ -733,6 +753,7 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 	struct gfar_private *priv = NULL;
 	struct device_node *np = ofdev->dev.of_node;
 	struct device_node *child = NULL;
+	struct phylink *phylink;
 	u32 stash_len = 0;
 	u32 stash_idx = 0;
 	unsigned int num_tx_qs, num_rx_qs;
@@ -891,11 +912,21 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 
 	err = of_property_read_string(np, "phy-connection-type", &ctype);
 
-	/* We only care about rgmii-id.  The rest are autodetected */
-	if (err == 0 && !strcmp(ctype, "rgmii-id"))
-		priv->interface = PHY_INTERFACE_MODE_RGMII_ID;
-	else
+	/* We only care about rgmii-id and sgmii - the former
+	 * is indistinguishable from rgmii in hardware, and phylink needs
+	 * the latter to be set appropriately for correct phy configuration.
+	 * The rest are autodetected
+	 */
+	if (err == 0) {
+		if (!strcmp(ctype, "rgmii-id"))
+			priv->interface = PHY_INTERFACE_MODE_RGMII_ID;
+		else if (!strcmp(ctype, "sgmii"))
+			priv->interface = PHY_INTERFACE_MODE_SGMII;
+		else
+			priv->interface = PHY_INTERFACE_MODE_MII;
+	} else {
 		priv->interface = PHY_INTERFACE_MODE_MII;
+	}
 
 	if (of_find_property(np, "fsl,magic-packet", NULL))
 		priv->device_flags |= FSL_GIANFAR_DEV_HAS_MAGIC_PACKET;
@@ -903,19 +934,21 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 	if (of_get_property(np, "fsl,wake-on-filer", NULL))
 		priv->device_flags |= FSL_GIANFAR_DEV_HAS_WAKE_ON_FILER;
 
-	priv->phy_node = of_parse_phandle(np, "phy-handle", 0);
+	priv->device_node = np;
+	priv->speed = SPEED_UNKNOWN;
 
-	/* In the case of a fixed PHY, the DT node associated
-	 * to the PHY is the Ethernet MAC DT node.
-	 */
-	if (!priv->phy_node && of_phy_is_fixed_link(np)) {
-		err = of_phy_register_fixed_link(np);
-		if (err)
-			goto err_grp_init;
+	priv->phylink_config.dev = &priv->ndev->dev;
+	priv->phylink_config.type = PHYLINK_NETDEV;
 
-		priv->phy_node = of_node_get(np);
+	phylink = phylink_create(&priv->phylink_config, of_fwnode_handle(np),
+				 priv->interface, &gfar_phylink_ops);
+	if (IS_ERR(phylink)) {
+		err = PTR_ERR(phylink);
+		goto err_grp_init;
 	}
 
+	priv->phylink = phylink;
+
 	/* Find the TBI PHY.  If it's not there, we don't support SGMII */
 	priv->tbi_node = of_parse_phandle(np, "tbi-handle", 0);
 
@@ -994,7 +1027,7 @@ static int gfar_hwtstamp_get(struct net_device *netdev, struct ifreq *ifr)
 
 static int gfar_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-	struct phy_device *phydev = dev->phydev;
+	struct gfar_private *priv = netdev_priv(dev);
 
 	if (!netif_running(dev))
 		return -EINVAL;
@@ -1004,10 +1037,7 @@ static int gfar_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (cmd == SIOCGHWTSTAMP)
 		return gfar_hwtstamp_get(dev, rq);
 
-	if (!phydev)
-		return -ENODEV;
-
-	return phy_mii_ioctl(phydev, rq, cmd);
+	return phylink_mii_ioctl(priv->phylink, rq, cmd);
 }
 
 static u32 cluster_entry_per_class(struct gfar_private *priv, u32 rqfar,
@@ -1307,7 +1337,6 @@ static void gfar_init_addr_hash_table(struct gfar_private *priv)
  */
 static int gfar_probe(struct platform_device *ofdev)
 {
-	struct device_node *np = ofdev->dev.of_node;
 	struct net_device *dev = NULL;
 	struct gfar_private *priv = NULL;
 	int err = 0, i;
@@ -1463,12 +1492,10 @@ static int gfar_probe(struct platform_device *ofdev)
 	return 0;
 
 register_fail:
-	if (of_phy_is_fixed_link(np))
-		of_phy_deregister_fixed_link(np);
 	unmap_group_regs(priv);
 	gfar_free_rx_queues(priv);
 	gfar_free_tx_queues(priv);
-	of_node_put(priv->phy_node);
+	phylink_destroy(priv->phylink);
 	of_node_put(priv->tbi_node);
 	free_gfar_dev(priv);
 	return err;
@@ -1477,19 +1504,15 @@ static int gfar_probe(struct platform_device *ofdev)
 static int gfar_remove(struct platform_device *ofdev)
 {
 	struct gfar_private *priv = platform_get_drvdata(ofdev);
-	struct device_node *np = ofdev->dev.of_node;
 
-	of_node_put(priv->phy_node);
 	of_node_put(priv->tbi_node);
 
 	unregister_netdev(priv->ndev);
 
-	if (of_phy_is_fixed_link(np))
-		of_phy_deregister_fixed_link(np);
-
 	unmap_group_regs(priv);
 	gfar_free_rx_queues(priv);
 	gfar_free_tx_queues(priv);
+	phylink_destroy(priv->phylink);
 	free_gfar_dev(priv);
 
 	return 0;
@@ -1643,9 +1666,11 @@ static int gfar_suspend(struct device *dev)
 		gfar_start_wol_filer(priv);
 
 	} else {
-		phy_stop(ndev->phydev);
+		phylink_stop(phy->phylink);
 	}
 
+	priv->speed = SPEED_UNKNOWN;
+
 	return 0;
 }
 
@@ -1672,7 +1697,7 @@ static int gfar_resume(struct device *dev)
 		gfar_filer_restore_table(priv);
 
 	} else {
-		phy_start(ndev->phydev);
+		phylink_start(priv->phylink);
 	}
 
 	gfar_start(priv);
@@ -1702,12 +1727,7 @@ static int gfar_restore(struct device *dev)
 
 	gfar_start(priv);
 
-	priv->oldlink = 0;
-	priv->oldspeed = 0;
-	priv->oldduplex = -1;
-
-	if (ndev->phydev)
-		phy_start(ndev->phydev);
+	phylink_start(priv->phylink);
 
 	netif_device_attach(ndev);
 	enable_napi(priv);
@@ -1781,46 +1801,26 @@ static phy_interface_t gfar_get_interface(struct net_device *dev)
  */
 static int init_phy(struct net_device *dev)
 {
-	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 	struct gfar_private *priv = netdev_priv(dev);
 	phy_interface_t interface;
-	struct phy_device *phydev;
 	struct ethtool_eee edata;
+	int flags = 0;
+	int err;
 
-	linkmode_set_bit_array(phy_10_100_features_array,
-			       ARRAY_SIZE(phy_10_100_features_array),
-			       mask);
-	linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, mask);
-	linkmode_set_bit(ETHTOOL_LINK_MODE_MII_BIT, mask);
-	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT)
-		linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, mask);
-
-	priv->oldlink = 0;
-	priv->oldspeed = 0;
-	priv->oldduplex = -1;
-
-	interface = gfar_get_interface(dev);
-
-	phydev = of_phy_connect(dev, priv->phy_node, &adjust_link, 0,
-				interface);
-	if (!phydev) {
-		dev_err(&dev->dev, "could not attach to PHY\n");
-		return -ENODEV;
+	err = phylink_of_phy_connect(priv->phylink, priv->device_node, flags);
+	if (err) {
+		netdev_err(dev, "could not attach to PHY: %d\n", err);
+		return err;
 	}
 
+	priv->tbi_phy = NULL;
+	interface = gfar_get_interface(dev);
 	if (interface == PHY_INTERFACE_MODE_SGMII)
 		gfar_configure_serdes(dev);
 
-	/* Remove any features not supported by the controller */
-	linkmode_and(phydev->supported, phydev->supported, mask);
-	linkmode_copy(phydev->advertising, phydev->supported);
-
-	/* Add support for flow control */
-	phy_support_asym_pause(phydev);
-
 	/* disable EEE autoneg, EEE not supported by eTSEC */
 	memset(&edata, 0, sizeof(struct ethtool_eee));
-	phy_ethtool_set_eee(phydev, &edata);
+	phylink_ethtool_set_eee(priv->phylink, &edata);
 
 	return 0;
 }
@@ -1850,6 +1850,8 @@ static void gfar_configure_serdes(struct net_device *dev)
 		return;
 	}
 
+	priv->tbi_phy = tbiphy;
+
 	/* If the link is already up, we must already be ok, and don't need to
 	 * configure and reset the TBI<->SerDes link.  Maybe U-Boot configured
 	 * everything for us?  Resetting it takes the link down and requires
@@ -1964,7 +1966,7 @@ void stop_gfar(struct net_device *dev)
 	/* disable ints and gracefully shut down Rx/Tx DMA */
 	gfar_halt(priv);
 
-	phy_stop(dev->phydev);
+	phylink_stop(priv->phylink);
 
 	free_skb_resources(priv);
 }
@@ -2219,12 +2221,7 @@ int startup_gfar(struct net_device *ndev)
 	/* Start Rx/Tx DMA and enable the interrupts */
 	gfar_start(priv);
 
-	/* force link state update after mac reset */
-	priv->oldlink = 0;
-	priv->oldspeed = 0;
-	priv->oldduplex = -1;
-
-	phy_start(ndev->phydev);
+	phylink_start(priv->phylink);
 
 	enable_napi(priv);
 
@@ -2593,7 +2590,7 @@ static int gfar_close(struct net_device *dev)
 	stop_gfar(dev);
 
 	/* Disconnect from the PHY */
-	phy_disconnect(dev->phydev);
+	phylink_disconnect_phy(priv->phylink);
 
 	gfar_free_irq(priv);
 
@@ -3387,23 +3384,6 @@ static irqreturn_t gfar_interrupt(int irq, void *grp_id)
 	return IRQ_HANDLED;
 }
 
-/* Called every time the controller might need to be made
- * aware of new link state.  The PHY code conveys this
- * information through variables in the phydev structure, and this
- * function converts those variables into the appropriate
- * register values, and can bring down the device if needed.
- */
-static void adjust_link(struct net_device *dev)
-{
-	struct gfar_private *priv = netdev_priv(dev);
-	struct phy_device *phydev = dev->phydev;
-
-	if (unlikely(phydev->link != priv->oldlink ||
-		     (phydev->link && (phydev->duplex != priv->oldduplex ||
-				       phydev->speed != priv->oldspeed))))
-		gfar_update_link_state(priv);
-}
-
 /* Update the hash table based on the current list of multicast
  * addresses we subscribe to.  Also, change the promiscuity of
  * the device based on the flags (this function is called
@@ -3635,132 +3615,169 @@ static irqreturn_t gfar_error(int irq, void *grp_id)
 	return IRQ_HANDLED;
 }
 
-static u32 gfar_get_flowctrl_cfg(struct gfar_private *priv)
+static void gfar_phylink_validate(struct phylink_config *config,
+				  unsigned long *supported,
+				  struct phylink_link_state *state)
+{
+	struct gfar_private *priv = netdev_priv(to_net_dev(config->dev));
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
+
+	if (state->interface != PHY_INTERFACE_MODE_NA &&
+	    state->interface != PHY_INTERFACE_MODE_SGMII &&
+	    state->interface != PHY_INTERFACE_MODE_1000BASEX &&
+	    !phy_interface_mode_is_rgmii(state->interface)) {
+		phylink_zero(supported);
+		return;
+	}
+
+	phylink_set(mask, Autoneg);
+	phylink_set(mask, Pause);
+	phylink_set(mask, Asym_Pause);
+	phylink_set_port_modes(mask);
+
+	phylink_set(mask, 10baseT_Half);
+	phylink_set(mask, 10baseT_Full);
+	phylink_set(mask, 100baseT_Half);
+	phylink_set(mask, 100baseT_Full);
+
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT) {
+		phylink_set(mask, 1000baseX_Full);
+		phylink_set(mask, 1000baseT_Full);
+	}
+
+	linkmode_and(supported, supported, mask);
+	linkmode_and(state->advertising, state->advertising, mask);
+}
+
+static int gfar_mac_link_state(struct phylink_config *config,
+			       struct phylink_link_state *state)
+{
+	if (state->interface == PHY_INTERFACE_MODE_SGMII ||
+	    state->interface == PHY_INTERFACE_MODE_1000BASEX) {
+		struct gfar_private *priv =
+			netdev_priv(to_net_dev(config->dev));
+		u16 tbi_cr;
+
+		if (!priv->tbi_phy)
+			return -ENODEV;
+
+		tbi_cr = phy_read(priv->tbi_phy, MII_TBI_CR);
+
+		state->duplex = !!(tbi_cr & TBI_CR_FULL_DUPLEX);
+		if ((tbi_cr & TBI_CR_SPEED_1000_MASK) == TBI_CR_SPEED_1000_MASK)
+			state->speed = SPEED_1000;
+	}
+
+	return 1;
+}
+
+static u32 gfar_get_flowctrl_cfg(const struct phylink_link_state *state)
 {
-	struct net_device *ndev = priv->ndev;
-	struct phy_device *phydev = ndev->phydev;
 	u32 val = 0;
 
-	if (!phydev->duplex)
+	if (!state->duplex)
 		return val;
 
-	if (!priv->pause_aneg_en) {
-		if (priv->tx_pause_en)
-			val |= MACCFG1_TX_FLOW;
-		if (priv->rx_pause_en)
-			val |= MACCFG1_RX_FLOW;
-	} else {
-		u16 lcl_adv, rmt_adv;
-		u8 flowctrl;
-		/* get link partner capabilities */
-		rmt_adv = 0;
-		if (phydev->pause)
-			rmt_adv = LPA_PAUSE_CAP;
-		if (phydev->asym_pause)
-			rmt_adv |= LPA_PAUSE_ASYM;
-
-		lcl_adv = linkmode_adv_to_lcl_adv_t(phydev->advertising);
-		flowctrl = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv);
-		if (flowctrl & FLOW_CTRL_TX)
-			val |= MACCFG1_TX_FLOW;
-		if (flowctrl & FLOW_CTRL_RX)
-			val |= MACCFG1_RX_FLOW;
-	}
+	if (state->pause & MLO_PAUSE_TX)
+		val |= MACCFG1_TX_FLOW;
+	if (state->pause & MLO_PAUSE_RX)
+		val |= MACCFG1_RX_FLOW;
 
 	return val;
 }
 
-static noinline void gfar_update_link_state(struct gfar_private *priv)
+static void gfar_mac_config(struct phylink_config *config, unsigned int mode,
+			    const struct phylink_link_state *state)
 {
+	struct gfar_private *priv = netdev_priv(to_net_dev(config->dev));
 	struct gfar __iomem *regs = priv->gfargrp[0].regs;
-	struct net_device *ndev = priv->ndev;
-	struct phy_device *phydev = ndev->phydev;
-	struct gfar_priv_rx_q *rx_queue = NULL;
-	int i;
+	u32 maccfg1, new_maccfg1;
+	u32 maccfg2, new_maccfg2;
+	u32 ecntrl, new_ecntrl;
+	u32 tx_flow, new_tx_flow;
 
 	if (unlikely(test_bit(GFAR_RESETTING, &priv->state)))
 		return;
 
-	if (phydev->link) {
-		u32 tempval1 = gfar_read(&regs->maccfg1);
-		u32 tempval = gfar_read(&regs->maccfg2);
-		u32 ecntrl = gfar_read(&regs->ecntrl);
-		u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
+	if (unlikely(phylink_autoneg_inband(mode)))
+		return;
 
-		if (phydev->duplex != priv->oldduplex) {
-			if (!(phydev->duplex))
-				tempval &= ~(MACCFG2_FULL_DUPLEX);
-			else
-				tempval |= MACCFG2_FULL_DUPLEX;
+	maccfg1 = gfar_read(&regs->maccfg1);
+	maccfg2 = gfar_read(&regs->maccfg2);
+	ecntrl = gfar_read(&regs->ecntrl);
 
-			priv->oldduplex = phydev->duplex;
-		}
+	new_maccfg2 = maccfg2 & ~(MACCFG2_FULL_DUPLEX | MACCFG2_IF);
+	new_ecntrl = ecntrl & ~ECNTRL_R100;
 
-		if (phydev->speed != priv->oldspeed) {
-			switch (phydev->speed) {
-			case 1000:
-				tempval =
-				    ((tempval & ~(MACCFG2_IF)) | MACCFG2_GMII);
+	if (state->duplex)
+		new_maccfg2 |= MACCFG2_FULL_DUPLEX;
 
-				ecntrl &= ~(ECNTRL_R100);
-				break;
-			case 100:
-			case 10:
-				tempval =
-				    ((tempval & ~(MACCFG2_IF)) | MACCFG2_MII);
-
-				/* Reduced mode distinguishes
-				 * between 10 and 100
-				 */
-				if (phydev->speed == SPEED_100)
-					ecntrl |= ECNTRL_R100;
-				else
-					ecntrl &= ~(ECNTRL_R100);
-				break;
-			default:
-				netif_warn(priv, link, priv->ndev,
-					   "Ack!  Speed (%d) is not 10/100/1000!\n",
-					   phydev->speed);
-				break;
-			}
-
-			priv->oldspeed = phydev->speed;
-		}
-
-		tempval1 &= ~(MACCFG1_TX_FLOW | MACCFG1_RX_FLOW);
-		tempval1 |= gfar_get_flowctrl_cfg(priv);
-
-		/* Turn last free buffer recording on */
-		if ((tempval1 & MACCFG1_TX_FLOW) && !tx_flow_oldval) {
-			for (i = 0; i < priv->num_rx_queues; i++) {
-				u32 bdp_dma;
-
-				rx_queue = priv->rx_queue[i];
-				bdp_dma = gfar_rxbd_dma_lastfree(rx_queue);
-				gfar_write(rx_queue->rfbptr, bdp_dma);
-			}
-
-			priv->tx_actual_en = 1;
-		}
-
-		if (unlikely(!(tempval1 & MACCFG1_TX_FLOW) && tx_flow_oldval))
-			priv->tx_actual_en = 0;
-
-		gfar_write(&regs->maccfg1, tempval1);
-		gfar_write(&regs->maccfg2, tempval);
-		gfar_write(&regs->ecntrl, ecntrl);
-
-		if (!priv->oldlink)
-			priv->oldlink = 1;
-
-	} else if (priv->oldlink) {
-		priv->oldlink = 0;
-		priv->oldspeed = 0;
-		priv->oldduplex = -1;
+	switch (state->speed) {
+	case SPEED_1000:
+		new_maccfg2 |= MACCFG2_GMII;
+		break;
+	case SPEED_100:
+		new_maccfg2 |= MACCFG2_MII;
+		new_ecntrl = ecntrl | ECNTRL_R100;
+		break;
+	case SPEED_10:
+		new_maccfg2 |= MACCFG2_MII;
+		break;
+	default:
+		netif_warn(priv, link, priv->ndev,
+			   "Ack!  Speed (%d) is not 10/100/1000!\n",
+			   state->speed);
+		return;
 	}
 
-	if (netif_msg_link(priv))
-		phy_print_status(phydev);
+	priv->speed = state->speed;
+
+	new_maccfg1 = maccfg1 & ~(MACCFG1_TX_FLOW | MACCFG1_RX_FLOW);
+	new_maccfg1 |= gfar_get_flowctrl_cfg(state);
+
+	/* Turn last free buffer recording on */
+	tx_flow = maccfg1 & MACCFG1_TX_FLOW;
+	new_tx_flow = new_maccfg1 & MACCFG1_TX_FLOW;
+	if (new_tx_flow && !tx_flow) {
+		int i;
+
+		for (i = 0; i < priv->num_rx_queues; i++) {
+			struct gfar_priv_rx_q *rx_queue;
+			u32 bdp_dma;
+
+			rx_queue = priv->rx_queue[i];
+			bdp_dma = gfar_rxbd_dma_lastfree(rx_queue);
+			gfar_write(rx_queue->rfbptr, bdp_dma);
+		}
+
+		priv->tx_actual_en = 1;
+	} else if (unlikely(!new_tx_flow && tx_flow)) {
+		priv->tx_actual_en = 0;
+	}
+
+	if (new_maccfg1 != maccfg1)
+		gfar_write(&regs->maccfg1, new_maccfg1);
+	if (new_maccfg2 != maccfg2)
+		gfar_write(&regs->maccfg2, new_maccfg2);
+	if (new_ecntrl != ecntrl)
+		gfar_write(&regs->ecntrl, new_ecntrl);
+}
+
+static void gfar_mac_an_restart(struct phylink_config *config)
+{
+	/* Not supported */
+}
+
+static void gfar_mac_link_down(struct phylink_config *config, unsigned int mode,
+			       phy_interface_t interface)
+{
+	/* Not supported */
+}
+
+static void gfar_mac_link_up(struct phylink_config *config, unsigned int mode,
+			     phy_interface_t interface, struct phy_device *phy)
+{
+	/* Not supported */
 }
 
 static const struct of_device_id gfar_match[] =
diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h
index f2af96349c7b..0b28b1f60f2d 100644
--- a/drivers/net/ethernet/freescale/gianfar.h
+++ b/drivers/net/ethernet/freescale/gianfar.h
@@ -31,8 +31,7 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
-#include <linux/mii.h>
-#include <linux/phy.h>
+#include <linux/phylink.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -149,8 +148,13 @@ extern const char gfar_driver_version[];
 #define GFAR_SUPPORTED_GBIT SUPPORTED_1000baseT_Full
 
 /* TBI register addresses */
+#define MII_TBI_CR		0x00
 #define MII_TBICON		0x11
 
+/* TBI_CR register bit fields */
+#define TBI_CR_FULL_DUPLEX	0x0100
+#define TBI_CR_SPEED_1000_MASK	0x0040
+
 /* TBICON register bit fields */
 #define TBICON_CLK_SELECT	0x0020
 
@@ -1148,12 +1152,12 @@ struct gfar_private {
 
 	/* PHY stuff */
 	phy_interface_t interface;
-	struct device_node *phy_node;
+	struct device_node *device_node;
 	struct device_node *tbi_node;
-	struct mii_bus *mii_bus;
-	int oldspeed;
-	int oldduplex;
-	int oldlink;
+	struct phylink *phylink;
+	struct phylink_config phylink_config;
+	struct phy_device *tbi_phy;
+	int speed;
 
 	uint32_t msg_enable;
 
@@ -1165,11 +1169,7 @@ struct gfar_private {
 		bd_stash_en:1,
 		rx_filer_enable:1,
 		/* Enable priorty based Tx scheduling in Hw */
-		prio_sched_en:1,
-		/* Flow control flags */
-		pause_aneg_en:1,
-		tx_pause_en:1,
-		rx_pause_en:1;
+		prio_sched_en:1;
 
 	/* The total tx and rx ring size for the enabled queues */
 	unsigned int total_tx_ring_size;
@@ -1333,8 +1333,6 @@ void reset_gfar(struct net_device *dev);
 void gfar_mac_reset(struct gfar_private *priv);
 void gfar_halt(struct gfar_private *priv);
 void gfar_start(struct gfar_private *priv);
-void gfar_phy_test(struct mii_bus *bus, struct phy_device *phydev, int enable,
-		   u32 regnum, u32 read);
 void gfar_configure_coalescing_all(struct gfar_private *priv);
 int gfar_set_features(struct net_device *dev, netdev_features_t features);
 
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index 3433b46b90c1..146b30d07789 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -35,7 +35,7 @@
 #include <asm/types.h>
 #include <linux/ethtool.h>
 #include <linux/mii.h>
-#include <linux/phy.h>
+#include <linux/phylink.h>
 #include <linux/sort.h>
 #include <linux/if_vlan.h>
 #include <linux/of_platform.h>
@@ -207,12 +207,10 @@ static void gfar_get_regs(struct net_device *dev, struct ethtool_regs *regs,
 static unsigned int gfar_usecs2ticks(struct gfar_private *priv,
 				     unsigned int usecs)
 {
-	struct net_device *ndev = priv->ndev;
-	struct phy_device *phydev = ndev->phydev;
 	unsigned int count;
 
 	/* The timer is different, depending on the interface speed */
-	switch (phydev->speed) {
+	switch (priv->speed) {
 	case SPEED_1000:
 		count = GFAR_GBIT_TIME;
 		break;
@@ -234,12 +232,10 @@ static unsigned int gfar_usecs2ticks(struct gfar_private *priv,
 static unsigned int gfar_ticks2usecs(struct gfar_private *priv,
 				     unsigned int ticks)
 {
-	struct net_device *ndev = priv->ndev;
-	struct phy_device *phydev = ndev->phydev;
 	unsigned int count;
 
 	/* The timer is different, depending on the interface speed */
-	switch (phydev->speed) {
+	switch (priv->speed) {
 	case SPEED_1000:
 		count = GFAR_GBIT_TIME;
 		break;
@@ -489,58 +485,15 @@ static void gfar_gpauseparam(struct net_device *dev,
 {
 	struct gfar_private *priv = netdev_priv(dev);
 
-	epause->autoneg = !!priv->pause_aneg_en;
-	epause->rx_pause = !!priv->rx_pause_en;
-	epause->tx_pause = !!priv->tx_pause_en;
+	phylink_ethtool_get_pauseparam(priv->phylink, epause);
 }
 
 static int gfar_spauseparam(struct net_device *dev,
 			    struct ethtool_pauseparam *epause)
 {
 	struct gfar_private *priv = netdev_priv(dev);
-	struct phy_device *phydev = dev->phydev;
-	struct gfar __iomem *regs = priv->gfargrp[0].regs;
 
-	if (!phydev)
-		return -ENODEV;
-
-	if (!phy_validate_pause(phydev, epause))
-		return -EINVAL;
-
-	priv->rx_pause_en = priv->tx_pause_en = 0;
-	phy_set_asym_pause(phydev, epause->rx_pause, epause->tx_pause);
-	if (epause->rx_pause) {
-		priv->rx_pause_en = 1;
-
-		if (epause->tx_pause) {
-			priv->tx_pause_en = 1;
-		}
-	} else if (epause->tx_pause) {
-		priv->tx_pause_en = 1;
-	}
-
-	if (epause->autoneg)
-		priv->pause_aneg_en = 1;
-	else
-		priv->pause_aneg_en = 0;
-
-	if (!epause->autoneg) {
-		u32 tempval = gfar_read(&regs->maccfg1);
-
-		tempval &= ~(MACCFG1_TX_FLOW | MACCFG1_RX_FLOW);
-
-		priv->tx_actual_en = 0;
-		if (priv->tx_pause_en) {
-			priv->tx_actual_en = 1;
-			tempval |= MACCFG1_TX_FLOW;
-		}
-
-		if (priv->rx_pause_en)
-			tempval |= MACCFG1_RX_FLOW;
-		gfar_write(&regs->maccfg1, tempval);
-	}
-
-	return 0;
+	return phylink_ethtool_set_pauseparam(priv->phylink, epause);
 }
 
 int gfar_set_features(struct net_device *dev, netdev_features_t features)
@@ -1519,6 +1472,24 @@ static int gfar_get_ts_info(struct net_device *dev,
 	return 0;
 }
 
+/* Set link ksettings (phy address, speed) for ethtools */
+static int gfar_get_link_ksettings(struct net_device *dev,
+				   struct ethtool_link_ksettings *cmd)
+{
+	struct gfar_private *priv = netdev_priv(dev);
+
+	return phylink_ethtool_ksettings_get(priv->phylink, cmd);
+}
+
+/* Get link ksettings for ethtools */
+static int gfar_set_link_ksettings(struct net_device *dev,
+				   const struct ethtool_link_ksettings *cmd)
+{
+	struct gfar_private *priv = netdev_priv(dev);
+
+	return phylink_ethtool_ksettings_set(priv->phylink, cmd);
+}
+
 const struct ethtool_ops gfar_ethtool_ops = {
 	.get_drvinfo = gfar_gdrvinfo,
 	.get_regs_len = gfar_reglen,
@@ -1542,6 +1513,6 @@ const struct ethtool_ops gfar_ethtool_ops = {
 	.set_rxnfc = gfar_set_nfc,
 	.get_rxnfc = gfar_get_nfc,
 	.get_ts_info = gfar_get_ts_info,
-	.get_link_ksettings = phy_ethtool_get_link_ksettings,
-	.set_link_ksettings = phy_ethtool_set_link_ksettings,
+	.get_link_ksettings = gfar_get_link_ksettings,
+	.set_link_ksettings = gfar_set_link_ksettings,
 };
-- 
2.22.0


^ permalink raw reply related

* [RFC PATCH 2/2] net: phylink: don't start and stop SGMII PHYs in SFP modules twice
From: Arseny Solokha @ 2019-07-23 15:17 UTC (permalink / raw)
  To: Claudiu Manoil, Ioana Ciornei, Russell King, Andrew Lunn
  Cc: netdev, Arseny Solokha
In-Reply-To: <20190723151702.14430-1-asolokha@kb.kras.ru>

SFP modules connected using the SGMII interface have their own PHYs which
are handled by the struct phylink's phydev field. After commit ce0aa27ff3f6
("sfp: add sfp-bus to bridge between network devices and sfp cages") an
sfp-bus attached to the same phylink also gets control over a PHY in an SFP
module which is actually the same PHY managed by phylink itself. This
results in WARNs during network interface bringup and shutdown when a
copper SFP module is connected, as phy_start() and phy_stop() are called
twice in a row for the same phy_device:

  % ip link set up dev eth0
  ------------[ cut here ]------------
  called from state UP
  WARNING: CPU: 1 PID: 155 at drivers/net/phy/phy.c:895 phy_start+0x74/0xc0
  Modules linked in:
  CPU: 1 PID: 155 Comm: backend Not tainted 5.2.0+ #1
  NIP:  c0227bf0 LR: c0227bf0 CTR: c004d224
  REGS: df547720 TRAP: 0700   Not tainted  (5.2.0+)
  MSR:  00029000 <CE,EE,ME>  CR: 24002822  XER: 00000000

  GPR00: c0227bf0 df5477d8 df5d7080 00000014 df9d2370 df9d5ac4 1f4eb000 00000001
  GPR08: c061fe58 00000000 00000000 df5477d8 0000003c 100c8768 00000000 00000000
  GPR16: df486a00 c046f1c8 c046eea0 00000000 c046e904 c0239604 db68449c 00000000
  GPR24: e9083204 00000000 00000001 db684460 e9083404 00000000 db6dce00 db6dcc00
  NIP [c0227bf0] phy_start+0x74/0xc0
  LR [c0227bf0] phy_start+0x74/0xc0
  Call Trace:
  [df5477d8] [c0227bf0] phy_start+0x74/0xc0 (unreliable)
  [df5477e8] [c023cad0] startup_gfar+0x398/0x3f4
  [df547828] [c023cf08] gfar_enet_open+0x364/0x374
  [df547898] [c029d870] __dev_open+0xe4/0x140
  [df5478c8] [c029db70] __dev_change_flags+0xf0/0x188
  [df5478f8] [c029dc28] dev_change_flags+0x20/0x54
  [df547918] [c02ae304] do_setlink+0x310/0x818
  [df547a08] [c02b1eb8] __rtnl_newlink+0x384/0x6b0
  [df547c28] [c02b222c] rtnl_newlink+0x48/0x68
  [df547c48] [c02ad7c8] rtnetlink_rcv_msg+0x240/0x27c
  [df547c98] [c02cc068] netlink_rcv_skb+0x8c/0xf0
  [df547cd8] [c02cba3c] netlink_unicast+0x114/0x19c
  [df547d08] [c02cbd74] netlink_sendmsg+0x2b0/0x2c0
  [df547d58] [c027b668] sock_sendmsg_nosec+0x20/0x40
  [df547d68] [c027d080] ___sys_sendmsg+0x17c/0x1dc
  [df547e98] [c027df7c] __sys_sendmsg+0x68/0x84
  [df547ef8] [c027e430] sys_socketcall+0x1a0/0x204
  [df547f38] [c000d1d8] ret_from_syscall+0x0/0x38
  --- interrupt: c01 at 0xfd4e030
      LR = 0xfd4e010
  Instruction dump:
  813f0188 38800000 2b890005 419d0014 3d40c046 5529103a 394aa208 7c8a482e
  3c60c046 3863a1b8 4cc63182 4be009a1 <0fe00000> 48000030 3c60c046 3863a1d0
  ---[ end trace d4c095aeaf6ea998 ]---

and

  % ip link set down dev eth0
  ------------[ cut here ]------------
  called from state HALTED
  WARNING: CPU: 1 PID: 184 at drivers/net/phy/phy.c:858 phy_stop+0x3c/0x88

  <...>

  Call Trace:
  [df581788] [c0228450] phy_stop+0x3c/0x88 (unreliable)
  [df581798] [c022d548] sfp_sm_phy_detach+0x1c/0x44
  [df5817a8] [c022e8cc] sfp_sm_event+0x4b0/0x87c
  [df581848] [c022f04c] sfp_upstream_stop+0x34/0x44
  [df581858] [c0225608] phylink_stop+0x7c/0xe4
  [df581868] [c023c57c] stop_gfar+0x7c/0x94
  [df581888] [c023c5b8] gfar_close+0x24/0x94
  [df5818a8] [c0298688] __dev_close_many+0xdc/0xf8
  [df5818c8] [c029db58] __dev_change_flags+0xd8/0x188
  [df5818f8] [c029dc28] dev_change_flags+0x20/0x54
  [df581918] [c02ae304] do_setlink+0x310/0x818
  [df581a08] [c02b1eb8] __rtnl_newlink+0x384/0x6b0
  [df581c28] [c02b222c] rtnl_newlink+0x48/0x68
  [df581c48] [c02ad7c8] rtnetlink_rcv_msg+0x240/0x27c
  [df581c98] [c02cc068] netlink_rcv_skb+0x8c/0xf0
  [df581cd8] [c02cba3c] netlink_unicast+0x114/0x19c
  [df581d08] [c02cbd74] netlink_sendmsg+0x2b0/0x2c0
  [df581d58] [c027b668] sock_sendmsg_nosec+0x20/0x40
  [df581d68] [c027d080] ___sys_sendmsg+0x17c/0x1dc
  [df581e98] [c027df7c] __sys_sendmsg+0x68/0x84
  [df581ef8] [c027e430] sys_socketcall+0x1a0/0x204
  [df581f38] [c000d1d8] ret_from_syscall+0x0/0x38

  <...>

  ---[ end trace d4c095aeaf6ea999 ]---

SFP modules with the 1000Base-X interface are not affected.

So, skip explicit calls to phy_start() and phy_stop() when phylink has just
enabled or disabled an attached SFP module.

Signed-off-by: Arseny Solokha <asolokha@kb.kras.ru>
---
 drivers/net/phy/phylink.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 5d0af041b8f9..4de7665876af 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -992,7 +992,7 @@ void phylink_start(struct phylink *pl)
 		mod_timer(&pl->link_poll, jiffies + HZ);
 	if (pl->sfp_bus)
 		sfp_upstream_start(pl->sfp_bus);
-	if (pl->phydev)
+	else if (pl->phydev)
 		phy_start(pl->phydev);
 }
 EXPORT_SYMBOL_GPL(phylink_start);
@@ -1010,10 +1010,10 @@ void phylink_stop(struct phylink *pl)
 {
 	ASSERT_RTNL();
 
-	if (pl->phydev)
-		phy_stop(pl->phydev);
 	if (pl->sfp_bus)
 		sfp_upstream_stop(pl->sfp_bus);
+	else if (pl->phydev)
+		phy_stop(pl->phydev);
 	del_timer_sync(&pl->link_poll);
 	if (pl->link_irq) {
 		free_irq(pl->link_irq, pl);
-- 
2.22.0


^ permalink raw reply related

* [RFC PATCH 0/2] convert gianfar to phylink
From: Arseny Solokha @ 2019-07-23 15:17 UTC (permalink / raw)
  To: Claudiu Manoil, Ioana Ciornei, Russell King, Andrew Lunn
  Cc: netdev, Arseny Solokha

The first patch in the series (almost) converts gianfar to phylink API. The
incentive behind this effort was to get proper support for 1000Base-X and
SGMII SFP modules.

There are some usages of the older phylib left, as serdes have to be
configured and its parameters queried via a TBI interface, and I've failed
to find a reasonably easy way to do it with phylink without much surgery.
It's the first reason for RFC here. However, usage of the older API only
covers two special cases of underlying hardware management and is not
involved in link and SFP management directly.

The conversion was tested with various 1000Base-X connected optical modules
and SGMII-connected copper ones.

The second patch deals with an issue in the phylink proper which only
manifests when bringing up or shutting down a network interface with SGMII
SFP module connected, which yields in calling phy_start() or phy_stop()
twice in a row for such modules. It doesn't look like a proper fix to me,
though, thus the second reason for RFC.

Arseny Solokha (2):
  gianfar: convert to phylink
  net: phylink: don't start and stop SGMII PHYs in SFP modules twice

 drivers/net/ethernet/freescale/Kconfig        |   2 +-
 drivers/net/ethernet/freescale/gianfar.c      | 409 +++++++++---------
 drivers/net/ethernet/freescale/gianfar.h      |  26 +-
 .../net/ethernet/freescale/gianfar_ethtool.c  |  79 ++--
 drivers/net/phy/phylink.c                     |   6 +-
 5 files changed, 254 insertions(+), 268 deletions(-)

-- 
2.22.0


^ permalink raw reply

* [PATCH net-next 0/3] enetc: Add mdio bus driver for the PCIe MDIO endpoint
From: Claudiu Manoil @ 2019-07-23 15:15 UTC (permalink / raw)
  To: David S . Miller
  Cc: Rob Herring, Li Yang, alexandru.marginean, netdev, devicetree,
	linux-arm-kernel, linux-kernel

First patch just registers the PCIe endpoint device containing
the MDIO registers as a standalone MDIO bus driver, to allow
an alternative way to control the MDIO bus.  The same code used
by the ENETC ports (eth controllers) to manage MDIO via local
registers applies and is reused.

Bindings are provided for the new MDIO node, similarly to ENETC
port nodes bindings.

Last patch enables the ENETC port 1 and its RGMII PHY on the
LS1028A QDS board, where the MDIO muxing configuration relies
on the MDIO support provided in the first patch.


Claudiu Manoil (3):
  enetc: Add mdio bus driver for the PCIe MDIO endpoint
  dt-bindings: net: fsl: enetc: Add bindings for the central MDIO PCIe
    endpoint
  arm64: dts: ls1028a: Enable eth port1 on the ls1028a QDS board

 .../devicetree/bindings/net/fsl-enetc.txt     | 42 ++++++++-
 .../boot/dts/freescale/fsl-ls1028a-qds.dts    | 40 +++++++++
 .../arm64/boot/dts/freescale/fsl-ls1028a.dtsi |  6 ++
 .../net/ethernet/freescale/enetc/enetc_mdio.c | 90 +++++++++++++++++++
 .../net/ethernet/freescale/enetc/enetc_pf.c   |  5 +-
 5 files changed, 179 insertions(+), 4 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH net-next 2/3] dt-bindings: net: fsl: enetc: Add bindings for the central MDIO PCIe endpoint
From: Claudiu Manoil @ 2019-07-23 15:15 UTC (permalink / raw)
  To: David S . Miller
  Cc: Rob Herring, Li Yang, alexandru.marginean, netdev, devicetree,
	linux-arm-kernel, linux-kernel
In-Reply-To: <1563894955-545-1-git-send-email-claudiu.manoil@nxp.com>

The on-chip PCIe root complex that integrates the ENETC ethernet
controllers also integrates a PCIe enpoint for the MDIO controller
provinding for cetralized control of the ENETC mdio bus.
Add bindings for this "central" MDIO Integrated PCIe Endpoit.

Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
---
 .../devicetree/bindings/net/fsl-enetc.txt     | 42 +++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/fsl-enetc.txt b/Documentation/devicetree/bindings/net/fsl-enetc.txt
index 25fc687419db..c090f6df7a39 100644
--- a/Documentation/devicetree/bindings/net/fsl-enetc.txt
+++ b/Documentation/devicetree/bindings/net/fsl-enetc.txt
@@ -11,7 +11,9 @@ Required properties:
 		  to parent node bindings.
 - compatible	: Should be "fsl,enetc".
 
-1) The ENETC external port is connected to a MDIO configurable phy:
+1. The ENETC external port is connected to a MDIO configurable phy
+
+1.1. Using the local ENETC Port MDIO interface
 
 In this case, the ENETC node should include a "mdio" sub-node
 that in turn should contain the "ethernet-phy" node describing the
@@ -47,8 +49,42 @@ Example:
 		};
 	};
 
-2) The ENETC port is an internal port or has a fixed-link external
-connection:
+1.2. Using the central MDIO PCIe enpoint device
+
+In this case, the mdio node should be defined as another PCIe
+endpoint node, at the same level with the ENETC port nodes.
+
+Required properties:
+
+- reg		: Specifies PCIe Device Number and Function
+		  Number of the ENETC endpoint device, according
+		  to parent node bindings.
+- compatible	: Should be "fsl,enetc-mdio".
+
+The remaining required mdio bus properties are standard, their bindings
+already defined in Documentation/devicetree/bindings/net/mdio.txt.
+
+Example:
+
+	ethernet@0,0 {
+		compatible = "fsl,enetc";
+		reg = <0x000000 0 0 0 0>;
+		phy-handle = <&sgmii_phy0>;
+		phy-connection-type = "sgmii";
+	};
+
+	mdio@0,3 {
+		compatible = "fsl,enetc-mdio";
+		reg = <0x000300 0 0 0 0>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		sgmii_phy0: ethernet-phy@2 {
+			reg = <0x2>;
+		};
+	};
+
+2. The ENETC port is an internal port or has a fixed-link external
+connection
 
 In this case, the ENETC port node defines a fixed link connection,
 as specified by Documentation/devicetree/bindings/net/fixed-link.txt.
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 1/3] enetc: Add mdio bus driver for the PCIe MDIO endpoint
From: Claudiu Manoil @ 2019-07-23 15:15 UTC (permalink / raw)
  To: David S . Miller
  Cc: Rob Herring, Li Yang, alexandru.marginean, netdev, devicetree,
	linux-arm-kernel, linux-kernel
In-Reply-To: <1563894955-545-1-git-send-email-claudiu.manoil@nxp.com>

ENETC ports can manage the MDIO bus via local register
interface.  However there's also a centralized way
to manage the MDIO bus, via the MDIO PCIe endpoint
device integrated by the same root complex that also
integrates the ENETC ports (eth controllers).

Depending on board design and use case, centralized
access to MDIO may be better than using local ENETC
port registers.  For instance, on the LS1028A QDS board
where MDIO muxing is requiered.  Also, the LS1028A on-chip
switch doesn't have a local MDIO register interface.

The current patch registers the above PCIe enpoint as a
separate MDIO bus and provides a driver for it by re-using
the code used for local MDIO access.  It also allows the
ENETC port PHYs to be managed by this driver if the local
"mdio" node is missing from the ENETC port node.

Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
---
 .../net/ethernet/freescale/enetc/enetc_mdio.c | 90 +++++++++++++++++++
 .../net/ethernet/freescale/enetc/enetc_pf.c   |  5 +-
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
index 77b9cd10ba2b..efa8a29f463d 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
@@ -197,3 +197,93 @@ void enetc_mdio_remove(struct enetc_pf *pf)
 		mdiobus_free(pf->mdio);
 	}
 }
+
+#define ENETC_MDIO_DEV_ID	0xee01
+#define ENETC_MDIO_DEV_NAME	"FSL PCIe IE Central MDIO"
+#define ENETC_MDIO_BUS_NAME	ENETC_MDIO_DEV_NAME " Bus"
+#define ENETC_MDIO_DRV_NAME	ENETC_MDIO_DEV_NAME " driver"
+#define ENETC_MDIO_DRV_ID	"fsl_enetc_mdio"
+
+static int enetc_pci_mdio_probe(struct pci_dev *pdev,
+				const struct pci_device_id *ent)
+{
+	struct device *dev = &pdev->dev;
+	struct mii_bus *bus;
+	int err;
+
+	bus = mdiobus_alloc_size(sizeof(u32 *));
+	if (!bus)
+		return -ENOMEM;
+
+	bus->name = ENETC_MDIO_BUS_NAME;
+	bus->read = enetc_mdio_read;
+	bus->write = enetc_mdio_write;
+	bus->parent = dev;
+	snprintf(bus->id, MII_BUS_ID_SIZE, "%s", dev_name(dev));
+
+	pcie_flr(pdev);
+	err = pci_enable_device_mem(pdev);
+	if (err) {
+		dev_err(dev, "device enable failed\n");
+		return err;
+	}
+
+	err = pci_request_mem_regions(pdev, ENETC_MDIO_DRV_ID);
+	if (err) {
+		dev_err(dev, "pci_request_regions failed\n");
+		goto err_pci_mem_reg;
+	}
+
+	bus->priv = pci_iomap_range(pdev, 0, ENETC_MDIO_REG_OFFSET, 0);
+	if (!bus->priv) {
+		err = -ENXIO;
+		dev_err(dev, "ioremap failed\n");
+		goto err_ioremap;
+	}
+
+	err = of_mdiobus_register(bus, dev->of_node);
+	if (err)
+		goto err_mdiobus_reg;
+
+	pci_set_drvdata(pdev, bus);
+
+	return 0;
+
+err_mdiobus_reg:
+	iounmap(bus->priv);
+err_ioremap:
+	pci_release_mem_regions(pdev);
+err_pci_mem_reg:
+	pci_disable_device(pdev);
+
+	return err;
+}
+
+static void enetc_pci_mdio_remove(struct pci_dev *pdev)
+{
+	struct mii_bus *bus = pci_get_drvdata(pdev);
+
+	mdiobus_unregister(bus);
+	iounmap(bus->priv);
+	mdiobus_free(bus);
+
+	pci_release_mem_regions(pdev);
+	pci_disable_device(pdev);
+}
+
+static const struct pci_device_id enetc_pci_mdio_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, ENETC_MDIO_DEV_ID) },
+	{ 0, } /* End of table. */
+};
+MODULE_DEVICE_TABLE(pci, enetc_mdio_id_table);
+
+static struct pci_driver enetc_pci_mdio_driver = {
+	.name = ENETC_MDIO_DRV_ID,
+	.id_table = enetc_pci_mdio_id_table,
+	.probe = enetc_pci_mdio_probe,
+	.remove = enetc_pci_mdio_remove,
+};
+module_pci_driver(enetc_pci_mdio_driver);
+
+MODULE_DESCRIPTION(ENETC_MDIO_DRV_NAME);
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index 258b3cb38a6f..7d6513ff8507 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -750,6 +750,7 @@ static int enetc_of_get_phy(struct enetc_ndev_priv *priv)
 {
 	struct enetc_pf *pf = enetc_si_priv(priv->si);
 	struct device_node *np = priv->dev->of_node;
+	struct device_node *mdio_np;
 	int err;
 
 	if (!np) {
@@ -773,7 +774,9 @@ static int enetc_of_get_phy(struct enetc_ndev_priv *priv)
 		priv->phy_node = of_node_get(np);
 	}
 
-	if (!of_phy_is_fixed_link(np)) {
+	mdio_np = of_get_child_by_name(np, "mdio");
+	if (mdio_np) {
+		of_node_put(mdio_np);
 		err = enetc_mdio_probe(pf);
 		if (err) {
 			of_node_put(priv->phy_node);
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 3/3] arm64: dts: ls1028a: Enable eth port1 on the ls1028a QDS board
From: Claudiu Manoil @ 2019-07-23 15:15 UTC (permalink / raw)
  To: David S . Miller
  Cc: Rob Herring, Li Yang, alexandru.marginean, netdev, devicetree,
	linux-arm-kernel, linux-kernel
In-Reply-To: <1563894955-545-1-git-send-email-claudiu.manoil@nxp.com>

LS1028a has one Ethernet management interface. On the QDS board, the
MDIO signals are multiplexed to either on-board AR8035 PHY device or
to 4 PCIe slots allowing for SGMII cards.
To enable the Ethernet ENETC Port 1, which can only be connected to a
RGMII PHY, the multiplexer needs to be configured to route the MDIO to
the AR8035 PHY.  The MDIO/MDC routing is controlled by bits 7:4 of FPGA
board config register 0x54, and value 0 selects the on-board RGMII PHY.
The FPGA board config registers are accessible on the i2c bus, at address
0x66.

The PF3 MDIO PCIe integrated endpoint device allows for centralized access
to the MDIO bus.  Add the corresponding devicetree node and set it to be
the MDIO bus parent.

Signed-off-by: Alex Marginean <alexandru.marginean@nxp.com>
Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
---
 .../boot/dts/freescale/fsl-ls1028a-qds.dts    | 40 +++++++++++++++++++
 .../arm64/boot/dts/freescale/fsl-ls1028a.dtsi |  6 +++
 2 files changed, 46 insertions(+)

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts
index de6ef39f3118..663c4b728c07 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts
@@ -85,6 +85,26 @@
 			system-clock-frequency = <25000000>;
 		};
 	};
+
+	mdio-mux {
+		compatible = "mdio-mux-multiplexer";
+		mux-controls = <&mux 0>;
+		mdio-parent-bus = <&enetc_mdio_pf3>;
+		#address-cells=<1>;
+		#size-cells = <0>;
+
+		/* on-board RGMII PHY */
+		mdio@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+
+			qds_phy1: ethernet-phy@5 {
+				/* Atheros 8035 */
+				reg = <5>;
+			};
+		};
+	};
 };
 
 &duart0 {
@@ -164,6 +184,26 @@
 			};
 		};
 	};
+
+	fpga@66 {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "fsl,ls1028aqds-fpga", "fsl,fpga-qixis-i2c",
+			     "simple-mfd";
+		reg = <0x66>;
+
+		mux: mux-controller {
+			compatible = "reg-mux";
+			#mux-control-cells = <1>;
+			mux-reg-masks = <0x54 0xf0>; /* 0: reg 0x54, bits 7:4 */
+		};
+	};
+
+};
+
+&enetc_port1 {
+	phy-handle = <&qds_phy1>;
+	phy-connection-type = "rgmii-id";
 };
 
 &sai1 {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
index 7975519b4f56..de71153fda00 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
@@ -536,6 +536,12 @@
 				compatible = "fsl,enetc";
 				reg = <0x000100 0 0 0 0>;
 			};
+			enetc_mdio_pf3: mdio@0,3 {
+				compatible = "fsl,enetc-mdio";
+				reg = <0x000300 0 0 0 0>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 			ethernet@0,4 {
 				compatible = "fsl,enetc-ptp";
 				reg = <0x000400 0 0 0 0>;
-- 
2.17.1


^ permalink raw reply related

* [PATCH] sky2: Disable MSI on ASUS P6T
From: Takashi Iwai @ 2019-07-23 15:15 UTC (permalink / raw)
  To: netdev; +Cc: Mirko Lindner, Stephen Hemminger, Marcus Seyfarth,
	David S . Miller

The onboard sky2 NIC on ASUS P6T WS PRO doesn't work after PM resume
due to the infamous IRQ problem.  Disabling MSI works around it, so
let's add it to the blacklist.

Unfortunately the BIOS on the machine doesn't fill the standard
DMI_SYS_* entry, so we pick up DMI_BOARD_* entries instead.

BugLink: https://bugzilla.suse.com/show_bug.cgi?id=1142496
Reported-and-tested-by: Marcus Seyfarth <m.seyfarth@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/net/ethernet/marvell/sky2.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index f518312ffe69..a01c75ede871 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -4924,6 +4924,13 @@ static const struct dmi_system_id msi_blacklist[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "P5W DH Deluxe"),
 		},
 	},
+	{
+		.ident = "ASUS P6T",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+			DMI_MATCH(DMI_BOARD_NAME, "P6T"),
+		},
+	},
 	{}
 };
 
-- 
2.16.4


^ permalink raw reply related

* Re: [RFC PATCH net-next 10/12] drop_monitor: Add packet alert mode
From: Neil Horman @ 2019-07-23 15:14 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: netdev, davem, dsahern, roopa, nikolay, jakub.kicinski, toke,
	andy, f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <20190723141625.GA8972@splinter>

On Tue, Jul 23, 2019 at 05:16:25PM +0300, Ido Schimmel wrote:
> On Tue, Jul 23, 2019 at 08:43:40AM -0400, Neil Horman wrote:
> > On Mon, Jul 22, 2019 at 09:31:32PM +0300, Ido Schimmel wrote:
> > > +static void net_dm_packet_work(struct work_struct *work)
> > > +{
> > > +	struct per_cpu_dm_data *data;
> > > +	struct sk_buff_head list;
> > > +	struct sk_buff *skb;
> > > +	unsigned long flags;
> > > +
> > > +	data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
> > > +
> > > +	__skb_queue_head_init(&list);
> > > +
> > > +	spin_lock_irqsave(&data->drop_queue.lock, flags);
> > > +	skb_queue_splice_tail_init(&data->drop_queue, &list);
> > > +	spin_unlock_irqrestore(&data->drop_queue.lock, flags);
> > > +
> > These functions are all executed in a per-cpu context.  While theres nothing
> > wrong with using a spinlock here, I think you can get away with just doing
> > local_irqsave and local_irq_restore.
> 
> Hi Neil,
> 
> Thanks a lot for reviewing. I might be missing something, but please
> note that this function is executed from a workqueue and therefore the
> CPU it is running on does not have to be the same CPU to which 'data'
> belongs to. If so, I'm not sure how I can avoid taking the spinlock, as
> otherwise two different CPUs can modify the list concurrently.
> 
Ah, my bad, I was under the impression that the schedule_work call for
that particular work queue was actually a call to schedule_work_on,
which would have affined it to a specific cpu.  That said, looking at
it, I think using schedule_work_on was my initial intent, as the work
queue is registered per cpu.  And converting it to schedule_work_on
would allow you to reduce the spin_lock to a faster local_irqsave

Otherwise though, this looks really good to me
Neil

> > 
> > Neil
> > 
> > > +	while ((skb = __skb_dequeue(&list)))
> > > +		net_dm_packet_report(skb);
> > > +}
> 

^ permalink raw reply

* Re: [PATCH bpf] tools/bpf: fix bpftool build with OUTPUT set
From: Ilya Leoshkevich @ 2019-07-23 15:14 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: bpf, netdev, lmb, gor, heiko.carstens, Arnaldo Carvalho de Melo
In-Reply-To: <20190719111716.1cbf62d1@cakuba.netronome.com>

> Am 19.07.2019 um 20:17 schrieb Jakub Kicinski <jakub.kicinski@netronome.com>:
> 
> On Fri, 19 Jul 2019 15:12:24 +0200, Ilya Leoshkevich wrote:
>>> Am 18.07.2019 um 20:51 schrieb Jakub Kicinski <jakub.kicinski@netronome.com>:
>>> 
>>> We should probably make a script with all the ways of calling make
>>> should work. Otherwise we can lose track too easily.  
>> 
>> Thanks for the script!
>> 
>> I’m trying to make it all pass now, and hitting a weird issue in the
>> Kbuild case. The build prints "No rule to make target
>> 'scripts/Makefile.ubsan.o'" and proceeds with an empty BPFTOOL_VERSION,
>> which causes problems later on.
> 
> Does it only break with UBSAN enabled?

No, all the time. I think this is a coincidence - make happens to scan
scripts/Makefile.ubsan first.

> 
>> I've found that this is caused by sub_make_done=1 environment variable,
>> and unsetting it indeed fixes the problem, since the root Makefile no
>> longer uses the implicit %.o rule.
>> 
>> However, I wonder if that would be acceptable in the final version of
>> the patch, and whether there is a cleaner way to achieve the same
>> effect?
> 
> I'm not sure to be honest. Did you check how perf deals with that?

perf obtains the version using "git describe". However, if we are
building it from a tarball, it falls back to "make kernelversion" and
fails in a similar way:

linux-5.3-rc1$ make defconfig
linux-5.3-rc1$ make tools/perf
<snip>
make[6]: Circular scripts/Makefile.ubsan.mod <- scripts/Makefile.ubsan.o dependency dropped.
make[6]: m2c: Command not found
make[6]: *** [<builtin>: scripts/Makefile.ubsan.o] Error 127
make[5]: *** [Makefile:1765: scripts/Makefile.ubsan.o] Error 2
<snip>

The same trick helps:

--- tools/perf/util/PERF-VERSION-GEN.orig	2019-07-23 17:12:07.621123187 +0200
+++ tools/perf/util/PERF-VERSION-GEN	2019-07-23 17:12:33.441133619 +0200
@@ -26,7 +26,7 @@
 fi
 if test -z "$TAG"
 then
-	TAG=$(MAKEFLAGS= make -sC ../.. kernelversion)
+	TAG=$(MAKEFLAGS= sub_make_done= make -sC ../.. kernelversion)
 fi
 VN="$TAG$CID"
 if test -n "$CID"

^ permalink raw reply

* Re: [PATCH v12 1/5] can: m_can: Create a m_can platform framework
From: Dan Murphy @ 2019-07-23 15:14 UTC (permalink / raw)
  To: wg, mkl, davem, gregkh; +Cc: linux-can, netdev, linux-kernel
In-Reply-To: <f236a88a-485c-9002-1e4a-9a5ad0e1c81f@ti.com>

Hello

On 7/10/19 7:08 AM, Dan Murphy wrote:
> Hello
>
> On 6/17/19 10:09 AM, Dan Murphy wrote:
>> Marc
>>
>> On 6/10/19 11:35 AM, Dan Murphy wrote:
>>> Bump
>>>
>>> On 6/6/19 8:16 AM, Dan Murphy wrote:
>>>> Marc
>>>>
>>>> Bump
>>>>
>>>> On 5/31/19 6:51 AM, Dan Murphy wrote:
>>>>> Marc
>>>>>
>>>>> On 5/15/19 3:54 PM, Dan Murphy wrote:
>>>>>> Marc
>>>>>>
>>>>>> On 5/9/19 11:11 AM, Dan Murphy wrote:
>>>>>>> Create a m_can platform framework that peripheral
>>>>>>> devices can register to and use common code and register sets.
>>>>>>> The peripheral devices may provide read/write and configuration
>>>>>>> support of the IP.
>>>>>>>
>>>>>>> Acked-by: Wolfgang Grandegger <wg@grandegger.com>
>>>>>>> Signed-off-by: Dan Murphy <dmurphy@ti.com>
>>>>>>> ---
>>>>>>>
>>>>>>> v12 - Update the m_can_read/write functions to create a 
>>>>>>> backtrace if the callback
>>>>>>> pointer is NULL. - https://lore.kernel.org/patchwork/patch/1052302/
>>>>>>>
>>>>>> Is this able to be merged now?
>>>>>
>>>>> ping
>>
>> Wondering if there is anything else we need to do?
>>
>> The part has officially shipped and we had hoped to have driver 
>> support in Linux as part of the announcement.
>>
> Is this being sent in a PR for 5.3?
>
> Dan
>
Adding Greg to this thread as I have no idea what is going on with 
this.  This patch set has missed 2 merge windows and has

been ready since May.  Our customers are requesting status but we can 
only point to the mail thread

Here is the reference of the pinging I have done without reply

https://lore.kernel.org/patchwork/patch/1071894/

Dan


>
>> Dan
>>
>>
>>>>>
>>>>>
>>>>>> Dan
>>>>>>
>>>>>> <snip>

^ permalink raw reply

* Re: [RFC PATCH net-next 00/12] drop_monitor: Capture dropped packets and metadata
From: Ido Schimmel @ 2019-07-23 15:14 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: netdev, davem, nhorman, dsahern, roopa, nikolay, jakub.kicinski,
	andy, f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <875znt3pxu.fsf@toke.dk>

On Tue, Jul 23, 2019 at 02:17:49PM +0200, Toke Høiland-Jørgensen wrote:
> Ido Schimmel <idosch@idosch.org> writes:
> 
> > On Mon, Jul 22, 2019 at 09:43:15PM +0200, Toke Høiland-Jørgensen wrote:
> >> Is there a mechanism for the user to filter the packets before they are
> >> sent to userspace? A bpf filter would be the obvious choice I guess...
> >
> > Hi Toke,
> >
> > Yes, it's on my TODO list to write an eBPF program that only lets
> > "unique" packets to be enqueued on the netlink socket. Where "unique" is
> > defined as {5-tuple, PC}. The rest of the copies will be counted in an
> > eBPF map, which is just a hash table keyed by {5-tuple, PC}.
> 
> Yeah, that's a good idea. Or even something simpler like tcpdump-style
> filters for the packets returned by drop monitor (say if I'm just trying
> to figure out what happens to my HTTP requests).

Yep, that's a good idea. I guess different users will use different
programs. Will look into both options.

> > I think it would be good to have the program as part of the bcc
> > repository [1]. What do you think?
> 
> Sure. We could also add it to the XDP tutorial[2]; it could go into a
> section on introspection and debugging (just added a TODO about that[3]).

Great!

> >> For integrating with XDP the trick would be to find a way to do it that
> >> doesn't incur any overhead when it's not enabled. Are you envisioning
> >> that this would be enabled separately for the different "modes" (kernel,
> >> hardware, XDP, etc)?
> >
> > Yes. Drop monitor have commands to enable and disable tracing, but they
> > don't carry any attributes at the moment. My plan is to add an attribute
> > (e.g., 'NET_DM_ATTR_DROP_TYPE') that will specify the type of drops
> > you're interested in - SW/HW/XDP. If the attribute is not specified,
> > then current behavior is maintained and all the drop types are traced.
> > But if you're only interested in SW drops, then overhead for the rest
> > should be zero.
> 
> Makes sense (although "should be" is the key here ;)).
> 
> I'm also worried about the drop monitor getting overwhelmed; if you turn
> it on for XDP and you're running a filtering program there, you'll
> suddenly get *a lot* of drops.
> 
> As I read your patch, the current code can basically queue up an
> unbounded number of packets waiting to go out over netlink, can't it?

That's a very good point. Each CPU holds a drop list. It probably makes
sense to limit it by default (to 1000?) and allow user to change it
later, if needed. I can expose a counter that shows how many packets
were dropped because of this limit. It can be used as an indication to
adjust the queue length (or flip to 'summary' mode).

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Andy Lutomirski @ 2019-07-23 15:11 UTC (permalink / raw)
  To: Song Liu
  Cc: Andy Lutomirski, Kees Cook, linux-security@vger.kernel.org,
	Networking, bpf, Alexei Starovoitov, Daniel Borkmann, Kernel Team,
	Lorenz Bauer, Jann Horn, Greg KH, Linux API
In-Reply-To: <4A7A225A-6C23-4C0F-9A95-7C6C56B281ED@fb.com>

On Mon, Jul 22, 2019 at 1:54 PM Song Liu <songliubraving@fb.com> wrote:
>
> Hi Andy, Lorenz, and all,
>
> > On Jul 2, 2019, at 2:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
> >
> > On Tue, Jul 2, 2019 at 2:04 PM Kees Cook <keescook@chromium.org> wrote:
> >>
> >> On Mon, Jul 01, 2019 at 06:59:13PM -0700, Andy Lutomirski wrote:
> >>> I think I'm understanding your motivation.  You're not trying to make
> >>> bpf() generically usable without privilege -- you're trying to create
> >>> a way to allow certain users to access dangerous bpf functionality
> >>> within some limits.
> >>>
> >>> That's a perfectly fine goal, but I think you're reinventing the
> >>> wheel, and the wheel you're reinventing is quite complicated and
> >>> already exists.  I think you should teach bpftool to be secure when
> >>> installed setuid root or with fscaps enabled and put your policy in
> >>> bpftool.  If you want to harden this a little bit, it would seem
> >>> entirely reasonable to add a new CAP_BPF_ADMIN and change some, but
> >>> not all, of the capable() checks to check CAP_BPF_ADMIN instead of the
> >>> capabilities that they currently check.
> >>
> >> If finer grained controls are wanted, it does seem like the /dev/bpf
> >> path makes the most sense. open, request abilities, use fd. The open can
> >> be mediated by DAC and LSM. The request can be mediated by LSM. This
> >> provides a way to add policy at the LSM level and at the tool level.
> >> (i.e. For tool-level controls: leave LSM wide open, make /dev/bpf owned
> >> by "bpfadmin" and bpftool becomes setuid "bpfadmin". For fine-grained
> >> controls, leave /dev/bpf wide open and add policy to SELinux, etc.)
> >>
> >> With only a new CAP, you don't get the fine-grained controls. (The
> >> "request abilities" part is the key there.)
> >
> > Sure you do: the effective set.  It has somewhat bizarre defaults, but
> > I don't think that's a real problem.  Also, this wouldn't be like
> > CAP_DAC_READ_SEARCH -- you can't accidentally use your BPF caps.
> >
> > I think that a /dev capability-like object isn't totally nuts, but I
> > think we should do it well, and this patch doesn't really achieve
> > that.  But I don't think bpf wants fine-grained controls like this at
> > all -- as I pointed upthread, a fine-grained solution really wants
> > different treatment for the different capable() checks, and a bunch of
> > them won't resemble capabilities or /dev/bpf at all.
>
> With 5.3-rc1 out, I am back on this. :)
>
> How about we modify the set as:
>   1. Introduce sys_bpf_with_cap() that takes fd of /dev/bpf.

I'm fine with this in principle, but:

>   2. Better handling of capable() calls through bpf code. I guess the
>      biggest problem here is is_priv in verifier.c:bpf_check().

I think it would be good to understand exactly what /dev/bpf will
enable one to do.  Without some care, it would just become the next
CAP_SYS_ADMIN: if you can open it, sure, you're not root, but you can
intercept network traffic, modify cgroup behavior, and do plenty of
other things, any of which can probably be used to completely take
over the system.

It would also be nice to understand why you can't do what you need to
do entirely in user code using setuid or fscaps.

Finally, at risk of rehashing some old arguments, I'll point out that
the bpf() syscall is an unusual design to begin with.  As an example,
consider bpf_prog_attach().  Outside of bpf(), if I want to change the
behavior of a cgroup, I would write to a file in
/sys/kernel/cgroup/unified/whatever/, and normal DAC and MAC rules
apply.  With bpf(), however, I just call bpf() to attach a program to
the cgroup.  bpf() says "oh, you are capable(CAP_NET_ADMIN) -- go for
it!".  Unless I missed something major, and I just re-read the code,
there is no check that the caller has write or LSM permission to
anything at all in cgroupfs, and the existing API would make it very
awkward to impose any kind of DAC rules here.

So I think it might actually be time to repay some techincal debt and
come up with a real fix.  As a less intrusive approach, you could see
about requiring ownership of the cgroup directory instead of
CAP_NET_ADMIN.  As a more intrusive but perhaps better approach, you
could invert the logic to to make it work like everything outside of
cgroup: add pseudo-files like bpf.inet_ingress to the cgroup
directories, and require a writable fd to *that* to a new improved
attach API.  If a user could do:

int fd = open("/sys/fs/cgroup/.../bpf.inet_attach", O_RDWR);  /* usual
DAC and MAC policy applies */
int bpf_fd = setup the bpf stuff;  /* no privilege required, unless
the program is huge or needs is_priv */
bpf(BPF_IMPROVED_ATTACH, target = fd, program = bpf_fd);

there would be no capabilities or global privilege at all required for
this.  It would just work with cgroup delegation, containers, etc.

I think you could even pull off this type of API change with only
libbpf changes.  In particular, there's this code:

int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
                    unsigned int flags)
{
        union bpf_attr attr;

        memset(&attr, 0, sizeof(attr));
        attr.target_fd     = target_fd;
        attr.attach_bpf_fd = prog_fd;
        attr.attach_type   = type;
        attr.attach_flags  = flags;

        return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
}

This would instead do something like:

int specific_target_fd = openat(target_fd, bpf_type_to_target[type], O_RDWR);
attr.target_fd = specific_target_fd;
...

return sys_bpf(BPF_PROG_IMPROVED_ATTACH, &attr, sizeof(attr));

Would this solve your problem without needing /dev/bpf at all?

--Andy

^ permalink raw reply

* [PATCH net-next 4/4] nfp: flower: offload MPLS set action
From: John Hurley @ 2019-07-23 14:34 UTC (permalink / raw)
  To: netdev; +Cc: davem, simon.horman, jakub.kicinski, oss-drivers, John Hurley
In-Reply-To: <1563892442-4654-1-git-send-email-john.hurley@netronome.com>

Recent additions to the kernel include a TC action module to manipulate
MPLS headers on packets. Such actions are available to offload via the
flow_offload intermediate representation API.

Modify the NFP driver to allow the offload of MPLS set actions to
firmware. Set actions update the outermost MPLS header. The offload
includes a mask to specify which fields should be set.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 45 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   |  8 ++++
 2 files changed, 53 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 7f288ae..ff2f419 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -70,6 +70,37 @@ nfp_fl_pop_mpls(struct nfp_fl_pop_mpls *pop_mpls,
 	pop_mpls->ethtype = act->mpls_pop.proto;
 }
 
+static void
+nfp_fl_set_mpls(struct nfp_fl_set_mpls *set_mpls,
+		const struct flow_action_entry *act)
+{
+	size_t act_size = sizeof(struct nfp_fl_set_mpls);
+	u32 mpls_lse = 0, mpls_mask = 0;
+
+	set_mpls->head.jump_id = NFP_FL_ACTION_OPCODE_SET_MPLS;
+	set_mpls->head.len_lw = act_size >> NFP_FL_LW_SIZ;
+
+	if (act->mpls_mangle.label != ACT_MPLS_LABEL_NOT_SET) {
+		mpls_lse |= act->mpls_mangle.label << MPLS_LS_LABEL_SHIFT;
+		mpls_mask |= MPLS_LS_LABEL_MASK;
+	}
+	if (act->mpls_mangle.tc != ACT_MPLS_TC_NOT_SET) {
+		mpls_lse |= act->mpls_mangle.tc << MPLS_LS_TC_SHIFT;
+		mpls_mask |= MPLS_LS_TC_MASK;
+	}
+	if (act->mpls_mangle.bos != ACT_MPLS_BOS_NOT_SET) {
+		mpls_lse |= act->mpls_mangle.bos << MPLS_LS_S_SHIFT;
+		mpls_mask |= MPLS_LS_S_MASK;
+	}
+	if (act->mpls_mangle.ttl) {
+		mpls_lse |= act->mpls_mangle.ttl << MPLS_LS_TTL_SHIFT;
+		mpls_mask |= MPLS_LS_TTL_MASK;
+	}
+
+	set_mpls->lse = cpu_to_be32(mpls_lse);
+	set_mpls->lse_mask = cpu_to_be32(mpls_mask);
+}
+
 static void nfp_fl_pop_vlan(struct nfp_fl_pop_vlan *pop_vlan)
 {
 	size_t act_size = sizeof(struct nfp_fl_pop_vlan);
@@ -917,6 +948,7 @@ nfp_flower_loop_action(struct nfp_app *app, const struct flow_action_entry *act,
 	struct nfp_fl_push_mpls *psh_m;
 	struct nfp_fl_pop_vlan *pop_v;
 	struct nfp_fl_pop_mpls *pop_m;
+	struct nfp_fl_set_mpls *set_m;
 	int err;
 
 	switch (act->id) {
@@ -1050,6 +1082,19 @@ nfp_flower_loop_action(struct nfp_app *app, const struct flow_action_entry *act,
 		nfp_fl_pop_mpls(pop_m, act);
 		*a_len += sizeof(struct nfp_fl_pop_mpls);
 		break;
+	case FLOW_ACTION_MPLS_MANGLE:
+		if (*a_len +
+		    sizeof(struct nfp_fl_set_mpls) > NFP_FL_MAX_A_SIZ) {
+			NL_SET_ERR_MSG_MOD(extack, "unsupported offload: maximum allowed action list size exceeded at set MPLS");
+			return -EOPNOTSUPP;
+		}
+
+		set_m = (struct nfp_fl_set_mpls *)&nfp_fl->action_data[*a_len];
+		nfp_fl->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_NULL);
+
+		nfp_fl_set_mpls(set_m, act);
+		*a_len += sizeof(struct nfp_fl_set_mpls);
+		break;
 	default:
 		/* Currently we do not handle any other actions. */
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: unsupported action in action list");
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 3198ad4..3324394 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -72,6 +72,7 @@
 #define NFP_FL_ACTION_OPCODE_POP_MPLS		4
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL	6
 #define NFP_FL_ACTION_OPCODE_SET_ETHERNET	7
+#define NFP_FL_ACTION_OPCODE_SET_MPLS		8
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS	9
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_TTL_TOS	10
 #define NFP_FL_ACTION_OPCODE_SET_IPV6_SRC	11
@@ -245,6 +246,13 @@ struct nfp_fl_pop_mpls {
 	__be16 ethtype;
 };
 
+struct nfp_fl_set_mpls {
+	struct nfp_fl_act_head head;
+	__be16 reserved;
+	__be32 lse_mask;
+	__be32 lse;
+};
+
 /* Metadata with L2 (1W/4B)
  * ----------------------------------------------------------------
  *    3                   2                   1
-- 
2.7.4


^ permalink raw reply related

* [PATCH net-next 3/4] nfp: flower: offload MPLS pop action
From: John Hurley @ 2019-07-23 14:34 UTC (permalink / raw)
  To: netdev; +Cc: davem, simon.horman, jakub.kicinski, oss-drivers, John Hurley
In-Reply-To: <1563892442-4654-1-git-send-email-john.hurley@netronome.com>

Recent additions to the kernel include a TC action module to manipulate
MPLS headers on packets. Such actions are available to offload via the
flow_offload intermediate representation API.

Modify the NFP driver to allow the offload of MPLS pop actions to
firmware. The act_mpls TC module enforces that the next protocol is
supplied along with the pop action. Passing this to firmware allows it
to properly rebuild the underlying packet after the pop.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 25 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   |  6 ++++++
 2 files changed, 31 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 9e18bec..7f288ae 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -59,6 +59,17 @@ nfp_fl_push_mpls(struct nfp_fl_push_mpls *push_mpls,
 	return 0;
 }
 
+static void
+nfp_fl_pop_mpls(struct nfp_fl_pop_mpls *pop_mpls,
+		const struct flow_action_entry *act)
+{
+	size_t act_size = sizeof(struct nfp_fl_pop_mpls);
+
+	pop_mpls->head.jump_id = NFP_FL_ACTION_OPCODE_POP_MPLS;
+	pop_mpls->head.len_lw = act_size >> NFP_FL_LW_SIZ;
+	pop_mpls->ethtype = act->mpls_pop.proto;
+}
+
 static void nfp_fl_pop_vlan(struct nfp_fl_pop_vlan *pop_vlan)
 {
 	size_t act_size = sizeof(struct nfp_fl_pop_vlan);
@@ -905,6 +916,7 @@ nfp_flower_loop_action(struct nfp_app *app, const struct flow_action_entry *act,
 	struct nfp_fl_push_vlan *psh_v;
 	struct nfp_fl_push_mpls *psh_m;
 	struct nfp_fl_pop_vlan *pop_v;
+	struct nfp_fl_pop_mpls *pop_m;
 	int err;
 
 	switch (act->id) {
@@ -1025,6 +1037,19 @@ nfp_flower_loop_action(struct nfp_app *app, const struct flow_action_entry *act,
 			return err;
 		*a_len += sizeof(struct nfp_fl_push_mpls);
 		break;
+	case FLOW_ACTION_MPLS_POP:
+		if (*a_len +
+		    sizeof(struct nfp_fl_pop_mpls) > NFP_FL_MAX_A_SIZ) {
+			NL_SET_ERR_MSG_MOD(extack, "unsupported offload: maximum allowed action list size exceeded at pop MPLS");
+			return -EOPNOTSUPP;
+		}
+
+		pop_m = (struct nfp_fl_pop_mpls *)&nfp_fl->action_data[*a_len];
+		nfp_fl->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_NULL);
+
+		nfp_fl_pop_mpls(pop_m, act);
+		*a_len += sizeof(struct nfp_fl_pop_mpls);
+		break;
 	default:
 		/* Currently we do not handle any other actions. */
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: unsupported action in action list");
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 91af0fa..3198ad4 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -69,6 +69,7 @@
 #define NFP_FL_ACTION_OPCODE_PUSH_VLAN		1
 #define NFP_FL_ACTION_OPCODE_POP_VLAN		2
 #define NFP_FL_ACTION_OPCODE_PUSH_MPLS		3
+#define NFP_FL_ACTION_OPCODE_POP_MPLS		4
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL	6
 #define NFP_FL_ACTION_OPCODE_SET_ETHERNET	7
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS	9
@@ -239,6 +240,11 @@ struct nfp_fl_push_mpls {
 	__be32 lse;
 };
 
+struct nfp_fl_pop_mpls {
+	struct nfp_fl_act_head head;
+	__be16 ethtype;
+};
+
 /* Metadata with L2 (1W/4B)
  * ----------------------------------------------------------------
  *    3                   2                   1
-- 
2.7.4


^ permalink raw reply related

* [PATCH net-next 2/4] nfp: flower: offload MPLS push action
From: John Hurley @ 2019-07-23 14:34 UTC (permalink / raw)
  To: netdev; +Cc: davem, simon.horman, jakub.kicinski, oss-drivers, John Hurley
In-Reply-To: <1563892442-4654-1-git-send-email-john.hurley@netronome.com>

Recent additions to the kernel include a TC action module to manipulate
MPLS headers on packets. Such actions are available to offload via the
flow_offload intermediate representation API.

Modify the NFP driver to allow the offload of MPLS push actions to
firmware.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 50 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   |  7 +++
 2 files changed, 57 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 5a54fe8..9e18bec 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -2,10 +2,12 @@
 /* Copyright (C) 2017-2018 Netronome Systems, Inc. */
 
 #include <linux/bitfield.h>
+#include <linux/mpls.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_csum.h>
 #include <net/tc_act/tc_gact.h>
 #include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_mpls.h>
 #include <net/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_vlan.h>
 #include <net/tc_act/tc_tunnel_key.h>
@@ -25,6 +27,38 @@
 						 NFP_FL_TUNNEL_KEY | \
 						 NFP_FL_TUNNEL_GENEVE_OPT)
 
+static int
+nfp_fl_push_mpls(struct nfp_fl_push_mpls *push_mpls,
+		 const struct flow_action_entry *act,
+		 struct netlink_ext_ack *extack)
+{
+	size_t act_size = sizeof(struct nfp_fl_push_mpls);
+	u32 mpls_lse = 0;
+
+	push_mpls->head.jump_id = NFP_FL_ACTION_OPCODE_PUSH_MPLS;
+	push_mpls->head.len_lw = act_size >> NFP_FL_LW_SIZ;
+
+	/* BOS is optional in the TC action but required for offload. */
+	if (act->mpls_push.bos != ACT_MPLS_BOS_NOT_SET) {
+		mpls_lse |= act->mpls_push.bos << MPLS_LS_S_SHIFT;
+	} else {
+		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: BOS field must explicitly be set for MPLS push");
+		return -EOPNOTSUPP;
+	}
+
+	/* Leave MPLS TC as a default value of 0 if not explicitly set. */
+	if (act->mpls_push.tc != ACT_MPLS_TC_NOT_SET)
+		mpls_lse |= act->mpls_push.tc << MPLS_LS_TC_SHIFT;
+
+	/* Proto, label and TTL are enforced and verified for MPLS push. */
+	mpls_lse |= act->mpls_push.label << MPLS_LS_LABEL_SHIFT;
+	mpls_lse |= act->mpls_push.ttl << MPLS_LS_TTL_SHIFT;
+	push_mpls->ethtype = act->mpls_push.proto;
+	push_mpls->lse = cpu_to_be32(mpls_lse);
+
+	return 0;
+}
+
 static void nfp_fl_pop_vlan(struct nfp_fl_pop_vlan *pop_vlan)
 {
 	size_t act_size = sizeof(struct nfp_fl_pop_vlan);
@@ -869,6 +903,7 @@ nfp_flower_loop_action(struct nfp_app *app, const struct flow_action_entry *act,
 	struct nfp_fl_set_ipv4_tun *set_tun;
 	struct nfp_fl_pre_tunnel *pre_tun;
 	struct nfp_fl_push_vlan *psh_v;
+	struct nfp_fl_push_mpls *psh_m;
 	struct nfp_fl_pop_vlan *pop_v;
 	int err;
 
@@ -975,6 +1010,21 @@ nfp_flower_loop_action(struct nfp_app *app, const struct flow_action_entry *act,
 		 */
 		*csum_updated &= ~act->csum_flags;
 		break;
+	case FLOW_ACTION_MPLS_PUSH:
+		if (*a_len +
+		    sizeof(struct nfp_fl_push_mpls) > NFP_FL_MAX_A_SIZ) {
+			NL_SET_ERR_MSG_MOD(extack, "unsupported offload: maximum allowed action list size exceeded at push MPLS");
+			return -EOPNOTSUPP;
+		}
+
+		psh_m = (struct nfp_fl_push_mpls *)&nfp_fl->action_data[*a_len];
+		nfp_fl->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_NULL);
+
+		err = nfp_fl_push_mpls(psh_m, act, extack);
+		if (err)
+			return err;
+		*a_len += sizeof(struct nfp_fl_push_mpls);
+		break;
 	default:
 		/* Currently we do not handle any other actions. */
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: unsupported action in action list");
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 0f1706a..91af0fa 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -68,6 +68,7 @@
 #define NFP_FL_ACTION_OPCODE_OUTPUT		0
 #define NFP_FL_ACTION_OPCODE_PUSH_VLAN		1
 #define NFP_FL_ACTION_OPCODE_POP_VLAN		2
+#define NFP_FL_ACTION_OPCODE_PUSH_MPLS		3
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL	6
 #define NFP_FL_ACTION_OPCODE_SET_ETHERNET	7
 #define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS	9
@@ -232,6 +233,12 @@ struct nfp_fl_push_geneve {
 	u8 opt_data[];
 };
 
+struct nfp_fl_push_mpls {
+	struct nfp_fl_act_head head;
+	__be16 ethtype;
+	__be32 lse;
+};
+
 /* Metadata with L2 (1W/4B)
  * ----------------------------------------------------------------
  *    3                   2                   1
-- 
2.7.4


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox