Netdev List
 help / color / mirror / Atom feed
* INFO: rcu detected stall in kmem_cache_alloc_node_trace
From: syzbot @ 2018-04-30 18:05 UTC (permalink / raw)
  To: davem, linux-kernel, linux-sctp, netdev, nhorman, syzkaller-bugs,
	vyasevich

Hello,

syzbot found the following crash on:

HEAD commit:    17dec0a94915 Merge branch 'userns-linus' of  
git://git.kerne...
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?id=6093051722203136
kernel config:   
https://syzkaller.appspot.com/x/.config?id=-2735707888269579554
dashboard link: https://syzkaller.appspot.com/bug?extid=deec965c578bb9b81613
compiler:       gcc (GCC) 8.0.1 20180301 (experimental)

Unfortunately, I don't have any reproducer for this crash yet.

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+deec965c578bb9b81613@syzkaller.appspotmail.com

sctp: [Deprecated]: syz-executor3 (pid 10218) Use of int in max_burst  
socket option.
Use struct sctp_assoc_value instead
sctp: [Deprecated]: syz-executor3 (pid 10218) Use of int in max_burst  
socket option.
Use struct sctp_assoc_value instead
random: crng init done
INFO: rcu_sched self-detected stall on CPU
	0-....: (120712 ticks this GP) idle=ac6/1/4611686018427387908  
softirq=31693/31693 fqs=31173
	 (t=125001 jiffies g=17039 c=17038 q=303419)
NMI backtrace for cpu 0
CPU: 0 PID: 10218 Comm: syz-executor3 Not tainted 4.16.0+ #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x1b9/0x29f lib/dump_stack.c:53
  nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
  nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
  arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
  trigger_single_cpu_backtrace include/linux/nmi.h:156 [inline]
  rcu_dump_cpu_stacks+0x175/0x1c2 kernel/rcu/tree.c:1376
  print_cpu_stall kernel/rcu/tree.c:1525 [inline]
  check_cpu_stall.isra.61.cold.80+0x36c/0x59a kernel/rcu/tree.c:1593
  __rcu_pending kernel/rcu/tree.c:3356 [inline]
  rcu_pending kernel/rcu/tree.c:3401 [inline]
  rcu_check_callbacks+0x21b/0xad0 kernel/rcu/tree.c:2763
  update_process_times+0x2d/0x70 kernel/time/timer.c:1636
  tick_sched_handle+0xa0/0x180 kernel/time/tick-sched.c:162
  tick_sched_timer+0x42/0x130 kernel/time/tick-sched.c:1170
  __run_hrtimer kernel/time/hrtimer.c:1349 [inline]
  __hrtimer_run_queues+0x3e3/0x10a0 kernel/time/hrtimer.c:1411
  hrtimer_interrupt+0x2f3/0x750 kernel/time/hrtimer.c:1469
  local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1025 [inline]
  smp_apic_timer_interrupt+0x15d/0x710 arch/x86/kernel/apic/apic.c:1050
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783  
[inline]
RIP: 0010:lock_is_held_type+0x18b/0x210 kernel/locking/lockdep.c:3960
RSP: 0018:ffff8801db006400 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff12
RAX: dffffc0000000000 RBX: 0000000000000282 RCX: 0000000000000000
RDX: 1ffffffff1162e55 RSI: ffffffff88b90c60 RDI: 0000000000000282
RBP: ffff8801db006420 R08: ffffed003b6046c3 R09: ffffed003b6046c2
R10: ffffed003b6046c2 R11: ffff8801db023613 R12: ffff8801b2f623c0
R13: 0000000000000000 R14: ffff88009932bb00 R15: 00000000ffffffff
  lock_is_held include/linux/lockdep.h:344 [inline]
  rcu_read_lock_sched_held+0x108/0x120 kernel/rcu/update.c:117
  trace_kmalloc_node include/trace/events/kmem.h:100 [inline]
  kmem_cache_alloc_node_trace+0x34e/0x770 mm/slab.c:3652
  __do_kmalloc_node mm/slab.c:3669 [inline]
  __kmalloc_node_track_caller+0x33/0x70 mm/slab.c:3684
  __kmalloc_reserve.isra.38+0x3a/0xe0 net/core/skbuff.c:137
  __alloc_skb+0x14d/0x780 net/core/skbuff.c:205
  alloc_skb include/linux/skbuff.h:987 [inline]
  sctp_packet_transmit+0x45e/0x3ba0 net/sctp/output.c:585
  sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
  expire_timers kernel/time/timer.c:1363 [inline]
  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
  invoke_softirq kernel/softirq.c:365 [inline]
  irq_exit+0x1d1/0x200 kernel/softirq.c:405
  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
  </IRQ>
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783  
[inline]
RIP: 0010:console_unlock+0xcdf/0x1100 kernel/printk/printk.c:2403
RSP: 0018:ffff8801946eec00 EFLAGS: 00000212 ORIG_RAX: ffffffffffffff12
RAX: 0000000000040000 RBX: 0000000000000200 RCX: ffffc90002ee8000
RDX: 0000000000004461 RSI: ffffffff815f3446 RDI: 0000000000000212
RBP: ffff8801946eed68 R08: ffff8801b2f62c38 R09: 0000000000000006
R10: ffff8801b2f623c0 R11: 0000000000000000 R12: 0000000000000000
R13: ffffffff84b84430 R14: 0000000000000001 R15: dffffc0000000000
  vprintk_emit+0x6ad/0xdd0 kernel/printk/printk.c:1907
  vprintk_default+0x28/0x30 kernel/printk/printk.c:1947
  vprintk_func+0x7a/0xe7 kernel/printk/printk_safe.c:379
  printk+0x9e/0xba kernel/printk/printk.c:1980
  sctp_getsockopt_maxburst net/sctp/socket.c:6265 [inline]
  sctp_getsockopt.cold.34+0x11d/0x14c net/sctp/socket.c:7240
  sock_common_getsockopt+0x9a/0xe0 net/core/sock.c:2998
  __sys_getsockopt+0x1a5/0x370 net/socket.c:1940
  SYSC_getsockopt net/socket.c:1951 [inline]
  SyS_getsockopt+0x34/0x50 net/socket.c:1948
  do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x455279
RSP: 002b:00007f5c0c0f2c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000037
RAX: ffffffffffffffda RBX: 00007f5c0c0f36d4 RCX: 0000000000455279
RDX: 0000000000000014 RSI: 0000000000000084 RDI: 0000000000000014
RBP: 000000000072bea0 R08: 0000000020000240 R09: 0000000000000000
R10: 0000000020000140 R11: 0000000000000246 R12: 00000000ffffffff
R13: 000000000000012b R14: 00000000006f4ca8 R15: 0000000000000000
INFO: rcu_sched detected stalls on CPUs/tasks:
	0-....: (120712 ticks this GP) idle=ac6/1/4611686018427387908  
softirq=31693/31693 fqs=31173
	(detected by 1, t=125002 jiffies, g=17039, c=17038, q=303419)
Sending NMI from CPU 1 to CPUs 0:
NMI backtrace for cpu 0
CPU: 0 PID: 10218 Comm: syz-executor3 Not tainted 4.16.0+ #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:__lock_acquire+0xa78/0x5130 kernel/locking/lockdep.c:3434
RSP: 0018:ffff8801db005b40 EFLAGS: 00000046
RAX: dffffc0000000000 RBX: 00000000858813a6 RCX: 0000000000000001
RDX: 1ffff100365ec585 RSI: ffff8801b2f62c38 RDI: ffff8801b2f62cf9
RBP: ffff8801db005ed0 R08: 0000000000000008 R09: 0000000000000004
R10: ffff8801b2f62cd8 R11: ffff8801b2f623c0 R12: 00000000be6c7baf
R13: 0000000000000000 R14: 2ca977e0cccfd81f R15: 0000000000000000
FS:  00007f5c0c0f3700(0000) GS:ffff8801db000000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffe57df8fa8 CR3: 00000001d7124000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  <IRQ>
  lock_acquire+0x1dc/0x520 kernel/locking/lockdep.c:3920
  rcu_lock_acquire include/linux/rcupdate.h:246 [inline]
  rcu_read_lock include/linux/rcupdate.h:632 [inline]
  is_bpf_text_address+0x3b/0x170 kernel/bpf/core.c:478
  kernel_text_address+0x79/0xf0 kernel/extable.c:152
  __kernel_text_address+0xd/0x40 kernel/extable.c:107
  unwind_get_return_address+0x61/0xa0 arch/x86/kernel/unwind_frame.c:18
  __save_stack_trace+0x7e/0xd0 arch/x86/kernel/stacktrace.c:45
  save_stack_trace+0x1a/0x20 arch/x86/kernel/stacktrace.c:60
  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
  set_track mm/kasan/kasan.c:459 [inline]
  __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:520
  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:527
  __cache_free mm/slab.c:3486 [inline]
  kmem_cache_free+0x86/0x2d0 mm/slab.c:3744
  kfree_skbmem+0x13c/0x210 net/core/skbuff.c:582
  __kfree_skb net/core/skbuff.c:642 [inline]
  consume_skb+0x193/0x550 net/core/skbuff.c:701
  sctp_chunk_destroy net/sctp/sm_make_chunk.c:1477 [inline]
  sctp_chunk_put+0x2c0/0x440 net/sctp/sm_make_chunk.c:1504
  sctp_chunk_free+0x53/0x60 net/sctp/sm_make_chunk.c:1491
  sctp_packet_pack net/sctp/output.c:489 [inline]
  sctp_packet_transmit+0x142e/0x3ba0 net/sctp/output.c:610
  sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
  expire_timers kernel/time/timer.c:1363 [inline]
  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
  invoke_softirq kernel/softirq.c:365 [inline]
  irq_exit+0x1d1/0x200 kernel/softirq.c:405
  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
  </IRQ>
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783  
[inline]
RIP: 0010:console_unlock+0xcdf/0x1100 kernel/printk/printk.c:2403
RSP: 0018:ffff8801946eec00 EFLAGS: 00000212 ORIG_RAX: ffffffffffffff12
RAX: 0000000000040000 RBX: 0000000000000200 RCX: ffffc90002ee8000
RDX: 0000000000004461 RSI: ffffffff815f3446 RDI: 0000000000000212
RBP: ffff8801946eed68 R08: ffff8801b2f62c38 R09: 0000000000000006
R10: ffff8801b2f623c0 R11: 0000000000000000 R12: 0000000000000000
R13: ffffffff84b84430 R14: 0000000000000001 R15: dffffc0000000000
  vprintk_emit+0x6ad/0xdd0 kernel/printk/printk.c:1907
  vprintk_default+0x28/0x30 kernel/printk/printk.c:1947
  vprintk_func+0x7a/0xe7 kernel/printk/printk_safe.c:379
  printk+0x9e/0xba kernel/printk/printk.c:1980
  sctp_getsockopt_maxburst net/sctp/socket.c:6265 [inline]
  sctp_getsockopt.cold.34+0x11d/0x14c net/sctp/socket.c:7240
  sock_common_getsockopt+0x9a/0xe0 net/core/sock.c:2998
  __sys_getsockopt+0x1a5/0x370 net/socket.c:1940
  SYSC_getsockopt net/socket.c:1951 [inline]
  SyS_getsockopt+0x34/0x50 net/socket.c:1948
  do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x455279
RSP: 002b:00007f5c0c0f2c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000037
RAX: ffffffffffffffda RBX: 00007f5c0c0f36d4 RCX: 0000000000455279
RDX: 0000000000000014 RSI: 0000000000000084 RDI: 0000000000000014
RBP: 000000000072bea0 R08: 0000000020000240 R09: 0000000000000000
R10: 0000000020000140 R11: 0000000000000246 R12: 00000000ffffffff
R13: 000000000000012b R14: 00000000006f4ca8 R15: 0000000000000000
Code: 0f 85 dc 32 00 00 8b 0d b7 9e 8b 07 85 c9 0f 84 62 f7 ff ff 48 b8 00  
00 00 00 00 fc ff df 48 8b 54 24 78 48 c1 ea 03 80 3c 02 00 <0f> 85 0b 31  
00 00 48 8b 94 24 88 00 00 00 4d 89 b3 68 08 00 00
INFO: NMI handler (nmi_cpu_backtrace_handler) took too long to run: 3.007  
msecs
INFO: task kworker/u4:4:688 blocked for more than 120 seconds.
       Not tainted 4.16.0+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kworker/u4:4    D19560   688      2 0x80000000
Workqueue: events_unbound fsnotify_mark_destroy_workfn
Call Trace:
  context_switch kernel/sched/core.c:2848 [inline]
  __schedule+0x807/0x1e40 kernel/sched/core.c:3490
  schedule+0xef/0x430 kernel/sched/core.c:3549
  schedule_timeout+0x1b5/0x240 kernel/time/timer.c:1777
  do_wait_for_common kernel/sched/completion.c:83 [inline]
  __wait_for_common kernel/sched/completion.c:104 [inline]
  wait_for_common kernel/sched/completion.c:115 [inline]
  wait_for_completion+0x3e7/0x870 kernel/sched/completion.c:136
  __synchronize_srcu+0x189/0x240 kernel/rcu/srcutree.c:924
  synchronize_srcu+0x408/0x54f kernel/rcu/srcutree.c:1002
  fsnotify_mark_destroy_workfn+0x1aa/0x530 fs/notify/mark.c:759
  process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
  worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
  kthread+0x345/0x410 kernel/kthread.c:238
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:411

Showing all locks held in the system:
2 locks held by kworker/u4:4/688:
  #0: 000000007b6e92d0 ((wq_completion)"events_unbound"){+.+.}, at:  
__write_once_size include/linux/compiler.h:215 [inline]
  #0: 000000007b6e92d0 ((wq_completion)"events_unbound"){+.+.}, at:  
arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline]
  #0: 000000007b6e92d0 ((wq_completion)"events_unbound"){+.+.}, at:  
atomic64_set include/asm-generic/atomic-instrumented.h:40 [inline]
  #0: 000000007b6e92d0 ((wq_completion)"events_unbound"){+.+.}, at:  
atomic_long_set include/asm-generic/atomic-long.h:57 [inline]
  #0: 000000007b6e92d0 ((wq_completion)"events_unbound"){+.+.}, at:  
set_work_data kernel/workqueue.c:617 [inline]
  #0: 000000007b6e92d0 ((wq_completion)"events_unbound"){+.+.}, at:  
set_work_pool_and_clear_pending kernel/workqueue.c:644 [inline]
  #0: 000000007b6e92d0 ((wq_completion)"events_unbound"){+.+.}, at:  
process_one_work+0xaef/0x1b50 kernel/workqueue.c:2116
  #1: 000000004c7e11cf ((reaper_work).work){+.+.}, at:  
process_one_work+0xb46/0x1b50 kernel/workqueue.c:2120
2 locks held by khungtaskd/878:
  #0: 000000001cc267e2 (rcu_read_lock){....}, at:  
check_hung_uninterruptible_tasks kernel/hung_task.c:175 [inline]
  #0: 000000001cc267e2 (rcu_read_lock){....}, at: watchdog+0x1ff/0xf60  
kernel/hung_task.c:249
  #1: 000000002f71223f (tasklist_lock){.+.+}, at:  
debug_show_all_locks+0xde/0x34a kernel/locking/lockdep.c:4470
2 locks held by kworker/1:2/1980:
  #0: 00000000dc6681fd ((wq_completion)"events"){+.+.}, at:  
__write_once_size include/linux/compiler.h:215 [inline]
  #0: 00000000dc6681fd ((wq_completion)"events"){+.+.}, at:  
arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline]
  #0: 00000000dc6681fd ((wq_completion)"events"){+.+.}, at: atomic64_set  
include/asm-generic/atomic-instrumented.h:40 [inline]
  #0: 00000000dc6681fd ((wq_completion)"events"){+.+.}, at: atomic_long_set  
include/asm-generic/atomic-long.h:57 [inline]
  #0: 00000000dc6681fd ((wq_completion)"events"){+.+.}, at: set_work_data  
kernel/workqueue.c:617 [inline]
  #0: 00000000dc6681fd ((wq_completion)"events"){+.+.}, at:  
set_work_pool_and_clear_pending kernel/workqueue.c:644 [inline]
  #0: 00000000dc6681fd ((wq_completion)"events"){+.+.}, at:  
process_one_work+0xaef/0x1b50 kernel/workqueue.c:2116
  #1: 000000008e69c2e7 (xfrm_state_gc_work){+.+.}, at:  
process_one_work+0xb46/0x1b50 kernel/workqueue.c:2120
1 lock held by rsyslogd/4365:
  #0: 00000000ef321630 (&f->f_pos_lock){+.+.}, at: __fdget_pos+0x1a9/0x1e0  
fs/file.c:766
2 locks held by getty/4456:
  #0: 0000000044e43f49 (&tty->ldisc_sem){++++}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1: 0000000093b079e0 (&ldata->atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131
2 locks held by getty/4457:
  #0: 000000005b25b55f (&tty->ldisc_sem){++++}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1: 0000000099955ee5 (&ldata->atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131
2 locks held by getty/4458:
  #0: 0000000050f5738d (&tty->ldisc_sem){++++}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1: 00000000cccc0402 (&ldata->atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131
2 locks held by getty/4459:
  #0: 00000000ddecfb5c (&tty->ldisc_sem){++++}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1: 000000003c071f3a (&ldata->atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131
2 locks held by getty/4460:
  #0: 000000003bec706e (&tty->ldisc_sem){++++}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1: 00000000dd28453c (&ldata->atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131
2 locks held by getty/4461:
  #0: 00000000313fc55a (&tty->ldisc_sem){++++}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1: 00000000e9766156 (&ldata->atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131
2 locks held by getty/4462:
  #0: 000000003212bb13 (&tty->ldisc_sem){++++}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1: 0000000065fa2f73 (&ldata->atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131

=============================================

NMI backtrace for cpu 1
CPU: 1 PID: 878 Comm: khungtaskd Not tainted 4.16.0+ #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x1b9/0x29f lib/dump_stack.c:53
  nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
  nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
  arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
  trigger_all_cpu_backtrace include/linux/nmi.h:138 [inline]
  check_hung_task kernel/hung_task.c:132 [inline]
  check_hung_uninterruptible_tasks kernel/hung_task.c:190 [inline]
  watchdog+0xc10/0xf60 kernel/hung_task.c:249
  kthread+0x345/0x410 kernel/kthread.c:238
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:411
Sending NMI from CPU 1 to CPUs 0:
NMI backtrace for cpu 0
CPU: 0 PID: 10218 Comm: syz-executor3 Not tainted 4.16.0+ #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:debug_lockdep_rcu_enabled.part.1+0xb/0x60 kernel/rcu/update.c:298
RSP: 0018:ffff8801db0062e8 EFLAGS: 00000202
RAX: dffffc0000000000 RBX: 1ffff1003b600c61 RCX: ffffffff86583d15
RDX: 0000000000000004 RSI: ffffffff86583d65 RDI: ffffffff88e6cc40
RBP: ffff8801db0062f8 R08: ffff8801b2f623c0 R09: ffffed003b6046c2
R10: ffffed003b6046c2 R11: ffff8801db023613 R12: ffff8801c3d740c0
R13: 0000000000000001 R14: 0000000000010000 R15: ffff88019cc0a900
FS:  00007f5c0c0f3700(0000) GS:ffff8801db000000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffe57df8fa8 CR3: 00000001d7124000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  <IRQ>
  rcu_read_unlock include/linux/rcupdate.h:684 [inline]
  ip6_mtu+0x36a/0x590 net/ipv6/route.c:2420
  dst_mtu include/net/dst.h:210 [inline]
  ip6_xmit+0xb42/0x23f0 net/ipv6/ip6_output.c:262
  sctp_v6_xmit+0x4a5/0x6b0 net/sctp/ipv6.c:225
  sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:642
  sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
  expire_timers kernel/time/timer.c:1363 [inline]
  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
  invoke_softirq kernel/softirq.c:365 [inline]
  irq_exit+0x1d1/0x200 kernel/softirq.c:405
  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
  </IRQ>
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783  
[inline]
RIP: 0010:console_unlock+0xcdf/0x1100 kernel/printk/printk.c:2403
RSP: 0018:ffff8801946eec00 EFLAGS: 00000212 ORIG_RAX: ffffffffffffff12
RAX: 0000000000040000 RBX: 0000000000000200 RCX: ffffc90002ee8000
RDX: 0000000000004461 RSI: ffffffff815f3446 RDI: 0000000000000212
RBP: ffff8801946eed68 R08: ffff8801b2f62c38 R09: 0000000000000006
R10: ffff8801b2f623c0 R11: 0000000000000000 R12: 0000000000000000
R13: ffffffff84b84430 R14: 0000000000000001 R15: dffffc0000000000
  vprintk_emit+0x6ad/0xdd0 kernel/printk/printk.c:1907
  vprintk_default+0x28/0x30 kernel/printk/printk.c:1947
  vprintk_func+0x7a/0xe7 kernel/printk/printk_safe.c:379
  printk+0x9e/0xba kernel/printk/printk.c:1980
  sctp_getsockopt_maxburst net/sctp/socket.c:6265 [inline]
  sctp_getsockopt.cold.34+0x11d/0x14c net/sctp/socket.c:7240
  sock_common_getsockopt+0x9a/0xe0 net/core/sock.c:2998
  __sys_getsockopt+0x1a5/0x370 net/socket.c:1940
  SYSC_getsockopt net/socket.c:1951 [inline]
  SyS_getsockopt+0x34/0x50 net/socket.c:1948
  do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x455279
RSP: 002b:00007f5c0c0f2c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000037
RAX: ffffffffffffffda RBX: 00007f5c0c0f36d4 RCX: 0000000000455279
RDX: 0000000000000014 RSI: 0000000000000084 RDI: 0000000000000014
RBP: 000000000072bea0 R08: 0000000020000240 R09: 0000000000000000
R10: 0000000020000140 R11: 0000000000000246 R12: 00000000ffffffff
R13: 000000000000012b R14: 00000000006f4ca8 R15: 0000000000000000
Code: e9 c3 fd ff ff 48 8b 7d c8 e8 52 6c 50 00 e9 9c fd ff ff 0f 1f 00 66  
2e 0f 1f 84 00 00 00 00 00 48 b8 00 00 00 00 00 fc ff df 55 <48> 89 e5 53  
65 48 8b 1c 25 c0 ed 01 00 48 8d bb 74 08 00 00 48


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is  
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug  
report.
Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* INFO: rcu detected stall in kfree_skbmem
From: syzbot @ 2018-04-30 18:09 UTC (permalink / raw)
  To: avagin, davem, ktkhai, linux-kernel, netdev, syzkaller-bugs

Hello,

syzbot found the following crash on:

HEAD commit:    5d1365940a68 Merge  
git://git.kernel.org/pub/scm/linux/kerne...
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?id=5667997129637888
kernel config:   
https://syzkaller.appspot.com/x/.config?id=-5947642240294114534
dashboard link: https://syzkaller.appspot.com/bug?extid=fc78715ba3b3257caf6a
compiler:       gcc (GCC) 8.0.1 20180413 (experimental)

Unfortunately, I don't have any reproducer for this crash yet.

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+fc78715ba3b3257caf6a@syzkaller.appspotmail.com

INFO: rcu_sched self-detected stall on CPU
	1-...!: (1 GPs behind) idle=a3e/1/4611686018427387908 softirq=71980/71983  
fqs=33
	 (t=125000 jiffies g=39438 c=39437 q=958)
rcu_sched kthread starved for 124829 jiffies! g39438 c39437 f0x0  
RCU_GP_WAIT_FQS(3) ->state=0x0 ->cpu=0
RCU grace-period kthread stack dump:
rcu_sched       R  running task    23768     9      2 0x80000000
Call Trace:
  context_switch kernel/sched/core.c:2848 [inline]
  __schedule+0x801/0x1e30 kernel/sched/core.c:3490
  schedule+0xef/0x430 kernel/sched/core.c:3549
  schedule_timeout+0x138/0x240 kernel/time/timer.c:1801
  rcu_gp_kthread+0x6b5/0x1940 kernel/rcu/tree.c:2231
  kthread+0x345/0x410 kernel/kthread.c:238
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:411
NMI backtrace for cpu 1
CPU: 1 PID: 20560 Comm: syz-executor4 Not tainted 4.16.0+ #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
  nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
  nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
  arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
  trigger_single_cpu_backtrace include/linux/nmi.h:156 [inline]
  rcu_dump_cpu_stacks+0x175/0x1c2 kernel/rcu/tree.c:1376
  print_cpu_stall kernel/rcu/tree.c:1525 [inline]
  check_cpu_stall.isra.61.cold.80+0x36c/0x59a kernel/rcu/tree.c:1593
  __rcu_pending kernel/rcu/tree.c:3356 [inline]
  rcu_pending kernel/rcu/tree.c:3401 [inline]
  rcu_check_callbacks+0x21b/0xad0 kernel/rcu/tree.c:2763
  update_process_times+0x2d/0x70 kernel/time/timer.c:1636
  tick_sched_handle+0x9f/0x180 kernel/time/tick-sched.c:173
  tick_sched_timer+0x45/0x130 kernel/time/tick-sched.c:1283
  __run_hrtimer kernel/time/hrtimer.c:1386 [inline]
  __hrtimer_run_queues+0x3e3/0x10a0 kernel/time/hrtimer.c:1448
  hrtimer_interrupt+0x286/0x650 kernel/time/hrtimer.c:1506
  local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1025 [inline]
  smp_apic_timer_interrupt+0x15d/0x710 arch/x86/kernel/apic/apic.c:1050
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783  
[inline]
RIP: 0010:kmem_cache_free+0xb3/0x2d0 mm/slab.c:3757
RSP: 0018:ffff8801db105228 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000007 RBX: ffff8800b055c940 RCX: 1ffff1003b2345a5
RDX: 0000000000000000 RSI: ffff8801d91a2d80 RDI: 0000000000000282
RBP: ffff8801db105248 R08: ffff8801d91a2cb8 R09: 0000000000000002
R10: ffff8801d91a2480 R11: 0000000000000000 R12: ffff8801d9848e40
R13: 0000000000000282 R14: ffffffff85b7f27c R15: 0000000000000000
  kfree_skbmem+0x13c/0x210 net/core/skbuff.c:582
  __kfree_skb net/core/skbuff.c:642 [inline]
  kfree_skb+0x19d/0x560 net/core/skbuff.c:659
  enqueue_to_backlog+0x2fc/0xc90 net/core/dev.c:3968
  netif_rx_internal+0x14d/0xae0 net/core/dev.c:4181
  netif_rx+0xba/0x400 net/core/dev.c:4206
  loopback_xmit+0x283/0x741 drivers/net/loopback.c:91
  __netdev_start_xmit include/linux/netdevice.h:4087 [inline]
  netdev_start_xmit include/linux/netdevice.h:4096 [inline]
  xmit_one net/core/dev.c:3053 [inline]
  dev_hard_start_xmit+0x264/0xc10 net/core/dev.c:3069
  __dev_queue_xmit+0x2724/0x34c0 net/core/dev.c:3584
  dev_queue_xmit+0x17/0x20 net/core/dev.c:3617
  neigh_hh_output include/net/neighbour.h:472 [inline]
  neigh_output include/net/neighbour.h:480 [inline]
  ip6_finish_output2+0x134e/0x2810 net/ipv6/ip6_output.c:120
  ip6_finish_output+0x5fe/0xbc0 net/ipv6/ip6_output.c:154
  NF_HOOK_COND include/linux/netfilter.h:277 [inline]
  ip6_output+0x227/0x9b0 net/ipv6/ip6_output.c:171
  dst_output include/net/dst.h:444 [inline]
  NF_HOOK include/linux/netfilter.h:288 [inline]
  ip6_xmit+0xf51/0x23f0 net/ipv6/ip6_output.c:277
  sctp_v6_xmit+0x4a5/0x6b0 net/sctp/ipv6.c:225
  sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:650
  sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
  expire_timers kernel/time/timer.c:1363 [inline]
  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
  invoke_softirq kernel/softirq.c:365 [inline]
  irq_exit+0x1d1/0x200 kernel/softirq.c:405
  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
  </IRQ>
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783  
[inline]
RIP: 0010:lock_release+0x4d4/0xa10 kernel/locking/lockdep.c:3942
RSP: 0018:ffff8801971ce7b0 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
RAX: dffffc0000000000 RBX: 1ffff10032e39cfb RCX: 1ffff1003b234595
RDX: 1ffffffff11630ed RSI: 0000000000000002 RDI: 0000000000000282
RBP: ffff8801971ce8e0 R08: 1ffff10032e39cff R09: ffffed003b6246c2
R10: 0000000000000003 R11: 0000000000000001 R12: ffff8801d91a2480
R13: ffffffff88b8df60 R14: ffff8801d91a2480 R15: ffff8801971ce7f8
  rcu_lock_release include/linux/rcupdate.h:251 [inline]
  rcu_read_unlock include/linux/rcupdate.h:688 [inline]
  __unlock_page_memcg+0x72/0x100 mm/memcontrol.c:1654
  unlock_page_memcg+0x2c/0x40 mm/memcontrol.c:1663
  page_remove_file_rmap mm/rmap.c:1248 [inline]
  page_remove_rmap+0x6f2/0x1250 mm/rmap.c:1299
  zap_pte_range mm/memory.c:1337 [inline]
  zap_pmd_range mm/memory.c:1441 [inline]
  zap_pud_range mm/memory.c:1470 [inline]
  zap_p4d_range mm/memory.c:1491 [inline]
  unmap_page_range+0xeb4/0x2200 mm/memory.c:1512
  unmap_single_vma+0x1a0/0x310 mm/memory.c:1557
  unmap_vmas+0x120/0x1f0 mm/memory.c:1587
  exit_mmap+0x265/0x570 mm/mmap.c:3038
  __mmput kernel/fork.c:962 [inline]
  mmput+0x251/0x610 kernel/fork.c:983
  exit_mm kernel/exit.c:544 [inline]
  do_exit+0xe98/0x2730 kernel/exit.c:852
  do_group_exit+0x16f/0x430 kernel/exit.c:968
  get_signal+0x886/0x1960 kernel/signal.c:2469
  do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810
  exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162
  prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
  syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
  do_syscall_64+0x792/0x9d0 arch/x86/entry/common.c:292
  entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x455319
RSP: 002b:00007fa346e81ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
RAX: fffffffffffffe00 RBX: 000000000072bf80 RCX: 0000000000455319
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000072bf80
RBP: 000000000072bf80 R08: 0000000000000000 R09: 000000000072bf58
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000a3e81f R14: 00007fa346e829c0 R15: 0000000000000001


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is  
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug  
report.
Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* Re: Request for stable 4.14.x inclusion: net: don't call update_pmtu unconditionally
From: Greg KH @ 2018-04-30 18:22 UTC (permalink / raw)
  To: Eddie Chapman; +Cc: Thomas Deutschmann, stable, davem, nicolas.dichtel, netdev
In-Reply-To: <7485fe73-0c33-e5b4-6c9b-c8c7a359be4a@ehuk.net>

On Fri, Apr 27, 2018 at 07:43:52PM +0100, Eddie Chapman wrote:
> On 27/04/18 19:07, Thomas Deutschmann wrote:
> > Hi Greg,
> > 
> > first, we need to cherry-pick another patch first:
> > >  From 52a589d51f1008f62569bf89e95b26221ee76690 Mon Sep 17 00:00:00 2001
> > > From: Xin Long <lucien.xin@gmail.com>
> > > Date: Mon, 25 Dec 2017 14:43:58 +0800
> > > Subject: [PATCH] geneve: update skb dst pmtu on tx path
> > > 
> > > Commit a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") has fixed
> > > a performance issue caused by the change of lower dev's mtu for vxlan.
> > > 
> > > The same thing needs to be done for geneve as well.
> > > 
> > > Note that geneve cannot adjust it's mtu according to lower dev's mtu
> > > when creating it. The performance is very low later when netperfing
> > > over it without fixing the mtu manually. This patch could also avoid
> > > this issue.
> > > 
> > > Signed-off-by: Xin Long <lucien.xin@gmail.com>
> > > Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> Oops, I completely missed that the coreos patch doesn't have the geneve hunk
> that is in the original 4.15 patch. I don't load the geneve module on my box
> hence why no problems surfaced on my machine.

The geneve hunk doesn't apply at all to the 4.14.y tree, so I think
someone has a messed up tree somewhere...

I'll go look into this now.

greg k-h

^ permalink raw reply

* Re: [dm-devel] [PATCH v5] fault-injection: introduce kvmalloc fallback options
From: John Stoffel @ 2018-04-30 18:27 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: John Stoffel, Andrew, eric.dumazet, mst, edumazet, netdev,
	jasowang, Randy Dunlap, linux-kernel, Matthew Wilcox, Hocko,
	James Bottomley, Michal, dm-devel, David Miller, David Rientjes,
	Morton, virtualization, linux-mm, Vlastimil Babka
In-Reply-To: <alpine.LRH.2.02.1804261726540.13401@file01.intranet.prod.int.rdu2.redhat.com>

>>>>> "Mikulas" == Mikulas Patocka <mpatocka@redhat.com> writes:

Mikulas> On Thu, 26 Apr 2018, John Stoffel wrote:

>> >>>>> "James" == James Bottomley <James.Bottomley@HansenPartnership.com> writes:
>> 
James> I may be an atypical developer but I'd rather have a root canal
James> than browse through menuconfig options.  The way to get people
James> to learn about new debugging options is to blog about it (or
James> write an lwn.net article) which google will find the next time
James> I ask it how I debug XXX.  Google (probably as a service to
James> humanity) rarely turns up Kconfig options in response to a
James> query.
>> 
>> I agree with James here.  Looking at the SLAB vs SLUB Kconfig entries
>> tells me *nothing* about why I should pick one or the other, as an
>> example.
>> 
>> John

Mikulas> I see your point - and I think the misunderstanding is this.

Thanks.

Mikulas> This patch is not really helping people to debug existing crashes. It is 
Mikulas> not like "you get a crash" - "you google for some keywords" - "you get a 
Mikulas> page that suggests to turn this option on" - "you turn it on and solve the 
Mikulas> crash".

Mikulas> What this patch really does is that - it makes the kernel deliberately 
Mikulas> crash in a situation when the code violates the specification, but it 
Mikulas> would not crash otherwise or it would crash very rarely. It helps to 
Mikulas> detect specification violations.

Mikulas> If the kernel developer (or tester) doesn't use this option, his buggy 
Mikulas> code won't crash - and if it won't crash, he won't fix the bug or report 
Mikulas> it. How is the user or developer supposed to learn about this option, if 
Mikulas> he gets no crash at all?

So why do we make this a KConfig option at all?  Just turn it on and
let it rip.  Now I also think that Linus has the right idea to not
just sprinkle BUG_ONs into the code, just dump and oops and keep going
if you can.  If it's a filesystem or a device, turn it read only so
that people notice right away.

^ permalink raw reply

* Re: [PATCH net-next v9 1/4] virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit
From: Samudrala, Sridhar @ 2018-04-30 19:14 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh, aaron.f.brown
In-Reply-To: <20180430070340.GF23854@nanopsycho.orion>


On 4/30/2018 12:03 AM, Jiri Pirko wrote:
> Mon, Apr 30, 2018 at 04:47:03AM CEST, sridhar.samudrala@intel.com wrote:
>> On 4/28/2018 12:50 AM, Jiri Pirko wrote:
>>> Fri, Apr 27, 2018 at 07:06:57PM CEST,sridhar.samudrala@intel.com  wrote:
>>>> This feature bit can be used by hypervisor to indicate virtio_net device to
>>>> act as a standby for another device with the same MAC address.
>>>>
>>>> VIRTIO_NET_F_STANDBY is defined as bit 62 as it is a device feature bit.
>>>>
>>>> Signed-off-by: Sridhar Samudrala<sridhar.samudrala@intel.com>
>>>> ---
>>>> drivers/net/virtio_net.c        | 2 +-
>>>> include/uapi/linux/virtio_net.h | 3 +++
>>>> 2 files changed, 4 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>>> index 3b5991734118..51a085b1a242 100644
>>>> --- a/drivers/net/virtio_net.c
>>>> +++ b/drivers/net/virtio_net.c
>>>> @@ -2999,7 +2999,7 @@ static struct virtio_device_id id_table[] = {
>>>> 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
>>>> 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
>>>> 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
>>>> -	VIRTIO_NET_F_SPEED_DUPLEX
>>>> +	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
>>> This is not part of current qemu master (head 6f0c4706b35dead265509115ddbd2a8d1af516c1)
>>> Were I can find the qemu code?
>>>
>>> Also, I think it makes sense to push HW (qemu HW in this case) first
>>> and only then the driver.
>> I had sent qemu patch with a couple of earlier versions of this patchset.
>> Will include it when i send out v10.
> The point was, don't you want to push it to qemu first? Did you at least
> send RFC to qemu?

Yes. Here is the link to the RFC patch.
https://patchwork.ozlabs.org/patch/859521/

^ permalink raw reply

* Re: [RFC net-next 0/5] Support for PHY test modes
From: Florian Fainelli @ 2018-04-30 19:23 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: David Miller, netdev, rmk, linux-kernel, cphealy, nikita.yoush,
	vivien.didelot, Nisar.Sayed, UNGLinuxDriver
In-Reply-To: <20180430164012.GB16854@lunn.ch>

On 04/30/2018 09:40 AM, Andrew Lunn wrote:
>> Turning these tests on will typically result in the link partner
>> dropping the link with us, and the interface will be non-functional as
>> far as the data path is concerned (similar to an isolation mode). This
>> might warrant properly reporting that to user-space through e.g: a
>> private IFF_* value maybe?
> 
> Hi Florian
> 
> I've not looked at the code yet....
> 
> Is it also necessary to kick off auto-neg again after the test has
> finished, in order to reestablish the link?

It would yes. Right now there is a test mode exposed named "normal"
which really, means: bring back to the PHY to an operation state. This
state likely does not belong here and we would have to introduce flags
instead such as start and stop the test instead.

Please review the patches when we get a chance, because I suspect this
is not the only issue they have ;)
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next v9 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-04-30 19:26 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh, aaron.f.brown
In-Reply-To: <20180430071208.GG23854@nanopsycho.orion>

On 4/30/2018 12:12 AM, Jiri Pirko wrote:
> Mon, Apr 30, 2018 at 05:00:33AM CEST, sridhar.samudrala@intel.com wrote:
>> On 4/28/2018 1:24 AM, Jiri Pirko wrote:
>>> Fri, Apr 27, 2018 at 07:06:59PM CEST, sridhar.samudrala@intel.com wrote:
>>>> This patch enables virtio_net to switch over to a VF datapath when a VF
>>>> netdev is present with the same MAC address. It allows live migration
>>>> of a VM with a direct attached VF without the need to setup a bond/team
>>>> between a VF and virtio net device in the guest.
>>>>
>>>> The hypervisor needs to enable only one datapath at any time so that
>>>> packets don't get looped back to the VM over the other datapath. When a VF
>>> Why? Both datapaths could be enabled at a time. Why the loop on
>>> hypervisor side would be a problem. This in not an issue for
>>> bonding/team as well.
>> Somehow the hypervisor needs to make sure that the broadcasts/multicasts from the VM
>> sent over the VF datapath don't get looped back to the VM via the virtio-net datapth.
> Why? Please see below.
>
>
>> This can happen if both datapaths are enabled at the same time.
>>
>> I would think this is an issue even with bonding/team as well when virtio-net and
>> VF are backed by the same PF.
>>
>>
> I believe that the scenario is the same as on an ordinary nic/swich
> network:
>
> ...................
>
>    host
>   
>         bond0
>        /     \
>      eth0   eth1
>       |       |
> ...................
>       |       |
>       p1      p2
>
>    switch
>
> ...................
>
> It is perfectly valid to p1 and p2 be up and "bridged" together. Bond
> has to cope with loop-backed frames. "Failover driver" should too,
> it's the same scenario.

OK. So looks like we should be able to handle this by returning RX_HANDLER_EXACT
for frames received on standby device when primary is present.

^ permalink raw reply

* Re: Representing cpu-port of a switch
From: Florian Fainelli @ 2018-04-30 19:45 UTC (permalink / raw)
  To: sk.syed2; +Cc: netdev, jayaram, syeds, andrew, vivien.didelot
In-Reply-To: <CAE6JOJ909uWjzr1gOic59=cvMfJbGWB28-b4CnjMr-hXHVNP+w@mail.gmail.com>

On 04/30/2018 06:06 AM, sk.syed2 wrote:
>> Again, pretty standard. What you probably meant is that for host destined or initiated traffic you must use that internal port to egress/ingress frames towards the other external ports.
>>
> Yes.
> 
>>
>>> Without tagging, we cant really use DSA, and hide the cpu/dsa port. So
>>> if we expose this cpu port as a interface with fixed-phy
>>> infrastructure does it create any problems?
>>
>> Well the fact that you don't have a tagging protocol does not really mean you cannot use DSA. You could create your own tagging format which in your case could be as simple as keeping the prepended DMA packet descriptor and have the parsing of that descriptor be done in a DSA tagger driver.
> This would need HW change wont it?
> 
>  Not that I would necessarily recommend that though, see below.
>>
> 
>>  DSA documentation says one
>>> cannot open a socket on cpu/dsa port and send/receive traffic. Is it
>>> fairly common to use internal/cpu port as a network interface- i.e,
>>> creating a socket and send/receive traffic?
>>
>> In the context of DSA, all external ports are represented by a network device. This means that the CPU/management port is only used to ingress/egress frames that include the tag which either the switch hardware inserts on its way to the host or conversely that the host must insert to have the switch do the appropriate switching operation. If you do not use tags and you still have a way to target specific external ports the same representation should happen and you do not want to expose the internal port because it will only be used to send/receive traffic from the external ports and it will not be used to send or receive traffic to itself (so to speak).
> Just like with DSA you should have the ability to create network
> devices that are per-port and you can use the HW provided information
> to deliver packets to the appropriate destination port, conversely
> send from the appropriate source port.
>>
>>
> The switch HW doesn't have any provision to target specific external
> port. I think I didnt make this clear. We would like to integrate a
> switch along with an endpoint into a single HW. Meaning we would like
> to use internal cpu port as any normal network port to send and
> receive traffic. The external network ports will not be used to
> send/receive normal traffic(except for control/mgmt frames). So, the
> two external front panel ports tied to phys are lets say eth1, eth2.

The only problem with that approach is that eth1 and eth2 are only
control interfaces, they cannot transfer any data, eth0 does that, see
below.

> The cpu port tied to DMAs is represented using fixed-link/always_on
> interface say ep0(endpoint). Now applications can open a socket and
> send/receive traffic from ep0. The switch does forwarding based on
> programmed CAM. Do you see any problem with this approach?

No, this is entirely similar to what we do in DSA with DSA_PROTO_TAG_NONE.

In premise, if you created a VLAN identifier for each of your front
panel port, you could emulate in SW what you do not have in HW which is
to target specific front panel ports.

> 
> 
> 
>>> One problem is how to report back when network errors(like if both
>>> front panel ports are disconnected, the expectation is to bring this
>>> cpu port down?).
>>
>> The CPU port should be considered always UP and the external ports must have proper link notifications in place through PHYLIB/PHYLINK preferably. With link management in place the carrier state is properly managed and the network stack won't send traffic from a port that is not UP.
>>
>>> We also need to offload all the switch configuration to switch-dev. So
>>> the question is using switch-dev without DSA and representing a cpu
>>> port as a normal network interface would be ok?
>>
>> Using switchdev without DSA is absolutely okay, see rocker and mlxsw, but neither of those do represent their CPU/management port for the reasons outlined above that it is only used to convey traffic to/from other ports that have a proper network device representation.
>>
> May be this is something elementary, but what do you mean by proper
> network device representation that cpu port lacks?

CPU ports in these cases are not created as a separate net_device
instance, they exist in the HW and at the SW level because we need to
use them to direct traffic to/from specific front-panel port using a
specific DMA capability (e.g: a special set of bits in a DMA
descriptor). In your case, you don't have that ability in your HW, so
you must actually create a CPU net_device for applications to be able to
send traffic.
-- 
Florian

^ permalink raw reply

* [PATCH net-next] udp: disable gso with no_check_tx
From: Willem de Bruijn @ 2018-04-30 19:58 UTC (permalink / raw)
  To: netdev; +Cc: davem, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Syzbot managed to send a udp gso packet without checksum offload into
the gso stack by disabling tx checksum (UDP_NO_CHECK6_TX). This
triggered the skb_warn_bad_offload.

  RIP: 0010:skb_warn_bad_offload+0x2bc/0x600 net/core/dev.c:2658
   skb_gso_segment include/linux/netdevice.h:4038 [inline]
   validate_xmit_skb+0x54d/0xd90 net/core/dev.c:3120
   __dev_queue_xmit+0xbf8/0x34c0 net/core/dev.c:3577
   dev_queue_xmit+0x17/0x20 net/core/dev.c:3618

UDP_NO_CHECK6_TX sets skb->ip_summed to CHECKSUM_NONE just after the
udp gso integrity checks in udp_(v6_)send_skb. Extend those checks to
catch and fail in this case.

After the integrity checks jump directly to the CHECKSUM_PARTIAL case
to avoid reading the no_check_tx flags again (a TOCTTOU race).

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/ipv4/udp.c | 4 ++++
 net/ipv6/udp.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 794aeafeb782..dd3102a37ef9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -786,11 +786,14 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 			return -EINVAL;
 		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
 			return -EINVAL;
+		if (sk->sk_no_check_tx)
+			return -EINVAL;
 		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		goto csum_partial;
 	}
 
 	if (is_udplite)  				 /*     UDP-Lite      */
@@ -802,6 +805,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 		goto send;
 
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
 
 		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
 		goto send;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6acfdd3e442b..a34e28ac03a7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1051,11 +1051,14 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 			return -EINVAL;
 		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
 			return -EINVAL;
+		if (udp_sk(sk)->no_check6_tx)
+			return -EINVAL;
 		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		goto csum_partial;
 	}
 
 	if (is_udplite)
@@ -1064,6 +1067,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 		skb->ip_summed = CHECKSUM_NONE;
 		goto send;
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
 		udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
 		goto send;
 	} else
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* Re: [PATCH net-next v2 4/5] ipv6: sr: Add seg6local action End.BPF
From: Mathieu Xhonneux @ 2018-04-30 20:06 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: David Miller, netdev, David Lebrun, Daniel Borkmann
In-Reply-To: <20180428000141.f63iw36nfr3bbz2r@ast-mbp>

2018-04-28 2:01 GMT+02:00 Alexei Starovoitov <alexei.starovoitov@gmail.com>:
>
> On Fri, Apr 27, 2018 at 10:59:19AM -0400, David Miller wrote:
> > From: Mathieu Xhonneux <m.xhonneux@gmail.com>
> > Date: Tue, 24 Apr 2018 18:44:15 +0100
> >
> > > This patch adds the End.BPF action to the LWT seg6local infrastructure.
> > > This action works like any other seg6local End action, meaning that an IPv6
> > > header with SRH is needed, whose DA has to be equal to the SID of the
> > > action. It will also advance the SRH to the next segment, the BPF program
> > > does not have to take care of this.
> >
> > I'd like to see some BPF developers review this change.
> >
> > But on my side I wonder if, instead of validating the whole thing afterwards,
> > we should make the helpers accessible by the eBPF program validate the changes
> > as they are made.
>
> Looking at the code I don't think it's possible to keep it valid all the time
> while building, so seg6_validate_srh() after the program run seems necessary.


Indeed, e.g. to add a TLV in the SRH one needs to call
bpf_lwt_seg6_adjust_srh (to add some room for the TLV), then
bpf_lwt_seg6_store_bytes() (to fill the space with the TLV). Between
those two calls, the SRH is in an invalid state.

>
>
> I think the whole set should be targeting bpf-next tree.
> Please fix kbuild errors, rebase and document new helper in man-page style.
> Things like:
> +       test_btf_haskv.o test_btf_nokv.o test_lwt_seg6local.o
> +>>>>>>> selftests/bpf: test for seg6local End.BPF action
> should be fixed properly.


Oops, I didn't catch this one, thanks. I'll send a v3 towards bpf-next.

^ permalink raw reply

* Re: [PATCH net-next v6 0/3] kernel: add support to collect hardware logs in crash recovery kernel
From: Eric W. Biederman @ 2018-04-30 20:09 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: netdev, kexec, linux-fsdevel, linux-kernel, davem, viro, stephen,
	akpm, torvalds, ganeshgr, nirranjan, indranil
In-Reply-To: <cover.1525082669.git.rahul.lakkireddy@chelsio.com>

Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:

> v6:
> - Reworked device dump elf note name to contain vendor identifier.
> - Added vmcoredd_header that precedes actual dump in the Elf Note.
> - Device dump's name is moved inside vmcoredd_header.
> - Added "CHELSIO" string as vendor identifier in the Elf Note name
>   for cxgb4 device dumps.

Yep you did and that is not correct.  My apologies if I was unclear.

An elf note looks like this:

/* Note header in a PT_NOTE section */
typedef struct elf32_note {
  Elf32_Word	n_namesz;	/* Name size */
  Elf32_Word	n_descsz;	/* Content size */
  Elf32_Word	n_type;		/* Content type */
} Elf32_Nhdr;

n_descsz is is the length of the body.

n_namesz is the length of the ``vendor'' but not the vendor of the your
driver.  It is the ``vendor'' that defines n_type.  So "LINUX" in our
case.

The pair "LINUX", NT_VMCOREDD go together.  That pair is what a
subsequent program must look at to decide how to understand the note.

Please don't use CRASH_CORE_NOTE_HEAD BYTES that obscures things
unnecessarily, and really is not applicable to this use case.
ALIGN(sizeof(struct elf_note), 4) is almost as short and it makes
it clear what your are talking about.


Also please don't look too closely as the other note stuff for Elf core
dumps.  That stuff was not well done from an Elf standards and ABI
perspective.  Unfortunately by the time I had noticed it was years later
so not something that is worth the breakage in tools to change now.

Looking at struct vmcoredd_header.

That seems a reasonable structure.  However it is a uapi structure so
we need to carefully definite it that way.

Perhaps:

#define VMCOREDD_MAX_NAME_BYTES  32

struct vmcoredd_header {
	__u32 header_len;	/* Length of this header */
        __u32 reserved;
        __u64 data_len;		/* Length of this device dump */
        __u8 dump_name[VMCOREDD_MAX_NAME_BYTES];
        __u8 reserved2[16];
};


Looking at that I see another siginficant issue.  We can't let the
device dump be more than 32bits long.  The length of an elf
note is defined by an elf word which is universally a __u32.

So perhaps vmcoredd_header should look like:

#define VMCOREDD_MAX_NAME_BYTES  44

struct vmcoredd_header {
	__u32	n_namesz;	/* Name size */
        __u32	n_descsz;	/* Content size */
        __u32	n_type;		/* NT_VMDOREDD */
	__u8	name[8];	/* LINUX\0\0\0 */
        __u8	dump_name[VMCOREDD_MAX_NAME_BYTES];
};

The total length of the data dump would be descsz - sizeof(struct
vmcoredd_header).  The header winds up being a constant 64 bytes this
way, and includes the elf note header.

Eric

^ permalink raw reply

* Good News
From: Mrs Julie Leach @ 2018-04-30 20:27 UTC (permalink / raw)


You are a recipient to Mrs Julie Leach Donation of $2 million USD. Contact 
(julieleach104@gmail.com) for claims.

^ permalink raw reply

* [PATCH 0/6] Fix XSA-155-like bugs in frontend drivers
From: Marek Marczykowski-Górecki @ 2018-04-30 21:01 UTC (permalink / raw)
  To: xen-devel
  Cc: Marek Marczykowski-Górecki, Roger Pau Monné,
	Boris Ostrovsky, Greg Kroah-Hartman, Jens Axboe, Juergen Gross,
	Konrad Rzeszutek Wilk, Stefano Stabellini, open list:BLOCK LAYER,
	open list, open list:NETWORKING DRIVERS, stable

Patches in original Xen Security Advisory 155 cared only about backend drivers
while leaving frontend patches to be "developed and released (publicly) after
the embargo date". This is said series.

Marek Marczykowski-Górecki (6):
  xen: Add RING_COPY_RESPONSE()
  xen-netfront: copy response out of shared buffer before accessing it
  xen-netfront: do not use data already exposed to backend
  xen-netfront: add range check for Tx response id
  xen-blkfront: make local copy of response before using it
  xen-blkfront: prepare request locally, only then put it on the shared ring

 drivers/block/xen-blkfront.c    | 110 ++++++++++++++++++---------------
 drivers/net/xen-netfront.c      |  61 +++++++++---------
 include/xen/interface/io/ring.h |  14 ++++-
 3 files changed, 106 insertions(+), 79 deletions(-)

base-commit: 6d08b06e67cd117f6992c46611dfb4ce267cd71e
-- 
git-series 0.9.1

^ permalink raw reply

* [PATCH 2/6] xen-netfront: copy response out of shared buffer before accessing it
From: Marek Marczykowski-Górecki @ 2018-04-30 21:01 UTC (permalink / raw)
  To: xen-devel
  Cc: Juergen Gross, open list:NETWORKING DRIVERS,
	Marek Marczykowski-Górecki, stable, open list,
	Boris Ostrovsky
In-Reply-To: <cover.7ee732ab822b728ec486a3118ec12e9c06f0f325.1525122026.git-series.marmarek@invisiblethingslab.com>

Make local copy of the response, otherwise backend might modify it while
frontend is already processing it - leading to time of check / time of
use issue.

This is complementary to XSA155.

Cc: stable@vger.kernel.org
Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
---
 drivers/net/xen-netfront.c | 51 +++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 4dd0668..dc99763 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -387,13 +387,13 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue)
 		rmb(); /* Ensure we see responses up to 'rp'. */
 
 		for (cons = queue->tx.rsp_cons; cons != prod; cons++) {
-			struct xen_netif_tx_response *txrsp;
+			struct xen_netif_tx_response txrsp;
 
-			txrsp = RING_GET_RESPONSE(&queue->tx, cons);
-			if (txrsp->status == XEN_NETIF_RSP_NULL)
+			RING_COPY_RESPONSE(&queue->tx, cons, &txrsp);
+			if (txrsp.status == XEN_NETIF_RSP_NULL)
 				continue;
 
-			id  = txrsp->id;
+			id  = txrsp.id;
 			skb = queue->tx_skbs[id].skb;
 			if (unlikely(gnttab_query_foreign_access(
 				queue->grant_tx_ref[id]) != 0)) {
@@ -741,7 +741,7 @@ static int xennet_get_extras(struct netfront_queue *queue,
 			     RING_IDX rp)
 
 {
-	struct xen_netif_extra_info *extra;
+	struct xen_netif_extra_info extra;
 	struct device *dev = &queue->info->netdev->dev;
 	RING_IDX cons = queue->rx.rsp_cons;
 	int err = 0;
@@ -757,24 +757,23 @@ static int xennet_get_extras(struct netfront_queue *queue,
 			break;
 		}
 
-		extra = (struct xen_netif_extra_info *)
-			RING_GET_RESPONSE(&queue->rx, ++cons);
+		RING_COPY_RESPONSE(&queue->rx, ++cons, &extra);
 
-		if (unlikely(!extra->type ||
-			     extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+		if (unlikely(!extra.type ||
+			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
 			if (net_ratelimit())
 				dev_warn(dev, "Invalid extra type: %d\n",
-					extra->type);
+					extra.type);
 			err = -EINVAL;
 		} else {
-			memcpy(&extras[extra->type - 1], extra,
-			       sizeof(*extra));
+			memcpy(&extras[extra.type - 1], &extra,
+			       sizeof(extra));
 		}
 
 		skb = xennet_get_rx_skb(queue, cons);
 		ref = xennet_get_rx_ref(queue, cons);
 		xennet_move_rx_slot(queue, skb, ref);
-	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
+	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
 
 	queue->rx.rsp_cons = cons;
 	return err;
@@ -784,28 +783,28 @@ static int xennet_get_responses(struct netfront_queue *queue,
 				struct netfront_rx_info *rinfo, RING_IDX rp,
 				struct sk_buff_head *list)
 {
-	struct xen_netif_rx_response *rx = &rinfo->rx;
+	struct xen_netif_rx_response rx = rinfo->rx;
 	struct xen_netif_extra_info *extras = rinfo->extras;
 	struct device *dev = &queue->info->netdev->dev;
 	RING_IDX cons = queue->rx.rsp_cons;
 	struct sk_buff *skb = xennet_get_rx_skb(queue, cons);
 	grant_ref_t ref = xennet_get_rx_ref(queue, cons);
-	int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+	int max = MAX_SKB_FRAGS + (rx.status <= RX_COPY_THRESHOLD);
 	int slots = 1;
 	int err = 0;
 	unsigned long ret;
 
-	if (rx->flags & XEN_NETRXF_extra_info) {
+	if (rx.flags & XEN_NETRXF_extra_info) {
 		err = xennet_get_extras(queue, extras, rp);
 		cons = queue->rx.rsp_cons;
 	}
 
 	for (;;) {
-		if (unlikely(rx->status < 0 ||
-			     rx->offset + rx->status > XEN_PAGE_SIZE)) {
+		if (unlikely(rx.status < 0 ||
+			     rx.offset + rx.status > XEN_PAGE_SIZE)) {
 			if (net_ratelimit())
 				dev_warn(dev, "rx->offset: %u, size: %d\n",
-					 rx->offset, rx->status);
+					 rx.offset, rx.status);
 			xennet_move_rx_slot(queue, skb, ref);
 			err = -EINVAL;
 			goto next;
@@ -819,7 +818,7 @@ static int xennet_get_responses(struct netfront_queue *queue,
 		if (ref == GRANT_INVALID_REF) {
 			if (net_ratelimit())
 				dev_warn(dev, "Bad rx response id %d.\n",
-					 rx->id);
+					 rx.id);
 			err = -EINVAL;
 			goto next;
 		}
@@ -832,7 +831,7 @@ static int xennet_get_responses(struct netfront_queue *queue,
 		__skb_queue_tail(list, skb);
 
 next:
-		if (!(rx->flags & XEN_NETRXF_more_data))
+		if (!(rx.flags & XEN_NETRXF_more_data))
 			break;
 
 		if (cons + slots == rp) {
@@ -842,7 +841,7 @@ static int xennet_get_responses(struct netfront_queue *queue,
 			break;
 		}
 
-		rx = RING_GET_RESPONSE(&queue->rx, cons + slots);
+		RING_COPY_RESPONSE(&queue->rx, cons + slots, &rx);
 		skb = xennet_get_rx_skb(queue, cons + slots);
 		ref = xennet_get_rx_ref(queue, cons + slots);
 		slots++;
@@ -898,9 +897,9 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
 	struct sk_buff *nskb;
 
 	while ((nskb = __skb_dequeue(list))) {
-		struct xen_netif_rx_response *rx =
-			RING_GET_RESPONSE(&queue->rx, ++cons);
+		struct xen_netif_rx_response rx;
 		skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
+		RING_COPY_RESPONSE(&queue->rx, ++cons, &rx);
 
 		if (shinfo->nr_frags == MAX_SKB_FRAGS) {
 			unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
@@ -911,7 +910,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
 		BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS);
 
 		skb_add_rx_frag(skb, shinfo->nr_frags, skb_frag_page(nfrag),
-				rx->offset, rx->status, PAGE_SIZE);
+				rx.offset, rx.status, PAGE_SIZE);
 
 		skb_shinfo(nskb)->nr_frags = 0;
 		kfree_skb(nskb);
@@ -1007,7 +1006,7 @@ static int xennet_poll(struct napi_struct *napi, int budget)
 	i = queue->rx.rsp_cons;
 	work_done = 0;
 	while ((i != rp) && (work_done < budget)) {
-		memcpy(rx, RING_GET_RESPONSE(&queue->rx, i), sizeof(*rx));
+		RING_COPY_RESPONSE(&queue->rx, i, rx);
 		memset(extras, 0, sizeof(rinfo.extras));
 
 		err = xennet_get_responses(queue, &rinfo, rp, &tmpq);
-- 
git-series 0.9.1

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related

* [PATCH 3/6] xen-netfront: do not use data already exposed to backend
From: Marek Marczykowski-Górecki @ 2018-04-30 21:01 UTC (permalink / raw)
  To: xen-devel
  Cc: Juergen Gross, open list:NETWORKING DRIVERS,
	Marek Marczykowski-Górecki, stable, open list,
	Boris Ostrovsky
In-Reply-To: <cover.7ee732ab822b728ec486a3118ec12e9c06f0f325.1525122026.git-series.marmarek@invisiblethingslab.com>

Backend may freely modify anything on shared page, so use data which was
supposed to be written there, instead of reading it back from the shared
page.

This is complementary to XSA155.

CC: stable@vger.kernel.org
Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
---
 drivers/net/xen-netfront.c |  9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index dc99763..934b8a4 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -458,7 +458,7 @@ static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset,
 	tx->flags = 0;
 
 	info->tx = tx;
-	info->size += tx->size;
+	info->size += len;
 }
 
 static struct xen_netif_tx_request *xennet_make_first_txreq(
@@ -574,7 +574,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	int slots;
 	struct page *page;
 	unsigned int offset;
-	unsigned int len;
+	unsigned int len, this_len;
 	unsigned long flags;
 	struct netfront_queue *queue = NULL;
 	unsigned int num_queues = dev->real_num_tx_queues;
@@ -634,14 +634,15 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	/* First request for the linear area. */
+	this_len = min_t(unsigned int, XEN_PAGE_SIZE - offset, len);
 	first_tx = tx = xennet_make_first_txreq(queue, skb,
 						page, offset, len);
-	offset += tx->size;
+	offset += this_len;
 	if (offset == PAGE_SIZE) {
 		page++;
 		offset = 0;
 	}
-	len -= tx->size;
+	len -= this_len;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		/* local packet? */
-- 
git-series 0.9.1

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related

* [PATCH 4/6] xen-netfront: add range check for Tx response id
From: Marek Marczykowski-Górecki @ 2018-04-30 21:01 UTC (permalink / raw)
  To: xen-devel
  Cc: Marek Marczykowski-Górecki, stable, Boris Ostrovsky,
	Juergen Gross, open list:NETWORKING DRIVERS, open list
In-Reply-To: <cover.7ee732ab822b728ec486a3118ec12e9c06f0f325.1525122026.git-series.marmarek@invisiblethingslab.com>

Tx response ID is fetched from shared page, so make sure it is sane
before using it as an array index.

CC: stable@vger.kernel.org
Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
---
 drivers/net/xen-netfront.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 934b8a4..55c9b25 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -394,6 +394,7 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue)
 				continue;
 
 			id  = txrsp.id;
+			BUG_ON(id >= NET_TX_RING_SIZE);
 			skb = queue->tx_skbs[id].skb;
 			if (unlikely(gnttab_query_foreign_access(
 				queue->grant_tx_ref[id]) != 0)) {
-- 
git-series 0.9.1

^ permalink raw reply related

* Re: [dm-devel] [PATCH v5] fault-injection: introduce kvmalloc fallback options
From: Mikulas Patocka @ 2018-04-30 21:07 UTC (permalink / raw)
  To: John Stoffel
  Cc: Andrew, eric.dumazet, mst, edumazet, netdev, jasowang,
	Randy Dunlap, linux-kernel, Matthew Wilcox, Hocko,
	James Bottomley, Michal, dm-devel, David Miller, David Rientjes,
	Morton, virtualization, linux-mm, Vlastimil Babka
In-Reply-To: <23271.24580.695738.853532@quad.stoffel.home>



On Mon, 30 Apr 2018, John Stoffel wrote:

> >>>>> "Mikulas" == Mikulas Patocka <mpatocka@redhat.com> writes:
> 
> Mikulas> On Thu, 26 Apr 2018, John Stoffel wrote:
> 
> Mikulas> I see your point - and I think the misunderstanding is this.
> 
> Thanks.
> 
> Mikulas> This patch is not really helping people to debug existing crashes. It is 
> Mikulas> not like "you get a crash" - "you google for some keywords" - "you get a 
> Mikulas> page that suggests to turn this option on" - "you turn it on and solve the 
> Mikulas> crash".
> 
> Mikulas> What this patch really does is that - it makes the kernel deliberately 
> Mikulas> crash in a situation when the code violates the specification, but it 
> Mikulas> would not crash otherwise or it would crash very rarely. It helps to 
> Mikulas> detect specification violations.
> 
> Mikulas> If the kernel developer (or tester) doesn't use this option, his buggy 
> Mikulas> code won't crash - and if it won't crash, he won't fix the bug or report 
> Mikulas> it. How is the user or developer supposed to learn about this option, if 
> Mikulas> he gets no crash at all?
> 
> So why do we make this a KConfig option at all?

Because other people see the KConfig option (so, they may enable it) and 
they don't see the kernel parameter (so, they won't enable it).

Close your eyes and say how many kernel parameters do you remember :-)

> Just turn it on and let it rip.

I can't test if all the networking drivers use kvmalloc properly, because 
I don't have the hardware. You can't test it neither. No one has all the 
hardware that is supported by Linux.

Driver issues can only be tested by a mass of users. And if the users 
don't know about the debugging option, they won't enable it.

> >> I agree with James here.  Looking at the SLAB vs SLUB Kconfig entries
> >> tells me *nothing* about why I should pick one or the other, as an
> >> example.

BTW. You can enable slub debugging either with CONFIG_SLUB_DEBUG_ON or 
with the kernel parameter "slub_debug" - and most users who compile their 
own kernel use CONFIG_SLUB_DEBUG_ON - just because it is visible.

> Now I also think that Linus has the right idea to not just sprinkle 
> BUG_ONs into the code, just dump and oops and keep going if you can.  
> If it's a filesystem or a device, turn it read only so that people 
> notice right away.

This vmalloc fallback is similar to CONFIG_DEBUG_KOBJECT_RELEASE. 
CONFIG_DEBUG_KOBJECT_RELEASE changes the behavior of kobject_put in order 
to cause deliberate crashes (that wouldn't happen otherwise) in drivers 
that misuse kobject_put. In the same sense, we want to cause deliberate 
crashes (that wouldn't happen otherwise) in drivers that misuse kvmalloc.

The crashes will only happen in debugging kernels, not in production 
kernels.

Mikulas

^ permalink raw reply

* Proposal
From: Miss Zeliha Omer Faruk @ 2018-04-30 21:13 UTC (permalink / raw)





Hello

   Greetings to you today i asked before but i did't get a response please
i know this might come to you as a surprise because you do not know me
personally i have a business proposal for our mutual benefit please let
me know if you are interested.



Best Regards,

Esentepe Mahallesi Büyükdere
Caddesi Kristal Kule Binasi
No:215
 Sisli - Istanbul, Turkey

^ permalink raw reply

* [PATCH] ipv6: Allow non-gateway ECMP for IPv6
From: Thomas Winter @ 2018-04-30 21:15 UTC (permalink / raw)
  To: netdev
  Cc: Thomas Winter, David Ahern, David S. Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI

It is valid to have static routes where the nexthop
is an interface not an address such as tunnels.
For IPv4 it was possible to use ECMP on these routes
but not for IPv6.

Signed-off-by: Thomas Winter <Thomas.Winter@alliedtelesis.co.nz>
Cc: David Ahern <dsahern@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
---
 include/net/ip6_route.h | 3 +--
 net/ipv6/ip6_fib.c      | 3 ---
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 08b132381984..abceb5864d99 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,8 +68,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
 {
-	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-	       RTF_GATEWAY;
+	return (rt->rt6i_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0;
 }
 
 void ip6_route_input(struct sk_buff *skb);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index deab2db6692e..3c97c29d4401 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -934,9 +934,6 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			 * list.
 			 * Only static routes (which don't have flag
 			 * RTF_EXPIRES) are used for ECMPv6.
-			 *
-			 * To avoid long list, we only had siblings if the
-			 * route have a gateway.
 			 */
 			if (rt_can_ecmp &&
 			    rt6_qualify_for_ecmp(iter))
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next] net: core: Inline netdev_features_size_check()
From: Florian Fainelli @ 2018-04-30 21:20 UTC (permalink / raw)
  To: netdev; +Cc: davem, stephen, Florian Fainelli

We do not require this inline function to be used in multiple different
locations, just inline it where it gets used in register_netdevice().

Suggested-by: David Miller <davem@davemloft.net>
Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 include/linux/netdevice.h | 6 ------
 net/core/dev.c            | 3 ++-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9e09dd897b74..82f5a9aba578 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4108,12 +4108,6 @@ const char *netdev_drivername(const struct net_device *dev);
 
 void linkwatch_run_queue(void);
 
-static inline void netdev_features_size_check(void)
-{
-	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
-		     NETDEV_FEATURE_COUNT);
-}
-
 static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
 							  netdev_features_t f2)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index e01c21a88cae..3263c14c607f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7879,7 +7879,8 @@ int register_netdevice(struct net_device *dev)
 	int ret;
 	struct net *net = dev_net(dev);
 
-	netdev_features_size_check();
+	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+		     NETDEV_FEATURE_COUNT);
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH net-next v6] Add Common Applications Kept Enhanced (cake) qdisc
From: Cong Wang @ 2018-04-30 21:21 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Linux Kernel Network Developers, cake, Dave Taht
In-Reply-To: <20180429213439.7389-1-toke@toke.dk>

On Sun, Apr 29, 2018 at 2:34 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
> sch_cake targets the home router use case and is intended to squeeze the
> most bandwidth and latency out of even the slowest ISP links and routers,
> while presenting an API simple enough that even an ISP can configure it.
>
> Example of use on a cable ISP uplink:
>
> tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter
>
> To shape a cable download link (ifb and tc-mirred setup elided)
>
> tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash
>
> CAKE is filled with:
>
> * A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel
>   derived Flow Queuing system, which autoconfigures based on the bandwidth.
> * A novel "triple-isolate" mode (the default) which balances per-host
>   and per-flow FQ even through NAT.
> * An deficit based shaper, that can also be used in an unlimited mode.
> * 8 way set associative hashing to reduce flow collisions to a minimum.
> * A reasonable interpretation of various diffserv latency/loss tradeoffs.
> * Support for zeroing diffserv markings for entering and exiting traffic.
> * Support for interacting well with Docsis 3.0 shaper framing.
> * Extensive support for DSL framing types.
> * Support for ack filtering.

Why this TCP ACK filtering has to be built into CAKE qdisc rather than
an independent TC filter? Why other qdisc's can't use it?


> * Extensive statistics for measuring, loss, ecn markings, latency
>   variation.
>
> A paper describing the design of CAKE is available at
> https://arxiv.org/abs/1804.07617
>

Thanks.

^ permalink raw reply

* Re: [PATCH net-next v6] Add Common Applications Kept Enhanced (cake) qdisc
From: Dave Taht @ 2018-04-30 21:27 UTC (permalink / raw)
  To: Cong Wang
  Cc: Toke Høiland-Jørgensen, Linux Kernel Network Developers,
	Cake List
In-Reply-To: <CAM_iQpWVrhoo+PjguBLYqwyGyX98Cv3-M=G+CbHrwBom13fwZQ@mail.gmail.com>

On Mon, Apr 30, 2018 at 2:21 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Sun, Apr 29, 2018 at 2:34 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>> sch_cake targets the home router use case and is intended to squeeze the
>> most bandwidth and latency out of even the slowest ISP links and routers,
>> while presenting an API simple enough that even an ISP can configure it.
>>
>> Example of use on a cable ISP uplink:
>>
>> tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter
>>
>> To shape a cable download link (ifb and tc-mirred setup elided)
>>
>> tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash
>>
>> CAKE is filled with:
>>
>> * A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel
>>   derived Flow Queuing system, which autoconfigures based on the bandwidth.
>> * A novel "triple-isolate" mode (the default) which balances per-host
>>   and per-flow FQ even through NAT.
>> * An deficit based shaper, that can also be used in an unlimited mode.
>> * 8 way set associative hashing to reduce flow collisions to a minimum.
>> * A reasonable interpretation of various diffserv latency/loss tradeoffs.
>> * Support for zeroing diffserv markings for entering and exiting traffic.
>> * Support for interacting well with Docsis 3.0 shaper framing.
>> * Extensive support for DSL framing types.
>> * Support for ack filtering.
>
> Why this TCP ACK filtering has to be built into CAKE qdisc rather than
> an independent TC filter? Why other qdisc's can't use it?

I actually have a tc - bpf based ack filter, during the development of
cake's ack-thinner, that I should submit one of these days. It
proved to be of limited use.

Probably the biggest mistake we made is by calling this cake feature a
filter. It isn't.

Maybe we should have called it a "thinner" or something like that? In
order to properly "thin" or "reduce" an ack stream
you have to have a queue to look at and some related state. TC filters
do not operate on queues, qdiscs do. Thus the "ack-filter" here is
deeply embedded into cake's flow isolation and queue structures.

>
>
>> * Extensive statistics for measuring, loss, ecn markings, latency
>>   variation.
>>
>> A paper describing the design of CAKE is available at
>> https://arxiv.org/abs/1804.07617
>>
>
> Thanks.



-- 

Dave Täht
CEO, TekLibre, LLC
http://www.teklibre.com
Tel: 1-669-226-2619

^ permalink raw reply

* [PATCH net-next v3 0/2] openvswitch: Support conntrack zone limit
From: Yi-Hung Wei @ 2018-04-30 21:28 UTC (permalink / raw)
  To: netdev, pshelar; +Cc: Yi-Hung Wei

Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max.  In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table.  Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.

To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone.  For example, we can designate different zone to different VM,
and set conntrack limit to each zone.  By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs.  Moreover, the users can
set various conntrack limit to different zone based on their preference.

The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone.  If the number of
connection is above a configured limitation, OVS will return ENOMEM to the
userspace.  If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.

The first patch defines the conntrack limit netlink definition, and the
second patch provides the implementation.

v2->v3:
  - Addresses comments from Parvin that include using static keys to check
    if ovs_ct_limit features is used, only check ct_limit when a ct entry
    is unconfirmed, and reports rate limited warning messages when the ct
    limit is reached.
  - Rebases to master.

v1->v2:
  - Fixes commit log typos suggested by Greg.
  - Fixes memory free issue that Julia found.


Yi-Hung Wei (2):
  openvswitch: Add conntrack limit netlink definition
  openvswitch: Support conntrack zone limit

 include/uapi/linux/openvswitch.h |  62 +++++
 net/openvswitch/Kconfig          |   3 +-
 net/openvswitch/conntrack.c      | 508 ++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/conntrack.h      |   9 +-
 net/openvswitch/datapath.c       |   7 +-
 net/openvswitch/datapath.h       |   1 +
 6 files changed, 584 insertions(+), 6 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH net-next v3 1/2] openvswitch: Add conntrack limit netlink definition
From: Yi-Hung Wei @ 2018-04-30 21:28 UTC (permalink / raw)
  To: netdev, pshelar; +Cc: Yi-Hung Wei
In-Reply-To: <1525123713-38891-1-git-send-email-yihung.wei@gmail.com>

Define netlink messages and attributes to support user kernel
communication that uses the conntrack limit feature.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
 include/uapi/linux/openvswitch.h | 62 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 713e56ce681f..ca63c16375ce 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -937,4 +937,66 @@ enum ovs_meter_band_type {
 
 #define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
 
+/* Conntrack limit */
+#define OVS_CT_LIMIT_FAMILY  "ovs_ct_limit"
+#define OVS_CT_LIMIT_MCGROUP "ovs_ct_limit"
+#define OVS_CT_LIMIT_VERSION 0x1
+
+enum ovs_ct_limit_cmd {
+	OVS_CT_LIMIT_CMD_UNSPEC,
+	OVS_CT_LIMIT_CMD_SET,		/* Add or modify ct limit. */
+	OVS_CT_LIMIT_CMD_DEL,		/* Delete ct limit. */
+	OVS_CT_LIMIT_CMD_GET		/* Get ct limit. */
+};
+
+enum ovs_ct_limit_attr {
+	OVS_CT_LIMIT_ATTR_UNSPEC,
+	OVS_CT_LIMIT_ATTR_OPTION,	/* Nested OVS_CT_LIMIT_ATTR_* */
+	__OVS_CT_LIMIT_ATTR_MAX
+};
+
+#define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1)
+
+/**
+ * @OVS_CT_ZONE_LIMIT_ATTR_SET_REQ: Contains either
+ * OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT or a pair of
+ * OVS_CT_ZONE_LIMIT_ATTR_ZONE and OVS_CT_ZONE_LIMIT_ATTR_LIMIT.
+ * @OVS_CT_ZONE_LIMIT_ATTR_DEL_REQ: Contains OVS_CT_ZONE_LIMIT_ATTR_ZONE.
+ * @OVS_CT_ZONE_LIMIT_ATTR_GET_REQ: Contains OVS_CT_ZONE_LIMIT_ATTR_ZONE.
+ * @OVS_CT_ZONE_LIMIT_ATTR_GET_RLY: Contains either
+ * OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT or a triple of
+ * OVS_CT_ZONE_LIMIT_ATTR_ZONE, OVS_CT_ZONE_LIMIT_ATTR_LIMIT and
+ * OVS_CT_ZONE_LIMIT_ATTR_COUNT.
+ */
+enum ovs_ct_limit_option_attr {
+	OVS_CT_LIMIT_OPTION_ATTR_UNSPEC,
+	OVS_CT_ZONE_LIMIT_ATTR_SET_REQ,	/* Nested OVS_CT_ZONE_LIMIT_ATTR_*
+					 * attributes. */
+	OVS_CT_ZONE_LIMIT_ATTR_DEL_REQ,	/* Nested OVS_CT_ZONE_LIMIT_ATTR_*
+					 * attributes. */
+	OVS_CT_ZONE_LIMIT_ATTR_GET_REQ,	/* Nested OVS_CT_ZONE_LIMIT_ATTR_*
+					 * attributes. */
+	OVS_CT_ZONE_LIMIT_ATTR_GET_RLY,	/* Nested OVS_CT_ZONE_LIMIT_ATTR_*
+					 * attributes. */
+	__OVS_CT_LIMIT_OPTION_ATTR_MAX
+};
+
+#define OVS_CT_LIMIT_OPTION_ATTR_MAX (__OVS_CT_LIMIT_OPTION_ATTR_MAX - 1)
+
+enum ovs_ct_zone_limit_attr {
+	OVS_CT_ZONE_LIMIT_ATTR_UNSPEC,
+	OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT,	/* u32 default conntrack limit
+						 * for all zones. */
+	OVS_CT_ZONE_LIMIT_ATTR_ZONE,		/* u16 conntrack zone id. */
+	OVS_CT_ZONE_LIMIT_ATTR_LIMIT,		/* u32 max number of conntrack
+						 * entries allowed in the
+						 * corresponding zone. */
+	OVS_CT_ZONE_LIMIT_ATTR_COUNT,		/* u32 number of conntrack
+						 * entries in the corresponding
+						 * zone. */
+	__OVS_CT_ZONE_LIMIT_ATTR_MAX
+};
+
+#define OVS_CT_ZONE_LIMIT_ATTR_MAX (__OVS_CT_ZONE_LIMIT_ATTR_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v3 2/2] openvswitch: Support conntrack zone limit
From: Yi-Hung Wei @ 2018-04-30 21:28 UTC (permalink / raw)
  To: netdev, pshelar; +Cc: Yi-Hung Wei
In-Reply-To: <1525123713-38891-1-git-send-email-yihung.wei@gmail.com>

Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max.  In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table.  Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.

To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone.  For example, we can designate different zone to different VM,
and set conntrack limit to each zone.  By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs.  Moreover, the users can
set various conntrack limit to different zone based on their preference.

The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone.  If the number of
connection is above a configured limitation, ovs will return ENOMEM to the
userspace.  If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.

The following high leve APIs are provided to the userspace:
  - OVS_CT_LIMIT_CMD_SET:
    * set default connection limit for all zones
    * set the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_DEL:
    * remove the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_GET:
    * get the default connection limit for all zones
    * get the connection limit for a particular zone

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
---
 net/openvswitch/Kconfig     |   3 +-
 net/openvswitch/conntrack.c | 508 +++++++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/conntrack.h |   9 +-
 net/openvswitch/datapath.c  |   7 +-
 net/openvswitch/datapath.h  |   1 +
 5 files changed, 522 insertions(+), 6 deletions(-)

diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2650205cdaf9..89da9512ec1e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -9,7 +9,8 @@ config OPENVSWITCH
 		   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
 				     (!NF_NAT || NF_NAT) && \
 				     (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
-				     (!NF_NAT_IPV6 || NF_NAT_IPV6)))
+				     (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
+				     (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c5904f629091..8234964889d9 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -16,8 +16,11 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/sctp.h>
+#include <linux/static_key.h>
 #include <net/ip.h>
+#include <net/genetlink.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_count.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
@@ -76,6 +79,39 @@ struct ovs_conntrack_info {
 #endif
 };
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+#define OVS_CT_LIMIT_UNLIMITED	0
+#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
+#define CT_LIMIT_HASH_BUCKETS 512
+DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
+
+struct ovs_ct_limit {
+	/* Elements in ovs_ct_limit_info->limits hash table */
+	struct hlist_node hlist_node;
+	struct rcu_head rcu;
+	u16 zone;
+	u32 limit;
+};
+
+struct ovs_ct_limit_info {
+	u32 default_limit;
+	struct hlist_head *limits;
+	struct nf_conncount_data *data __aligned(8);
+};
+
+static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
+	[OVS_CT_LIMIT_ATTR_OPTION] = { .type = NLA_NESTED, },
+};
+
+static const struct nla_policy
+	ct_zone_limit_policy[OVS_CT_ZONE_LIMIT_ATTR_MAX + 1] = {
+		[OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT] = { .type = NLA_U32, },
+		[OVS_CT_ZONE_LIMIT_ATTR_ZONE] = { .type = NLA_U16, },
+		[OVS_CT_ZONE_LIMIT_ATTR_LIMIT] = { .type = NLA_U32, },
+		[OVS_CT_ZONE_LIMIT_ATTR_COUNT] = { .type = NLA_U32, },
+};
+#endif
+
 static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
 
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -1036,6 +1072,94 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
 	return false;
 }
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static struct hlist_head *ct_limit_hash_bucket(
+	const struct ovs_ct_limit_info *info, u16 zone)
+{
+	return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)];
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_set(const struct ovs_ct_limit_info *info,
+			 struct ovs_ct_limit *new_ct_limit)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, new_ct_limit->zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == new_ct_limit->zone) {
+			hlist_replace_rcu(&ct_limit->hlist_node,
+					  &new_ct_limit->hlist_node);
+			kfree_rcu(ct_limit, rcu);
+			return;
+		}
+	}
+
+	hlist_add_head_rcu(&new_ct_limit->hlist_node, head);
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == zone) {
+			hlist_del_rcu(&ct_limit->hlist_node);
+			kfree_rcu(ct_limit, rcu);
+			return;
+		}
+	}
+}
+
+/* Call with RCU read lock */
+static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == zone)
+			return ct_limit->limit;
+	}
+
+	return info->default_limit;
+}
+
+static int ovs_ct_check_limit(struct net *net,
+			      const struct ovs_conntrack_info *info,
+			      const struct nf_conntrack_tuple *tuple)
+{
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	u32 per_zone_limit, connections;
+	u32 conncount_key[5];
+
+	conncount_key[0] = info->zone.id;
+
+	rcu_read_lock();
+	per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
+	if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	connections = nf_conncount_count(net, ct_limit_info->data,
+					 conncount_key, tuple, &info->zone);
+	if (connections > per_zone_limit) {
+		rcu_read_unlock();
+		return -ENOMEM;
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+#endif
+
 /* Lookup connection and confirm if unconfirmed. */
 static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 			 const struct ovs_conntrack_info *info,
@@ -1054,6 +1178,20 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 	if (!ct)
 		return 0;
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
+		if (!nf_ct_is_confirmed(ct)) {
+			err = ovs_ct_check_limit(net, info,
+				&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+			if (err) {
+				net_warn_ratelimited("openvswitch: zone: %u "
+					"execeeds conntrack limit\n", info->zone.id);
+				return err;
+			}
+		}
+	}
+#endif
+
 	/* Set the conntrack event mask if given.  NEW and DELETE events have
 	 * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener
 	 * typically would receive many kinds of updates.  Setting the event
@@ -1655,7 +1793,365 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
 		nf_ct_tmpl_free(ct_info->ct);
 }
 
-void ovs_ct_init(struct net *net)
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
+{
+	int i, err;
+
+	ovs_net->ct_limit_info = kmalloc(sizeof *ovs_net->ct_limit_info,
+					 GFP_KERNEL);
+	if (!ovs_net->ct_limit_info)
+		return -ENOMEM;
+
+	ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
+	ovs_net->ct_limit_info->limits =
+		kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
+			      GFP_KERNEL);
+	if (!ovs_net->ct_limit_info->limits) {
+		kfree(ovs_net->ct_limit_info);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
+
+	ovs_net->ct_limit_info->data =
+		nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
+
+	if (IS_ERR(ovs_net->ct_limit_info->data)) {
+		err = PTR_ERR(ovs_net->ct_limit_info->data);
+		kfree(ovs_net->ct_limit_info->limits);
+		kfree(ovs_net->ct_limit_info);
+		return err;
+	}
+	return 0;
+}
+
+static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
+{
+	const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info;
+	int i;
+
+	nf_conncount_destroy(net, NFPROTO_INET, info->data);
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+		struct hlist_head *head = &info->limits[i];
+		struct ovs_ct_limit *ct_limit;
+
+		hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
+			kfree_rcu(ct_limit, rcu);
+	}
+	kfree(ovs_net->ct_limit_info->limits);
+	kfree(ovs_net->ct_limit_info);
+}
+
+static struct sk_buff *
+ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd,
+			     struct ovs_header **ovs_reply_header)
+{
+	struct sk_buff *skb;
+	struct ovs_header *ovs_header = info->userhdr;
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	*ovs_reply_header = genlmsg_put(skb, info->snd_portid,
+					info->snd_seq,
+					&dp_ct_limit_genl_family, 0, cmd);
+
+	if (!*ovs_reply_header) {
+		nlmsg_free(skb);
+		return ERR_PTR(-EMSGSIZE);
+	}
+	(*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex;
+
+	return skb;
+}
+
+static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info)
+{
+	struct nlattr *nla;
+	int rem, err;
+
+	nla_for_each_nested(nla, nla_zone_limit, rem) {
+		struct nlattr *attr[OVS_CT_ZONE_LIMIT_ATTR_MAX + 1];
+		struct ovs_ct_limit *ct_limit;
+
+		if (nla_type(nla) != OVS_CT_ZONE_LIMIT_ATTR_SET_REQ)
+			return  -EINVAL;
+
+		err = nla_parse((struct nlattr **)&attr,
+				OVS_CT_ZONE_LIMIT_ATTR_MAX, nla_data(nla),
+				nla_len(nla), ct_zone_limit_policy, NULL);
+		if (err)
+			return err;
+
+		if (attr[OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT]) {
+			u32 default_limit = nla_get_u32(
+				attr[OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT]);
+			ovs_lock();
+			info->default_limit = default_limit;
+			ovs_unlock();
+		} else {
+			if (!attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE] ||
+			    !attr[OVS_CT_ZONE_LIMIT_ATTR_LIMIT]) {
+				return -EINVAL;
+			}
+
+			ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
+			if (!ct_limit)
+				return -ENOMEM;
+
+			ct_limit->zone = nla_get_u16(
+				attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE]);
+			ct_limit->limit = nla_get_u32(
+				attr[OVS_CT_ZONE_LIMIT_ATTR_LIMIT]);
+
+			ovs_lock();
+			ct_limit_set(info, ct_limit);
+			ovs_unlock();
+		}
+	}
+	return 0;
+}
+
+static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info)
+{
+	struct nlattr *nla;
+	int rem, err;
+
+	nla_for_each_nested(nla, nla_zone_limit, rem) {
+		struct nlattr *attr[OVS_CT_ZONE_LIMIT_ATTR_MAX + 1];
+		u16 zone;
+
+		if (nla_type(nla) != OVS_CT_ZONE_LIMIT_ATTR_DEL_REQ)
+			return  -EINVAL;
+
+		err = nla_parse((struct nlattr **)&attr,
+				OVS_CT_ZONE_LIMIT_ATTR_MAX, nla_data(nla),
+				nla_len(nla), ct_zone_limit_policy, NULL);
+		if (err)
+			return err;
+
+		if (!attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE])
+			return -EINVAL;
+
+		zone = nla_get_u16(attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE]);
+
+		ovs_lock();
+		ct_limit_del(info, zone);
+		ovs_unlock();
+	}
+	return 0;
+}
+
+static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
+					  struct sk_buff *reply)
+{
+	int err;
+	struct nlattr *nla_nested;
+
+	nla_nested = nla_nest_start(reply, OVS_CT_ZONE_LIMIT_ATTR_GET_RLY);
+
+	err = nla_put_u32(reply, OVS_CT_ZONE_LIMIT_ATTR_DEFAULT_LIMIT,
+			  info->default_limit);
+	if (err)
+		return err;
+
+	nla_nest_end(reply, nla_nested);
+	return 0;
+}
+
+static int ovs_ct_limit_get_zone_limit(struct net *net,
+				       struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info,
+				       struct sk_buff *reply)
+{
+	struct nlattr *nla, *nla_nested;
+	int rem, err;
+	u16 zone;
+	u32 limit, count, conncount_key[5];
+	struct nf_conntrack_zone ct_zone;
+
+	nla_for_each_nested(nla, nla_zone_limit, rem) {
+		struct nlattr *attr[OVS_CT_ZONE_LIMIT_ATTR_MAX + 1];
+
+		if (nla_type(nla) != OVS_CT_ZONE_LIMIT_ATTR_GET_REQ)
+			return -EINVAL;
+
+		err = nla_parse((struct nlattr **)&attr,
+				OVS_CT_ZONE_LIMIT_ATTR_MAX, nla_data(nla),
+				nla_len(nla), ct_zone_limit_policy, NULL);
+		if (err)
+			return err;
+
+		if (!attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE])
+			return -EINVAL;
+
+		zone = nla_get_u16(attr[OVS_CT_ZONE_LIMIT_ATTR_ZONE]);
+		nf_ct_zone_init(&ct_zone, zone, NF_CT_DEFAULT_ZONE_DIR, 0);
+		rcu_read_lock();
+		limit = ct_limit_get(info, zone);
+		rcu_read_unlock();
+
+		conncount_key[0] = zone;
+		count = nf_conncount_count(net, info->data, conncount_key,
+					   NULL, &ct_zone);
+
+		nla_nested = nla_nest_start(reply,
+					    OVS_CT_ZONE_LIMIT_ATTR_GET_RLY);
+		if (nla_put_u16(reply, OVS_CT_ZONE_LIMIT_ATTR_ZONE, zone) ||
+		    nla_put_u32(reply, OVS_CT_ZONE_LIMIT_ATTR_LIMIT, limit) ||
+		    nla_put_u32(reply, OVS_CT_ZONE_LIMIT_ATTR_COUNT, count))
+			return -EMSGSIZE;
+		nla_nest_end(reply, nla_nested);
+	}
+
+	return 0;
+}
+
+static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	if (!a[OVS_CT_LIMIT_ATTR_OPTION])
+		return -EINVAL;
+
+	err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_OPTION],
+					  ct_limit_info);
+	if (err)
+		goto exit_err;
+	static_branch_enable(&ovs_ct_limit_enabled);
+
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	if (!a[OVS_CT_LIMIT_ATTR_OPTION])
+		return -EINVAL;
+
+	err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_OPTION],
+					  ct_limit_info);
+	if (err)
+		goto exit_err;
+
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct nlattr *nla_reply;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct net *net = sock_net(skb->sk);
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_OPTION);
+
+	err = ovs_ct_limit_get_default_limit(ct_limit_info, reply);
+	if (err)
+		goto exit_err;
+
+	if (a[OVS_CT_LIMIT_ATTR_OPTION]) {
+		err = ovs_ct_limit_get_zone_limit(
+			net, a[OVS_CT_LIMIT_ATTR_OPTION], ct_limit_info,
+			reply);
+		if (err)
+			goto exit_err;
+	}
+
+	nla_nest_end(reply, nla_reply);
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static struct genl_ops ct_limit_genl_ops[] = {
+	{ .cmd = OVS_CT_LIMIT_CMD_SET,
+		.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+					   * privilege. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_set,
+	},
+	{ .cmd = OVS_CT_LIMIT_CMD_DEL,
+		.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+					   * privilege. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_del,
+	},
+	{ .cmd = OVS_CT_LIMIT_CMD_GET,
+		.flags = 0,		  /* OK for unprivileged users. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_get,
+	},
+};
+
+static const struct genl_multicast_group ovs_ct_limit_multicast_group = {
+	.name = OVS_CT_LIMIT_MCGROUP,
+};
+
+struct genl_family dp_ct_limit_genl_family __ro_after_init = {
+	.hdrsize = sizeof(struct ovs_header),
+	.name = OVS_CT_LIMIT_FAMILY,
+	.version = OVS_CT_LIMIT_VERSION,
+	.maxattr = OVS_CT_LIMIT_ATTR_MAX,
+	.netnsok = true,
+	.parallel_ops = true,
+	.ops = ct_limit_genl_ops,
+	.n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+	.mcgrps = &ovs_ct_limit_multicast_group,
+	.n_mcgrps = 1,
+	.module = THIS_MODULE,
+};
+#endif
+
+int ovs_ct_init(struct net *net)
 {
 	unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -1666,12 +2162,22 @@ void ovs_ct_init(struct net *net)
 	} else {
 		ovs_net->xt_label = true;
 	}
+
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	return ovs_ct_limit_init(net, ovs_net);
+#else
+	return 0;
+#endif
 }
 
 void ovs_ct_exit(struct net *net)
 {
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	ovs_ct_limit_exit(net, ovs_net);
+#endif
+
 	if (ovs_net->xt_label)
 		nf_connlabels_put(net);
 }
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 399dfdd2c4f9..900dadd70974 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -17,10 +17,11 @@
 #include "flow.h"
 
 struct ovs_conntrack_info;
+struct ovs_ct_limit_info;
 enum ovs_key_attr;
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-void ovs_ct_init(struct net *);
+int ovs_ct_init(struct net *);
 void ovs_ct_exit(struct net *);
 bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
 int ovs_ct_copy_action(struct net *, const struct nlattr *,
@@ -44,7 +45,7 @@ void ovs_ct_free_action(const struct nlattr *a);
 #else
 #include <linux/errno.h>
 
-static inline void ovs_ct_init(struct net *net) { }
+static inline int ovs_ct_init(struct net *net) { return 0; }
 
 static inline void ovs_ct_exit(struct net *net) { }
 
@@ -104,4 +105,8 @@ static inline void ovs_ct_free_action(const struct nlattr *a) { }
 
 #define CT_SUPPORTED_MASK 0
 #endif /* CONFIG_NF_CONNTRACK */
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+extern struct genl_family dp_ct_limit_genl_family;
+#endif
 #endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 015e24e08909..a61818e94396 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2288,6 +2288,9 @@ static struct genl_family * const dp_genl_families[] = {
 	&dp_flow_genl_family,
 	&dp_packet_genl_family,
 	&dp_meter_genl_family,
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	&dp_ct_limit_genl_family,
+#endif
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2323,8 +2326,7 @@ static int __net_init ovs_init_net(struct net *net)
 
 	INIT_LIST_HEAD(&ovs_net->dps);
 	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
-	ovs_ct_init(net);
-	return 0;
+	return ovs_ct_init(net);
 }
 
 static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2469,3 +2471,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 523d65526766..51bd4dcb6c8b 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,6 +144,7 @@ struct dp_upcall_info {
 struct ovs_net {
 	struct list_head dps;
 	struct work_struct dp_notify_work;
+	struct ovs_ct_limit_info *ct_limit_info;
 
 	/* Module reference for configuring conntrack. */
 	bool xt_label;
-- 
2.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox