Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next] net: sched: make tcf_action_dump_1() static
From: Jamal Hadi Salim @ 2022-08-15 11:53 UTC (permalink / raw)
  To: Zhengchao Shao
  Cc: netdev, linux-kernel, davem, edumazet, kuba, pabeni,
	xiyou.wangcong, jiri, weiyongjun1, yuehaibing
In-Reply-To: <20220815070122.113871-1-shaozhengchao@huawei.com>

You shouldnt have so many line changes to remove the EXPORT and change
"int" to "static int".
What am i missing?
Unnecessary line changes add extra effort to git archeology

cheers,
jamal

On Mon, Aug 15, 2022 at 2:58 AM Zhengchao Shao <shaozhengchao@huawei.com> wrote:
>
> Function tcf_action_dump_1() is not used outside of act_api.c, so remove
> the superfluous EXPORT_SYMBOL() and marks it static.
>
> Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
> ---
>  include/net/act_api.h |   1 -
>  net/sched/act_api.c   | 100 +++++++++++++++++++++---------------------
>  2 files changed, 49 insertions(+), 52 deletions(-)
>
> diff --git a/include/net/act_api.h b/include/net/act_api.h
> index 9cf6870b526e..d51b3f931771 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -215,7 +215,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
>  int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
>                     int ref, bool terse);
>  int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
> -int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
>
>  static inline void tcf_action_update_bstats(struct tc_action *a,
>                                             struct sk_buff *skb)
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index b69fcde546ba..9fd98bf5c724 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -510,6 +510,55 @@ tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act)
>         return -1;
>  }
>
> +int
> +tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
> +{
> +       return a->ops->dump(skb, a, bind, ref);
> +}
> +
> +static int
> +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
> +{
> +       int err = -EINVAL;
> +       unsigned char *b = skb_tail_pointer(skb);
> +       struct nlattr *nest;
> +       u32 flags;
> +
> +       if (tcf_action_dump_terse(skb, a, false))
> +               goto nla_put_failure;
> +
> +       if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
> +           nla_put_bitfield32(skb, TCA_ACT_HW_STATS,
> +                              a->hw_stats, TCA_ACT_HW_STATS_ANY))
> +               goto nla_put_failure;
> +
> +       if (a->used_hw_stats_valid &&
> +           nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS,
> +                              a->used_hw_stats, TCA_ACT_HW_STATS_ANY))
> +               goto nla_put_failure;
> +
> +       flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK;
> +       if (flags &&
> +           nla_put_bitfield32(skb, TCA_ACT_FLAGS,
> +                              flags, flags))
> +               goto nla_put_failure;
> +
> +       if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count))
> +               goto nla_put_failure;
> +
> +       nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
> +       if (!nest)
> +               goto nla_put_failure;
> +       err = tcf_action_dump_old(skb, a, bind, ref);
> +       if (err > 0) {
> +               nla_nest_end(skb, nest);
> +               return err;
> +       }
> +
> +nla_put_failure:
> +       nlmsg_trim(skb, b);
> +       return -1;
> +}
>  static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
>                            struct netlink_callback *cb)
>  {
> @@ -1132,57 +1181,6 @@ static void tcf_action_put_many(struct tc_action *actions[])
>         }
>  }
>
> -int
> -tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
> -{
> -       return a->ops->dump(skb, a, bind, ref);
> -}
> -
> -int
> -tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
> -{
> -       int err = -EINVAL;
> -       unsigned char *b = skb_tail_pointer(skb);
> -       struct nlattr *nest;
> -       u32 flags;
> -
> -       if (tcf_action_dump_terse(skb, a, false))
> -               goto nla_put_failure;
> -
> -       if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
> -           nla_put_bitfield32(skb, TCA_ACT_HW_STATS,
> -                              a->hw_stats, TCA_ACT_HW_STATS_ANY))
> -               goto nla_put_failure;
> -
> -       if (a->used_hw_stats_valid &&
> -           nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS,
> -                              a->used_hw_stats, TCA_ACT_HW_STATS_ANY))
> -               goto nla_put_failure;
> -
> -       flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK;
> -       if (flags &&
> -           nla_put_bitfield32(skb, TCA_ACT_FLAGS,
> -                              flags, flags))
> -               goto nla_put_failure;
> -
> -       if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count))
> -               goto nla_put_failure;
> -
> -       nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
> -       if (nest == NULL)
> -               goto nla_put_failure;
> -       err = tcf_action_dump_old(skb, a, bind, ref);
> -       if (err > 0) {
> -               nla_nest_end(skb, nest);
> -               return err;
> -       }
> -
> -nla_put_failure:
> -       nlmsg_trim(skb, b);
> -       return -1;
> -}
> -EXPORT_SYMBOL(tcf_action_dump_1);
> -
>  int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
>                     int bind, int ref, bool terse)
>  {
> --
> 2.17.1
>

^ permalink raw reply

* Re: [PATCH net-next v3] net: skb: prevent the split of kfree_skb_reason() by gcc
From: Miguel Ojeda @ 2022-08-15 11:52 UTC (permalink / raw)
  To: menglong8.dong
  Cc: kuba, ojeda, ndesaulniers, davem, edumazet, pabeni, asml.silence,
	imagedong, luiz.von.dentz, vasily.averin, jk, linux-kernel,
	netdev, kernel test robot
In-Reply-To: <20220815062727.1203589-1-imagedong@tencent.com>

On Mon, Aug 15, 2022 at 8:27 AM <menglong8.dong@gmail.com> wrote:
>
>  include/linux/compiler-gcc.h   | 12 ++++++++++++
>  include/linux/compiler_types.h |  4 ++++

No, this should be in `compiler_attributes.h` like you had it before.

To be clear, what you did here would be fine, but it is the "old way"
(we added `compiler_attributes.h` to reduce the complexity of
`compiler-*` and `compiler_types.h` and make it a bit more
normalized).

Please take a moment and read how other attributes do it in
`compiler_attributes.h` with `__has_attribute`. Check, for instance,
`__copy`, which is very similar to your case (not supported by Clang
and ICC, except in your case GCC always supports at least since 5.1).

Cheers,
Miguel

^ permalink raw reply

* Re: [PATCH v2 1/2] Revert "mlxsw: core: Use different get_trend() callbacks for different thermal zones"
From: Ido Schimmel @ 2022-08-15 11:47 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: rafael, vadimp, davem, netdev, linux-kernel, Vadim Pasternak,
	Petr Machata, Eric Dumazet, Jakub Kicinski, Paolo Abeni
In-Reply-To: <20220815091032.1731268-1-daniel.lezcano@linaro.org>

On Mon, Aug 15, 2022 at 11:10:31AM +0200, Daniel Lezcano wrote:
> This reverts commit 2dc2f760052da4925482ecdcdc5c94d4a599153c.
> 
> As discussed in the thread:
> 
> https://lore.kernel.org/all/f3c62ebe-7d59-c537-a010-bff366c8aeba@linaro.org/
> 
> the feature provided by commits 2dc2f760052da and 6f73862fabd93 is
> actually already handled by the thermal framework via the cooling
> device state aggregation, thus all this code is pointless.
> 
> No conflict happened when reverting the patch.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> Tested-by: Vadim Pasternak <vadimp@nvidia.com>

Daniel, the intention is to send these patches to mainline as part of
your 6.1 pull request?

I discussed it with Vadim yesterday and we do not expect changes in the
file during the current cycle so this is OK as far as we are concerned,
but I believe this will also need an ack from one of the netdev
maintainers.

Thanks

^ permalink raw reply

* Re: [PATCH net-next] net: wwan: iosm: Enable M.2 7360 WWAN card support
From: Florian Klink @ 2022-08-15 11:30 UTC (permalink / raw)
  To: Kumar, M Chetan
  Cc: Jan Kiszka, netdev, kuba, davem, johannes, ryazanov.s.a,
	loic.poulain, krishna.c.sudi, linuxwwan
In-Reply-To: <fb700c62-eca4-879b-1b1a-966d9232fd4d@linux.intel.com>

Hey,

On 22-02-10 21:46:21, Kumar, M Chetan wrote:
>On 2/10/2022 9:08 PM, Jan Kiszka wrote:
>>On 10.02.22 16:34, M Chetan Kumar wrote:
>>>This patch enables Intel M.2 7360 WWAN card support on
>>>IOSM Driver.
>>>[…]
>>
>>Hey, cool! I'll be happy to try that out soon. Any special userland
>>changes required, or will it "just work" with sufficiently recent
>>ModemManager or whatever?
>
>It need some changes at ModemManager side.

There's some people trying out this patchset in
https://github.com/xmm7360/xmm7360-pci/issues/31.

With the changes merged in, apparently the modem still reports a "SIM
not inserted" error.

https://github.com/xmm7360/xmm7360-pci/issues/31#issuecomment-1181936111
suggests it might be the "FCC lock" feature, but even then, it doesn't
seem to work.

There's now a ModemManager issue at
https://gitlab.freedesktop.org/mobile-broadband/ModemManager/-/issues/612,
which is probably the more appropriate way to discuss this, rather than
another out-of-tree kernel driver.

If you have any more insights on what's missing to get this to work in
NetworkManager/ModemManager, any comment would be appreciated.

Thanks!
Florian

^ permalink raw reply

* Re: [RFC net-next v3 23/29] io_uring: allow to pass addr into sendzc
From: Stefan Metzmacher @ 2022-08-15 11:40 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring, netdev, linux-kernel
  Cc: David S . Miller, Jakub Kicinski, Jonathan Lemon,
	Willem de Bruijn, Jens Axboe, kernel-team
In-Reply-To: <db7bbfcd-fdd0-ed8e-3d8e-78d76f278af8@gmail.com>

Hi Pavel,

> Thanks for giving a thought about the API, are you trying
> to use it in samba?

Yes, but I'd need SENDMSGZC and then I'd like to test,
which variant gives the best performance. It also depends
on the configured samba vfs module stack.

My current prototype uses IO_SENDMSG for the header < 250 bytes
followed by up to 8MBytes via IO_SPLICE if the storage backend also
supports splice, otherwise I'd try to use IO_SENDMSGZC for header + 8 MBytes payload
together. If there's encryption turned actice on the connection we would
most likely always use a bounce buffer and hit the IO_SENDMSGZC case.
So all in all I'd say we'll use it.

I guess it would be useful for userspace to notice if zero was possible or not.

__msg_zerocopy_callback() sets SO_EE_CODE_ZEROCOPY_COPIED, maybe
io_uring_tx_zerocopy_callback() should have something like:

if (!success)
     notif->cqe.res = SO_EE_CODE_ZEROCOPY_COPIED;

This would make it a bit easier to judge if SENDZC is useful for the
application or not. Or at least have debug message, which would explain
be able to explain degraded performance to the admin/developer.

>>>> Given that this fills in msg almost completely can we also have
>>>> a version of SENDMSGZC, it would be very useful to also allow
>>>> msg_control to be passed and as well as an iovec.
>>>>
>>>> Would that be possible?
>>>
>>> Right, I left it to follow ups as the series is already too long.
>>>
>>> fwiw, I'm going to also add addr to IORING_OP_SEND.
>>
>>
>> Given the minimal differences, which were left between
>> IORING_OP_SENDZC and IORING_OP_SEND, wouldn't it be better
>> to merge things to IORING_OP_SEND using a IORING_RECVSEND_ZC_NOTIF
>> as indication to use the notif slot.
> 
> And will be even more similar in for-next, but with notifications
> I'd still prefer different opcodes to get a little bit more
> flexibility and not making the normal io_uring send path messier.

Ok, we should just remember the opcode is only u8
and we already have ~ 50 out of ~250 allocated in ~3 years
time.

>> It would means we don't need to waste two opcodes for
>> IORING_OP_SENDZC and IORING_OP_SENDMSGZC (and maybe more)
>>
>>
>> I also noticed a problem in io_notif_update()
>>
>>          for (; idx < idx_end; idx++) {
>>                  struct io_notif_slot *slot = &ctx->notif_slots[idx];
>>
>>                  if (!slot->notif)
>>                          continue;
>>                  if (up->arg)
>>                          slot->tag = up->arg;
>>                  io_notif_slot_flush_submit(slot, issue_flags);
>>          }
>>
>>   slot->tag = up->arg is skipped if there is no notif already.
>>
>> So you can't just use a 2 linked sqe's with
>>
>> IORING_RSRC_UPDATE_NOTIF followed by IORING_OP_SENDZC(with IORING_RECVSEND_NOTIF_FLUSH)
> 
> slot->notif is lazily initialised with the first send attached to it,
> so in your example IORING_OP_SENDZC will first create a notification
> to execute the send and then will flush it.
> 
> This "if" is there is only to have a more reliable API. We can
> go over the range and allocate all empty slots and then flush
> all of them, but allocation failures should be propagated to the
> userspace when currently the function it can't fail.
> 
>> I think the if (!slot->notif) should be moved down a bit.
> 
> Not sure what you mean

I think it should be:

                   if (up->arg)
                           slot->tag = up->arg;
                   if (!slot->notif)
                           continue;
                   io_notif_slot_flush_submit(slot, issue_flags);

or even:

                   slot->tag = up->arg;
                   if (!slot->notif)
                           continue;
                   io_notif_slot_flush_submit(slot, issue_flags);

otherwise IORING_RSRC_UPDATE_NOTIF would not be able to reset the tag,
if notif was never created or already be flushed.

>> It would somehow be nice to avoid the notif slots at all and somehow
>> use some kind of multishot request in order to generate two qces.
> 
> It is there first to ammortise overhead of zerocopy infra and bits
> for second CQE posting. But more importantly, without it for TCP
> the send payload size would need to be large enough or performance
> would suffer, but all depends on the use case. TL;DR; it would be
> forced to create a new SKB for each new send.
> 
> For something simpler, I'll push another zc variant that doesn't
> have notifiers and posts only one CQE and only after the buffers
> are no more in use by the kernel. This works well for UDP and for
> some TCP scenarios, but doesn't cover all cases.

I think (at least for stream sockets) it would be more useful to
get two CQEs:
1. The first signals userspace that it can
    issue the next send-like operation (SEND,SENDZC,SENDMSG,SPLICE)
    on the stream without the risk of byte ordering problem within the stream
    and avoid too high latency (which would happen, if we wait for a send to
    leave the hardware nic, before sending the next PDU).
2. The 2nd signals userspace that the buffer can be reused or released.

In that case it would be useful to also provide a separate 'user_data' element
for the 2nd CQE.

>> I'm also wondering what will happen if a notif will be referenced by the net layer
>> but the io_uring instance is already closed, wouldn't
>> io_uring_tx_zerocopy_callback() or __io_notif_complete_tw() crash
>> because notif->ctx is a stale pointer, of notif itself is already gone...
> 
> io_uring will flush all slots and wait for all notifications
> to fire, i.e. io_uring_tx_zerocopy_callback(), so it's not a
> problem.

I can't follow :-(

What I see is that io_notif_unregister():

                 nd = io_notif_to_data(notif);
                 slot->notif = NULL;
                 if (!refcount_dec_and_test(&nd->uarg.refcnt))
                         continue;

So if the net layer still has a reference we just go on.

Only a wild guess, is it something of:

io_alloc_notif():
         ...
         notif->task = current;
         io_get_task_refs(1);
         notif->rsrc_node = NULL;
         io_req_set_rsrc_node(notif, ctx, 0);
         ...

and

__io_req_complete_put():
                 ...
                 io_req_put_rsrc(req);
                 /*
                  * Selected buffer deallocation in io_clean_op() assumes that
                  * we don't hold ->completion_lock. Clean them here to avoid
                  * deadlocks.
                  */
                 io_put_kbuf_comp(req);
                 io_dismantle_req(req);
                 io_put_task(req->task, 1);
                 ...

that causes io_ring_exit_work() to wait for it.
It would be great if you or someone else could explain this in detail
and maybe adding some comments into the code.

metze


^ permalink raw reply

* Re: [PATCH] virtio_net: fix endian-ness for RSS
From: Andrew Melnichenko @ 2022-08-15 11:12 UTC (permalink / raw)
  To: patchwork-bot+netdevbpf
  Cc: Michael S. Tsirkin, linux-kernel, jasowang, davem, edumazet, kuba,
	pabeni, virtualization, netdev, Yan Vugenfirer, Yuri Benditovich
In-Reply-To: <166030021657.10916.8438944707929097441.git-patchwork-notify@kernel.org>

Reviewed-by: Andrew Melnychenko andrew@daynix.com

On Fri, Aug 12, 2022 at 1:30 PM <patchwork-bot+netdevbpf@kernel.org> wrote:
>
> Hello:
>
> This patch was applied to netdev/net.git (master)
> by David S. Miller <davem@davemloft.net>:
>
> On Thu, 11 Aug 2022 08:51:58 -0400 you wrote:
> > Using native endian-ness for device supplied fields is wrong
> > on BE platforms. Sparse warns about this.
> >
> > Fixes: 91f41f01d219 ("drivers/net/virtio_net: Added RSS hash report.")
> > Cc: "Andrew Melnychenko" <andrew@daynix.com>
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >
> > [...]
>
> Here is the summary with links:
>   - virtio_net: fix endian-ness for RSS
>     https://git.kernel.org/netdev/net/c/95bb633048fa
>
> You are awesome, thank you!
> --
> Deet-doot-dot, I am a bot.
> https://korg.docs.kernel.org/patchwork/pwbot.html
>
>

^ permalink raw reply

* Re: [PATCH v1 0/3] Bring back driver_deferred_probe_check_state() for now
From: Tony Lindgren @ 2022-08-15 11:01 UTC (permalink / raw)
  To: Saravana Kannan
  Cc: Greg Kroah-Hartman, Rafael J. Wysocki, Kevin Hilman, Ulf Hansson,
	Pavel Machek, Len Brown, Andrew Lunn, Heiner Kallweit,
	Russell King, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, naresh.kamboju, kernel-team, linux-kernel, linux-pm,
	netdev
In-Reply-To: <20220727185012.3255200-1-saravanak@google.com>

* Saravana Kannan <saravanak@google.com> [700101 02:00]:
> More fixes/changes are needed before driver_deferred_probe_check_state()
> can be deleted. So, bring it back for now.
> 
> Greg,
> 
> Can we get this into 5.19? If not, it might not be worth picking up this
> series. I could just do the other/more fixes in time for 5.20.

Yes please pick this as fixes for v6.0-rc series, it fixes booting for
me. I've replied with fixes tags for the two patches that were causing
regressions for me.

Regards,

Tony

^ permalink raw reply

* Re: [PATCH v1 3/3] Revert "PM: domains: Delete usage of driver_deferred_probe_check_state()"
From: Tony Lindgren @ 2022-08-15 11:00 UTC (permalink / raw)
  To: Saravana Kannan
  Cc: Greg Kroah-Hartman, Rafael J. Wysocki, Kevin Hilman, Ulf Hansson,
	Pavel Machek, Len Brown, Andrew Lunn, Heiner Kallweit,
	Russell King, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, naresh.kamboju, kernel-team, linux-kernel, linux-pm,
	netdev
In-Reply-To: <20220727185012.3255200-4-saravanak@google.com>

* Saravana Kannan <saravanak@google.com> [700101 02:00]:
> This reverts commit 5a46079a96451cfb15e4f5f01f73f7ba24ef851a.
> 
> There are a few more issues to fix that have been reported in the thread
> for the original series [1]. We'll need to fix those before this will
> work. So, revert it for now.

This fixes booting for several TI 32-bit ARM SoCs such as am335x and dra7.

Please add a proper fixes tag for this patch though:

Fixes: 5a46079a9645 ("PM: domains: Delete usage of driver_deferred_probe_check_state()")

Reviewed-by: Tony Lindgren <tony@atomide.com>
Tested-by: Tony Lindgren <tony@atomide.com>

^ permalink raw reply

* Re: [PATCH v1 2/3] Revert "net: mdio: Delete usage of driver_deferred_probe_check_state()"
From: Tony Lindgren @ 2022-08-15 11:02 UTC (permalink / raw)
  To: Saravana Kannan
  Cc: Greg Kroah-Hartman, Rafael J. Wysocki, Kevin Hilman, Ulf Hansson,
	Pavel Machek, Len Brown, Andrew Lunn, Heiner Kallweit,
	Russell King, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, naresh.kamboju, kernel-team, linux-kernel, linux-pm,
	netdev
In-Reply-To: <20220727185012.3255200-3-saravanak@google.com>

* Saravana Kannan <saravanak@google.com> [700101 02:00]:
> This reverts commit f8217275b57aa48d98cc42051c2aac34152718d6.
> 
> There are a few more issues to fix that have been reported in the thread
> for the original series [1]. We'll need to fix those before this will
> work. So, revert it for now.

Reviewed-by: Tony Lindgren <tony@atomide.com>

^ permalink raw reply

* Re: [PATCH net 1/1] net_sched: cls_route: disallow handle of 0
From: patchwork-bot+netdevbpf @ 2022-08-15 11:00 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, edumazet, kuba, pabeni, netdev, xiyou.wangcong, jiri,
	kuznet, cascardo, linux-distros, security, stephen, dsahern,
	gregkh
In-Reply-To: <20220814112758.3088655-1-jhs@mojatatu.com>

Hello:

This patch was applied to netdev/net.git (master)
by David S. Miller <davem@davemloft.net>:

On Sun, 14 Aug 2022 11:27:58 +0000 you wrote:
> Follows up on:
> https://lore.kernel.org/all/20220809170518.164662-1-cascardo@canonical.com/
> 
> handle of 0 implies from/to of universe realm which is not very
> sensible.
> 
> Lets see what this patch will do:
> $sudo tc qdisc add dev $DEV root handle 1:0 prio
> 
> [...]

Here is the summary with links:
  - [net,1/1] net_sched: cls_route: disallow handle of 0
    https://git.kernel.org/netdev/net/c/02799571714d

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net 0/4] mlxsw: Fixes for PTP support
From: patchwork-bot+netdevbpf @ 2022-08-15 11:00 UTC (permalink / raw)
  To: Petr Machata
  Cc: davem, edumazet, kuba, pabeni, netdev, idosch, danieller, amcohen,
	richardcochran, mlxsw
In-Reply-To: <cover.1660315448.git.petrm@nvidia.com>

Hello:

This series was applied to netdev/net.git (master)
by David S. Miller <davem@davemloft.net>:

On Fri, 12 Aug 2022 17:31:59 +0200 you wrote:
> This set fixes several issues in mlxsw PTP code.
> 
> - Patch #1 fixes compilation warnings.
> 
> - Patch #2 adjusts the order of operation during cleanup, thereby
>   closing the window after PTP state was already cleaned in the ASIC
>   for the given port, but before the port is removed, when the user
>   could still in theory make changes to the configuration.
> 
> [...]

Here is the summary with links:
  - [net,1/4] mlxsw: spectrum_ptp: Fix compilation warnings
    https://git.kernel.org/netdev/net/c/12e091389b29
  - [net,2/4] mlxsw: spectrum: Clear PTP configuration after unregistering the netdevice
    https://git.kernel.org/netdev/net/c/a159e986ad26
  - [net,3/4] mlxsw: spectrum_ptp: Protect PTP configuration with a mutex
    https://git.kernel.org/netdev/net/c/d72fdef21f07
  - [net,4/4] mlxsw: spectrum_ptp: Forbid PTP enablement only in RX or in TX
    https://git.kernel.org/netdev/net/c/e01885c31bef

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* [syzbot] WARNING: suspicious RCU usage in bpf_sk_reuseport_detach
From: syzbot @ 2022-08-15 10:59 UTC (permalink / raw)
  To: bpf, davem, ecree.xilinx, edumazet, habetsm.xilinx, kuba,
	linux-kernel, netdev, pabeni, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    94ce3b64c62d net/tls: Use RCU API to access tls_ctx->netdev
git tree:       net
console+strace: https://syzkaller.appspot.com/x/log.txt?x=14641e15080000
kernel config:  https://syzkaller.appspot.com/x/.config?x=53da55f2bdeb0d4c
dashboard link: https://syzkaller.appspot.com/bug?extid=24bcff6e82ce253f23ec
compiler:       gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=106c89fd080000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17ead885080000

The issue was bisected to:

commit f72c38fad234759fe943cb2e40bf3d0f7de1d4d9
Author: Edward Cree <ecree.xilinx@gmail.com>
Date:   Wed Jul 20 18:33:48 2022 +0000

    sfc: hook up ef100 representor TX

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=125bf9fd080000
final oops:     https://syzkaller.appspot.com/x/report.txt?x=115bf9fd080000
console output: https://syzkaller.appspot.com/x/log.txt?x=165bf9fd080000

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+24bcff6e82ce253f23ec@syzkaller.appspotmail.com
Fixes: f72c38fad234 ("sfc: hook up ef100 representor TX")

=============================
WARNING: suspicious RCU usage
5.19.0-syzkaller-05408-g94ce3b64c62d #0 Not tainted
-----------------------------
include/net/sock.h:592 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
4 locks held by syz-executor334/3611:
 #0: ffff888073b7be10 (&sb->s_type->i_mutex_key#10){+.+.}-{3:3}, at: inode_lock include/linux/fs.h:760 [inline]
 #0: ffff888073b7be10 (&sb->s_type->i_mutex_key#10){+.+.}-{3:3}, at: __sock_release+0x86/0x280 net/socket.c:649
 #1: ffffc900014e5c28 (&table->hash[i].lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:354 [inline]
 #1: ffffc900014e5c28 (&table->hash[i].lock){+...}-{2:2}, at: udp_lib_unhash net/ipv4/udp.c:2014 [inline]
 #1: ffffc900014e5c28 (&table->hash[i].lock){+...}-{2:2}, at: udp_lib_unhash+0x1d5/0x730 net/ipv4/udp.c:2004
 #2: ffffffff8d7a9a78 (reuseport_lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:354 [inline]
 #2: ffffffff8d7a9a78 (reuseport_lock){+...}-{2:2}, at: reuseport_detach_sock+0x22/0x4a0 net/core/sock_reuseport.c:346
 #3: ffff888145f9a0b8 (clock-AF_INET){++..}-{2:2}, at: bpf_sk_reuseport_detach+0x26/0x190 kernel/bpf/reuseport_array.c:26

stack backtrace:
CPU: 1 PID: 3611 Comm: syz-executor334 Not tainted 5.19.0-syzkaller-05408-g94ce3b64c62d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
 __rcu_dereference_sk_user_data_with_flags include/net/sock.h:592 [inline]
 bpf_sk_reuseport_detach+0x156/0x190 kernel/bpf/reuseport_array.c:27
 reuseport_detach_sock+0x8c/0x4a0 net/core/sock_reuseport.c:362
 udp_lib_unhash net/ipv4/udp.c:2016 [inline]
 udp_lib_unhash+0x210/0x730 net/ipv4/udp.c:2004
 sk_common_release+0xba/0x390 net/core/sock.c:3600
 inet_release+0x12e/0x280 net/ipv4/af_inet.c:428
 __sock_release+0xcd/0x280 net/socket.c:650
 sock_close+0x18/0x20 net/socket.c:1365
 __fput+0x277/0x9d0 fs/file_table.c:320
 task_work_run+0xdd/0x1a0 kernel/task_work.c:177
 exit_task_work include/linux/task_work.h:38 [inline]
 do_exit+0xade/0x29d0 kernel/exit.c:795
 do_group_exit+0xd2/0x2f0 kernel/exit.c:925
 __do_sys_exit_group kernel/exit.c:936 [inline]
 __se_sys_exit_group kernel/exit.c:934 [inline]
 __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:934
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fe407d09699
Code: Unable to access opcode bytes at RIP 0x7fe407d0966f.
RSP: 002b:00007ffc0ff152a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7

---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
syzbot can test patches for this issue, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH v1 1/3] Revert "driver core: Delete driver_deferred_probe_check_state()"
From: Tony Lindgren @ 2022-08-15 10:58 UTC (permalink / raw)
  To: Saravana Kannan
  Cc: Greg Kroah-Hartman, Rafael J. Wysocki, Kevin Hilman, Ulf Hansson,
	Pavel Machek, Len Brown, Andrew Lunn, Heiner Kallweit,
	Russell King, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, naresh.kamboju, kernel-team, linux-kernel, linux-pm,
	netdev
In-Reply-To: <20220727185012.3255200-2-saravanak@google.com>

* Saravana Kannan <saravanak@google.com> [700101 02:00]:
> This reverts commit 9cbffc7a59561be950ecc675d19a3d2b45202b2b.
> 
> There are a few more issues to fix that have been reported in the thread
> for the original series [1]. We'll need to fix those before this will
> work. So, revert it for now.

This fixes booting for several TI 32-bit ARM SoCs such as am335x and dra7.

Please add a proper fixes tag for this patch though:

Fixes: 5a46079a9645 ("PM: domains: Delete usage of driver_deferred_probe_check_state()")

Reviewed-by: Tony Lindgren <tony@atomide.com>
Tested-by: Tony Lindgren <tony@atomide.com>

^ permalink raw reply

* Re: [PATCH] net: fix potential refcount leak in ndisc_router_discovery()
From: patchwork-bot+netdevbpf @ 2022-08-15 10:50 UTC (permalink / raw)
  To: Xin Xiong
  Cc: davem, yoshfuji, dsahern, edumazet, kuba, pabeni, praveen5582,
	zxu, netdev, linux-kernel, yuanxzhang, tanxin.ctf
In-Reply-To: <20220813124907.3396-1-xiongx18@fudan.edu.cn>

Hello:

This patch was applied to netdev/net.git (master)
by David S. Miller <davem@davemloft.net>:

On Sat, 13 Aug 2022 20:49:08 +0800 you wrote:
> The issue happens on specific paths in the function. After both the
> object `rt` and `neigh` are grabbed successfully, when `lifetime` is
> nonzero but the metric needs change, the function just deletes the
> route and set `rt` to NULL. Then, it may try grabbing `rt` and `neigh`
> again if above conditions hold. The function simply overwrite `neigh`
> if succeeds or returns if fails, without decreasing the reference
> count of previous `neigh`. This may result in memory leaks.
> 
> [...]

Here is the summary with links:
  - net: fix potential refcount leak in ndisc_router_discovery()
    https://git.kernel.org/netdev/net/c/7396ba87f1ed

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH v2 1/2] neigh: fix possible DoS due to net iface start/stop loop
From: Denis V. Lunev @ 2022-08-15 10:47 UTC (permalink / raw)
  To: Christian Brauner, Alexander Mikhalitsyn
  Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Daniel Borkmann, David Ahern, Yajun Deng,
	Roopa Prabhu, linux-kernel, Alexey Kuznetsov, Konstantin Khorenko,
	kernel, devel
In-Reply-To: <20220815094432.tdqdfh3pwcfekegg@wittgenstein>

On 15.08.2022 11:44, Christian Brauner wrote:
> On Wed, Aug 10, 2022 at 07:08:39PM +0300, Alexander Mikhalitsyn wrote:
>> From: "Denis V. Lunev" <den@openvz.org>
>>
>> Normal processing of ARP request (usually this is Ethernet broadcast
>> packet) coming to the host is looking like the following:
>> * the packet comes to arp_process() call and is passed through routing
>>    procedure
>> * the request is put into the queue using pneigh_enqueue() if
>>    corresponding ARP record is not local (common case for container
>>    records on the host)
>> * the request is processed by timer (within 80 jiffies by default) and
>>    ARP reply is sent from the same arp_process() using
>>    NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED condition (flag is set inside
>>    pneigh_enqueue())
>>
>> And here the problem comes. Linux kernel calls pneigh_queue_purge()
>> which destroys the whole queue of ARP requests on ANY network interface
>> start/stop event through __neigh_ifdown().
>>
>> This is actually not a problem within the original world as network
>> interface start/stop was accessible to the host 'root' only, which
>> could do more destructive things. But the world is changed and there
>> are Linux containers available. Here container 'root' has an access
>> to this API and could be considered as untrusted user in the hosting
>> (container's) world.
>>
>> Thus there is an attack vector to other containers on node when
>> container's root will endlessly start/stop interfaces. We have observed
>> similar situation on a real production node when docker container was
>> doing such activity and thus other containers on the node become not
>> accessible.
>>
>> The patch proposed doing very simple thing. It drops only packets from
>> the same namespace in the pneigh_queue_purge() where network interface
>> state change is detected. This is enough to prevent the problem for the
>> whole node preserving original semantics of the code.
> This is how I'd do it as well.
>
>> v2:
>> 	- do del_timer_sync() if queue is empty after pneigh_queue_purge()
>>
>> Cc: "David S. Miller" <davem@davemloft.net>
>> Cc: Eric Dumazet <edumazet@google.com>
>> Cc: Jakub Kicinski <kuba@kernel.org>
>> Cc: Paolo Abeni <pabeni@redhat.com>
>> Cc: Daniel Borkmann <daniel@iogearbox.net>
>> Cc: David Ahern <dsahern@kernel.org>
>> Cc: Yajun Deng <yajun.deng@linux.dev>
>> Cc: Roopa Prabhu <roopa@nvidia.com>
>> Cc: Christian Brauner <brauner@kernel.org>
>> Cc: netdev@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org
>> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
>> Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
>> Cc: Konstantin Khorenko <khorenko@virtuozzo.com>
>> Cc: kernel@openvz.org
>> Cc: devel@openvz.org
>> Investigated-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>> ---
>>   net/core/neighbour.c | 25 +++++++++++++++++--------
>>   1 file changed, 17 insertions(+), 8 deletions(-)
>>
>> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
>> index 54625287ee5b..19d99d1eff53 100644
>> --- a/net/core/neighbour.c
>> +++ b/net/core/neighbour.c
>> @@ -307,14 +307,23 @@ static int neigh_del_timer(struct neighbour *n)
>>   	return 0;
>>   }
>>   
>> -static void pneigh_queue_purge(struct sk_buff_head *list)
>> +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
>>   {
>> +	unsigned long flags;
>>   	struct sk_buff *skb;
>>   
>> -	while ((skb = skb_dequeue(list)) != NULL) {
>> -		dev_put(skb->dev);
>> -		kfree_skb(skb);
>> +	spin_lock_irqsave(&list->lock, flags);
> I'm a bit surprised to see a spinlock held around a while loop walking a
> linked list but that seems to be quite common in this file. I take it
> the lists are guaranteed to be short.
Within the current code the size of the list is 64 packets at most
(same spinlock is held during packets processing).

Though this semantics is changed in the next patch from
Alexander, where we will get 64 packets/interface.

Den

>> +	skb = skb_peek(list);
>> +	while (skb != NULL) {
>> +		struct sk_buff *skb_next = skb_peek_next(skb, list);
>> +		if (net == NULL || net_eq(dev_net(skb->dev), net)) {
>> +			__skb_unlink(skb, list);
>> +			dev_put(skb->dev);
>> +			kfree_skb(skb);
>> +		}
>> +		skb = skb_next;
>>   	}
>> +	spin_unlock_irqrestore(&list->lock, flags);
>>   }
>>   
>>   static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
>> @@ -385,9 +394,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
>>   	write_lock_bh(&tbl->lock);
>>   	neigh_flush_dev(tbl, dev, skip_perm);
>>   	pneigh_ifdown_and_unlock(tbl, dev);
>> -
>> -	del_timer_sync(&tbl->proxy_timer);
>> -	pneigh_queue_purge(&tbl->proxy_queue);
>> +	pneigh_queue_purge(&tbl->proxy_queue, dev_net(dev));
>> +	if (skb_queue_empty_lockless(&tbl->proxy_queue))
>> +		del_timer_sync(&tbl->proxy_timer);
>>   	return 0;
>>   }
>>   
>> @@ -1787,7 +1796,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl)
>>   	cancel_delayed_work_sync(&tbl->managed_work);
>>   	cancel_delayed_work_sync(&tbl->gc_work);
>>   	del_timer_sync(&tbl->proxy_timer);
>> -	pneigh_queue_purge(&tbl->proxy_queue);
>> +	pneigh_queue_purge(&tbl->proxy_queue, NULL);
>>   	neigh_ifdown(tbl, NULL);
>>   	if (atomic_read(&tbl->entries))
>>   		pr_crit("neighbour leakage\n");
>> -- 
>> 2.36.1
>>


^ permalink raw reply

* Re: [PATCH v2 1/1] net: qrtr: start MHI channel after endpoit creation
From: patchwork-bot+netdevbpf @ 2022-08-15 10:40 UTC (permalink / raw)
  To: Maxim Kochetkov
  Cc: netdev, davem, edumazet, kuba, pabeni, linux-arm-msm,
	quic_hemantk, mani
In-Reply-To: <20220811094840.1654088-1-fido_max@inbox.ru>

Hello:

This patch was applied to netdev/net.git (master)
by David S. Miller <davem@davemloft.net>:

On Thu, 11 Aug 2022 12:48:40 +0300 you wrote:
> MHI channel may generates event/interrupt right after enabling.
> It may leads to 2 race conditions issues.
> 
> 1)
> Such event may be dropped by qcom_mhi_qrtr_dl_callback() at check:
> 
> 	if (!qdev || mhi_res->transaction_status)
> 		return;
> 
> [...]

Here is the summary with links:
  - [v2,1/1] net: qrtr: start MHI channel after endpoit creation
    https://git.kernel.org/netdev/net/c/68a838b84eff

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH v3 0/2] neighbour: fix possible DoS due to net iface start/stop loop
From: patchwork-bot+netdevbpf @ 2022-08-15 10:40 UTC (permalink / raw)
  To: Alexander Mikhalitsyn
  Cc: netdev, davem, edumazet, kuba, pabeni, daniel, dsahern,
	yajun.deng, roopa, brauner, linux-kernel, den, kuznet, khorenko,
	ptikhomirov, andrey.zhadchenko, alexander, kernel, devel
In-Reply-To: <20220811152012.319641-1-alexander.mikhalitsyn@virtuozzo.com>

Hello:

This series was applied to netdev/net.git (master)
by David S. Miller <davem@davemloft.net>:

On Thu, 11 Aug 2022 18:20:10 +0300 you wrote:
> Dear friends,
> 
> Recently one of OpenVZ users reported that they have issues with network
> availability of some containers. It was discovered that the reason is absence
> of ARP replies from the Host Node on the requests about container IPs.
> 
> Of course, we started from tcpdump analysis and noticed that ARP requests
> successfuly comes to the problematic node external interface. So, something
> was wrong from the kernel side.
> 
> [...]

Here is the summary with links:
  - [v3,1/2] neigh: fix possible DoS due to net iface start/stop loop
    https://git.kernel.org/netdev/net/c/66ba215cb513
  - [v3,2/2] neighbour: make proxy_queue.qlen limit per-device
    https://git.kernel.org/netdev/net/c/0ff4eb3d5ebb

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net 0/2][pull request] Intel Wired LAN Driver Updates 2022-08-11 (ice)
From: patchwork-bot+netdevbpf @ 2022-08-15 10:40 UTC (permalink / raw)
  To: Tony Nguyen; +Cc: davem, kuba, pabeni, edumazet, netdev
In-Reply-To: <20220811161714.305094-1-anthony.l.nguyen@intel.com>

Hello:

This series was applied to netdev/net.git (master)
by Tony Nguyen <anthony.l.nguyen@intel.com>:

On Thu, 11 Aug 2022 09:17:12 -0700 you wrote:
> This series contains updates to ice driver only.
> 
> Benjamin corrects a misplaced parenthesis for a WARN_ON check.
> 
> Michal removes WARN_ON from a check as its recoverable and not
> warranting of a call trace.
> 
> [...]

Here is the summary with links:
  - [net,1/2] ice: Fix VSI rebuild WARN_ON check for VF
    https://git.kernel.org/netdev/net/c/7fe05e125d5f
  - [net,2/2] ice: Fix call trace with null VSI during VF reset
    https://git.kernel.org/netdev/net/c/cf90b74341ee

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* [PATCH] netfilter: conntrack: NF_CONNTRACK_PROCFS should no longer default to y
From: Geert Uytterhoeven @ 2022-08-15 10:39 UTC (permalink / raw)
  To: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jan Engelhardt
  Cc: netfilter-devel, coreteam, netdev, linux-kernel,
	Geert Uytterhoeven

NF_CONNTRACK_PROCFS was marked obsolete in commit 54b07dca68557b09
("netfilter: provide config option to disable ancient procfs parts") in
v3.3.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 net/netfilter/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 22f15ebf6045b3a9..4b8d04640ff32274 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -144,7 +144,6 @@ config NF_CONNTRACK_ZONES
 
 config NF_CONNTRACK_PROCFS
 	bool "Supply CT list in procfs (OBSOLETE)"
-	default y
 	depends on PROC_FS
 	help
 	This option enables for the list of known conntrack entries
-- 
2.25.1


^ permalink raw reply related

* Re: [RFC net-next v3 23/29] io_uring: allow to pass addr into sendzc
From: Pavel Begunkov @ 2022-08-15  9:46 UTC (permalink / raw)
  To: Stefan Metzmacher, io-uring, netdev, linux-kernel
  Cc: David S . Miller, Jakub Kicinski, Jonathan Lemon,
	Willem de Bruijn, Jens Axboe, kernel-team
In-Reply-To: <4eb0adae-660a-3582-df27-d6c254b97adb@samba.org>

On 8/13/22 09:45, Stefan Metzmacher wrote:
> Hi Pavel,

Hi Stefan,

Thanks for giving a thought about the API, are you trying
to use it in samba?

>>> Given that this fills in msg almost completely can we also have
>>> a version of SENDMSGZC, it would be very useful to also allow
>>> msg_control to be passed and as well as an iovec.
>>>
>>> Would that be possible?
>>
>> Right, I left it to follow ups as the series is already too long.
>>
>> fwiw, I'm going to also add addr to IORING_OP_SEND.
> 
> 
> Given the minimal differences, which were left between
> IORING_OP_SENDZC and IORING_OP_SEND, wouldn't it be better
> to merge things to IORING_OP_SEND using a IORING_RECVSEND_ZC_NOTIF
> as indication to use the notif slot.

And will be even more similar in for-next, but with notifications
I'd still prefer different opcodes to get a little bit more
flexibility and not making the normal io_uring send path messier.

> It would means we don't need to waste two opcodes for
> IORING_OP_SENDZC and IORING_OP_SENDMSGZC (and maybe more)
> 
> 
> I also noticed a problem in io_notif_update()
> 
>          for (; idx < idx_end; idx++) {
>                  struct io_notif_slot *slot = &ctx->notif_slots[idx];
> 
>                  if (!slot->notif)
>                          continue;
>                  if (up->arg)
>                          slot->tag = up->arg;
>                  io_notif_slot_flush_submit(slot, issue_flags);
>          }
> 
>   slot->tag = up->arg is skipped if there is no notif already.
> 
> So you can't just use a 2 linked sqe's with
> 
> IORING_RSRC_UPDATE_NOTIF followed by IORING_OP_SENDZC(with IORING_RECVSEND_NOTIF_FLUSH)

slot->notif is lazily initialised with the first send attached to it,
so in your example IORING_OP_SENDZC will first create a notification
to execute the send and then will flush it.

This "if" is there is only to have a more reliable API. We can
go over the range and allocate all empty slots and then flush
all of them, but allocation failures should be propagated to the
userspace when currently the function it can't fail.

> I think the if (!slot->notif) should be moved down a bit.

Not sure what you mean

> It would somehow be nice to avoid the notif slots at all and somehow
> use some kind of multishot request in order to generate two qces.

It is there first to ammortise overhead of zerocopy infra and bits
for second CQE posting. But more importantly, without it for TCP
the send payload size would need to be large enough or performance
would suffer, but all depends on the use case. TL;DR; it would be
forced to create a new SKB for each new send.

For something simpler, I'll push another zc variant that doesn't
have notifiers and posts only one CQE and only after the buffers
are no more in use by the kernel. This works well for UDP and for
some TCP scenarios, but doesn't cover all cases.
> I'm also wondering what will happen if a notif will be referenced by the net layer
> but the io_uring instance is already closed, wouldn't
> io_uring_tx_zerocopy_callback() or __io_notif_complete_tw() crash
> because notif->ctx is a stale pointer, of notif itself is already gone...

io_uring will flush all slots and wait for all notifications
to fire, i.e. io_uring_tx_zerocopy_callback(), so it's not a
problem.

-- 
Pavel Begunkov

^ permalink raw reply

* Re: [PATCH v2 1/2] neigh: fix possible DoS due to net iface start/stop loop
From: Christian Brauner @ 2022-08-15  9:44 UTC (permalink / raw)
  To: Alexander Mikhalitsyn
  Cc: netdev, Denis V. Lunev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Daniel Borkmann, David Ahern,
	Yajun Deng, Roopa Prabhu, linux-kernel, Alexey Kuznetsov,
	Konstantin Khorenko, kernel, devel
In-Reply-To: <20220810160840.311628-2-alexander.mikhalitsyn@virtuozzo.com>

On Wed, Aug 10, 2022 at 07:08:39PM +0300, Alexander Mikhalitsyn wrote:
> From: "Denis V. Lunev" <den@openvz.org>
> 
> Normal processing of ARP request (usually this is Ethernet broadcast
> packet) coming to the host is looking like the following:
> * the packet comes to arp_process() call and is passed through routing
>   procedure
> * the request is put into the queue using pneigh_enqueue() if
>   corresponding ARP record is not local (common case for container
>   records on the host)
> * the request is processed by timer (within 80 jiffies by default) and
>   ARP reply is sent from the same arp_process() using
>   NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED condition (flag is set inside
>   pneigh_enqueue())
> 
> And here the problem comes. Linux kernel calls pneigh_queue_purge()
> which destroys the whole queue of ARP requests on ANY network interface
> start/stop event through __neigh_ifdown().
> 
> This is actually not a problem within the original world as network
> interface start/stop was accessible to the host 'root' only, which
> could do more destructive things. But the world is changed and there
> are Linux containers available. Here container 'root' has an access
> to this API and could be considered as untrusted user in the hosting
> (container's) world.
> 
> Thus there is an attack vector to other containers on node when
> container's root will endlessly start/stop interfaces. We have observed
> similar situation on a real production node when docker container was
> doing such activity and thus other containers on the node become not
> accessible.
> 
> The patch proposed doing very simple thing. It drops only packets from
> the same namespace in the pneigh_queue_purge() where network interface
> state change is detected. This is enough to prevent the problem for the
> whole node preserving original semantics of the code.

This is how I'd do it as well.

> 
> v2:
> 	- do del_timer_sync() if queue is empty after pneigh_queue_purge()
> 
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Jakub Kicinski <kuba@kernel.org>
> Cc: Paolo Abeni <pabeni@redhat.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: David Ahern <dsahern@kernel.org>
> Cc: Yajun Deng <yajun.deng@linux.dev>
> Cc: Roopa Prabhu <roopa@nvidia.com>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: netdev@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
> Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
> Cc: Konstantin Khorenko <khorenko@virtuozzo.com>
> Cc: kernel@openvz.org
> Cc: devel@openvz.org
> Investigated-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
> Signed-off-by: Denis V. Lunev <den@openvz.org>
> ---
>  net/core/neighbour.c | 25 +++++++++++++++++--------
>  1 file changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
> index 54625287ee5b..19d99d1eff53 100644
> --- a/net/core/neighbour.c
> +++ b/net/core/neighbour.c
> @@ -307,14 +307,23 @@ static int neigh_del_timer(struct neighbour *n)
>  	return 0;
>  }
>  
> -static void pneigh_queue_purge(struct sk_buff_head *list)
> +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
>  {
> +	unsigned long flags;
>  	struct sk_buff *skb;
>  
> -	while ((skb = skb_dequeue(list)) != NULL) {
> -		dev_put(skb->dev);
> -		kfree_skb(skb);
> +	spin_lock_irqsave(&list->lock, flags);

I'm a bit surprised to see a spinlock held around a while loop walking a
linked list but that seems to be quite common in this file. I take it
the lists are guaranteed to be short.

> +	skb = skb_peek(list);
> +	while (skb != NULL) {
> +		struct sk_buff *skb_next = skb_peek_next(skb, list);
> +		if (net == NULL || net_eq(dev_net(skb->dev), net)) {
> +			__skb_unlink(skb, list);
> +			dev_put(skb->dev);
> +			kfree_skb(skb);
> +		}
> +		skb = skb_next;
>  	}
> +	spin_unlock_irqrestore(&list->lock, flags);
>  }
>  
>  static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
> @@ -385,9 +394,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
>  	write_lock_bh(&tbl->lock);
>  	neigh_flush_dev(tbl, dev, skip_perm);
>  	pneigh_ifdown_and_unlock(tbl, dev);
> -
> -	del_timer_sync(&tbl->proxy_timer);
> -	pneigh_queue_purge(&tbl->proxy_queue);
> +	pneigh_queue_purge(&tbl->proxy_queue, dev_net(dev));
> +	if (skb_queue_empty_lockless(&tbl->proxy_queue))
> +		del_timer_sync(&tbl->proxy_timer);
>  	return 0;
>  }
>  
> @@ -1787,7 +1796,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl)
>  	cancel_delayed_work_sync(&tbl->managed_work);
>  	cancel_delayed_work_sync(&tbl->gc_work);
>  	del_timer_sync(&tbl->proxy_timer);
> -	pneigh_queue_purge(&tbl->proxy_queue);
> +	pneigh_queue_purge(&tbl->proxy_queue, NULL);
>  	neigh_ifdown(tbl, NULL);
>  	if (atomic_read(&tbl->entries))
>  		pr_crit("neighbour leakage\n");
> -- 
> 2.36.1
> 

^ permalink raw reply

* Re: [PATCH V5 0/6] ifcvf/vDPA: support query device config space through netlink
From: Zhu, Lingshan @ 2022-08-15  9:36 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, virtualization, netdev, kvm, parav, xieyongji,
	gautam.dawar
In-Reply-To: <d07dc70e-e97b-9b9e-3ef2-c3f648c57a05@intel.com>



On 8/12/2022 7:41 PM, Zhu, Lingshan wrote:
>
>
> On 8/12/2022 7:17 PM, Michael S. Tsirkin wrote:
>> On Fri, Aug 12, 2022 at 07:14:39AM -0400, Michael S. Tsirkin wrote:
>>> On Fri, Aug 12, 2022 at 06:44:54PM +0800, Zhu Lingshan wrote:
>>>> This series allows userspace to query device config space of vDPA
>>>> devices and the management devices through netlink,
>>>> to get multi-queue, feature bits and etc.
>>>>
>>>> This series has introduced a new netlink attr
>>>> VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES, this should be used to query
>>>> features of vDPA  devices than the management device.
>>>>
>>>> Please help review.
>>> I can't merge this for this merge window.
>>> Am I right when I say that the new thing here is patch 5/6 + new
>>> comments?
>>> If yes I can queue it out of the window, on top.
>> So at this point, can you please send patches on top of the vhost
>> tree? I think these are just patches 3 and 5 but please confirm.
> I will rebase them on vhost tree and resend them soon, main changes 
> are in patch 5,
> we have made MTU, MAC, MQ conditional there. And there are some new 
> comments as
> you suggested.
Hi Michael,

I have rebased patch 3/6 and 5/6, they can apply on both vhost tree
and Linus tree, the new series including these two patches are sent out.

Thanks,
Zhu Lingshan
>
>
> Thanks,
> Zhu Lingshan
>>
>>
>>>> Thanks!
>>>> Zhu Lingshan
>>>>
>>>> Changes rom V4:
>>>> (1) Read MAC, MTU, MQ conditionally (Michael)
>>>> (2) If VIRTIO_NET_F_MAC not set, don't report MAC to userspace
>>>> (3) If VIRTIO_NET_F_MTU not set, report 1500 to userspace
>>>> (4) Add comments to the new attr
>>>> VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES(Michael)
>>>> (5) Add comments for reporting the device status as LE(Michael)
>>>>
>>>> Changes from V3:
>>>> (1)drop the fixes tags(Parva)
>>>> (2)better commit log for patch 1/6(Michael)
>>>> (3)assign num_queues to max_supported_vqs than max_vq_pairs(Jason)
>>>> (4)initialize virtio pci capabilities in the probe() function.
>>>>
>>>> Changes from V2:
>>>> Add fixes tags(Parva)
>>>>
>>>> Changes from V1:
>>>> (1) Use __virito16_to_cpu(true, xxx) for the le16 casting(Jason)
>>>> (2) Add a comment in ifcvf_get_config_size(), to explain
>>>> why we should return the minimum value of
>>>> sizeof(struct virtio_net_config) and the onboard
>>>> cap size(Jason)
>>>> (3) Introduced a new attr VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES
>>>> (4) Show the changes of iproute2 output before and after 5/6 
>>>> patch(Jason)
>>>> (5) Fix cast warning in vdpa_fill_stats_rec()
>>>>
>>>> Zhu Lingshan (6):
>>>>    vDPA/ifcvf: get_config_size should return a value no greater 
>>>> than dev
>>>>      implementation
>>>>    vDPA/ifcvf: support userspace to query features and MQ of a 
>>>> management
>>>>      device
>>>>    vDPA: allow userspace to query features of a vDPA device
>>>>    vDPA: !FEATURES_OK should not block querying device config space
>>>>    vDPA: Conditionally read fields in virtio-net dev config space
>>>>    fix 'cast to restricted le16' warnings in vdpa.c
>>>>
>>>>   drivers/vdpa/ifcvf/ifcvf_base.c |  13 ++-
>>>>   drivers/vdpa/ifcvf/ifcvf_base.h |   2 +
>>>>   drivers/vdpa/ifcvf/ifcvf_main.c | 142 
>>>> +++++++++++++++++---------------
>>>>   drivers/vdpa/vdpa.c             |  82 ++++++++++++------
>>>>   include/uapi/linux/vdpa.h       |   3 +
>>>>   5 files changed, 149 insertions(+), 93 deletions(-)
>>>>
>>>> -- 
>>>> 2.31.1
>


^ permalink raw reply

* [PATCH 2/2] vDPA: conditionally read fields in virtio-net dev
From: Zhu Lingshan @ 2022-08-15  9:26 UTC (permalink / raw)
  To: jasowang, mst
  Cc: virtualization, netdev, kvm, parav, xieyongji, gautam.dawar,
	Zhu Lingshan
In-Reply-To: <20220815092638.504528-1-lingshan.zhu@intel.com>

Some fields of virtio-net device config space are
conditional on the feature bits, the spec says:

"The mac address field always exists
(though is only valid if VIRTIO_NET_F_MAC is set)"

"max_virtqueue_pairs only exists if VIRTIO_NET_F_MQ
or VIRTIO_NET_F_RSS is set"

"mtu only exists if VIRTIO_NET_F_MTU is set"

so we should read MTU, MAC and MQ in the device config
space only when these feature bits are offered.

For MQ, if both VIRTIO_NET_F_MQ and VIRTIO_NET_F_RSS are
not set, the virtio device should have
one queue pair as default value, so when userspace querying queue pair numbers,
it should return mq=1 than zero.

For MTU, if VIRTIO_NET_F_MTU is not set, we should not read
MTU from the device config sapce.
RFC894 <A Standard for the Transmission of IP Datagrams over Ethernet Networks>
says:"The minimum length of the data field of a packet sent over an
Ethernet is 1500 octets, thus the maximum length of an IP datagram
sent over an Ethernet is 1500 octets.  Implementations are encouraged
to support full-length packets"

virtio spec says:"The virtio network device is a virtual ethernet card",
so the default MTU value should be 1500 for virtio-net.

For MAC, the spec says:"If the VIRTIO_NET_F_MAC feature bit is set,
the configuration space mac entry indicates the “physical” address
of the network card, otherwise the driver would typically
generate a random local MAC address." So there is no
default MAC address if VIRTIO_NET_F_MAC not set.

This commits introduces functions vdpa_dev_net_mtu_config_fill()
and vdpa_dev_net_mac_config_fill() to fill MTU and MAC.
It also fixes vdpa_dev_net_mq_config_fill() to report correct
MQ when _F_MQ is not present.

These functions should check devices features than driver
features, and struct vdpa_device is not needed as a parameter

The test & userspace tool output:

Feature bit VIRTIO_NET_F_MTU, VIRTIO_NET_F_RSS, VIRTIO_NET_F_MQ
and VIRTIO_NET_F_MAC can be mask out by hardcode.

However, it is challenging to "disable" the related fields
in the HW device config space, so let's just assume the values
are meaningless if the feature bits are not set.

Before this change, when feature bits for RSS, MQ, MTU and MAC
are not set, iproute2 output:
$vdpa vdpa0: mac 00:e8:ca:11:be:05 link up link_announce false mtu 1500
  negotiated_features

without this commit, function vdpa_dev_net_config_fill()
reads all config space fields unconditionally, so let's
assume the MAC and MTU are meaningless, and it checks
MQ with driver_features, so we don't see max_vq_pairs.

After applying this commit, when feature bits for
MQ, RSS, MAC and MTU are not set,iproute2 output:
$vdpa dev config show vdpa0
vdpa0: link up link_announce false max_vq_pairs 1 mtu 1500
  negotiated_features

As explained above:
Here is no MAC, because VIRTIO_NET_F_MAC is not set,
and there is no default value for MAC. It shows
max_vq_paris = 1 because even without MQ feature,
a functional virtio-net must have one queue pair.
mtu = 1500 is the default value as ethernet
required.

This commit also add supplementary comments for
__virtio16_to_cpu(true, xxx) operations in
vdpa_dev_net_config_fill() and vdpa_fill_stats_rec()

Signed-off-by: Zhu Lingshan <lingshan.zhu@intel.com>
---
 drivers/vdpa/vdpa.c | 60 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index efb55a06e961..a74660b98979 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -801,19 +801,44 @@ static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callba
 	return msg->len;
 }

-static int vdpa_dev_net_mq_config_fill(struct vdpa_device *vdev,
-				       struct sk_buff *msg, u64 features,
+static int vdpa_dev_net_mq_config_fill(struct sk_buff *msg, u64 features,
 				       const struct virtio_net_config *config)
 {
 	u16 val_u16;

-	if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0)
-		return 0;
+	if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0 &&
+	    (features & BIT_ULL(VIRTIO_NET_F_RSS)) == 0)
+		val_u16 = 1;
+	else
+		val_u16 = __virtio16_to_cpu(true, config->max_virtqueue_pairs);

-	val_u16 = le16_to_cpu(config->max_virtqueue_pairs);
 	return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16);
 }

+static int vdpa_dev_net_mtu_config_fill(struct sk_buff *msg, u64 features,
+					const struct virtio_net_config *config)
+{
+	u16 val_u16;
+
+	if ((features & BIT_ULL(VIRTIO_NET_F_MTU)) == 0)
+		val_u16 = 1500;
+	else
+		val_u16 = __virtio16_to_cpu(true, config->mtu);
+
+	return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16);
+}
+
+static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features,
+					const struct virtio_net_config *config)
+{
+	if ((features & BIT_ULL(VIRTIO_NET_F_MAC)) == 0)
+		return 0;
+	else
+		return  nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR,
+				sizeof(config->mac), config->mac);
+}
+
+
 static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg)
 {
 	struct virtio_net_config config = {};
@@ -822,18 +847,16 @@ static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *ms

 	vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config));

-	if (nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, sizeof(config.mac),
-		    config.mac))
-		return -EMSGSIZE;
+	/*
+	 * Assume little endian for now, userspace can tweak this for
+	 * legacy guest support.
+	 */
+	val_u16 = __virtio16_to_cpu(true, config.status);

 	val_u16 = __virtio16_to_cpu(true, config.status);
 	if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16))
 		return -EMSGSIZE;

-	val_u16 = __virtio16_to_cpu(true, config.mtu);
-	if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16))
-		return -EMSGSIZE;
-
 	features_driver = vdev->config->get_driver_features(vdev);
 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver,
 			      VDPA_ATTR_PAD))
@@ -846,7 +869,13 @@ static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *ms
 			      VDPA_ATTR_PAD))
 		return -EMSGSIZE;

-	return vdpa_dev_net_mq_config_fill(vdev, msg, features_driver, &config);
+	if (vdpa_dev_net_mac_config_fill(msg, features_device, &config))
+		return -EMSGSIZE;
+
+	if (vdpa_dev_net_mtu_config_fill(msg, features_device, &config))
+		return -EMSGSIZE;
+
+	return vdpa_dev_net_mq_config_fill(msg, features_device, &config);
 }

 static int
@@ -914,6 +943,11 @@ static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg,
 	}
 	vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config));

+	/*
+	 * Assume little endian for now, userspace can tweak this for
+	 * legacy guest support.
+	 */
+
 	max_vqp = __virtio16_to_cpu(true, config.max_virtqueue_pairs);
 	if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, max_vqp))
 		return -EMSGSIZE;
-- 
2.31.1

^ permalink raw reply related

* [PATCH 1/2] vDPA: allow userspace to query features of a vDPA device
From: Zhu Lingshan @ 2022-08-15  9:26 UTC (permalink / raw)
  To: jasowang, mst
  Cc: virtualization, netdev, kvm, parav, xieyongji, gautam.dawar,
	Zhu Lingshan
In-Reply-To: <20220815092638.504528-1-lingshan.zhu@intel.com>

This commit adds a new vDPA netlink attribution
VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES. Userspace can query
features of vDPA devices through this new attr.

Signed-off-by: Zhu Lingshan <lingshan.zhu@intel.com>
---
 drivers/vdpa/vdpa.c       | 17 +++++++++++++----
 include/uapi/linux/vdpa.h |  3 +++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index c06c02704461..efb55a06e961 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -491,6 +491,8 @@ static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *m
 		err = -EMSGSIZE;
 		goto msg_err;
 	}
+
+	/* report features of a vDPA management device through VDPA_ATTR_DEV_SUPPORTED_FEATURES */
 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_SUPPORTED_FEATURES,
 			      mdev->supported_features, VDPA_ATTR_PAD)) {
 		err = -EMSGSIZE;
@@ -815,7 +817,7 @@ static int vdpa_dev_net_mq_config_fill(struct vdpa_device *vdev,
 static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg)
 {
 	struct virtio_net_config config = {};
-	u64 features;
+	u64 features_device, features_driver;
 	u16 val_u16;
 
 	vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config));
@@ -832,12 +834,19 @@ static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *ms
 	if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16))
 		return -EMSGSIZE;
 
-	features = vdev->config->get_driver_features(vdev);
-	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features,
+	features_driver = vdev->config->get_driver_features(vdev);
+	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver,
+			      VDPA_ATTR_PAD))
+		return -EMSGSIZE;
+
+	features_device = vdev->config->get_device_features(vdev);
+
+	/* report features of a vDPA device through VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES */
+	if (nla_put_u64_64bit(msg, VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES, features_device,
 			      VDPA_ATTR_PAD))
 		return -EMSGSIZE;
 
-	return vdpa_dev_net_mq_config_fill(vdev, msg, features, &config);
+	return vdpa_dev_net_mq_config_fill(vdev, msg, features_driver, &config);
 }
 
 static int
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index 25c55cab3d7c..d171b92ef522 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -46,7 +46,10 @@ enum vdpa_attr {
 
 	VDPA_ATTR_DEV_NEGOTIATED_FEATURES,	/* u64 */
 	VDPA_ATTR_DEV_MGMTDEV_MAX_VQS,		/* u32 */
+	/* features of a vDPA management device */
 	VDPA_ATTR_DEV_SUPPORTED_FEATURES,	/* u64 */
+	/* features of a vDPA device, e.g., /dev/vhost-vdpa0 */
+	VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES,	/* u64 */
 
 	VDPA_ATTR_DEV_QUEUE_INDEX,              /* u32 */
 	VDPA_ATTR_DEV_VENDOR_ATTR_NAME,		/* string */
-- 
2.31.1


^ permalink raw reply related

* [PATCH 0/2] allow userspace to query device features
From: Zhu Lingshan @ 2022-08-15  9:26 UTC (permalink / raw)
  To: jasowang, mst
  Cc: virtualization, netdev, kvm, parav, xieyongji, gautam.dawar,
	Zhu Lingshan

This series allows userspace to query device features of
a vDPA device through a new netlink attr
VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES

This series also make some fields of virtio-net 
device config space conditional on the feature bits,
this means:

MTU should be conditional on VIRTIO_F_NET_MTU
MAC should be conditional on VIRTIO_F_NET_MAC
MQ should be conditional on VIRTIO_F_NET_MQ

For details, please refer to commit message
of patch 2/2

Thanks!

Zhu Lingshan (2):
  vDPA: allow userspace to query features of a vDPA device
  vDPA: conditionally read fields in virtio-net dev

 drivers/vdpa/vdpa.c       | 71 +++++++++++++++++++++++++++++++--------
 include/uapi/linux/vdpa.h |  3 ++
 2 files changed, 60 insertions(+), 14 deletions(-)

-- 
2.31.1


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox