Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next 04/15] net: page_pool: id the page pools
From: Jakub Kicinski @ 2023-11-09 16:22 UTC (permalink / raw)
  To: Ilias Apalodimas; +Cc: davem, netdev, edumazet, pabeni, almasrymina, hawk
In-Reply-To: <CAC_iWj+gdrsyumk77mR60o6rw=pUmnXgrkmwJXK_04KPJCMhAw@mail.gmail.com>

On Thu, 9 Nov 2023 11:21:32 +0200 Ilias Apalodimas wrote:
> > +       mutex_lock(&page_pools_lock);
> > +       err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b,
> > +                             &id_alloc_next, GFP_KERNEL);
> > +       if (err < 0)
> > +               goto err_unlock;  
> 
> A nit really, but get rid of the if/goto and just let this return err; ?

There's more stuff added here by a subsequent patch. It ends up like
this:

int page_pool_list(struct page_pool *pool)
{
	static u32 id_alloc_next;
	int err;

	mutex_lock(&page_pools_lock);
	err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b,
			      &id_alloc_next, GFP_KERNEL);
	if (err < 0)
		goto err_unlock;

	if (pool->slow.netdev) {
		hlist_add_head(&pool->user.list,
			       &pool->slow.netdev->page_pools);
		pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0;

		netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF);
	}

	mutex_unlock(&page_pools_lock);
	return 0;

err_unlock:
	mutex_unlock(&page_pools_lock);
	return err;
}

Do you want me to combine the error and non-error paths?
I have a weak preference for not mixing, sometimes err gets set 
to a positive value and that starts to propagate, unlikely to
happen here tho.

^ permalink raw reply

* RE: [PATCH net v2 1/2] r8169: add handling DASH when DASH is disabled
From: Hau @ 2023-11-09 16:20 UTC (permalink / raw)
  To: Paolo Abeni, hkallweit1@gmail.com
  Cc: nic_swsd, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org
In-Reply-To: <5783a6f8819a741f0f299602ff615e6a03368246.camel@redhat.com>

> You should include the fixes tag you already added in v1 and your Sob should
> come as the last tag
> 
> The same applies to the next patch
> 
I forget to add Fixes tag. I will add it back.

> > Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
> 
> It's not clear where/when Heiner provided the above tag for this patch.
> I hope that was off-list.
I may misunderstanding what he means. I will not add this tag in my next patch.

> 
> > Cc: stable@vger.kernel.org
> > ---
> >  drivers/net/ethernet/realtek/r8169_main.c | 35
> > ++++++++++++++++-------
> >  1 file changed, 25 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/realtek/r8169_main.c
> > b/drivers/net/ethernet/realtek/r8169_main.c
> > index 0c76c162b8a9..108dc75050ba 100644
> > --- a/drivers/net/ethernet/realtek/r8169_main.c
> > +++ b/drivers/net/ethernet/realtek/r8169_main.c
> > @@ -624,6 +624,7 @@ struct rtl8169_private {
> >
> >       unsigned supports_gmii:1;
> >       unsigned aspm_manageable:1;
> > +     unsigned dash_enabled:1;
> >       dma_addr_t counters_phys_addr;
> >       struct rtl8169_counters *counters;
> >       struct rtl8169_tc_offsets tc_offset; @@ -1253,14 +1254,26 @@
> > static bool r8168ep_check_dash(struct rtl8169_private *tp)
> >       return r8168ep_ocp_read(tp, 0x128) & BIT(0);  }
> >
> > -static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp)
> > +static bool rtl_dash_is_enabled(struct rtl8169_private *tp) {
> > +     switch (tp->dash_type) {
> > +     case RTL_DASH_DP:
> > +             return r8168dp_check_dash(tp);
> > +     case RTL_DASH_EP:
> > +             return r8168ep_check_dash(tp);
> > +     default:
> > +             return false;
> > +     }
> > +}
> > +
> > +static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private
> > +*tp)
> >  {
> >       switch (tp->mac_version) {
> >       case RTL_GIGA_MAC_VER_28:
> >       case RTL_GIGA_MAC_VER_31:
> > -             return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE;
> > +             return RTL_DASH_DP;
> >       case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53:
> > -             return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE;
> > +             return RTL_DASH_EP;
> >       default:
> >               return RTL_DASH_NONE;
> >       }
> > @@ -1453,7 +1466,7 @@ static void __rtl8169_set_wol(struct
> > rtl8169_private *tp, u32 wolopts)
> >
> >       device_set_wakeup_enable(tp_to_dev(tp), wolopts);
> >
> > -     if (tp->dash_type == RTL_DASH_NONE) {
> > +     if (!tp->dash_enabled) {
> >               rtl_set_d3_pll_down(tp, !wolopts);
> >               tp->dev->wol_enabled = wolopts ? 1 : 0;
> >       }
> > @@ -2512,7 +2525,7 @@ static void rtl_wol_enable_rx(struct
> > rtl8169_private *tp)
> >
> >  static void rtl_prepare_power_down(struct rtl8169_private *tp)  {
> > -     if (tp->dash_type != RTL_DASH_NONE)
> > +     if (tp->dash_enabled)
> >               return;
> >
> >       if (tp->mac_version == RTL_GIGA_MAC_VER_32 || @@ -4869,7 +4882,7
> > @@ static int rtl8169_runtime_idle(struct device *device)  {
> >       struct rtl8169_private *tp = dev_get_drvdata(device);
> >
> > -     if (tp->dash_type != RTL_DASH_NONE)
> > +     if (tp->dash_enabled)
> >               return -EBUSY;
> >
> >       if (!netif_running(tp->dev) || !netif_carrier_ok(tp->dev)) @@
> > -4896,7 +4909,7 @@ static void rtl_shutdown(struct pci_dev *pdev)
> >       rtl_rar_set(tp, tp->dev->perm_addr);
> >
> >       if (system_state == SYSTEM_POWER_OFF &&
> > -         tp->dash_type == RTL_DASH_NONE) {
> > +             !tp->dash_enabled) {
> 
> Since you have to repost, please maintain the correct indentation
> above:
> 
>         if (system_state == SYSTEM_POWER_OFF &&
>             !tp->dash_enabled) {
> 
>         ^^^^
> spaces here.
I will correct the indentation. Thanks.

^ permalink raw reply

* Re: [PATCH net-next 1/1] net: stmmac: Add support for HW-accelerated VLAN stripping
From: Andrew Lunn @ 2023-11-09 16:17 UTC (permalink / raw)
  To: Gan Yi Fang
  Cc: Alexandre Torgue, Jose Abreu, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Maxime Coquelin, Russell King,
	Andrew Halaney, Simon Horman, Bartosz Golaszewski, Shenwei Wang,
	Russell King, Johannes Zink, Jochen Henneberg, netdev,
	linux-stm32, linux-arm-kernel, linux-kernel, Looi Hong Aun,
	Voon Weifeng, Song Yoong Siang
In-Reply-To: <20231109053831.2572699-1-yi.fang.gan@intel.com>

> +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
> @@ -198,6 +198,17 @@ static int dwmac4_get_tx_ls(struct dma_desc *p)
>  		>> TDES3_LAST_DESCRIPTOR_SHIFT;
>  }
>  
> +static inline int dwmac4_wrback_get_rx_vlan_tci(struct dma_desc *p)
> +{
> +	return (le32_to_cpu(p->des0) & RDES0_VLAN_TAG_MASK);
> +}
> +
> +static inline bool dwmac4_wrback_get_rx_vlan_valid(struct dma_desc *p)
> +{
> +	return ((le32_to_cpu(p->des3) & RDES3_LAST_DESCRIPTOR) &&
> +		(le32_to_cpu(p->des3) & RDES3_RDES0_VALID));
> +}

No inline functions in C files please. Let the compiler decide.

You are submitting a number of patches for this driver. Do they all
cleanly apply independent of each other? Or are there dependencies?

	Andrew

^ permalink raw reply

* Re: [PATCH net-next 00/15] net: page_pool: add netlink-based introspection
From: Jakub Kicinski @ 2023-11-09 16:14 UTC (permalink / raw)
  To: Ilias Apalodimas; +Cc: davem, netdev, edumazet, pabeni, almasrymina, hawk
In-Reply-To: <CAC_iWjKi0V6wUmutmpjYyjZGkwXef4bxQwcx6o5rytT+-Pj5Eg@mail.gmail.com>

On Thu, 9 Nov 2023 10:11:47 +0200 Ilias Apalodimas wrote:
> > We immediately run into page pool leaks both real and false positive
> > warnings. As Eric pointed out/predicted there's no guarantee that
> > applications will read / close their sockets so a page pool page
> > may be stuck in a socket (but not leaked) forever. This happens
> > a lot in our fleet. Most of these are obviously due to application
> > bugs but we should not be printing kernel warnings due to minor
> > application resource leaks.  
> 
> Fair enough, I guess you mean 'continuous warnings'?

Yes, in this case but I'm making a general statement.
Or do you mean that there's a typo / grammar issue?

> > Conversely the page pool memory may get leaked at runtime, and
> > we have no way to detect / track that, unless someone reconfigures
> > the NIC and destroys the page pools which leaked the pages.
> >
> > The solution presented here is to expose the memory use of page
> > pools via netlink. This allows for continuous monitoring of memory
> > used by page pools, regardless if they were destroyed or not.
> > Sample in patch 15 can print the memory use and recycling
> > efficiency:
> >
> > $ ./page-pool
> >     eth0[2]     page pools: 10 (zombies: 0)
> >                 refs: 41984 bytes: 171966464 (refs: 0 bytes: 0)
> >                 recycling: 90.3% (alloc: 656:397681 recycle: 89652:270201)
> 
> That's reasonable, and the recycling rate is pretty impressive. 

This is just from a test machine, fresh boot, maybe a short iperf run,
I don't remember now :) In any case not real workload.

> Any idea how that translated to enhancements overall? mem/cpu pressure etc

I haven't collected much prod data at this stage, I'm hoping to add
this to the internal kernel and then do a more thorough investigation.

^ permalink raw reply

* Re: [GIT PULL v2] Networking for 6.7
From: Kirill A. Shutemov @ 2023-11-09 16:14 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Hou Tao, Jakub Kicinski, Yonghong Song, Alexei Starovoitov,
	Linus Torvalds, David S. Miller, Network Development, LKML,
	Paolo Abeni
In-Reply-To: <CAADnVQJnMQaFoWxj165GZ+CwJbVtPQBss80o7zYVQwg5MVij3g@mail.gmail.com>

On Thu, Nov 09, 2023 at 08:01:39AM -0800, Alexei Starovoitov wrote:
> On Thu, Nov 9, 2023 at 7:49 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >
> > On Tue, Oct 31, 2023 at 02:09:48PM -0700, Jakub Kicinski wrote:
> > >       bpf: Add support for non-fix-size percpu mem allocation
> >
> > Recent changes in BPF increased per-CPU memory consumption a lot.
> >
> > On virtual machine with 288 CPUs, per-CPU consumtion increased from 111 MB
> > to 969 MB, or 8.7x.
> >
> > I've bisected it to the commit 41a5db8d8161 ("bpf: Add support for
> > non-fix-size percpu mem allocation"), which part of the pull request.
> 
> Hmm. This is unexpected. Thank you for reporting.
> 
> How did you measure this 111 MB vs 969 MB ?
> Pls share the steps to reproduce.

Boot VMM with 288 (qemu-system-x86_64 -smp 288) and check Percpu: field of
/proc/meminfo.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply

* RE: [PATCH net 3/3] dpll: fix register pin with unregistered parent pin
From: Kubalewski, Arkadiusz @ 2023-11-09 16:13 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev@vger.kernel.org, vadim.fedorenko@linux.dev,
	Michalik, Michal, Olech, Milena, pabeni@redhat.com,
	kuba@kernel.org
In-Reply-To: <ZUzcjUqoL6gcxW6f@nanopsycho>

>From: Jiri Pirko <jiri@resnulli.us>
>Sent: Thursday, November 9, 2023 2:20 PM
>
>Thu, Nov 09, 2023 at 10:59:04AM CET, arkadiusz.kubalewski@intel.com wrote:
>>>From: Jiri Pirko <jiri@resnulli.us>
>>>Sent: Wednesday, November 8, 2023 4:08 PM
>>>
>>>Wed, Nov 08, 2023 at 11:32:26AM CET, arkadiusz.kubalewski@intel.com
>>>wrote:
>>>>In case of multiple kernel module instances using the same dpll device:
>>>>if only one registers dpll device, then only that one can register
>>>
>>>They why you don't register in multiple instances? See mlx5 for a
>>>reference.
>>>
>>
>>Every registration requires ops, but for our case only PF0 is able to
>
>What makes PF0 so special? Smell like broken FW design... Care to fix
>it?
>

Well, from my perspective FW design it is.
AFAIR this single point of control is somehow related to HW design and
security requirements back when it was designed.. Don't think this would
be doable anytime soon (if doable at all).

Thank you!
Arkadiusz

>
>>control dpll pins and device, thus only this can provide ops.
>>Basically without PF0, dpll is not able to be controlled, as well
>>as directly connected pins.
>>
>>>
>>>>directly connected pins with a dpll device. If unregistered parent
>>>>determines if the muxed pin can be register with it or not, it forces
>>>>serialized driver load order - first the driver instance which
>>>>registers the direct pins needs to be loaded, then the other instances
>>>>could register muxed type pins.
>>>>
>>>>Allow registration of a pin with a parent even if the parent was not
>>>>yet registered, thus allow ability for unserialized driver instance
>>>
>>>Weird.
>>>
>>
>>Yeah, this is issue only for MUX/parent pin part, couldn't find better
>>way, but it doesn't seem to break things around..
>>
>>Thank you!
>>Arkadiusz
>>
>>>
>>>>load order.
>>>>Do not WARN_ON notification for unregistered pin, which can be invoked
>>>>for described case, instead just return error.
>>>>
>>>>Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
>>>>Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base
>>>>functions")
>>>>Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>>>>---
>>>> drivers/dpll/dpll_core.c    | 4 ----
>>>> drivers/dpll/dpll_netlink.c | 2 +-
>>>> 2 files changed, 1 insertion(+), 5 deletions(-)
>>>>
>>>>diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index
>>>>4077b562ba3b..ae884b92d68c 100644
>>>>--- a/drivers/dpll/dpll_core.c
>>>>+++ b/drivers/dpll/dpll_core.c
>>>>@@ -28,8 +28,6 @@ static u32 dpll_xa_id;
>>>> 	WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
>>>> #define ASSERT_DPLL_NOT_REGISTERED(d)	\
>>>> 	WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
>>>>-#define ASSERT_PIN_REGISTERED(p)	\
>>>>-	WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED))
>>>>
>>>> struct dpll_device_registration {
>>>> 	struct list_head list;
>>>>@@ -641,8 +639,6 @@ int dpll_pin_on_pin_register(struct dpll_pin
>>>>*parent,
>>>struct dpll_pin *pin,
>>>> 	    WARN_ON(!ops->state_on_pin_get) ||
>>>> 	    WARN_ON(!ops->direction_get))
>>>> 		return -EINVAL;
>>>>-	if (ASSERT_PIN_REGISTERED(parent))
>>>>-		return -EINVAL;
>>>>
>>>> 	mutex_lock(&dpll_lock);
>>>> 	ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv); diff
>>>>--git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index
>>>>963bbbbe6660..ff430f43304f 100644
>>>>--- a/drivers/dpll/dpll_netlink.c
>>>>+++ b/drivers/dpll/dpll_netlink.c
>>>>@@ -558,7 +558,7 @@ dpll_pin_event_send(enum dpll_cmd event, struct
>>>>dpll_pin *pin)
>>>> 	int ret = -ENOMEM;
>>>> 	void *hdr;
>>>>
>>>>-	if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)))
>>>>+	if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))
>>>> 		return -ENODEV;
>>>>
>>>> 	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
>>>>--
>>>>2.38.1
>>>>

^ permalink raw reply

* Re: [RFC PATCH v3 10/12] tcp: RX path for devmem TCP
From: Edward Cree @ 2023-11-09 16:07 UTC (permalink / raw)
  To: Mina Almasry, David Ahern
  Cc: Willem de Bruijn, Stanislav Fomichev, netdev, linux-kernel,
	linux-arch, linux-kselftest, linux-media, dri-devel,
	linaro-mm-sig, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Jesper Dangaard Brouer, Ilias Apalodimas,
	Arnd Bergmann, Shuah Khan, Sumit Semwal, Christian König,
	Shakeel Butt, Jeroen de Borst, Praveen Kaligineedi,
	Willem de Bruijn, Kaiyuan Zhang
In-Reply-To: <CAHS8izM_qrEs37F=kPzT_kmqCBV_wSiTf72PtHfJYxks9R9--Q@mail.gmail.com>

On 09/11/2023 02:39, Mina Almasry wrote:
> On Wed, Nov 8, 2023 at 7:36 AM Edward Cree <ecree.xilinx@gmail.com> wrote:
>>  If not then surely the way to return a memory area
>>  in an io_uring idiom is just to post a new read sqe ('RX descriptor')
>>  pointing into it, rather than explicitly returning it with setsockopt.
> 
> We're interested in using this with regular TCP sockets, not
> necessarily io_uring.
Fair.  I just wanted to push against the suggestion upthread that "oh,
 since io_uring supports setsockopt() we can just ignore it and it'll
 all magically work later" (paraphrased).
If you can keep the "allocate buffers out of a devmem region" and "post
 RX descriptors built on those buffers" APIs separate (inside the
 kernel; obviously both triggered by a single call to the setsockopt()
 uAPI) that'll likely make things simpler for the io_uring interface I
 describe, which will only want the latter.

-ed

PS: Here's a crazy idea that I haven't thought through at all: what if
 you allow device memory to be mmap()ed into process address space
 (obviously with none of r/w/x because it's unreachable), so that your
 various uAPIs can just operate on pointers (e.g. the setsockopt
 becomes the madvise it's named after; recvmsg just uses or populates
 the iovec rather than needing a cmsg).  Then if future devices have
 their memory CXL accessible that can potentially be enabled with no
 change to the uAPI (userland just starts being able to access the
 region without faulting).
And you can maybe add a semantic flag to recvmsg saying "if you don't
 use all the buffers in my iovec, keep hold of the rest of them for
 future incoming traffic, and if I post new buffers with my next
 recvmsg, add those to the tail of the RXQ rather than replacing the
 ones you've got".  That way you can still have the "userland
 directly fills the RX ring" behaviour even with TCP sockets.

^ permalink raw reply

* RE: [PATCH net 3/3] dpll: fix register pin with unregistered parent pin
From: Kubalewski, Arkadiusz @ 2023-11-09 16:02 UTC (permalink / raw)
  To: Vadim Fedorenko, Jiri Pirko
  Cc: netdev@vger.kernel.org, Michalik, Michal, Olech, Milena,
	pabeni@redhat.com, kuba@kernel.org
In-Reply-To: <8b6e3211-3f03-4c17-b0cb-26175bf42213@linux.dev>

>From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
>Sent: Thursday, November 9, 2023 11:56 AM
>To: Kubalewski, Arkadiusz <arkadiusz.kubalewski@intel.com>; Jiri Pirko
>
>On 09/11/2023 09:59, Kubalewski, Arkadiusz wrote:
>>> From: Jiri Pirko <jiri@resnulli.us>
>>> Sent: Wednesday, November 8, 2023 4:08 PM
>>>
>>> Wed, Nov 08, 2023 at 11:32:26AM CET, arkadiusz.kubalewski@intel.com
>>> wrote:
>>>> In case of multiple kernel module instances using the same dpll device:
>>>> if only one registers dpll device, then only that one can register
>>>
>>> They why you don't register in multiple instances? See mlx5 for a
>>> reference.
>>>
>>
>> Every registration requires ops, but for our case only PF0 is able to
>> control dpll pins and device, thus only this can provide ops.
>> Basically without PF0, dpll is not able to be controlled, as well
>> as directly connected pins.
>>
>But why do you need other pins then, if FP0 doesn't exist?
>

In general we don't need them at that point, but this is a corner case,
where users for some reason decided to unbind PF 0, and I treat this state
as temporary, where dpll/pins controllability is temporarily broken.

The dpll at that point is not registered, all the direct pins are also
not registered, thus not available to the users.

When I do dump at that point there are still 3 pins present, one for each
PF, although they are all zombies - no parents as their parent pins are not
registered (as the other patch [1/3] prevents dump of pin parent if the
parent is not registered). Maybe we can remove the REGISTERED mark for all
the muxed pins, if all their parents have been unregistered, so they won't
be visible to the user at all. Will try to POC that.

>>>
>>>> directly connected pins with a dpll device. If unregistered parent
>>>> determines if the muxed pin can be register with it or not, it forces
>>>> serialized driver load order - first the driver instance which
>>>> registers the direct pins needs to be loaded, then the other instances
>>>> could register muxed type pins.
>>>>
>>>> Allow registration of a pin with a parent even if the parent was not
>>>> yet registered, thus allow ability for unserialized driver instance
>>>
>>> Weird.
>>>
>>
>> Yeah, this is issue only for MUX/parent pin part, couldn't find better
>> way, but it doesn't seem to break things around..
>>
>
>I just wonder how do you see the registration procedure? How can parent
>pin exist if it's not registered? I believe you cannot get it through
>DPLL API, then the only possible way is to create it within the same
>driver code, which can be simply re-arranged. Am I wrong here?
>

By "parent exist" I mean the parent pin exist in the dpll subsystem
(allocated on pins xa), but it doesn't mean it is available to the users,
as it might not be registered with a dpll device.

We have this 2 step init approach:
1. dpll_pin_get(..) -> allocate new pin or increase reference if exist
2.1. dpll_pin_register(..) -> register with a dpll device
2.2. dpll_pin_on_pin_register -> register with a parent pin

Basically:
- PF 0 does 1 & 2.1 for all the direct inputs, and steps: 1 & 2.2 for its
  recovery clock pin,
- other PF's only do step 1 for the direct input pins (as they must get
  reference to those in order to register recovery clock pin with them),
  and steps: 1 & 2.2 for their recovery clock pin.


Thank you!
Arkadiusz

>> Thank you!
>> Arkadiusz
>>
>>>
>>>> load order.
>>>> Do not WARN_ON notification for unregistered pin, which can be invoked
>>>> for described case, instead just return error.
>>>>
>>>> Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
>>>> Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base
>>>> functions")
>>>> Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>>>> ---
>>>> drivers/dpll/dpll_core.c    | 4 ----
>>>> drivers/dpll/dpll_netlink.c | 2 +-
>>>> 2 files changed, 1 insertion(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index
>>>> 4077b562ba3b..ae884b92d68c 100644
>>>> --- a/drivers/dpll/dpll_core.c
>>>> +++ b/drivers/dpll/dpll_core.c
>>>> @@ -28,8 +28,6 @@ static u32 dpll_xa_id;
>>>> 	WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
>>>> #define ASSERT_DPLL_NOT_REGISTERED(d)	\
>>>> 	WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
>>>> -#define ASSERT_PIN_REGISTERED(p)	\
>>>> -	WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED))
>>>>
>>>> struct dpll_device_registration {
>>>> 	struct list_head list;
>>>> @@ -641,8 +639,6 @@ int dpll_pin_on_pin_register(struct dpll_pin
>>>>*parent,
>>> struct dpll_pin *pin,
>>>> 	    WARN_ON(!ops->state_on_pin_get) ||
>>>> 	    WARN_ON(!ops->direction_get))
>>>> 		return -EINVAL;
>>>> -	if (ASSERT_PIN_REGISTERED(parent))
>>>> -		return -EINVAL;
>>>>
>>>> 	mutex_lock(&dpll_lock);
>>>> 	ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv); diff
>>>> --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index
>>>> 963bbbbe6660..ff430f43304f 100644
>>>> --- a/drivers/dpll/dpll_netlink.c
>>>> +++ b/drivers/dpll/dpll_netlink.c
>>>> @@ -558,7 +558,7 @@ dpll_pin_event_send(enum dpll_cmd event, struct
>>> dpll_pin *pin)
>>>> 	int ret = -ENOMEM;
>>>> 	void *hdr;
>>>>
>>>> -	if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)))
>>>> +	if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))
>>>> 		return -ENODEV;
>>>>
>>>> 	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
>>>> --
>>>> 2.38.1
>>>>
>


^ permalink raw reply

* Re: [GIT PULL v2] Networking for 6.7
From: Alexei Starovoitov @ 2023-11-09 16:01 UTC (permalink / raw)
  To: Kirill A. Shutemov, Hou Tao
  Cc: Jakub Kicinski, Yonghong Song, Alexei Starovoitov, Linus Torvalds,
	David S. Miller, Network Development, LKML, Paolo Abeni
In-Reply-To: <20231109154934.4saimljtqx625l3v@box.shutemov.name>

On Thu, Nov 9, 2023 at 7:49 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Tue, Oct 31, 2023 at 02:09:48PM -0700, Jakub Kicinski wrote:
> >       bpf: Add support for non-fix-size percpu mem allocation
>
> Recent changes in BPF increased per-CPU memory consumption a lot.
>
> On virtual machine with 288 CPUs, per-CPU consumtion increased from 111 MB
> to 969 MB, or 8.7x.
>
> I've bisected it to the commit 41a5db8d8161 ("bpf: Add support for
> non-fix-size percpu mem allocation"), which part of the pull request.

Hmm. This is unexpected. Thank you for reporting.

How did you measure this 111 MB vs 969 MB ?
Pls share the steps to reproduce.

Yonghong, Hou,

please take a look as well.

^ permalink raw reply

* Re: [GIT PULL v2] Networking for 6.7
From: Kirill A. Shutemov @ 2023-11-09 15:49 UTC (permalink / raw)
  To: Jakub Kicinski, Yonghong Song, Alexei Starovoitov
  Cc: torvalds, davem, netdev, linux-kernel, pabeni
In-Reply-To: <20231031210948.2651866-1-kuba@kernel.org>

On Tue, Oct 31, 2023 at 02:09:48PM -0700, Jakub Kicinski wrote:
>       bpf: Add support for non-fix-size percpu mem allocation

Recent changes in BPF increased per-CPU memory consumption a lot.

On virtual machine with 288 CPUs, per-CPU consumtion increased from 111 MB
to 969 MB, or 8.7x.

I've bisected it to the commit 41a5db8d8161 ("bpf: Add support for
non-fix-size percpu mem allocation"), which part of the pull request.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply

* Re: [PATCH] boning: use a read-write lock in bonding_show_bonds()
From: Jiri Pirko @ 2023-11-09 15:47 UTC (permalink / raw)
  To: Haifeng Xu
  Cc: j.vosburgh, andy, davem, edumazet, kuba, pabeni, netdev,
	linux-kernel
In-Reply-To: <20231108064641.65209-1-haifeng.xu@shopee.com>


s/boning/bonding/
?

^ permalink raw reply

* [PATCH 02/41] rxrpc: Fix two connection reaping bugs
From: David Howells @ 2023-11-09 15:39 UTC (permalink / raw)
  To: Marc Dionne
  Cc: David Howells, linux-afs, linux-fsdevel, linux-kernel,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev
In-Reply-To: <20231109154004.3317227-1-dhowells@redhat.com>

Fix two connection reaping bugs:

 (1) rxrpc_connection_expiry is in units of seconds, so
     rxrpc_disconnect_call() needs to multiply it by HZ when adding it to
     jiffies.

 (2) rxrpc_client_conn_reap_timeout() should set RXRPC_CLIENT_REAP_TIMER if
     local->kill_all_client_conns is clear, not if it is set (in which case
     we don't need the timer).  Without this, old client connections don't
     get cleaned up until the local endpoint is cleaned up.

Fixes: 5040011d073d ("rxrpc: Make the local endpoint hold a ref on a connected call")
Fixes: 0d6bf319bc5a ("rxrpc: Move the client conn cache management to the I/O thread")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: netdev@vger.kernel.org
cc: linux-afs@lists.infradead.org
---
 net/rxrpc/conn_object.c  | 2 +-
 net/rxrpc/local_object.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index ac85d4644a3c..df8a271948a1 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -212,7 +212,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
 		conn->idle_timestamp = jiffies;
 		if (atomic_dec_and_test(&conn->active))
 			rxrpc_set_service_reap_timer(conn->rxnet,
-						     jiffies + rxrpc_connection_expiry);
+						     jiffies + rxrpc_connection_expiry * HZ);
 	}
 
 	rxrpc_put_call(call, rxrpc_call_put_io_thread);
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 7d910aee4f8c..c553a30e9c83 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -87,7 +87,7 @@ static void rxrpc_client_conn_reap_timeout(struct timer_list *timer)
 	struct rxrpc_local *local =
 		container_of(timer, struct rxrpc_local, client_conn_reap_timer);
 
-	if (local->kill_all_client_conns &&
+	if (!local->kill_all_client_conns &&
 	    test_and_set_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags))
 		rxrpc_wake_up_io_thread(local);
 }


^ permalink raw reply related

* Re: [PATCH net v3] net: ti: icssg-prueth: Add missing icss_iep_put to error path
From: Jakub Kicinski @ 2023-11-09 15:29 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: David S. Miller, Eric Dumazet, Paolo Abeni, MD Danish Anwar,
	netdev, linux-kernel, Lopes Ivo, Diogo Miguel (T CED IFD-PT),
	Nishanth Menon, Su, Bao Cheng (RC-CN DF FA R&D),
	Wojciech Drewek, Roger Quadros, Grygorii Strashko
In-Reply-To: <502a27b6-e555-42d2-bb0f-964a58f81dbe@siemens.com>

On Thu, 9 Nov 2023 12:08:21 +0100 Jan Kiszka wrote:
> > Is there a reason you're not CCing authors of these changes?
> > Please make sure you run get_maintainer on the patch, and CC
> > folks appropriately.  
> 
> I was only interacting (directly) with Danish in the past years on this
> driver, and he also upstreamed it. So I assumed "ownership" moved on.
> Adding both, Roger with updated email (where get_maintainer does not help).

You'll need to repost the patch, it's been dropped from patchwork.

Roger, if your old address doesn't work any more please add an entry
to .mailmap.

^ permalink raw reply

* [PATCH net] ipvlan: add ipvlan_route_v6_outbound() helper
From: Eric Dumazet @ 2023-11-09 15:22 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, eric.dumazet, Eric Dumazet, syzbot, Mahesh Bandewar,
	Willem de Bruijn

Inspired by syzbot reports using a stack of multiple ipvlan devices.

Reduce stack size needed in ipvlan_process_v6_outbound() by moving
the flowi6 struct used for the route lookup in an non inlined
helper. ipvlan_route_v6_outbound() needs 120 bytes on the stack,
immediately reclaimed.

Also make sure ipvlan_process_v4_outbound() is not inlined.

We might also have to lower MAX_NEST_DEV, because only syzbot uses
setups with more than four stacked devices.

BUG: TASK stack guard page was hit at ffffc9000e803ff8 (stack is ffffc9000e804000..ffffc9000e808000)
stack guard page: 0000 [#1] SMP KASAN
CPU: 0 PID: 13442 Comm: syz-executor.4 Not tainted 6.1.52-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023
RIP: 0010:kasan_check_range+0x4/0x2a0 mm/kasan/generic.c:188
Code: 48 01 c6 48 89 c7 e8 db 4e c1 03 31 c0 5d c3 cc 0f 0b eb 02 0f 0b b8 ea ff ff ff 5d c3 cc 00 00 cc cc 00 00 cc cc 55 48 89 e5 <41> 57 41 56 41 55 41 54 53 b0 01 48 85 f6 0f 84 a4 01 00 00 48 89
RSP: 0018:ffffc9000e804000 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff817e5bf2
RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffff887c6568
RBP: ffffc9000e804000 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: dffffc0000000001 R12: 1ffff92001d0080c
R13: dffffc0000000000 R14: ffffffff87e6b100 R15: 0000000000000000
FS: 00007fd0c55826c0(0000) GS:ffff8881f6800000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffc9000e803ff8 CR3: 0000000170ef7000 CR4: 00000000003506f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<#DF>
</#DF>
<TASK>
[<ffffffff81f281d1>] __kasan_check_read+0x11/0x20 mm/kasan/shadow.c:31
[<ffffffff817e5bf2>] instrument_atomic_read include/linux/instrumented.h:72 [inline]
[<ffffffff817e5bf2>] _test_bit include/asm-generic/bitops/instrumented-non-atomic.h:141 [inline]
[<ffffffff817e5bf2>] cpumask_test_cpu include/linux/cpumask.h:506 [inline]
[<ffffffff817e5bf2>] cpu_online include/linux/cpumask.h:1092 [inline]
[<ffffffff817e5bf2>] trace_lock_acquire include/trace/events/lock.h:24 [inline]
[<ffffffff817e5bf2>] lock_acquire+0xe2/0x590 kernel/locking/lockdep.c:5632
[<ffffffff8563221e>] rcu_lock_acquire+0x2e/0x40 include/linux/rcupdate.h:306
[<ffffffff8561464d>] rcu_read_lock include/linux/rcupdate.h:747 [inline]
[<ffffffff8561464d>] ip6_pol_route+0x15d/0x1440 net/ipv6/route.c:2221
[<ffffffff85618120>] ip6_pol_route_output+0x50/0x80 net/ipv6/route.c:2606
[<ffffffff856f65b5>] pol_lookup_func include/net/ip6_fib.h:584 [inline]
[<ffffffff856f65b5>] fib6_rule_lookup+0x265/0x620 net/ipv6/fib6_rules.c:116
[<ffffffff85618009>] ip6_route_output_flags_noref+0x2d9/0x3a0 net/ipv6/route.c:2638
[<ffffffff8561821a>] ip6_route_output_flags+0xca/0x340 net/ipv6/route.c:2651
[<ffffffff838bd5a3>] ip6_route_output include/net/ip6_route.h:100 [inline]
[<ffffffff838bd5a3>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:473 [inline]
[<ffffffff838bd5a3>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bd5a3>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bd5a3>] ipvlan_queue_xmit+0xc33/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff855ce4cd>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff855ce4cd>] neigh_hh_output include/net/neighbour.h:529 [inline]
[<ffffffff855ce4cd>] neigh_output include/net/neighbour.h:543 [inline]
[<ffffffff855ce4cd>] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff8575d27f>] dst_output include/net/dst.h:444 [inline]
[<ffffffff8575d27f>] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161
[<ffffffff838bdae4>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline]
[<ffffffff838bdae4>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bdae4>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bdae4>] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff855ce4cd>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff855ce4cd>] neigh_hh_output include/net/neighbour.h:529 [inline]
[<ffffffff855ce4cd>] neigh_output include/net/neighbour.h:543 [inline]
[<ffffffff855ce4cd>] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff8575d27f>] dst_output include/net/dst.h:444 [inline]
[<ffffffff8575d27f>] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161
[<ffffffff838bdae4>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline]
[<ffffffff838bdae4>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bdae4>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bdae4>] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff855ce4cd>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff855ce4cd>] neigh_hh_output include/net/neighbour.h:529 [inline]
[<ffffffff855ce4cd>] neigh_output include/net/neighbour.h:543 [inline]
[<ffffffff855ce4cd>] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff8575d27f>] dst_output include/net/dst.h:444 [inline]
[<ffffffff8575d27f>] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161
[<ffffffff838bdae4>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline]
[<ffffffff838bdae4>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bdae4>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bdae4>] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff855ce4cd>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff855ce4cd>] neigh_hh_output include/net/neighbour.h:529 [inline]
[<ffffffff855ce4cd>] neigh_output include/net/neighbour.h:543 [inline]
[<ffffffff855ce4cd>] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff8575d27f>] dst_output include/net/dst.h:444 [inline]
[<ffffffff8575d27f>] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161
[<ffffffff838bdae4>] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline]
[<ffffffff838bdae4>] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline]
[<ffffffff838bdae4>] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline]
[<ffffffff838bdae4>] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677
[<ffffffff838c2909>] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229
[<ffffffff84d03900>] netdev_start_xmit include/linux/netdevice.h:4966 [inline]
[<ffffffff84d03900>] xmit_one net/core/dev.c:3644 [inline]
[<ffffffff84d03900>] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660
[<ffffffff84d080e2>] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324
[<ffffffff84d4a65e>] dev_queue_xmit include/linux/netdevice.h:3067 [inline]
[<ffffffff84d4a65e>] neigh_resolve_output+0x64e/0x750 net/core/neighbour.c:1560
[<ffffffff855ce503>] neigh_output include/net/neighbour.h:545 [inline]
[<ffffffff855ce503>] ip6_finish_output2+0x1643/0x1ae0 net/ipv6/ip6_output.c:139
[<ffffffff855b8616>] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline]
[<ffffffff855b8616>] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211
[<ffffffff855b7e3c>] NF_HOOK_COND include/linux/netfilter.h:298 [inline]
[<ffffffff855b7e3c>] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232
[<ffffffff855b9ce4>] dst_output include/net/dst.h:444 [inline]
[<ffffffff855b9ce4>] NF_HOOK include/linux/netfilter.h:309 [inline]
[<ffffffff855b9ce4>] ip6_xmit+0x11a4/0x1b20 net/ipv6/ip6_output.c:352
[<ffffffff8597984e>] sctp_v6_xmit+0x9ae/0x1230 net/sctp/ipv6.c:250
[<ffffffff8594623e>] sctp_packet_transmit+0x25de/0x2bc0 net/sctp/output.c:653
[<ffffffff858f5142>] sctp_packet_singleton+0x202/0x310 net/sctp/outqueue.c:783
[<ffffffff858ea411>] sctp_outq_flush_ctrl net/sctp/outqueue.c:914 [inline]
[<ffffffff858ea411>] sctp_outq_flush+0x661/0x3d40 net/sctp/outqueue.c:1212
[<ffffffff858f02f9>] sctp_outq_uncork+0x79/0xb0 net/sctp/outqueue.c:764
[<ffffffff8589f060>] sctp_side_effects net/sctp/sm_sideeffect.c:1199 [inline]
[<ffffffff8589f060>] sctp_do_sm+0x55c0/0x5c30 net/sctp/sm_sideeffect.c:1170
[<ffffffff85941567>] sctp_primitive_ASSOCIATE+0x97/0xc0 net/sctp/primitive.c:73
[<ffffffff859408b2>] sctp_sendmsg_to_asoc+0xf62/0x17b0 net/sctp/socket.c:1839
[<ffffffff85910b5e>] sctp_sendmsg+0x212e/0x33b0 net/sctp/socket.c:2029
[<ffffffff8544d559>] inet_sendmsg+0x149/0x310 net/ipv4/af_inet.c:849
[<ffffffff84c6c4d2>] sock_sendmsg_nosec net/socket.c:716 [inline]
[<ffffffff84c6c4d2>] sock_sendmsg net/socket.c:736 [inline]
[<ffffffff84c6c4d2>] ____sys_sendmsg+0x572/0x8c0 net/socket.c:2504
[<ffffffff84c6ca91>] ___sys_sendmsg net/socket.c:2558 [inline]
[<ffffffff84c6ca91>] __sys_sendmsg+0x271/0x360 net/socket.c:2587
[<ffffffff84c6cbff>] __do_sys_sendmsg net/socket.c:2596 [inline]
[<ffffffff84c6cbff>] __se_sys_sendmsg net/socket.c:2594 [inline]
[<ffffffff84c6cbff>] __x64_sys_sendmsg+0x7f/0x90 net/socket.c:2594
[<ffffffff85b32553>] do_syscall_x64 arch/x86/entry/common.c:51 [inline]
[<ffffffff85b32553>] do_syscall_64+0x53/0x80 arch/x86/entry/common.c:84
[<ffffffff85c00087>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
---
 drivers/net/ipvlan/ipvlan_core.c | 41 +++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 21e9cac7312186380fa60de11f0a9178080b74b0..2d5b021b4ea6053eeb055a76fa4c7d9380cd2a53 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -411,7 +411,7 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h,
 	return addr;
 }
 
-static int ipvlan_process_v4_outbound(struct sk_buff *skb)
+static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb)
 {
 	const struct iphdr *ip4h = ip_hdr(skb);
 	struct net_device *dev = skb->dev;
@@ -453,13 +453,11 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb)
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int ipvlan_process_v6_outbound(struct sk_buff *skb)
+
+static noinline_for_stack int
+ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb)
 {
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	struct net_device *dev = skb->dev;
-	struct net *net = dev_net(dev);
-	struct dst_entry *dst;
-	int err, ret = NET_XMIT_DROP;
 	struct flowi6 fl6 = {
 		.flowi6_oif = dev->ifindex,
 		.daddr = ip6h->daddr,
@@ -469,27 +467,38 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
 		.flowi6_mark = skb->mark,
 		.flowi6_proto = ip6h->nexthdr,
 	};
+	struct dst_entry *dst;
+	int err;
 
-	dst = ip6_route_output(net, NULL, &fl6);
-	if (dst->error) {
-		ret = dst->error;
+	dst = ip6_route_output(dev_net(dev), NULL, &fl6);
+	err = dst->error;
+	if (err) {
 		dst_release(dst);
-		goto err;
+		return err;
 	}
 	skb_dst_set(skb, dst);
+	return 0;
+}
+
+static int ipvlan_process_v6_outbound(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	int err, ret = NET_XMIT_DROP;
+
+	err = ipvlan_route_v6_outbound(dev, skb);
+	if (unlikely(err)) {
+		DEV_STATS_INC(dev, tx_errors);
+		kfree_skb(skb);
+		return err;
+	}
 
 	memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
 
-	err = ip6_local_out(net, skb->sk, skb);
+	err = ip6_local_out(dev_net(dev), skb->sk, skb);
 	if (unlikely(net_xmit_eval(err)))
 		DEV_STATS_INC(dev, tx_errors);
 	else
 		ret = NET_XMIT_SUCCESS;
-	goto out;
-err:
-	DEV_STATS_INC(dev, tx_errors);
-	kfree_skb(skb);
-out:
 	return ret;
 }
 #else
-- 
2.42.0.869.gea05f2083d-goog


^ permalink raw reply related

* Re: [net-next v4 0/5] net: wwan: t7xx: fw flashing & coredump support
From: Loic Poulain @ 2023-11-09 15:21 UTC (permalink / raw)
  To: SongJinJian@hotmail.com
  Cc: Jiri Pirko, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, corbet@lwn.net,
	ryazanov.s.a@gmail.com, johannes@sipsolutions.net,
	chandrashekar.devegowda@intel.com, linuxwwan@intel.com,
	chiranjeevi.rapolu@linux.intel.com, haijun.liu@mediatek.com,
	m.chetan.kumar@linux.intel.com, ricardo.martinez@linux.intel.com,
	netdev@vger.kernel.org, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, nmarupaka@google.com,
	vsankar@lenovo.com, danielwinkler@google.com
In-Reply-To: <ZUTOX0oSCPpdtjJV@nanopsycho>

Hi JinJian,

On Fri, 3 Nov 2023 at 11:41, Jiri Pirko <jiri@resnulli.us> wrote:
>
> Mon, Sep 18, 2023 at 08:56:26AM CEST, SongJinJian@hotmail.com wrote:
> >Tue, Sep 12, 2023 at 11:48:40AM CEST, songjinjian@hotmail.com wrote:
> >>>Adds support for t7xx wwan device firmware flashing & coredump
> >>>collection using devlink.
> >
> >>I don't believe that use of devlink is correct here. It seems like a misfit. IIUC, what you need is to communicate with the modem. Basically a communication channel to modem. The other wwan drivers implement these channels in _ctrl.c files, using multiple protocols. Why can't you do something similar and let devlink out of this please?
> >
> >>Until you put in arguments why you really need devlink and why is it a good fit, I'm against this. Please don't send any other versions of this patchset that use devlink.
> >
> > Yes, t7xx driver need communicate with modem with a communication channel to modem.
> > I took a look at the _ctrl.c files under wwan directory, it seemed the implementation can be well integrated with QualCommon's modem, if we do like this, I think we need modem firmware change, maybe not be suitable for current MTK modem directly.
> > Except for Qualcomm modem driver, there is also an Intel wwan driver 'iosm' and it use devlink to implement firmware flash(https://www.kernel.org/doc/html/latest/networking/devlink/iosm.html), Intel and MTK design and use devlink to do this work on
>
> If that exists, I made a mistake as a gatekeeper. That usage looks
> wrong.
>
>
> > 'mtk_t7xx' driver and I continue to do this work.
> >
> > I think devlink framework can support this scene and if we use devlink we don't need to develop other flash tools or other user space applications, use upstream devlink commands directly.
>
> Please don't.

So this is clear that devlink should not be used for this wwan
firmware upgrade, if you still want to abstract the fastboot protocol
part, maybe the easier would be to move on the generic firmware
framework, and especially the firmware upload API which seems to be a
good fit here? https://docs.kernel.org/driver-api/firmware/fw_upload.html#firmware-upload-api

Regards,
Loic

^ permalink raw reply

* Re: [RFC net-next] net: don't dump stack on queue timeout
From: Eric Dumazet @ 2023-11-09 15:21 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem, netdev, pabeni, syzbot+d55372214aff0faa1f1f, jhs,
	xiyou.wangcong, jiri
In-Reply-To: <20231109000901.949152-1-kuba@kernel.org>

On Thu, Nov 9, 2023 at 1:09 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> The top syzbot report for networking (#14 for the entire kernel)
> is the queue timeout splat. We kept it around for a long time,
> because in real life it provides pretty strong signal that
> something is wrong with the driver or the device.
>
> Removing it is also likely to break monitoring for those who
> track it as a kernel warning.
>
> Nevertheless, WARN()ings are best suited for catching kernel
> programming bugs. If a Tx queue gets starved due to a pause
> storm, priority configuration, or other weirdness - that's
> obviously a problem, but not a problem we can fix at
> the kernel level.
>
> Bite the bullet and convert the WARN() to a print.
>
> Before:
>
>   NETDEV WATCHDOG: eni1np1 (netdevsim): transmit queue 0 timed out 1975 ms
>   WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:525 dev_watchdog+0x39e/0x3b0
>   [... completely pointless stack trace of a timer follows ...]
>
> Now:
>
>   netdevsim netdevsim1 eni1np1: NETDEV WATCHDOG: CPU: 0: transmit queue 0 timed out 1769 ms
>
> Alternatively we could mark the drivers which syzbot has
> learned to abuse as "print-instead-of-WARN" selectively.
>
> Reported-by: syzbot+d55372214aff0faa1f1f@syzkaller.appspotmail.com
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>

SGTM !
Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [PATCH] Documentation: Document the Netlink spec
From: Breno Leitao @ 2023-11-09 15:20 UTC (permalink / raw)
  To: Jonathan Corbet; +Cc: Donald Hunter, linux-doc, netdev, kuba, pabeni, edumazet
In-Reply-To: <87r0kzuiax.fsf@meer.lwn.net>

On Thu, Nov 09, 2023 at 07:12:38AM -0700, Jonathan Corbet wrote:
> Donald Hunter <donald.hunter@gmail.com> writes:
> 
> > Jonathan Corbet <corbet@lwn.net> writes:
> >> I do have to wonder, though, whether a sphinx extension is the right way
> >> to solve this problem.  You're essentially implementing a filter that
> >> turns one YAML file into one RST file; might it be better to keep that
> >> outside of sphinx as a standalone script, invoked by the Makefile?
> >>
> >> Note that I'm asking because I wonder, I'm not saying I would block an
> >> extension-based implementation.
> >
> > +1 to this. The .rst generation can then be easily tested independently
> > of the doc build and the stub files could be avoided.
> >
> > Just a note that last year you offered the opposite guidance:
> >
> > https://lore.kernel.org/linux-doc/87tu4zsfse.fsf@meer.lwn.net/
> 
> Heh ... I totally forgot about that whole discussion ...
> 
> > If the preference now is for standalone scripts invoked by the Makefile
> > then this previous patch might be useful:
> >
> > https://lore.kernel.org/linux-doc/20220922115257.99815-2-donald.hunter@gmail.com/
> >
> > It would be good to document the preferred approach to this kind of doc
> > extension and I'd be happy to contribute an 'Extensions' section for
> > contributing.rst in the doc-guide.
> 
> I think it will vary depending on what we're trying to do, and I think
> we're still working it out - part of why I expressed some uncertainty
> this time around.
> 
> For something like the kernel-doc or automarkup, where we are modifying
> existing documents, an extension is the only way to go.  In this case,
> where we are creating new RST files from whole cloth, it's not so clear
> to me.  My feeling (this week at least ;) is that doing it as an
> extension makes things more complicated without a lot of benefit.

One way or another works for me. Given my experience with both ways, let
me share the advantages of boths so we can understand the trade-offs
better:

Sphinx extension advantages:
===========================

 1) Keep "extensions" uniform and organized, written in the same
 framework/language (python), using the same enry point, output, etc.

 2) Easy to cross reference objects in the whole documentation (not done
 in this patchset)

 3) Same dependencies for all "documentation". I.e, you don't need a
 dependency (as in python pip requirements.txt) for every "parser"
 script.

 4) Already being used in our infrastructure.


One-off parser advantages:
=========================

 1) Total flexibility. You can write the parser in any language.

 2) Easier and faster to interate and debug.

 3) No need to have a rst stub for every file (as in this patchset).
 This is a sphinx limitation right now, and *might* go away in the
 future.

 4) Less dependent of sphinx project.

> FWIW, if something like this is done as a makefile change, I'd do it a
> bit differently than your linked patch above.  Rather than replicate the
> command through the file, I'd just add a new target:
> 
>   netlink_specs:
>   	.../scripts/gen-netlink-rst
> 
>   htmldocs: netlink_specs
>   	existing stuff here
> 
> But that's a detail.

For that, we can't use a sphinx extension, since there is no way (as I
understood) from one rst to generate multiple rst.

^ permalink raw reply

* Re: [RFC net-next] net: don't dump stack on queue timeout
From: Jakub Kicinski @ 2023-11-09 15:18 UTC (permalink / raw)
  To: Daniele Palmas
  Cc: davem, netdev, edumazet, pabeni, syzbot+d55372214aff0faa1f1f, jhs,
	xiyou.wangcong, jiri
In-Reply-To: <CAGRyCJHiPcKnBkkCDxbannmJYLwZevvz8cnx88PcvnCeYULDaA@mail.gmail.com>

On Thu, 9 Nov 2023 08:40:00 +0100 Daniele Palmas wrote:
> For example, I can see the splat with MBIM modems when radio link
> failure happens, something for which the host can't really do
> anything. So, the main result of using WARN is to scare the users who
> are not aware of the reasons behind it and create unneeded support
> requests...

Is it not possible to clear the carrier on downstream devices?
Radio link failure sounds like carrier loss.

^ permalink raw reply

* Re: [PATCH] Documentation: Document the Netlink spec
From: Jakub Kicinski @ 2023-11-09 15:16 UTC (permalink / raw)
  To: Jonathan Corbet, Breno Leitao
  Cc: Donald Hunter, linux-doc, netdev, pabeni, edumazet
In-Reply-To: <87r0kzuiax.fsf@meer.lwn.net>

On Thu, 09 Nov 2023 07:12:38 -0700 Jonathan Corbet wrote:
>   netlink_specs:
>   	.../scripts/gen-netlink-rst

FWIW if we go down that route we probably want to put the script
under tools/net/ynl/ and reuse tools/net/ynl/lib/nlspec.py ?
It "abstracts away" some basic parsing of the spec, fills in implied
attributes etc.

^ permalink raw reply

* RE: [PATCH net,v3, 2/2] hv_netvsc: Fix race of register_netdevice_notifier and VF register
From: Haiyang Zhang @ 2023-11-09 14:39 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	KY Srinivasan, wei.liu@kernel.org, Dexuan Cui,
	edumazet@google.com, pabeni@redhat.com, davem@davemloft.net,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org
In-Reply-To: <20231108182618.09ef4dfe@kernel.org>



> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Wednesday, November 8, 2023 9:26 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> <decui@microsoft.com>; edumazet@google.com; pabeni@redhat.com;
> davem@davemloft.net; linux-kernel@vger.kernel.org; stable@vger.kernel.org
> Subject: Re: [PATCH net,v3, 2/2] hv_netvsc: Fix race of
> register_netdevice_notifier and VF register
> 
> On Tue,  7 Nov 2023 13:05:32 -0800 Haiyang Zhang wrote:
> > If VF NIC is registered earlier, NETDEV_REGISTER event is replayed,
> > but NETDEV_POST_INIT is not.
> 
> But Long Li sent the patch which starts to use POST_INIT against
> the net-next tree. If we apply this to net and Long Li's patch to
> net-next one release will have half of the fixes.
> 
> I think that you should add Long Li's patch to this series. That'd
> limit the confusion and git preserves authorship of the changes, so
> neither of you will loose the credit.

Will do.

Thanks,
- Haiyang

^ permalink raw reply

* Re: [PATCH] Documentation: Document the Netlink spec
From: Jonathan Corbet @ 2023-11-09 14:12 UTC (permalink / raw)
  To: Donald Hunter; +Cc: Breno Leitao, linux-doc, netdev, kuba, pabeni, edumazet
In-Reply-To: <m2h6lvmasi.fsf@gmail.com>

Donald Hunter <donald.hunter@gmail.com> writes:

> Jonathan Corbet <corbet@lwn.net> writes:
>> I do have to wonder, though, whether a sphinx extension is the right way
>> to solve this problem.  You're essentially implementing a filter that
>> turns one YAML file into one RST file; might it be better to keep that
>> outside of sphinx as a standalone script, invoked by the Makefile?
>>
>> Note that I'm asking because I wonder, I'm not saying I would block an
>> extension-based implementation.
>
> +1 to this. The .rst generation can then be easily tested independently
> of the doc build and the stub files could be avoided.
>
> Just a note that last year you offered the opposite guidance:
>
> https://lore.kernel.org/linux-doc/87tu4zsfse.fsf@meer.lwn.net/

Heh ... I totally forgot about that whole discussion ...

> If the preference now is for standalone scripts invoked by the Makefile
> then this previous patch might be useful:
>
> https://lore.kernel.org/linux-doc/20220922115257.99815-2-donald.hunter@gmail.com/
>
> It would be good to document the preferred approach to this kind of doc
> extension and I'd be happy to contribute an 'Extensions' section for
> contributing.rst in the doc-guide.

I think it will vary depending on what we're trying to do, and I think
we're still working it out - part of why I expressed some uncertainty
this time around.

For something like the kernel-doc or automarkup, where we are modifying
existing documents, an extension is the only way to go.  In this case,
where we are creating new RST files from whole cloth, it's not so clear
to me.  My feeling (this week at least ;) is that doing it as an
extension makes things more complicated without a lot of benefit.

FWIW, if something like this is done as a makefile change, I'd do it a
bit differently than your linked patch above.  Rather than replicate the
command through the file, I'd just add a new target:

  netlink_specs:
  	.../scripts/gen-netlink-rst

  htmldocs: netlink_specs
  	existing stuff here

But that's a detail.

Thanks,

jon

^ permalink raw reply

* Re: [RFC PATCH v3 07/12] page-pool: device memory support
From: Yunsheng Lin @ 2023-11-09 13:23 UTC (permalink / raw)
  To: Mina Almasry
  Cc: netdev, linux-kernel, linux-arch, linux-kselftest, linux-media,
	dri-devel, linaro-mm-sig, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Jesper Dangaard Brouer,
	Ilias Apalodimas, Arnd Bergmann, David Ahern, Willem de Bruijn,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi
In-Reply-To: <CAHS8izOh8yC7q9yJN+RAKGs=AgsEf13MnFDmG46=EU05ynnLKw@mail.gmail.com>

On 2023/11/9 20:20, Mina Almasry wrote:
> On Thu, Nov 9, 2023 at 1:30 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2023/11/9 11:20, Mina Almasry wrote:
>>> On Wed, Nov 8, 2023 at 2:56 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>>>
>>> Agreed everything above is undoable.
>>>
>>>> But we might be able to do something as folio is doing now, mm subsystem
>>>> is still seeing 'struct folio/page', but other subsystem like slab is using
>>>> 'struct slab', and there is still some common fields shared between
>>>> 'struct folio' and 'struct slab'.
>>>>
>>>
>>> In my eyes this is almost exactly what I suggested in RFC v1 and got
>>> immediately nacked with no room to negotiate. What we did for v1 is to
>>> allocate struct pages for dma-buf to make dma-bufs look like struct
>>> page to mm subsystem. Almost exactly what you're describing above.
>>
>> Maybe the above is where we have disagreement:
>> Do we still need make dma-bufs look like struct page to mm subsystem?
>> IMHO, the answer is no. We might only need to make dma-bufs look like
>> struct page to net stack and page pool subsystem. I think that is already
>> what this pacthset is trying to do, what I am suggesting is just make
>> it more like 'struct page' to net stack and page pool subsystem, in order
>> to try to avoid most of the 'if' checking in net stack and page pool
>> subsystem.
>>
> 
> First, most of the checking in the net stack is
> skb_frag_not_readable(). dma-buf are fundamentally not kmap()able and
> not readable. So we can't remove those, no matter what we do I think.
> Can we agree on that? If so, lets discuss removing most of the ifs in
> the page pool, only.

Agreed on the 'not kmap()able and not readable' checking part.

> 
>>> It's a no-go. I don't think renaming struct page to netmem is going to
>>> move the needle (it also re-introduces code-churn). What I feel like I
>>> learnt is that dma-bufs are not struct pages and can't be made to look
>>> like one, I think.
>>>
>>>> As the netmem patchset, is devmem able to reuse the below 'struct netmem'
>>>> and rename it to 'struct page_pool_iov'?
>>>
>>> I don't think so. For the reasons above, but also practically it
>>> immediately falls apart. Consider this field in netmem:
>>>
>>> + * @flags: The same as the page flags.  Do not use directly.
>>>
>>> dma-buf don't have or support page-flags, and making dma-buf looks
>>> like they support page flags or any page-like features (other than
>>> dma_addr) seems extremely unacceptable to mm folks.
>>
>> As far as I tell, as we limit the devmem usage in netstack, the below
>> is the related mm function call for 'struct page' for devmem:
>> page_ref_*(): page->_refcount does not need changing
> 
> Sorry, I don't understand. Are you suggesting we call page_ref_add() &
> page_ref_sub() on page_pool_iov? That is basically making
> page_pool_iov look like struct page to the mm stack, since page_ref_*
> are mm calls, which you say above we don't need to do. We will still
> need to special case this, no?

As we are reusing 'struct page' for devmem, page->_refcount for
devmem and page->_refcount for normal memory should be the same, right?
We may need to ensure 'struct page' for devmem to always look like a head
page for compound page or base page for net stack, as we use get_page()
in __skb_frag_ref().

We can choose to not call page_ref_sub() for page from devmem, we can
call napi_pp_put_page(), and we may be able to special handle the page
from devmem in devmem provider's 'release_page' ops in napi_pp_put_page().

> 
>> page_is_pfmemalloc(): which is corresponding to page->pp_magic, and
>>                       devmem provider can set/unset it in it's 'alloc_pages'
>>                       ops.
> 
> page_is_pfmemalloc() has nothing to do with page->pp_magic. It checks
> page->lru.next to figure out if this is a pfmemalloc. page_pool_iov
> has no page->lru.next. Still need to special case this?

See the comment in napi_pp_put_page():

	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
	 * in order to preserve any existing bits, such as bit 0 for the
	 * head page of compound page and bit 1 for pfmemalloc page, so
	 * mask those bits for freeing side when doing below checking,
	 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
	 * to avoid recycling the pfmemalloc page.
	 */

There is some union in struct page, page->lru.next and page->pp_magic is
actually pointing to the same thing as my understanding.


> 
>> page_to_nid(): we may need to handle it differently somewhat like this
>>                patch does as page_to_nid() may has different implementation
>>                based on different configuration.
> 
> So you're saying we need to handle page_to_nid() differently for
> devmem? So we're not going to be able to avoid the if statement.

Yes, it seems to be the only place that might need special handling I
see so far.

> 
>> page_pool_iov_put_many(): as mentioned in other thread, if net stack is not
>>                           calling page_pool_page_put_many() directly, we
>>                           can reuse napi_pp_put_page() for devmem too, and
>>                           handle the special case for devmem in 'release_page'
>>                           ops.
>>
> 
> page_pool_iov_put_many()/page_pool_iov_get_many() are called to do

Can we remove the page_pool_iov_put_many()/page_pool_iov_get_many()
calling?

> refcounting before the page is released back to the provider. I'm not
> seeing how we can handle the special case inside of 'release_page' -
> that's too late, as far as I can tell.

And handle the special case in page_pool_return_page() to mainly
replace put_page() with 'release_page' for devmem page?
https://elixir.free-electrons.com/linux/v6.6-rc1/source/net/core/page_pool.c#L537

> 
> The only way to remove the if statements in the page pool is to
> implement what you said was not feasible in an earlier email. We would
> define this struct:
> 
> struct netmem {
>         /* common fields */
>         refcount_t refcount;
>         bool is_pfmemalloc;
>         int nid;
>         ......
>         union {
>                 struct devmem{
>                         struct dmabuf_genpool_chunk_owner *owner;
>                 };
> 
>                 struct page * page;
>         };
> };
> 
> Then, we would require all memory providers to allocate struct netmem
> for the memory and set the common fields, including ones that have
> struct pages. For devmem, netmem->page will be NULL, because netmem
> has no page.

That is not what I have in mind.

> 
> If we do that, the page pool can ignore whether the underlying memory
> is page or devmem, because it can use the common fields, example:
> 
> /* page_ref_count replacement */
> netmem_ref_count(struct netmem* netmem) {
>     return netmem->refcount;
> }
> 
> /* page_ref_add replacement */
> netmem_ref_add(struct netmem* netmem) {
>    atomic_inc(netmem->refcount);
> }
> 
> /* page_to_nid replacement */
> netmem_nid(struct netmem* netmem) {
>     return netmem->nid;
> }
> 
> /* page_is_pfmemalloc() replacement */
> netmem_is_pfmemalloc(struct netmem* netmem) {
>     return netmem->is_pfmemalloc;
> }
> 
> /* page_ref_sub replacement */
> netmem_ref_sub(struct netmem* netmem) {
>     atomic_sub(netmet->refcount);
>     if (netmem->refcount == 0) {
>                   /* release page to the memory provider.
>                    * struct page memory provider will do put_page(),
>                    * devmem will do something else */
>            }
>      }
> }
> 
> 
> I think this MAY BE technically feasible, but I'm not sure it's better:
> 
> 1. It is a huge refactor to the page pool, lots of code churn. While
> the page pool currently uses page*, it needs to be completely
> refactored to use netmem*.
> 2. It causes extra memory usage. struct netmem needs to be allocated
> for every struct page.
> 3. It has minimal perf upside. The page_is_page_pool_iov() checks
> currently have minimal perf impact, and I demonstrated that to Jesper
> in RFC v2.
> 4. It also may not be technically feasible. I'm not sure how netmem
> interacts with skb_frag_t. I guess we replace struct page* bv_page
> with struct netmem* bv_page, and add changes there.
> 5. Drivers need to be refactored to use netmem* instead of page*,
> unless we cast netmem* to page* before returning to the driver.
> 
> Possibly other downsides, these are what I could immediately think of.
> 
> If I'm still misunderstanding your suggestion, it may be time to send
> me a concrete code snippet of what you have in mind. I'm a bit
> confused at the moment because the only avenue I see to remove the if
> statements in the page pool is to define the struct that we agreed is
> not feasible in earlier emails.
> 

I might be able to do it at the weekend if it is still not making any
sense to you.

> 
> --
> Thanks,
> Mina
> 
> .
> 

^ permalink raw reply

* Re: [PATCH net 3/3] dpll: fix register pin with unregistered parent pin
From: Jiri Pirko @ 2023-11-09 13:20 UTC (permalink / raw)
  To: Kubalewski, Arkadiusz
  Cc: netdev@vger.kernel.org, vadim.fedorenko@linux.dev,
	Michalik, Michal, Olech, Milena, pabeni@redhat.com,
	kuba@kernel.org
In-Reply-To: <DM6PR11MB46572BD8C43DACA0FF15C2CF9BAFA@DM6PR11MB4657.namprd11.prod.outlook.com>

Thu, Nov 09, 2023 at 10:59:04AM CET, arkadiusz.kubalewski@intel.com wrote:
>>From: Jiri Pirko <jiri@resnulli.us>
>>Sent: Wednesday, November 8, 2023 4:08 PM
>>
>>Wed, Nov 08, 2023 at 11:32:26AM CET, arkadiusz.kubalewski@intel.com wrote:
>>>In case of multiple kernel module instances using the same dpll device:
>>>if only one registers dpll device, then only that one can register
>>
>>They why you don't register in multiple instances? See mlx5 for a
>>reference.
>>
>
>Every registration requires ops, but for our case only PF0 is able to

What makes PF0 so special? Smell like broken FW design... Care to fix
it?


>control dpll pins and device, thus only this can provide ops.
>Basically without PF0, dpll is not able to be controlled, as well
>as directly connected pins.
>
>>
>>>directly connected pins with a dpll device. If unregistered parent
>>>determines if the muxed pin can be register with it or not, it forces
>>>serialized driver load order - first the driver instance which
>>>registers the direct pins needs to be loaded, then the other instances
>>>could register muxed type pins.
>>>
>>>Allow registration of a pin with a parent even if the parent was not
>>>yet registered, thus allow ability for unserialized driver instance
>>
>>Weird.
>>
>
>Yeah, this is issue only for MUX/parent pin part, couldn't find better
>way, but it doesn't seem to break things around..
>
>Thank you!
>Arkadiusz
>
>>
>>>load order.
>>>Do not WARN_ON notification for unregistered pin, which can be invoked
>>>for described case, instead just return error.
>>>
>>>Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
>>>Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base
>>>functions")
>>>Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>>>---
>>> drivers/dpll/dpll_core.c    | 4 ----
>>> drivers/dpll/dpll_netlink.c | 2 +-
>>> 2 files changed, 1 insertion(+), 5 deletions(-)
>>>
>>>diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index
>>>4077b562ba3b..ae884b92d68c 100644
>>>--- a/drivers/dpll/dpll_core.c
>>>+++ b/drivers/dpll/dpll_core.c
>>>@@ -28,8 +28,6 @@ static u32 dpll_xa_id;
>>> 	WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
>>> #define ASSERT_DPLL_NOT_REGISTERED(d)	\
>>> 	WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
>>>-#define ASSERT_PIN_REGISTERED(p)	\
>>>-	WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED))
>>>
>>> struct dpll_device_registration {
>>> 	struct list_head list;
>>>@@ -641,8 +639,6 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent,
>>struct dpll_pin *pin,
>>> 	    WARN_ON(!ops->state_on_pin_get) ||
>>> 	    WARN_ON(!ops->direction_get))
>>> 		return -EINVAL;
>>>-	if (ASSERT_PIN_REGISTERED(parent))
>>>-		return -EINVAL;
>>>
>>> 	mutex_lock(&dpll_lock);
>>> 	ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv); diff
>>>--git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index
>>>963bbbbe6660..ff430f43304f 100644
>>>--- a/drivers/dpll/dpll_netlink.c
>>>+++ b/drivers/dpll/dpll_netlink.c
>>>@@ -558,7 +558,7 @@ dpll_pin_event_send(enum dpll_cmd event, struct
>>dpll_pin *pin)
>>> 	int ret = -ENOMEM;
>>> 	void *hdr;
>>>
>>>-	if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)))
>>>+	if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))
>>> 		return -ENODEV;
>>>
>>> 	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
>>>--
>>>2.38.1
>>>

^ permalink raw reply

* Re: [PATCH net 2/3] dpll: fix pin dump crash for rebound module
From: Jiri Pirko @ 2023-11-09 13:19 UTC (permalink / raw)
  To: Kubalewski, Arkadiusz
  Cc: netdev@vger.kernel.org, vadim.fedorenko@linux.dev,
	Michalik, Michal, Olech, Milena, pabeni@redhat.com,
	kuba@kernel.org
In-Reply-To: <DM6PR11MB465752FE337EB962B147EB579BAFA@DM6PR11MB4657.namprd11.prod.outlook.com>

Thu, Nov 09, 2023 at 01:20:48PM CET, arkadiusz.kubalewski@intel.com wrote:
>>From: Jiri Pirko <jiri@resnulli.us>
>>Sent: Wednesday, November 8, 2023 3:30 PM
>>
>>Wed, Nov 08, 2023 at 11:32:25AM CET, arkadiusz.kubalewski@intel.com wrote:
>>>When a kernel module is unbound but the pin resources were not entirely
>>>freed (other kernel module instance have had kept the reference to that
>>>pin), and kernel module is again bound, the pin properties would not be
>>>updated (the properties are only assigned when memory for the pin is
>>>allocated), prop pointer still points to the kernel module memory of
>>>the kernel module which was deallocated on the unbind.
>>>
>>>If the pin dump is invoked in this state, the result is a kernel crash.
>>>Prevent the crash by storing persistent pin properties in dpll subsystem,
>>>copy the content from the kernel module when pin is allocated, instead of
>>>using memory of the kernel module.
>>>
>>>Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
>>>Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
>>>Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>>>---
>>> drivers/dpll/dpll_core.c    |  4 ++--
>>> drivers/dpll/dpll_core.h    |  4 ++--
>>> drivers/dpll/dpll_netlink.c | 28 ++++++++++++++--------------
>>> 3 files changed, 18 insertions(+), 18 deletions(-)
>>>
>>>diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
>>>index 3568149b9562..4077b562ba3b 100644
>>>--- a/drivers/dpll/dpll_core.c
>>>+++ b/drivers/dpll/dpll_core.c
>>>@@ -442,7 +442,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct
>>>module *module,
>>> 		ret = -EINVAL;
>>> 		goto err;
>>> 	}
>>>-	pin->prop = prop;
>>>+	memcpy(&pin->prop, prop, sizeof(pin->prop));
>>
>>Odd, you don't care about the pointer within this structure?
>>
>
>Well, true. Need a fix.
>Wondering if copying idea is better than just assigning prop pointer on
>each call to dpll_pin_get(..) function (when pin already exists)?

Not sure what do you mean. Examples please.


>
>Thank you!
>Arkadiusz
>
>>
>>> 	refcount_set(&pin->refcount, 1);
>>> 	xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC);
>>> 	xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
>>>@@ -634,7 +634,7 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent,
>>>struct dpll_pin *pin,
>>> 	unsigned long i, stop;
>>> 	int ret;
>>>
>>>-	if (WARN_ON(parent->prop->type != DPLL_PIN_TYPE_MUX))
>>>+	if (WARN_ON(parent->prop.type != DPLL_PIN_TYPE_MUX))
>>> 		return -EINVAL;
>>>
>>> 	if (WARN_ON(!ops) ||
>>>diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h
>>>index 5585873c5c1b..717f715015c7 100644
>>>--- a/drivers/dpll/dpll_core.h
>>>+++ b/drivers/dpll/dpll_core.h
>>>@@ -44,7 +44,7 @@ struct dpll_device {
>>>  * @module:		module of creator
>>>  * @dpll_refs:		hold referencees to dplls pin was registered with
>>>  * @parent_refs:	hold references to parent pins pin was registered
>>>with
>>>- * @prop:		pointer to pin properties given by registerer
>>>+ * @prop:		pin properties copied from the registerer
>>>  * @rclk_dev_name:	holds name of device when pin can recover clock
>>>from it
>>>  * @refcount:		refcount
>>>  **/
>>>@@ -55,7 +55,7 @@ struct dpll_pin {
>>> 	struct module *module;
>>> 	struct xarray dpll_refs;
>>> 	struct xarray parent_refs;
>>>-	const struct dpll_pin_properties *prop;
>>>+	struct dpll_pin_properties prop;
>>> 	refcount_t refcount;
>>> };
>>>
>>>diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
>>>index 93fc6c4b8a78..963bbbbe6660 100644
>>>--- a/drivers/dpll/dpll_netlink.c
>>>+++ b/drivers/dpll/dpll_netlink.c
>>>@@ -278,17 +278,17 @@ dpll_msg_add_pin_freq(struct sk_buff *msg, struct
>>>dpll_pin *pin,
>>> 	if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY, sizeof(freq), &freq,
>>> 			  DPLL_A_PIN_PAD))
>>> 		return -EMSGSIZE;
>>>-	for (fs = 0; fs < pin->prop->freq_supported_num; fs++) {
>>>+	for (fs = 0; fs < pin->prop.freq_supported_num; fs++) {
>>> 		nest = nla_nest_start(msg, DPLL_A_PIN_FREQUENCY_SUPPORTED);
>>> 		if (!nest)
>>> 			return -EMSGSIZE;
>>>-		freq = pin->prop->freq_supported[fs].min;
>>>+		freq = pin->prop.freq_supported[fs].min;
>>> 		if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MIN, sizeof(freq),
>>> 				  &freq, DPLL_A_PIN_PAD)) {
>>> 			nla_nest_cancel(msg, nest);
>>> 			return -EMSGSIZE;
>>> 		}
>>>-		freq = pin->prop->freq_supported[fs].max;
>>>+		freq = pin->prop.freq_supported[fs].max;
>>> 		if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MAX, sizeof(freq),
>>> 				  &freq, DPLL_A_PIN_PAD)) {
>>> 			nla_nest_cancel(msg, nest);
>>>@@ -304,9 +304,9 @@ static bool dpll_pin_is_freq_supported(struct dpll_pin
>>>*pin, u32 freq)
>>> {
>>> 	int fs;
>>>
>>>-	for (fs = 0; fs < pin->prop->freq_supported_num; fs++)
>>>-		if (freq >= pin->prop->freq_supported[fs].min &&
>>>-		    freq <= pin->prop->freq_supported[fs].max)
>>>+	for (fs = 0; fs < pin->prop.freq_supported_num; fs++)
>>>+		if (freq >= pin->prop.freq_supported[fs].min &&
>>>+		    freq <= pin->prop.freq_supported[fs].max)
>>> 			return true;
>>> 	return false;
>>> }
>>>@@ -403,7 +403,7 @@ static int
>>> dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
>>> 		     struct netlink_ext_ack *extack)
>>> {
>>>-	const struct dpll_pin_properties *prop = pin->prop;
>>>+	const struct dpll_pin_properties *prop = &pin->prop;
>>> 	struct dpll_pin_ref *ref;
>>> 	int ret;
>>>
>>>@@ -696,7 +696,7 @@ dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32
>>>parent_idx,
>>> 	int ret;
>>>
>>> 	if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
>>>-	      pin->prop->capabilities)) {
>>>+	      pin->prop.capabilities)) {
>>> 		NL_SET_ERR_MSG(extack, "state changing is not allowed");
>>> 		return -EOPNOTSUPP;
>>> 	}
>>>@@ -732,7 +732,7 @@ dpll_pin_state_set(struct dpll_device *dpll, struct
>>>dpll_pin *pin,
>>> 	int ret;
>>>
>>> 	if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
>>>-	      pin->prop->capabilities)) {
>>>+	      pin->prop.capabilities)) {
>>> 		NL_SET_ERR_MSG(extack, "state changing is not allowed");
>>> 		return -EOPNOTSUPP;
>>> 	}
>>>@@ -759,7 +759,7 @@ dpll_pin_prio_set(struct dpll_device *dpll, struct
>>>dpll_pin *pin,
>>> 	int ret;
>>>
>>> 	if (!(DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE &
>>>-	      pin->prop->capabilities)) {
>>>+	      pin->prop.capabilities)) {
>>> 		NL_SET_ERR_MSG(extack, "prio changing is not allowed");
>>> 		return -EOPNOTSUPP;
>>> 	}
>>>@@ -787,7 +787,7 @@ dpll_pin_direction_set(struct dpll_pin *pin, struct
>>>dpll_device *dpll,
>>> 	int ret;
>>>
>>> 	if (!(DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE &
>>>-	      pin->prop->capabilities)) {
>>>+	      pin->prop.capabilities)) {
>>> 		NL_SET_ERR_MSG(extack, "direction changing is not allowed");
>>> 		return -EOPNOTSUPP;
>>> 	}
>>>@@ -817,8 +817,8 @@ dpll_pin_phase_adj_set(struct dpll_pin *pin, struct
>>>nlattr *phase_adj_attr,
>>> 	int ret;
>>>
>>> 	phase_adj = nla_get_s32(phase_adj_attr);
>>>-	if (phase_adj > pin->prop->phase_range.max ||
>>>-	    phase_adj < pin->prop->phase_range.min) {
>>>+	if (phase_adj > pin->prop.phase_range.max ||
>>>+	    phase_adj < pin->prop.phase_range.min) {
>>> 		NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr,
>>> 				    "phase adjust value not supported");
>>> 		return -EINVAL;
>>>@@ -999,7 +999,7 @@ dpll_pin_find(u64 clock_id, struct nlattr
>>>*mod_name_attr,
>>> 	unsigned long i;
>>>
>>> 	xa_for_each_marked(&dpll_pin_xa, i, pin, DPLL_REGISTERED) {
>>>-		prop = pin->prop;
>>>+		prop = &pin->prop;
>>> 		cid_match = clock_id ? pin->clock_id == clock_id : true;
>>> 		mod_match = mod_name_attr && module_name(pin->module) ?
>>> 			!nla_strcmp(mod_name_attr,
>>>--
>>>2.38.1
>>>
>

^ permalink raw reply

* Re: [PATCH v2 net 7/7] net/sched: taprio: enable cycle time adjustment for current entry
From: Vladimir Oltean @ 2023-11-09 13:18 UTC (permalink / raw)
  To: Faizal Rahim
  Cc: Vinicius Costa Gomes, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev, linux-kernel
In-Reply-To: <20231107112023.676016-8-faizal.abdul.rahim@linux.intel.com>

On Tue, Nov 07, 2023 at 06:20:23AM -0500, Faizal Rahim wrote:
> Handles cycle time adjustments for the current active entry

Use the imperative mood for commit messages, i.e. "handle".

> when new admin base time occurs quickly, either within the
> current entry or the next one.
> 
> Changes covers:
> 1. Negative cycle correction or truncation
> Occurs when the new admin base time falls before the expiry of the
> current running entry.
> 
> 2. Positive cycle correction or extension
> Occurs when the new admin base time falls within the next entry,
> and the current entry is the cycle's last entry. In this case, the
> changes in taprio_start_sched() extends the schedule, preventing
> old oper schedule from resuming and getting truncated in the next
> advance_sched() call.
> 
> 3. A new API, update_gate_close_time(), has been created to update
> the gate_close_time of the current entry in the event of cycle
> correction.
> 
> Signed-off-by: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
> ---
>  net/sched/sch_taprio.c | 72 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 53 insertions(+), 19 deletions(-)
> 
> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
> index c60e9e7ac193..56743754d42e 100644
> --- a/net/sched/sch_taprio.c
> +++ b/net/sched/sch_taprio.c
> @@ -1379,41 +1379,75 @@ static void setup_first_end_time(struct taprio_sched *q,
>  		rcu_assign_pointer(q->current_entry, NULL);
>  }
>  
> +static void update_gate_close_time(struct sched_entry *current_entry,
> +				   ktime_t new_end_time,
> +				   int num_tc)
> +{
> +	int tc;
> +
> +	for (tc = 0; tc < num_tc; tc++) {
> +		if (current_entry->gate_mask & BIT(tc))
> +			current_entry->gate_close_time[tc] = new_end_time;
> +	}
> +}
> +
>  static void taprio_start_sched(struct Qdisc *sch,
>  			       ktime_t new_base_time,
> -			       struct sched_gate_list *new)
> +			       struct sched_gate_list *admin)
>  {
>  	struct taprio_sched *q = qdisc_priv(sch);
> +	ktime_t expires = hrtimer_get_expires(&q->advance_timer);
> +	struct net_device *dev = qdisc_dev(q->root);
> +	struct sched_entry *curr_entry = NULL;
>  	struct sched_gate_list *oper = NULL;
> -	ktime_t expires, start;
>  
>  	if (FULL_OFFLOAD_IS_ENABLED(q->flags))
>  		return;
>  
>  	oper = rcu_dereference_protected(q->oper_sched,
>  					 lockdep_is_held(&q->current_entry_lock));
> +	curr_entry = rcu_dereference_protected(q->current_entry,
> +					       lockdep_is_held(&q->current_entry_lock));
>  
> -	expires = hrtimer_get_expires(&q->advance_timer);
> -	if (expires == 0)
> -		expires = KTIME_MAX;
> +	if (hrtimer_active(&q->advance_timer)) {
> +		oper->cycle_time_correction =
> +			get_cycle_time_correction(oper, new_base_time,
> +						  curr_entry->end_time,
> +						  curr_entry);
>  
> -	/* If the new schedule starts before the next expiration, we
> -	 * reprogram it to the earliest one, so we change the admin
> -	 * schedule to the operational one at the right time.
> -	 */
> -	start = min_t(ktime_t, new_base_time, expires);
> -
> -	if (expires != KTIME_MAX &&
> -	    ktime_compare(start, new_base_time) == 0) {
> -		/* Since timer was changed to align to the new admin schedule,
> -		 * setting the variable below to a non-initialized value will
> -		 * indicate to advance_sched() to call switch_schedules() after
> -		 * this timer expires.

I would appreciate not changing things that you've established in
earlier changes. Try to keep stuff introduced earlier in a form that is
as close as possible to the final form.

> +		if (cycle_corr_active(oper->cycle_time_correction)) {
> +			/* This is the last entry we are running from oper,
> +			 * subsequent entry will take from the new admin.
> +			 */
> +			ktime_t	now = taprio_get_time(q);
> +			u64 gate_duration_left = ktime_sub(new_base_time, now);

What is special about "now" as a moment in time? Gate durations are
calculated relative to the moment when the sched_entry begins.

> +			struct qdisc_size_table *stab =
> +				rtnl_dereference(q->root->stab);

"q->root" is "sch".

> +			int num_tc = netdev_get_num_tc(dev);

It would be nice if you could pay some attention to the preferred
variable declaration style, i.e. longer lines come first. If you cannot
easily respect that, you could split the variable declarations from
their initialization.

> +
> +			oper->cycle_end_time = new_base_time;
> +			curr_entry->end_time = new_base_time;
> +			curr_entry->correction_active = true;
> +
> +			update_open_gate_duration(curr_entry, oper, num_tc,
> +						  gate_duration_left);

Recalculating open gate durations with a cycle time correction seems
very complicated, at least from this code path. What depends on this?
The data path only looks at the gate_close_time. Can we get away with
updating only the gate_close_time?

> +			update_gate_close_time(curr_entry, new_base_time, num_tc);
> +			taprio_update_queue_max_sdu(q, oper, stab);
> +			taprio_set_budgets(q, oper, curr_entry);

There's a lot of duplication between the correction management from
advance_sched() and the one from taprio_start_sched(). I wonder if some
of it can go into a common function.

> +		}
> +	}
> +
> +	if (!hrtimer_active(&q->advance_timer) ||
> +	    cycle_corr_active(oper->cycle_time_correction)) {
> +		/* Use new admin base time if :
> +		 * 1. there's no active oper
> +		 * 2. there's active oper and we will change to the new admin
> +		 * schedule after the current entry from oper ends
>  		 */
> -		oper->cycle_time_correction = 0;
> +		expires = new_base_time;
>  	}
>  
> -	hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
> +	hrtimer_start(&q->advance_timer, expires, HRTIMER_MODE_ABS);
>  }
>  
>  static void taprio_set_picos_per_byte(struct net_device *dev,
> -- 
> 2.25.1
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox