Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH v2 net 7/7] net/sched: taprio: enable cycle time adjustment for current entry
From: Vladimir Oltean @ 2023-11-09 13:18 UTC (permalink / raw)
  To: Faizal Rahim
  Cc: Vinicius Costa Gomes, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev, linux-kernel
In-Reply-To: <20231107112023.676016-8-faizal.abdul.rahim@linux.intel.com>

On Tue, Nov 07, 2023 at 06:20:23AM -0500, Faizal Rahim wrote:
> Handles cycle time adjustments for the current active entry

Use the imperative mood for commit messages, i.e. "handle".

> when new admin base time occurs quickly, either within the
> current entry or the next one.
> 
> Changes covers:
> 1. Negative cycle correction or truncation
> Occurs when the new admin base time falls before the expiry of the
> current running entry.
> 
> 2. Positive cycle correction or extension
> Occurs when the new admin base time falls within the next entry,
> and the current entry is the cycle's last entry. In this case, the
> changes in taprio_start_sched() extends the schedule, preventing
> old oper schedule from resuming and getting truncated in the next
> advance_sched() call.
> 
> 3. A new API, update_gate_close_time(), has been created to update
> the gate_close_time of the current entry in the event of cycle
> correction.
> 
> Signed-off-by: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
> ---
>  net/sched/sch_taprio.c | 72 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 53 insertions(+), 19 deletions(-)
> 
> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
> index c60e9e7ac193..56743754d42e 100644
> --- a/net/sched/sch_taprio.c
> +++ b/net/sched/sch_taprio.c
> @@ -1379,41 +1379,75 @@ static void setup_first_end_time(struct taprio_sched *q,
>  		rcu_assign_pointer(q->current_entry, NULL);
>  }
>  
> +static void update_gate_close_time(struct sched_entry *current_entry,
> +				   ktime_t new_end_time,
> +				   int num_tc)
> +{
> +	int tc;
> +
> +	for (tc = 0; tc < num_tc; tc++) {
> +		if (current_entry->gate_mask & BIT(tc))
> +			current_entry->gate_close_time[tc] = new_end_time;
> +	}
> +}
> +
>  static void taprio_start_sched(struct Qdisc *sch,
>  			       ktime_t new_base_time,
> -			       struct sched_gate_list *new)
> +			       struct sched_gate_list *admin)
>  {
>  	struct taprio_sched *q = qdisc_priv(sch);
> +	ktime_t expires = hrtimer_get_expires(&q->advance_timer);
> +	struct net_device *dev = qdisc_dev(q->root);
> +	struct sched_entry *curr_entry = NULL;
>  	struct sched_gate_list *oper = NULL;
> -	ktime_t expires, start;
>  
>  	if (FULL_OFFLOAD_IS_ENABLED(q->flags))
>  		return;
>  
>  	oper = rcu_dereference_protected(q->oper_sched,
>  					 lockdep_is_held(&q->current_entry_lock));
> +	curr_entry = rcu_dereference_protected(q->current_entry,
> +					       lockdep_is_held(&q->current_entry_lock));
>  
> -	expires = hrtimer_get_expires(&q->advance_timer);
> -	if (expires == 0)
> -		expires = KTIME_MAX;
> +	if (hrtimer_active(&q->advance_timer)) {
> +		oper->cycle_time_correction =
> +			get_cycle_time_correction(oper, new_base_time,
> +						  curr_entry->end_time,
> +						  curr_entry);
>  
> -	/* If the new schedule starts before the next expiration, we
> -	 * reprogram it to the earliest one, so we change the admin
> -	 * schedule to the operational one at the right time.
> -	 */
> -	start = min_t(ktime_t, new_base_time, expires);
> -
> -	if (expires != KTIME_MAX &&
> -	    ktime_compare(start, new_base_time) == 0) {
> -		/* Since timer was changed to align to the new admin schedule,
> -		 * setting the variable below to a non-initialized value will
> -		 * indicate to advance_sched() to call switch_schedules() after
> -		 * this timer expires.

I would appreciate not changing things that you've established in
earlier changes. Try to keep stuff introduced earlier in a form that is
as close as possible to the final form.

> +		if (cycle_corr_active(oper->cycle_time_correction)) {
> +			/* This is the last entry we are running from oper,
> +			 * subsequent entry will take from the new admin.
> +			 */
> +			ktime_t	now = taprio_get_time(q);
> +			u64 gate_duration_left = ktime_sub(new_base_time, now);

What is special about "now" as a moment in time? Gate durations are
calculated relative to the moment when the sched_entry begins.

> +			struct qdisc_size_table *stab =
> +				rtnl_dereference(q->root->stab);

"q->root" is "sch".

> +			int num_tc = netdev_get_num_tc(dev);

It would be nice if you could pay some attention to the preferred
variable declaration style, i.e. longer lines come first. If you cannot
easily respect that, you could split the variable declarations from
their initialization.

> +
> +			oper->cycle_end_time = new_base_time;
> +			curr_entry->end_time = new_base_time;
> +			curr_entry->correction_active = true;
> +
> +			update_open_gate_duration(curr_entry, oper, num_tc,
> +						  gate_duration_left);

Recalculating open gate durations with a cycle time correction seems
very complicated, at least from this code path. What depends on this?
The data path only looks at the gate_close_time. Can we get away with
updating only the gate_close_time?

> +			update_gate_close_time(curr_entry, new_base_time, num_tc);
> +			taprio_update_queue_max_sdu(q, oper, stab);
> +			taprio_set_budgets(q, oper, curr_entry);

There's a lot of duplication between the correction management from
advance_sched() and the one from taprio_start_sched(). I wonder if some
of it can go into a common function.

> +		}
> +	}
> +
> +	if (!hrtimer_active(&q->advance_timer) ||
> +	    cycle_corr_active(oper->cycle_time_correction)) {
> +		/* Use new admin base time if :
> +		 * 1. there's no active oper
> +		 * 2. there's active oper and we will change to the new admin
> +		 * schedule after the current entry from oper ends
>  		 */
> -		oper->cycle_time_correction = 0;
> +		expires = new_base_time;
>  	}
>  
> -	hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
> +	hrtimer_start(&q->advance_timer, expires, HRTIMER_MODE_ABS);
>  }
>  
>  static void taprio_set_picos_per_byte(struct net_device *dev,
> -- 
> 2.25.1
>

^ permalink raw reply

* Re: [PATCH net 1/3] dpll: fix pin dump crash after module unbind
From: Jiri Pirko @ 2023-11-09 13:18 UTC (permalink / raw)
  To: Kubalewski, Arkadiusz
  Cc: netdev@vger.kernel.org, vadim.fedorenko@linux.dev,
	Michalik, Michal, Olech, Milena, pabeni@redhat.com,
	kuba@kernel.org
In-Reply-To: <DM6PR11MB46576110B45D806064F437C49BAFA@DM6PR11MB4657.namprd11.prod.outlook.com>

Thu, Nov 09, 2023 at 10:49:49AM CET, arkadiusz.kubalewski@intel.com wrote:
>>From: Jiri Pirko <jiri@resnulli.us>
>>Sent: Wednesday, November 8, 2023 4:09 PM
>>
>>Wed, Nov 08, 2023 at 11:32:24AM CET, arkadiusz.kubalewski@intel.com wrote:
>>>Disallow dump of unregistered parent pins, it is possible when parent
>>>pin and dpll device registerer kernel module instance unbinds, and
>>>other kernel module instances of the same dpll device have pins
>>>registered with the parent pin. The user can invoke a pin-dump but as
>>>the parent was unregistered, thus shall not be accessed by the
>>>userspace, prevent that by checking if parent pin is still registered.
>>>
>>>Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
>>>Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>>>---
>>> drivers/dpll/dpll_netlink.c | 7 +++++++
>>> 1 file changed, 7 insertions(+)
>>>
>>>diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
>>>index a6dc3997bf5c..93fc6c4b8a78 100644
>>>--- a/drivers/dpll/dpll_netlink.c
>>>+++ b/drivers/dpll/dpll_netlink.c
>>>@@ -328,6 +328,13 @@ dpll_msg_add_pin_parents(struct sk_buff *msg, struct
>>dpll_pin *pin,
>>> 		void *parent_priv;
>>>
>>> 		ppin = ref->pin;
>>>+		/*
>>>+		 * dump parent only if it is registered, thus prevent crash on
>>>+		 * pin dump called when driver which registered the pin unbinds
>>>+		 * and different instance registered pin on that parent pin
>>
>>Read this sentence like 10 times, still don't get what you mean.
>>Shouldn't comments be easy to understand?
>>
>
>Hi,
>
>Hmm, wondering isn't it better to remove this comment at all?
>If you think it is needed I will rephrase it somehow..

I don't know if it is needed as I don't understand it :)
Just remove it.


>
>Thank you!
>Arkadiusz
>
>>
>>>+		 */
>>>+		if (!xa_get_mark(&dpll_pin_xa, ppin->id, DPLL_REGISTERED))
>>>+			continue;
>>> 		parent_priv = dpll_pin_on_dpll_priv(dpll_ref->dpll, ppin);
>>> 		ret = ops->state_on_pin_get(pin,
>>> 					    dpll_pin_on_pin_priv(ppin, pin),
>>>--
>>>2.38.1
>>>
>

^ permalink raw reply

* Re: [RFC net-next] net: don't dump stack on queue timeout
From: Jiri Pirko @ 2023-11-09 13:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem, netdev, edumazet, pabeni, syzbot+d55372214aff0faa1f1f, jhs,
	xiyou.wangcong
In-Reply-To: <20231109000901.949152-1-kuba@kernel.org>

Thu, Nov 09, 2023 at 01:09:01AM CET, kuba@kernel.org wrote:
>The top syzbot report for networking (#14 for the entire kernel)
>is the queue timeout splat. We kept it around for a long time,
>because in real life it provides pretty strong signal that
>something is wrong with the driver or the device.
>
>Removing it is also likely to break monitoring for those who
>track it as a kernel warning.
>
>Nevertheless, WARN()ings are best suited for catching kernel
>programming bugs. If a Tx queue gets starved due to a pause
>storm, priority configuration, or other weirdness - that's
>obviously a problem, but not a problem we can fix at
>the kernel level.
>
>Bite the bullet and convert the WARN() to a print.
>
>Before:
>
>  NETDEV WATCHDOG: eni1np1 (netdevsim): transmit queue 0 timed out 1975 ms
>  WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:525 dev_watchdog+0x39e/0x3b0
>  [... completely pointless stack trace of a timer follows ...]
>
>Now:
>
>  netdevsim netdevsim1 eni1np1: NETDEV WATCHDOG: CPU: 0: transmit queue 0 timed out 1769 ms
>
>Alternatively we could mark the drivers which syzbot has
>learned to abuse as "print-instead-of-WARN" selectively.
>
>Reported-by: syzbot+d55372214aff0faa1f1f@syzkaller.appspotmail.com
>Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Makes sense.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>

^ permalink raw reply

* [net-next RFC PATCH v6 4/4] dt-bindings: Document Marvell Aquantia PHY
From: Christian Marangi @ 2023-11-09 12:32 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Andrew Lunn,
	Heiner Kallweit, Russell King, Christian Marangi, Robert Marko,
	Vladimir Oltean, netdev, devicetree, linux-kernel
  Cc: Conor Dooley
In-Reply-To: <20231109123253.3933-1-ansuelsmth@gmail.com>

Document bindings for Marvell Aquantia PHY.

The Marvell Aquantia PHY require a firmware to work correctly and there
at least 3 way to load this firmware.

Describe all the different way and document the binding "firmware-name"
to load the PHY firmware from userspace.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
---
Changes v6:
- Add Reviewed-by tag
- Drop comments in dts examples
- Improve commit title
- Fix wrong reg in example
Changes v5:
- Drop extra entry not related to HW description
Changes v3:
- Make DT description more OS agnostic
- Use custom select to fix dtbs checks
Changes v2:
- Add DT patch

 .../bindings/net/marvell,aquantia.yaml        | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/marvell,aquantia.yaml

diff --git a/Documentation/devicetree/bindings/net/marvell,aquantia.yaml b/Documentation/devicetree/bindings/net/marvell,aquantia.yaml
new file mode 100644
index 000000000000..68b2087a6722
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/marvell,aquantia.yaml
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/marvell,aquantia.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell Aquantia Ethernet PHY
+
+maintainers:
+  - Christian Marangi <ansuelsmth@gmail.com>
+
+description: |
+  Marvell Aquantia Ethernet PHY require a firmware to be loaded to actually
+  work.
+
+  This can be done and is implemented by OEM in 3 different way:
+    - Attached SPI flash directly to the PHY with the firmware. The PHY
+      will self load the firmware in the presence of this configuration.
+    - Read from a dedicated partition on system NAND declared in an
+      NVMEM cell, and loaded to the PHY using its mailbox interface.
+    - Manually provided firmware loaded from a file in the filesystem.
+
+allOf:
+  - $ref: ethernet-phy.yaml#
+
+select:
+  properties:
+    compatible:
+      contains:
+        enum:
+          - ethernet-phy-id03a1.b445
+          - ethernet-phy-id03a1.b460
+          - ethernet-phy-id03a1.b4a2
+          - ethernet-phy-id03a1.b4d0
+          - ethernet-phy-id03a1.b4e0
+          - ethernet-phy-id03a1.b5c2
+          - ethernet-phy-id03a1.b4b0
+          - ethernet-phy-id03a1.b662
+          - ethernet-phy-id03a1.b712
+          - ethernet-phy-id31c3.1c12
+  required:
+    - compatible
+
+properties:
+  reg:
+    maxItems: 1
+
+  firmware-name:
+    description: specify the name of PHY firmware to load
+
+  nvmem-cells:
+    description: phandle to the firmware nvmem cell
+    maxItems: 1
+
+  nvmem-cell-names:
+    const: firmware
+
+required:
+  - compatible
+  - reg
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    mdio {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        ethernet-phy@0 {
+            compatible = "ethernet-phy-id31c3.1c12",
+                         "ethernet-phy-ieee802.3-c45";
+
+            reg = <0>;
+            firmware-name = "AQR-G4_v5.4.C-AQR_CIG_WF-1945_0x8_ID44776_VER1630.cld";
+        };
+
+        ethernet-phy@1 {
+            compatible = "ethernet-phy-id31c3.1c12",
+                         "ethernet-phy-ieee802.3-c45";
+
+            reg = <1>;
+            nvmem-cells = <&aqr_fw>;
+            nvmem-cell-names = "firmware";
+        };
+    };
+
+    flash {
+        compatible = "jedec,spi-nor";
+        #address-cells = <1>;
+        #size-cells = <1>;
+
+        partitions {
+            compatible = "fixed-partitions";
+            #address-cells = <1>;
+            #size-cells = <1>;
+
+            /* ... */
+
+            partition@650000 {
+                compatible = "nvmem-cells";
+                label = "0:ethphyfw";
+                reg = <0x650000 0x80000>;
+                read-only;
+                #address-cells = <1>;
+                #size-cells = <1>;
+
+                aqr_fw: aqr_fw@0 {
+                    reg = <0x0 0x5f42a>;
+                };
+            };
+
+            /* ... */
+
+        };
+    };
-- 
2.40.1


^ permalink raw reply related

* [net-next RFC PATCH v6 3/4] net: phy: aquantia: add firmware load support
From: Christian Marangi @ 2023-11-09 12:32 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Andrew Lunn,
	Heiner Kallweit, Russell King, Christian Marangi, Robert Marko,
	Vladimir Oltean, netdev, devicetree, linux-kernel
In-Reply-To: <20231109123253.3933-1-ansuelsmth@gmail.com>

From: Robert Marko <robimarko@gmail.com>

Aquantia PHY-s require firmware to be loaded before they start operating.
It can be automatically loaded in case when there is a SPI-NOR connected
to Aquantia PHY-s or can be loaded from the host via MDIO.

This patch adds support for loading the firmware via MDIO as in most cases
there is no SPI-NOR being used to save on cost.
Firmware loading code itself is ported from mainline U-boot with cleanups.

The firmware has mixed values both in big and little endian.
PHY core itself is big-endian but it expects values to be in little-endian.
The firmware is little-endian but CRC-16 value for it is stored at the end
of firmware in big-endian.

It seems the PHY does the conversion internally from firmware that is
little-endian to the PHY that is big-endian on using the mailbox
but mailbox returns a big-endian CRC-16 to verify the written data
integrity.

Co-developed-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: Robert Marko <robimarko@gmail.com>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
---
Changes v4:
- Drop inline
- Drop extra impossible check for negative values
Changes v3:
- Back to RFC due to merge window
- Use unaligned macro instead of memcpy
- Spam sanity check as firmware is evil
- Add print to signal the user the source of the fw (FS or NVMEM)
Changes v2:
- Move out of RFC
- Address sanity check for offsets
- Add additional comments on firmware load check
- Fix some typo
- Capitalize CRC in comments
- Rename load_sysfs to load_fs

 drivers/net/phy/aquantia/Kconfig             |   1 +
 drivers/net/phy/aquantia/Makefile            |   2 +-
 drivers/net/phy/aquantia/aquantia.h          |  32 ++
 drivers/net/phy/aquantia/aquantia_firmware.c | 367 +++++++++++++++++++
 drivers/net/phy/aquantia/aquantia_main.c     |   6 +
 5 files changed, 407 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/phy/aquantia/aquantia_firmware.c

diff --git a/drivers/net/phy/aquantia/Kconfig b/drivers/net/phy/aquantia/Kconfig
index 226146417a6a..a35de4b9b554 100644
--- a/drivers/net/phy/aquantia/Kconfig
+++ b/drivers/net/phy/aquantia/Kconfig
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config AQUANTIA_PHY
 	tristate "Aquantia PHYs"
+	select CRC_CCITT
 	help
 	  Currently supports the Aquantia AQ1202, AQ2104, AQR105, AQR405
diff --git a/drivers/net/phy/aquantia/Makefile b/drivers/net/phy/aquantia/Makefile
index 346f350bc084..aa77fb63c8ec 100644
--- a/drivers/net/phy/aquantia/Makefile
+++ b/drivers/net/phy/aquantia/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-aquantia-objs			+= aquantia_main.o
+aquantia-objs			+= aquantia_main.o aquantia_firmware.o
 ifdef CONFIG_HWMON
 aquantia-objs			+= aquantia_hwmon.o
 endif
diff --git a/drivers/net/phy/aquantia/aquantia.h b/drivers/net/phy/aquantia/aquantia.h
index f0c767c4fad1..9ed38972abdb 100644
--- a/drivers/net/phy/aquantia/aquantia.h
+++ b/drivers/net/phy/aquantia/aquantia.h
@@ -10,10 +10,35 @@
 #include <linux/phy.h>
 
 /* Vendor specific 1, MDIO_MMD_VEND1 */
+#define VEND1_GLOBAL_SC				0x0
+#define VEND1_GLOBAL_SC_SOFT_RESET		BIT(15)
+#define VEND1_GLOBAL_SC_LOW_POWER		BIT(11)
+
 #define VEND1_GLOBAL_FW_ID			0x0020
 #define VEND1_GLOBAL_FW_ID_MAJOR		GENMASK(15, 8)
 #define VEND1_GLOBAL_FW_ID_MINOR		GENMASK(7, 0)
 
+#define VEND1_GLOBAL_MAILBOX_INTERFACE1			0x0200
+#define VEND1_GLOBAL_MAILBOX_INTERFACE1_EXECUTE		BIT(15)
+#define VEND1_GLOBAL_MAILBOX_INTERFACE1_WRITE		BIT(14)
+#define VEND1_GLOBAL_MAILBOX_INTERFACE1_CRC_RESET	BIT(12)
+#define VEND1_GLOBAL_MAILBOX_INTERFACE1_BUSY		BIT(8)
+
+#define VEND1_GLOBAL_MAILBOX_INTERFACE2			0x0201
+#define VEND1_GLOBAL_MAILBOX_INTERFACE3			0x0202
+#define VEND1_GLOBAL_MAILBOX_INTERFACE3_MSW_ADDR_MASK	GENMASK(15, 0)
+#define VEND1_GLOBAL_MAILBOX_INTERFACE3_MSW_ADDR(x)	FIELD_PREP(VEND1_GLOBAL_MAILBOX_INTERFACE3_MSW_ADDR_MASK, (u16)((x) >> 16))
+#define VEND1_GLOBAL_MAILBOX_INTERFACE4			0x0203
+#define VEND1_GLOBAL_MAILBOX_INTERFACE4_LSW_ADDR_MASK	GENMASK(15, 2)
+#define VEND1_GLOBAL_MAILBOX_INTERFACE4_LSW_ADDR(x)	FIELD_PREP(VEND1_GLOBAL_MAILBOX_INTERFACE4_LSW_ADDR_MASK, (u16)(x))
+
+#define VEND1_GLOBAL_MAILBOX_INTERFACE5			0x0204
+#define VEND1_GLOBAL_MAILBOX_INTERFACE5_MSW_DATA_MASK	GENMASK(15, 0)
+#define VEND1_GLOBAL_MAILBOX_INTERFACE5_MSW_DATA(x)	FIELD_PREP(VEND1_GLOBAL_MAILBOX_INTERFACE5_MSW_DATA_MASK, (u16)((x) >> 16))
+#define VEND1_GLOBAL_MAILBOX_INTERFACE6			0x0205
+#define VEND1_GLOBAL_MAILBOX_INTERFACE6_LSW_DATA_MASK	GENMASK(15, 0)
+#define VEND1_GLOBAL_MAILBOX_INTERFACE6_LSW_DATA(x)	FIELD_PREP(VEND1_GLOBAL_MAILBOX_INTERFACE6_LSW_DATA_MASK, (u16)(x))
+
 /* The following registers all have similar layouts; first the registers... */
 #define VEND1_GLOBAL_CFG_10M			0x0310
 #define VEND1_GLOBAL_CFG_100M			0x031b
@@ -28,6 +53,11 @@
 #define VEND1_GLOBAL_CFG_RATE_ADAPT_PAUSE	2
 
 /* Vendor specific 1, MDIO_MMD_VEND2 */
+#define VEND1_GLOBAL_CONTROL2			0xc001
+#define VEND1_GLOBAL_CONTROL2_UP_RUN_STALL_RST	BIT(15)
+#define VEND1_GLOBAL_CONTROL2_UP_RUN_STALL_OVD	BIT(6)
+#define VEND1_GLOBAL_CONTROL2_UP_RUN_STALL	BIT(0)
+
 #define VEND1_THERMAL_PROV_HIGH_TEMP_FAIL	0xc421
 #define VEND1_THERMAL_PROV_LOW_TEMP_FAIL	0xc422
 #define VEND1_THERMAL_PROV_HIGH_TEMP_WARN	0xc423
@@ -83,3 +113,5 @@ int aqr_hwmon_probe(struct phy_device *phydev);
 #else
 static inline int aqr_hwmon_probe(struct phy_device *phydev) { return 0; }
 #endif
+
+int aqr_firmware_load(struct phy_device *phydev);
diff --git a/drivers/net/phy/aquantia/aquantia_firmware.c b/drivers/net/phy/aquantia/aquantia_firmware.c
new file mode 100644
index 000000000000..0267ef2a231a
--- /dev/null
+++ b/drivers/net/phy/aquantia/aquantia_firmware.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitfield.h>
+#include <linux/of.h>
+#include <linux/firmware.h>
+#include <linux/crc-ccitt.h>
+#include <linux/nvmem-consumer.h>
+
+#include <asm/unaligned.h>
+
+#include "aquantia.h"
+
+#define UP_RESET_SLEEP		100
+
+/* addresses of memory segments in the phy */
+#define DRAM_BASE_ADDR		0x3FFE0000
+#define IRAM_BASE_ADDR		0x40000000
+
+/* firmware image format constants */
+#define VERSION_STRING_SIZE		0x40
+#define VERSION_STRING_OFFSET		0x0200
+/* primary offset is written at an offset from the start of the fw blob */
+#define PRIMARY_OFFSET_OFFSET		0x8
+/* primary offset needs to be then added to a base offset */
+#define PRIMARY_OFFSET_SHIFT		12
+#define PRIMARY_OFFSET(x)		((x) << PRIMARY_OFFSET_SHIFT)
+#define HEADER_OFFSET			0x300
+
+struct aqr_fw_header {
+	u32 padding;
+	u8 iram_offset[3];
+	u8 iram_size[3];
+	u8 dram_offset[3];
+	u8 dram_size[3];
+} __packed;
+
+enum aqr_fw_src {
+	AQR_FW_SRC_NVMEM = 0,
+	AQR_FW_SRC_FS,
+};
+
+static const char * const aqr_fw_src_string[] = {
+	[AQR_FW_SRC_NVMEM] = "NVMEM",
+	[AQR_FW_SRC_FS] = "FS",
+};
+
+/* AQR firmware doesn't have fixed offsets for iram and dram section
+ * but instead provide an header with the offset to use on reading
+ * and parsing the firmware.
+ *
+ * AQR firmware can't be trusted and each offset is validated to be
+ * not negative and be in the size of the firmware itself.
+ */
+static bool aqr_fw_validate_get(size_t size, size_t offset, size_t get_size)
+{
+	return offset + get_size <= size;
+}
+
+static int aqr_fw_get_be16(const u8 *data, size_t offset, size_t size, u16 *value)
+{
+	if (!aqr_fw_validate_get(size, offset, sizeof(u16)))
+		return -EINVAL;
+
+	*value = get_unaligned_be16(data + offset);
+
+	return 0;
+}
+
+static int aqr_fw_get_le16(const u8 *data, size_t offset, size_t size, u16 *value)
+{
+	if (!aqr_fw_validate_get(size, offset, sizeof(u16)))
+		return -EINVAL;
+
+	*value = get_unaligned_le16(data + offset);
+
+	return 0;
+}
+
+static int aqr_fw_get_le24(const u8 *data, size_t offset, size_t size, u32 *value)
+{
+	if (!aqr_fw_validate_get(size, offset, sizeof(u8) * 3))
+		return -EINVAL;
+
+	*value = get_unaligned_le24(data + offset);
+
+	return 0;
+}
+
+/* load data into the phy's memory */
+static int aqr_fw_load_memory(struct phy_device *phydev, u32 addr,
+			      const u8 *data, size_t len)
+{
+	u16 crc = 0, up_crc;
+	size_t pos;
+
+	/* PHY expect addr in LE */
+	addr = cpu_to_le32(addr);
+
+	phy_write_mmd(phydev, MDIO_MMD_VEND1,
+		      VEND1_GLOBAL_MAILBOX_INTERFACE1,
+		      VEND1_GLOBAL_MAILBOX_INTERFACE1_CRC_RESET);
+	phy_write_mmd(phydev, MDIO_MMD_VEND1,
+		      VEND1_GLOBAL_MAILBOX_INTERFACE3,
+		      VEND1_GLOBAL_MAILBOX_INTERFACE3_MSW_ADDR(addr));
+	phy_write_mmd(phydev, MDIO_MMD_VEND1,
+		      VEND1_GLOBAL_MAILBOX_INTERFACE4,
+		      VEND1_GLOBAL_MAILBOX_INTERFACE4_LSW_ADDR(addr));
+
+	/* We assume and enforce the size to be word aligned.
+	 * If a firmware that is not word aligned is found, please report upstream.
+	 */
+	for (pos = 0; pos < len; pos += sizeof(u32)) {
+		u32 word = get_unaligned((const u32 *)(data + pos));
+
+		phy_write_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_MAILBOX_INTERFACE5,
+			      VEND1_GLOBAL_MAILBOX_INTERFACE5_MSW_DATA(word));
+		phy_write_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_MAILBOX_INTERFACE6,
+			      VEND1_GLOBAL_MAILBOX_INTERFACE6_LSW_DATA(word));
+
+		phy_write_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_MAILBOX_INTERFACE1,
+			      VEND1_GLOBAL_MAILBOX_INTERFACE1_EXECUTE |
+			      VEND1_GLOBAL_MAILBOX_INTERFACE1_WRITE);
+
+		/* calculate CRC as we load data to the mailbox.
+		 * We convert word to big-endiang as PHY is BE and mailbox will
+		 * return a BE CRC.
+		 */
+		word = cpu_to_be32(word);
+		crc = crc_ccitt_false(crc, (u8 *)&word, sizeof(word));
+	}
+
+	up_crc = phy_read_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_MAILBOX_INTERFACE2);
+	if (crc != up_crc) {
+		phydev_err(phydev, "CRC mismatch: calculated 0x%04x PHY 0x%04x\n",
+			   crc, up_crc);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int aqr_fw_boot(struct phy_device *phydev, const u8 *data, size_t size,
+		       enum aqr_fw_src fw_src)
+{
+	u16 calculated_crc, read_crc, read_primary_offset;
+	u32 iram_offset = 0, iram_size = 0;
+	u32 dram_offset = 0, dram_size = 0;
+	char version[VERSION_STRING_SIZE];
+	u32 primary_offset = 0;
+	int ret;
+
+	/* extract saved CRC at the end of the fw
+	 * CRC is saved in big-endian as PHY is BE
+	 */
+	ret = aqr_fw_get_be16(data, size - sizeof(u16), size, &read_crc);
+	if (ret) {
+		phydev_err(phydev, "bad firmware CRC in firmware\n");
+		return ret;
+	}
+	calculated_crc = crc_ccitt_false(0, data, size - sizeof(u16));
+	if (read_crc != calculated_crc) {
+		phydev_err(phydev, "bad firmware CRC: file 0x%04x calculated 0x%04x\n",
+			   read_crc, calculated_crc);
+		return -EINVAL;
+	}
+
+	/* Get the primary offset to extract DRAM and IRAM sections. */
+	ret = aqr_fw_get_le16(data, PRIMARY_OFFSET_OFFSET, size, &read_primary_offset);
+	if (ret) {
+		phydev_err(phydev, "bad primary offset in firmware\n");
+		return ret;
+	}
+	primary_offset = PRIMARY_OFFSET(read_primary_offset);
+
+	/* Find the DRAM and IRAM sections within the firmware file.
+	 * Make sure the fw_header is correctly in the firmware.
+	 */
+	if (!aqr_fw_validate_get(size, primary_offset + HEADER_OFFSET,
+				 sizeof(struct aqr_fw_header))) {
+		phydev_err(phydev, "bad fw_header in firmware\n");
+		return -EINVAL;
+	}
+
+	/* offset are in LE and values needs to be converted to cpu endian */
+	ret = aqr_fw_get_le24(data, primary_offset + HEADER_OFFSET +
+			      offsetof(struct aqr_fw_header, iram_offset),
+			      size, &iram_offset);
+	if (ret) {
+		phydev_err(phydev, "bad iram offset in firmware\n");
+		return ret;
+	}
+	ret = aqr_fw_get_le24(data, primary_offset + HEADER_OFFSET +
+			      offsetof(struct aqr_fw_header, iram_size),
+			      size, &iram_size);
+	if (ret) {
+		phydev_err(phydev, "invalid iram size in firmware\n");
+		return ret;
+	}
+	ret = aqr_fw_get_le24(data, primary_offset + HEADER_OFFSET +
+			      offsetof(struct aqr_fw_header, dram_offset),
+			      size, &dram_offset);
+	if (ret) {
+		phydev_err(phydev, "bad dram offset in firmware\n");
+		return ret;
+	}
+	ret = aqr_fw_get_le24(data, primary_offset + HEADER_OFFSET +
+			      offsetof(struct aqr_fw_header, dram_size),
+			      size, &dram_size);
+	if (ret) {
+		phydev_err(phydev, "invalid dram size in firmware\n");
+		return ret;
+	}
+
+	/* Increment the offset with the primary offset.
+	 * Validate iram/dram offset and size.
+	 */
+	iram_offset += primary_offset;
+	if (iram_size % sizeof(u32)) {
+		phydev_err(phydev, "iram size if not aligned to word size. Please report this upstream!\n");
+		return -EINVAL;
+	}
+	if (!aqr_fw_validate_get(size, iram_offset, iram_size)) {
+		phydev_err(phydev, "invalid iram offset for iram size\n");
+		return -EINVAL;
+	}
+
+	dram_offset += primary_offset;
+	if (dram_size % sizeof(u32)) {
+		phydev_err(phydev, "dram size if not aligned to word size. Please report this upstream!\n");
+		return -EINVAL;
+	}
+	if (!aqr_fw_validate_get(size, dram_offset, dram_size)) {
+		phydev_err(phydev, "invalid iram offset for iram size\n");
+		return -EINVAL;
+	}
+
+	phydev_dbg(phydev, "primary %d IRAM offset=%d size=%d DRAM offset=%d size=%d\n",
+		   primary_offset, iram_offset, iram_size, dram_offset, dram_size);
+
+	if (!aqr_fw_validate_get(size, dram_offset + VERSION_STRING_OFFSET,
+				 VERSION_STRING_SIZE)) {
+		phydev_err(phydev, "invalid version in firmware\n");
+		return -EINVAL;
+	}
+	strscpy(version, (char *)data + dram_offset + VERSION_STRING_OFFSET,
+		VERSION_STRING_SIZE);
+	if (version[0] == '\0') {
+		phydev_err(phydev, "invalid version in firmware\n");
+		return -EINVAL;
+	}
+	phydev_info(phydev, "loading firmware version '%s' from '%s'\n", version,
+		    aqr_fw_src_string[fw_src]);
+
+	/* stall the microcprocessor */
+	phy_write_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_CONTROL2,
+		      VEND1_GLOBAL_CONTROL2_UP_RUN_STALL | VEND1_GLOBAL_CONTROL2_UP_RUN_STALL_OVD);
+
+	phydev_dbg(phydev, "loading DRAM 0x%08x from offset=%d size=%d\n",
+		   DRAM_BASE_ADDR, dram_offset, dram_size);
+	ret = aqr_fw_load_memory(phydev, DRAM_BASE_ADDR, data + dram_offset,
+				 dram_size);
+	if (ret)
+		return ret;
+
+	phydev_dbg(phydev, "loading IRAM 0x%08x from offset=%d size=%d\n",
+		   IRAM_BASE_ADDR, iram_offset, iram_size);
+	ret = aqr_fw_load_memory(phydev, IRAM_BASE_ADDR, data + iram_offset,
+				 iram_size);
+	if (ret)
+		return ret;
+
+	/* make sure soft reset and low power mode are clear */
+	phy_clear_bits_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_SC,
+			   VEND1_GLOBAL_SC_SOFT_RESET | VEND1_GLOBAL_SC_LOW_POWER);
+
+	/* Release the microprocessor. UP_RESET must be held for 100 usec. */
+	phy_write_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_CONTROL2,
+		      VEND1_GLOBAL_CONTROL2_UP_RUN_STALL |
+		      VEND1_GLOBAL_CONTROL2_UP_RUN_STALL_OVD |
+		      VEND1_GLOBAL_CONTROL2_UP_RUN_STALL_RST);
+	usleep_range(UP_RESET_SLEEP, UP_RESET_SLEEP * 2);
+
+	phy_write_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_CONTROL2,
+		      VEND1_GLOBAL_CONTROL2_UP_RUN_STALL_OVD);
+
+	return 0;
+}
+
+static int aqr_firmware_load_nvmem(struct phy_device *phydev)
+{
+	struct nvmem_cell *cell;
+	size_t size;
+	u8 *buf;
+	int ret;
+
+	cell = nvmem_cell_get(&phydev->mdio.dev, "firmware");
+	if (IS_ERR(cell))
+		return PTR_ERR(cell);
+
+	buf = nvmem_cell_read(cell, &size);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		goto exit;
+	}
+
+	ret = aqr_fw_boot(phydev, buf, size, AQR_FW_SRC_NVMEM);
+	if (ret)
+		phydev_err(phydev, "firmware loading failed: %d\n", ret);
+
+exit:
+	nvmem_cell_put(cell);
+
+	return ret;
+}
+
+static int aqr_firmware_load_fs(struct phy_device *phydev)
+{
+	struct device *dev = &phydev->mdio.dev;
+	const struct firmware *fw;
+	const char *fw_name;
+	int ret;
+
+	ret = of_property_read_string(dev->of_node, "firmware-name",
+				      &fw_name);
+	if (ret)
+		return ret;
+
+	ret = request_firmware(&fw, fw_name, dev);
+	if (ret) {
+		phydev_err(phydev, "failed to find FW file %s (%d)\n",
+			   fw_name, ret);
+		goto exit;
+	}
+
+	ret = aqr_fw_boot(phydev, fw->data, fw->size, AQR_FW_SRC_FS);
+	if (ret)
+		phydev_err(phydev, "firmware loading failed: %d\n", ret);
+
+exit:
+	release_firmware(fw);
+
+	return ret;
+}
+
+int aqr_firmware_load(struct phy_device *phydev)
+{
+	int ret;
+
+	/* Check if the firmware is not already loaded by pooling
+	 * the current version returned by the PHY. If 0 is returned,
+	 * no firmware is loaded.
+	 */
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_FW_ID);
+	if (ret > 0)
+		goto exit;
+
+	ret = aqr_firmware_load_nvmem(phydev);
+	if (!ret)
+		goto exit;
+
+	ret = aqr_firmware_load_fs(phydev);
+	if (ret)
+		return ret;
+
+exit:
+	return 0;
+}
diff --git a/drivers/net/phy/aquantia/aquantia_main.c b/drivers/net/phy/aquantia/aquantia_main.c
index 4498426e9a52..cc4a97741c4a 100644
--- a/drivers/net/phy/aquantia/aquantia_main.c
+++ b/drivers/net/phy/aquantia/aquantia_main.c
@@ -658,11 +658,17 @@ static int aqr107_resume(struct phy_device *phydev)
 
 static int aqr107_probe(struct phy_device *phydev)
 {
+	int ret;
+
 	phydev->priv = devm_kzalloc(&phydev->mdio.dev,
 				    sizeof(struct aqr107_priv), GFP_KERNEL);
 	if (!phydev->priv)
 		return -ENOMEM;
 
+	ret = aqr_firmware_load(phydev);
+	if (ret)
+		return ret;
+
 	return aqr_hwmon_probe(phydev);
 }
 
-- 
2.40.1


^ permalink raw reply related

* [net-next RFC PATCH v6 2/4] net: phy: aquantia: move MMD_VEND define to header
From: Christian Marangi @ 2023-11-09 12:32 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Andrew Lunn,
	Heiner Kallweit, Russell King, Christian Marangi, Robert Marko,
	Vladimir Oltean, netdev, devicetree, linux-kernel
In-Reply-To: <20231109123253.3933-1-ansuelsmth@gmail.com>

Move MMD_VEND define to header to clean things up and in preparation for
firmware loading support that require some define placed in
aquantia_main.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
---
Changes v4:
- Add Reviewed-by tag
Changes v3:
- Add this patch

 drivers/net/phy/aquantia/aquantia.h       | 69 +++++++++++++++++++++++
 drivers/net/phy/aquantia/aquantia_hwmon.c | 14 -----
 drivers/net/phy/aquantia/aquantia_main.c  | 55 ------------------
 3 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/drivers/net/phy/aquantia/aquantia.h b/drivers/net/phy/aquantia/aquantia.h
index c684b65c642c..f0c767c4fad1 100644
--- a/drivers/net/phy/aquantia/aquantia.h
+++ b/drivers/net/phy/aquantia/aquantia.h
@@ -9,6 +9,75 @@
 #include <linux/device.h>
 #include <linux/phy.h>
 
+/* Vendor specific 1, MDIO_MMD_VEND1 */
+#define VEND1_GLOBAL_FW_ID			0x0020
+#define VEND1_GLOBAL_FW_ID_MAJOR		GENMASK(15, 8)
+#define VEND1_GLOBAL_FW_ID_MINOR		GENMASK(7, 0)
+
+/* The following registers all have similar layouts; first the registers... */
+#define VEND1_GLOBAL_CFG_10M			0x0310
+#define VEND1_GLOBAL_CFG_100M			0x031b
+#define VEND1_GLOBAL_CFG_1G			0x031c
+#define VEND1_GLOBAL_CFG_2_5G			0x031d
+#define VEND1_GLOBAL_CFG_5G			0x031e
+#define VEND1_GLOBAL_CFG_10G			0x031f
+/* ...and now the fields */
+#define VEND1_GLOBAL_CFG_RATE_ADAPT		GENMASK(8, 7)
+#define VEND1_GLOBAL_CFG_RATE_ADAPT_NONE	0
+#define VEND1_GLOBAL_CFG_RATE_ADAPT_USX		1
+#define VEND1_GLOBAL_CFG_RATE_ADAPT_PAUSE	2
+
+/* Vendor specific 1, MDIO_MMD_VEND2 */
+#define VEND1_THERMAL_PROV_HIGH_TEMP_FAIL	0xc421
+#define VEND1_THERMAL_PROV_LOW_TEMP_FAIL	0xc422
+#define VEND1_THERMAL_PROV_HIGH_TEMP_WARN	0xc423
+#define VEND1_THERMAL_PROV_LOW_TEMP_WARN	0xc424
+#define VEND1_THERMAL_STAT1			0xc820
+#define VEND1_THERMAL_STAT2			0xc821
+#define VEND1_THERMAL_STAT2_VALID		BIT(0)
+#define VEND1_GENERAL_STAT1			0xc830
+#define VEND1_GENERAL_STAT1_HIGH_TEMP_FAIL	BIT(14)
+#define VEND1_GENERAL_STAT1_LOW_TEMP_FAIL	BIT(13)
+#define VEND1_GENERAL_STAT1_HIGH_TEMP_WARN	BIT(12)
+#define VEND1_GENERAL_STAT1_LOW_TEMP_WARN	BIT(11)
+
+#define VEND1_GLOBAL_GEN_STAT2			0xc831
+#define VEND1_GLOBAL_GEN_STAT2_OP_IN_PROG	BIT(15)
+
+#define VEND1_GLOBAL_RSVD_STAT1			0xc885
+#define VEND1_GLOBAL_RSVD_STAT1_FW_BUILD_ID	GENMASK(7, 4)
+#define VEND1_GLOBAL_RSVD_STAT1_PROV_ID		GENMASK(3, 0)
+
+#define VEND1_GLOBAL_RSVD_STAT9			0xc88d
+#define VEND1_GLOBAL_RSVD_STAT9_MODE		GENMASK(7, 0)
+#define VEND1_GLOBAL_RSVD_STAT9_1000BT2		0x23
+
+#define VEND1_GLOBAL_INT_STD_STATUS		0xfc00
+#define VEND1_GLOBAL_INT_VEND_STATUS		0xfc01
+
+#define VEND1_GLOBAL_INT_STD_MASK		0xff00
+#define VEND1_GLOBAL_INT_STD_MASK_PMA1		BIT(15)
+#define VEND1_GLOBAL_INT_STD_MASK_PMA2		BIT(14)
+#define VEND1_GLOBAL_INT_STD_MASK_PCS1		BIT(13)
+#define VEND1_GLOBAL_INT_STD_MASK_PCS2		BIT(12)
+#define VEND1_GLOBAL_INT_STD_MASK_PCS3		BIT(11)
+#define VEND1_GLOBAL_INT_STD_MASK_PHY_XS1	BIT(10)
+#define VEND1_GLOBAL_INT_STD_MASK_PHY_XS2	BIT(9)
+#define VEND1_GLOBAL_INT_STD_MASK_AN1		BIT(8)
+#define VEND1_GLOBAL_INT_STD_MASK_AN2		BIT(7)
+#define VEND1_GLOBAL_INT_STD_MASK_GBE		BIT(6)
+#define VEND1_GLOBAL_INT_STD_MASK_ALL		BIT(0)
+
+#define VEND1_GLOBAL_INT_VEND_MASK		0xff01
+#define VEND1_GLOBAL_INT_VEND_MASK_PMA		BIT(15)
+#define VEND1_GLOBAL_INT_VEND_MASK_PCS		BIT(14)
+#define VEND1_GLOBAL_INT_VEND_MASK_PHY_XS	BIT(13)
+#define VEND1_GLOBAL_INT_VEND_MASK_AN		BIT(12)
+#define VEND1_GLOBAL_INT_VEND_MASK_GBE		BIT(11)
+#define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL1	BIT(2)
+#define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL2	BIT(1)
+#define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL3	BIT(0)
+
 #if IS_REACHABLE(CONFIG_HWMON)
 int aqr_hwmon_probe(struct phy_device *phydev);
 #else
diff --git a/drivers/net/phy/aquantia/aquantia_hwmon.c b/drivers/net/phy/aquantia/aquantia_hwmon.c
index 0da451e46f69..7b3c49c3bf49 100644
--- a/drivers/net/phy/aquantia/aquantia_hwmon.c
+++ b/drivers/net/phy/aquantia/aquantia_hwmon.c
@@ -13,20 +13,6 @@
 
 #include "aquantia.h"
 
-/* Vendor specific 1, MDIO_MMD_VEND2 */
-#define VEND1_THERMAL_PROV_HIGH_TEMP_FAIL	0xc421
-#define VEND1_THERMAL_PROV_LOW_TEMP_FAIL	0xc422
-#define VEND1_THERMAL_PROV_HIGH_TEMP_WARN	0xc423
-#define VEND1_THERMAL_PROV_LOW_TEMP_WARN	0xc424
-#define VEND1_THERMAL_STAT1			0xc820
-#define VEND1_THERMAL_STAT2			0xc821
-#define VEND1_THERMAL_STAT2_VALID		BIT(0)
-#define VEND1_GENERAL_STAT1			0xc830
-#define VEND1_GENERAL_STAT1_HIGH_TEMP_FAIL	BIT(14)
-#define VEND1_GENERAL_STAT1_LOW_TEMP_FAIL	BIT(13)
-#define VEND1_GENERAL_STAT1_HIGH_TEMP_WARN	BIT(12)
-#define VEND1_GENERAL_STAT1_LOW_TEMP_WARN	BIT(11)
-
 #if IS_REACHABLE(CONFIG_HWMON)
 
 static umode_t aqr_hwmon_is_visible(const void *data,
diff --git a/drivers/net/phy/aquantia/aquantia_main.c b/drivers/net/phy/aquantia/aquantia_main.c
index 334a6904ca5a..4498426e9a52 100644
--- a/drivers/net/phy/aquantia/aquantia_main.c
+++ b/drivers/net/phy/aquantia/aquantia_main.c
@@ -91,61 +91,6 @@
 #define MDIO_C22EXT_STAT_SGMII_TX_FRAME_ALIGN_ERR	0xd31a
 #define MDIO_C22EXT_STAT_SGMII_TX_RUNT_FRAMES		0xd31b
 
-/* Vendor specific 1, MDIO_MMD_VEND1 */
-#define VEND1_GLOBAL_FW_ID			0x0020
-#define VEND1_GLOBAL_FW_ID_MAJOR		GENMASK(15, 8)
-#define VEND1_GLOBAL_FW_ID_MINOR		GENMASK(7, 0)
-
-#define VEND1_GLOBAL_GEN_STAT2			0xc831
-#define VEND1_GLOBAL_GEN_STAT2_OP_IN_PROG	BIT(15)
-
-/* The following registers all have similar layouts; first the registers... */
-#define VEND1_GLOBAL_CFG_10M			0x0310
-#define VEND1_GLOBAL_CFG_100M			0x031b
-#define VEND1_GLOBAL_CFG_1G			0x031c
-#define VEND1_GLOBAL_CFG_2_5G			0x031d
-#define VEND1_GLOBAL_CFG_5G			0x031e
-#define VEND1_GLOBAL_CFG_10G			0x031f
-/* ...and now the fields */
-#define VEND1_GLOBAL_CFG_RATE_ADAPT		GENMASK(8, 7)
-#define VEND1_GLOBAL_CFG_RATE_ADAPT_NONE	0
-#define VEND1_GLOBAL_CFG_RATE_ADAPT_USX		1
-#define VEND1_GLOBAL_CFG_RATE_ADAPT_PAUSE	2
-
-#define VEND1_GLOBAL_RSVD_STAT1			0xc885
-#define VEND1_GLOBAL_RSVD_STAT1_FW_BUILD_ID	GENMASK(7, 4)
-#define VEND1_GLOBAL_RSVD_STAT1_PROV_ID		GENMASK(3, 0)
-
-#define VEND1_GLOBAL_RSVD_STAT9			0xc88d
-#define VEND1_GLOBAL_RSVD_STAT9_MODE		GENMASK(7, 0)
-#define VEND1_GLOBAL_RSVD_STAT9_1000BT2		0x23
-
-#define VEND1_GLOBAL_INT_STD_STATUS		0xfc00
-#define VEND1_GLOBAL_INT_VEND_STATUS		0xfc01
-
-#define VEND1_GLOBAL_INT_STD_MASK		0xff00
-#define VEND1_GLOBAL_INT_STD_MASK_PMA1		BIT(15)
-#define VEND1_GLOBAL_INT_STD_MASK_PMA2		BIT(14)
-#define VEND1_GLOBAL_INT_STD_MASK_PCS1		BIT(13)
-#define VEND1_GLOBAL_INT_STD_MASK_PCS2		BIT(12)
-#define VEND1_GLOBAL_INT_STD_MASK_PCS3		BIT(11)
-#define VEND1_GLOBAL_INT_STD_MASK_PHY_XS1	BIT(10)
-#define VEND1_GLOBAL_INT_STD_MASK_PHY_XS2	BIT(9)
-#define VEND1_GLOBAL_INT_STD_MASK_AN1		BIT(8)
-#define VEND1_GLOBAL_INT_STD_MASK_AN2		BIT(7)
-#define VEND1_GLOBAL_INT_STD_MASK_GBE		BIT(6)
-#define VEND1_GLOBAL_INT_STD_MASK_ALL		BIT(0)
-
-#define VEND1_GLOBAL_INT_VEND_MASK		0xff01
-#define VEND1_GLOBAL_INT_VEND_MASK_PMA		BIT(15)
-#define VEND1_GLOBAL_INT_VEND_MASK_PCS		BIT(14)
-#define VEND1_GLOBAL_INT_VEND_MASK_PHY_XS	BIT(13)
-#define VEND1_GLOBAL_INT_VEND_MASK_AN		BIT(12)
-#define VEND1_GLOBAL_INT_VEND_MASK_GBE		BIT(11)
-#define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL1	BIT(2)
-#define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL2	BIT(1)
-#define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL3	BIT(0)
-
 /* Sleep and timeout for checking if the Processor-Intensive
  * MDIO operation is finished
  */
-- 
2.40.1


^ permalink raw reply related

* [net-next RFC PATCH v6 1/4] net: phy: aquantia: move to separate directory
From: Christian Marangi @ 2023-11-09 12:32 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Andrew Lunn,
	Heiner Kallweit, Russell King, Christian Marangi, Robert Marko,
	Vladimir Oltean, netdev, devicetree, linux-kernel

Move aquantia PHY driver to separate driectory in preparation for
firmware loading support to keep things tidy.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
---
Changes v4:
- Keep order for kconfig config
Changes v3:
- Add this patch

 drivers/net/phy/Kconfig                         | 5 +----
 drivers/net/phy/Makefile                        | 6 +-----
 drivers/net/phy/aquantia/Kconfig                | 5 +++++
 drivers/net/phy/aquantia/Makefile               | 6 ++++++
 drivers/net/phy/{ => aquantia}/aquantia.h       | 0
 drivers/net/phy/{ => aquantia}/aquantia_hwmon.c | 0
 drivers/net/phy/{ => aquantia}/aquantia_main.c  | 0
 7 files changed, 13 insertions(+), 9 deletions(-)
 create mode 100644 drivers/net/phy/aquantia/Kconfig
 create mode 100644 drivers/net/phy/aquantia/Makefile
 rename drivers/net/phy/{ => aquantia}/aquantia.h (100%)
 rename drivers/net/phy/{ => aquantia}/aquantia_hwmon.c (100%)
 rename drivers/net/phy/{ => aquantia}/aquantia_main.c (100%)

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 421d2b62918f..25cfc5ded1da 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -96,10 +96,7 @@ config ADIN1100_PHY
 	  Currently supports the:
 	  - ADIN1100 - Robust,Industrial, Low Power 10BASE-T1L Ethernet PHY
 
-config AQUANTIA_PHY
-	tristate "Aquantia PHYs"
-	help
-	  Currently supports the Aquantia AQ1202, AQ2104, AQR105, AQR405
+source "drivers/net/phy/aquantia/Kconfig"
 
 config AX88796B_PHY
 	tristate "Asix PHYs"
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index c945ed9bd14b..f65e85c91fc1 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -35,11 +35,7 @@ obj-y				+= $(sfp-obj-y) $(sfp-obj-m)
 obj-$(CONFIG_ADIN_PHY)		+= adin.o
 obj-$(CONFIG_ADIN1100_PHY)	+= adin1100.o
 obj-$(CONFIG_AMD_PHY)		+= amd.o
-aquantia-objs			+= aquantia_main.o
-ifdef CONFIG_HWMON
-aquantia-objs			+= aquantia_hwmon.o
-endif
-obj-$(CONFIG_AQUANTIA_PHY)	+= aquantia.o
+obj-$(CONFIG_AQUANTIA_PHY)	+= aquantia/
 obj-$(CONFIG_AT803X_PHY)	+= at803x.o
 obj-$(CONFIG_AX88796B_PHY)	+= ax88796b.o
 obj-$(CONFIG_BCM54140_PHY)	+= bcm54140.o
diff --git a/drivers/net/phy/aquantia/Kconfig b/drivers/net/phy/aquantia/Kconfig
new file mode 100644
index 000000000000..226146417a6a
--- /dev/null
+++ b/drivers/net/phy/aquantia/Kconfig
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config AQUANTIA_PHY
+	tristate "Aquantia PHYs"
+	help
+	  Currently supports the Aquantia AQ1202, AQ2104, AQR105, AQR405
diff --git a/drivers/net/phy/aquantia/Makefile b/drivers/net/phy/aquantia/Makefile
new file mode 100644
index 000000000000..346f350bc084
--- /dev/null
+++ b/drivers/net/phy/aquantia/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+aquantia-objs			+= aquantia_main.o
+ifdef CONFIG_HWMON
+aquantia-objs			+= aquantia_hwmon.o
+endif
+obj-$(CONFIG_AQUANTIA_PHY)	+= aquantia.o
diff --git a/drivers/net/phy/aquantia.h b/drivers/net/phy/aquantia/aquantia.h
similarity index 100%
rename from drivers/net/phy/aquantia.h
rename to drivers/net/phy/aquantia/aquantia.h
diff --git a/drivers/net/phy/aquantia_hwmon.c b/drivers/net/phy/aquantia/aquantia_hwmon.c
similarity index 100%
rename from drivers/net/phy/aquantia_hwmon.c
rename to drivers/net/phy/aquantia/aquantia_hwmon.c
diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia/aquantia_main.c
similarity index 100%
rename from drivers/net/phy/aquantia_main.c
rename to drivers/net/phy/aquantia/aquantia_main.c
-- 
2.40.1


^ permalink raw reply related

* Re: [RFC Draft PATCHv2 net-next] Doc: update bridge doc
From: Hangbin Liu @ 2023-11-09 13:04 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: netdev, David S . Miller, David Ahern, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Ido Schimmel, Roopa Prabhu,
	Stephen Hemminger
In-Reply-To: <68045f82-4306-165b-c4b2-96432cc8c422@blackwall.org>

Hi Nikolay,

On Wed, Nov 01, 2023 at 01:29:34PM +0200, Nikolay Aleksandrov wrote:
> > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> > index fac351a93aed..6adc0c70e345 100644
> > --- a/include/uapi/linux/if_link.h
> > +++ b/include/uapi/linux/if_link.h
> > @@ -461,6 +461,238 @@ enum in6_addr_gen_mode {
> >   /* Bridge section */
> > +/**
> > + * DOC: The bridge emum defination
> 
> s/emum/enum/
> 
> Below the time is not in seconds though. It is expected in clock_t (seconds
> multiplied by USER_HZ) and also exported. That should be
> better explained as it has caused confusion a lot.

I have done most of the update. But for this I'm not sure how to update.
Should I explain the clock_t defination before all the enum definations?
And how should I describe the time in the enums?
> > + *
> > + * @IFLA_BR_MCAST_LAST_MEMBER_CNT
> > + *   Set multicast last member count, ie the number of queries the bridge
> > + *   will send before stopping forwarding a multicast group after a "leave"
> > + *   message has been received.
> 
> This needs to be explained better. Remove "ie", "It is the number of queries
> the bridge will send", this part needs to be extended what are these queries
> and are they group-specific or general etc. The interval
> and time values below need better explanations of their units and what
> they represent in general. I won't add a comment below each.
> 
> Also please remove the extra new lines between the comments and the
> definitions.

And this one. Which extra new lines do you mean? Are you meaning no blank line
after each enum? e.g.

@IFLA_BR_...
  some comments
@IFLA_BR_...
  some comments

Thanks
Hangbin

^ permalink raw reply

* Re: [PATCH net] net: ti: icss-iep: fix setting counter value
From: patchwork-bot+netdevbpf @ 2023-11-09 12:50 UTC (permalink / raw)
  To: Diogo Ivo
  Cc: davem, edumazet, kuba, pabeni, horms, danishanwar, vigneshr,
	rogerq, grygorii.strashko, m-karicheri2, jan.kiszka, netdev,
	baocheng.su
In-Reply-To: <20231107120037.1513546-1-diogo.ivo@siemens.com>

Hello:

This patch was applied to netdev/net.git (main)
by Paolo Abeni <pabeni@redhat.com>:

On Tue,  7 Nov 2023 12:00:36 +0000 you wrote:
> Currently icss_iep_set_counter() writes the upper 32-bits of the
> counter value to both the lower and upper counter registers, so
> fix this by writing the appropriate value to the lower register.
> 
> Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver")
> Signed-off-by: Diogo Ivo <diogo.ivo@siemens.com>
> 
> [...]

Here is the summary with links:
  - [net] net: ti: icss-iep: fix setting counter value
    https://git.kernel.org/netdev/net/c/83b9dda8afa4

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net v4 0/3] Fix large frames in the Gemini ethernet driver
From: Linus Walleij @ 2023-11-09 12:46 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: Hans Ulli Kroll, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Michał Mirosław, Andrew Lunn,
	linux-arm-kernel, netdev, linux-kernel
In-Reply-To: <20231109105037.zppxrr3bptd7a7i6@skbuf>

On Thu, Nov 9, 2023 at 11:50 AM Vladimir Oltean <olteanv@gmail.com> wrote:

> Thanks for being persistent with this! I hope we didn't miss today's
> "net" pull request :)

Hey thanks for one of the best review cycles I've ever had, really
really appreciated.

It's more important to be correct than to be fast so I don't worry
much about when the patch goes in.

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH net v4 0/3] Fix large frames in the Gemini ethernet driver
From: Paolo Abeni @ 2023-11-09 12:26 UTC (permalink / raw)
  To: Vladimir Oltean, Linus Walleij
  Cc: Hans Ulli Kroll, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Michał Mirosław, Andrew Lunn, linux-arm-kernel, netdev,
	linux-kernel
In-Reply-To: <20231109105037.zppxrr3bptd7a7i6@skbuf>

On Thu, 2023-11-09 at 12:50 +0200, Vladimir Oltean wrote:
> On Thu, Nov 09, 2023 at 10:03:11AM +0100, Linus Walleij wrote:
> > This is the result of a bug hunt for a problem with the
> > RTL8366RB DSA switch leading me wrong all over the place.
> > 
> > I am indebted to Vladimir Oltean who as usual pointed
> > out where the real problem was, many thanks!
> > 
> > Tryig to actually use big ("jumbo") frames on this
> > hardware uncovered the real bugs. Then I tested it on
> > the DSA switch and it indeed fixes the issue.
> > 
> > To make sure it also works fine with big frames on
> > non-DSA devices I also copied a large video file over
> > scp to a device with maximum frame size, the data
> > was transported in large TCP packets ending up in
> > 0x7ff sized frames using software checksumming at
> > ~2.0 MB/s.
> > 
> > If I set down the MTU to the standard 1500 bytes so
> > that hardware checksumming is used, the scp transfer
> > of the same file was slightly lower, ~1.8-1.9 MB/s.
> > 
> > Despite this not being the best test it shows that
> > we can now stress the hardware with large frames
> > and that software checksum works fine.
> > 
> > Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
> > ---
> 
> Thanks for being persistent with this! I hope we didn't miss today's
> "net" pull request :)

I fear this is a bit too late for today's PR. I hope it should not be a
big problem, since we are very early in the release cycle.

Cheers,

Paolo


^ permalink raw reply

* Re: [PATCH v2 net 5/7] net/sched: taprio: fix delayed switching to new schedule after timer expiry
From: Vladimir Oltean @ 2023-11-09 12:24 UTC (permalink / raw)
  To: Faizal Rahim
  Cc: Vinicius Costa Gomes, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev, linux-kernel
In-Reply-To: <20231107112023.676016-6-faizal.abdul.rahim@linux.intel.com>

On Tue, Nov 07, 2023 at 06:20:21AM -0500, Faizal Rahim wrote:
>  static void taprio_start_sched(struct Qdisc *sch,
> -			       ktime_t start, struct sched_gate_list *new)
> +			       ktime_t new_base_time,
> +			       struct sched_gate_list *new)
>  {
>  	struct taprio_sched *q = qdisc_priv(sch);
> -	ktime_t expires;
> +	struct sched_gate_list *oper = NULL;

No point in initializing with NULL if this will be overwritten later.

> +	ktime_t expires, start;
>  
>  	if (FULL_OFFLOAD_IS_ENABLED(q->flags))
>  		return;
>  
> +	oper = rcu_dereference_protected(q->oper_sched,
> +					 lockdep_is_held(&q->current_entry_lock));
> +

^ permalink raw reply

* Re: [PATCH 15/22] arch: vdso: consolidate gettime prototypes
From: Michael Ellerman @ 2023-11-09 12:23 UTC (permalink / raw)
  To: Christophe Leroy, Arnd Bergmann, Arnd Bergmann, Andrew Morton,
	linux-kernel@vger.kernel.org, Masahiro Yamada,
	linux-kbuild@vger.kernel.org
  Cc: Matt Turner, Vineet Gupta, Russell King, Catalin Marinas,
	Will Deacon, Steven Rostedt, Masami Hiramatsu, Mark Rutland,
	guoren, Peter Zijlstra, Ard Biesheuvel, Huacai Chen, Greg Ungerer,
	Michal Simek, Thomas Bogendoerfer, Dinh Nguyen, Nicholas Piggin,
	Geoff Levand, Palmer Dabbelt, Heiko Carstens,
	John Paul Adrian Glaubitz, David S . Miller, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, x86@kernel.org, Helge Deller,
	Sudip Mukherjee, Greg Kroah-Hartman, Timur Tabi, Kent Overstreet,
	David Woodhouse, Naveen N. Rao, Anil S Keshavamurthy, Kees Cook,
	Vincenzo Frascino, Juri Lelli, Vincent Guittot, Nathan Chancellor,
	Nick Desaulniers, Nicolas Schier, Alexander Viro,
	Uwe Kleine-König, linux-alpha@vger.kernel.org,
	linux-snps-arc@lists.infradead.org,
	linux-arm-kernel@lists.infradead.org,
	linux-trace-kernel@vger.kernel.org, linux-csky@vger.kernel.org,
	loongarch@lists.linux.dev, linux-m68k@lists.linux-m68k.org,
	linux-mips@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	linux-riscv@lists.infradead.org, linux-s390@vger.kernel.org,
	linux-sh@vger.kernel.org, sparclinux@vger.kernel.org, Netdev,
	linux-parisc@vger.kernel.org, linux-usb@vger.kernel.org,
	linux-fbdev@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linux-bcachefs@vger.kernel.org, linux-mtd@lists.infradead.org
In-Reply-To: <886df4e4-9fc2-ca52-e7e9-53688e6e821a@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 09/11/2023 à 11:18, Michael Ellerman a écrit :
>> "Arnd Bergmann" <arnd@arndb.de> writes:
>>> On Wed, Nov 8, 2023, at 19:31, Christophe Leroy wrote:
>>>> Le 08/11/2023 à 13:58, Arnd Bergmann a écrit :
>>>
>>>> powerpc has functions doing more or less the same, they are called
>>>> __c_kernel_clock_gettime() and alike with their prototypes siting in
>>>> arch/powerpc/include/asm/vdso/gettimeofday.h
>>>>
>>>> Should those prototypes be moved to include/vdso/gettime.h too and
>>>> eventually renamed, or are they considered too powerpc specific ?
>>>
>>> I don't actually know, my initial interpretation was that
>>> these function names are part of the user ABI for the vdso,
>>> but I never looked closely enough at how vdso works to
>>> be sure what the actual ABI is.
>> 
>> AFAIK the ABI is just the symbols we export, as defined in the linker
>> script:
>> 
>> /*
>>   * This controls what symbols we export from the DSO.
>>   */
>> VERSION
>> {
>> 	VDSO_VERSION_STRING {
>> 	global:
>> 		__kernel_get_syscall_map;
>> 		__kernel_gettimeofday;
>> 		__kernel_clock_gettime;
>> 		__kernel_clock_getres;
>> 		__kernel_get_tbfreq;
>> 		__kernel_sync_dicache;
>> 		__kernel_sigtramp_rt64;
>> 		__kernel_getcpu;
>> 		__kernel_time;
>> 
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/kernel/vdso/vdso64.lds.S?h=v6.6&#n117
>> 
>>> If __c_kernel_clock_gettime() etc are not part of the user-facing
>>> ABI, I think renaming them for consistency with the other
>>> architectures would be best.
>> 
>> The __c symbols are not part of the ABI, so we could rename them.
>> 
>> At the moment though they don't have the same prototype as the generic
>> versions, because we find the VDSO data in asm and pass it to the C
>> functions, eg:
>> 
>> int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
>> 			    const struct vdso_data *vd);
>> 
>> I think we can rework that though, by implementing
>> __arch_get_vdso_data() and getting the vdso_data in C. Then we'd be able
>> to share the prototypes.
>
> I think it would not a been good idea, it would be less performant, for 
> explanation see commit 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e876f0b69dc993e86ca7795e63e98385aa9a7ef3

Ah thanks. I was wondering why you had done it in asm.

It's a pity but you're right that's probably a measurable performance
hit for some of those calls.

cheers

^ permalink raw reply

* RE: [PATCH net 2/3] dpll: fix pin dump crash for rebound module
From: Kubalewski, Arkadiusz @ 2023-11-09 12:20 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev@vger.kernel.org, vadim.fedorenko@linux.dev,
	Michalik, Michal, Olech, Milena, pabeni@redhat.com,
	kuba@kernel.org
In-Reply-To: <ZUubagu6B+vbfBqm@nanopsycho>

>From: Jiri Pirko <jiri@resnulli.us>
>Sent: Wednesday, November 8, 2023 3:30 PM
>
>Wed, Nov 08, 2023 at 11:32:25AM CET, arkadiusz.kubalewski@intel.com wrote:
>>When a kernel module is unbound but the pin resources were not entirely
>>freed (other kernel module instance have had kept the reference to that
>>pin), and kernel module is again bound, the pin properties would not be
>>updated (the properties are only assigned when memory for the pin is
>>allocated), prop pointer still points to the kernel module memory of
>>the kernel module which was deallocated on the unbind.
>>
>>If the pin dump is invoked in this state, the result is a kernel crash.
>>Prevent the crash by storing persistent pin properties in dpll subsystem,
>>copy the content from the kernel module when pin is allocated, instead of
>>using memory of the kernel module.
>>
>>Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
>>Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
>>Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>>---
>> drivers/dpll/dpll_core.c    |  4 ++--
>> drivers/dpll/dpll_core.h    |  4 ++--
>> drivers/dpll/dpll_netlink.c | 28 ++++++++++++++--------------
>> 3 files changed, 18 insertions(+), 18 deletions(-)
>>
>>diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
>>index 3568149b9562..4077b562ba3b 100644
>>--- a/drivers/dpll/dpll_core.c
>>+++ b/drivers/dpll/dpll_core.c
>>@@ -442,7 +442,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct
>>module *module,
>> 		ret = -EINVAL;
>> 		goto err;
>> 	}
>>-	pin->prop = prop;
>>+	memcpy(&pin->prop, prop, sizeof(pin->prop));
>
>Odd, you don't care about the pointer within this structure?
>

Well, true. Need a fix.
Wondering if copying idea is better than just assigning prop pointer on
each call to dpll_pin_get(..) function (when pin already exists)?

Thank you!
Arkadiusz

>
>> 	refcount_set(&pin->refcount, 1);
>> 	xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC);
>> 	xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
>>@@ -634,7 +634,7 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent,
>>struct dpll_pin *pin,
>> 	unsigned long i, stop;
>> 	int ret;
>>
>>-	if (WARN_ON(parent->prop->type != DPLL_PIN_TYPE_MUX))
>>+	if (WARN_ON(parent->prop.type != DPLL_PIN_TYPE_MUX))
>> 		return -EINVAL;
>>
>> 	if (WARN_ON(!ops) ||
>>diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h
>>index 5585873c5c1b..717f715015c7 100644
>>--- a/drivers/dpll/dpll_core.h
>>+++ b/drivers/dpll/dpll_core.h
>>@@ -44,7 +44,7 @@ struct dpll_device {
>>  * @module:		module of creator
>>  * @dpll_refs:		hold referencees to dplls pin was registered with
>>  * @parent_refs:	hold references to parent pins pin was registered
>>with
>>- * @prop:		pointer to pin properties given by registerer
>>+ * @prop:		pin properties copied from the registerer
>>  * @rclk_dev_name:	holds name of device when pin can recover clock
>>from it
>>  * @refcount:		refcount
>>  **/
>>@@ -55,7 +55,7 @@ struct dpll_pin {
>> 	struct module *module;
>> 	struct xarray dpll_refs;
>> 	struct xarray parent_refs;
>>-	const struct dpll_pin_properties *prop;
>>+	struct dpll_pin_properties prop;
>> 	refcount_t refcount;
>> };
>>
>>diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
>>index 93fc6c4b8a78..963bbbbe6660 100644
>>--- a/drivers/dpll/dpll_netlink.c
>>+++ b/drivers/dpll/dpll_netlink.c
>>@@ -278,17 +278,17 @@ dpll_msg_add_pin_freq(struct sk_buff *msg, struct
>>dpll_pin *pin,
>> 	if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY, sizeof(freq), &freq,
>> 			  DPLL_A_PIN_PAD))
>> 		return -EMSGSIZE;
>>-	for (fs = 0; fs < pin->prop->freq_supported_num; fs++) {
>>+	for (fs = 0; fs < pin->prop.freq_supported_num; fs++) {
>> 		nest = nla_nest_start(msg, DPLL_A_PIN_FREQUENCY_SUPPORTED);
>> 		if (!nest)
>> 			return -EMSGSIZE;
>>-		freq = pin->prop->freq_supported[fs].min;
>>+		freq = pin->prop.freq_supported[fs].min;
>> 		if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MIN, sizeof(freq),
>> 				  &freq, DPLL_A_PIN_PAD)) {
>> 			nla_nest_cancel(msg, nest);
>> 			return -EMSGSIZE;
>> 		}
>>-		freq = pin->prop->freq_supported[fs].max;
>>+		freq = pin->prop.freq_supported[fs].max;
>> 		if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MAX, sizeof(freq),
>> 				  &freq, DPLL_A_PIN_PAD)) {
>> 			nla_nest_cancel(msg, nest);
>>@@ -304,9 +304,9 @@ static bool dpll_pin_is_freq_supported(struct dpll_pin
>>*pin, u32 freq)
>> {
>> 	int fs;
>>
>>-	for (fs = 0; fs < pin->prop->freq_supported_num; fs++)
>>-		if (freq >= pin->prop->freq_supported[fs].min &&
>>-		    freq <= pin->prop->freq_supported[fs].max)
>>+	for (fs = 0; fs < pin->prop.freq_supported_num; fs++)
>>+		if (freq >= pin->prop.freq_supported[fs].min &&
>>+		    freq <= pin->prop.freq_supported[fs].max)
>> 			return true;
>> 	return false;
>> }
>>@@ -403,7 +403,7 @@ static int
>> dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
>> 		     struct netlink_ext_ack *extack)
>> {
>>-	const struct dpll_pin_properties *prop = pin->prop;
>>+	const struct dpll_pin_properties *prop = &pin->prop;
>> 	struct dpll_pin_ref *ref;
>> 	int ret;
>>
>>@@ -696,7 +696,7 @@ dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32
>>parent_idx,
>> 	int ret;
>>
>> 	if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
>>-	      pin->prop->capabilities)) {
>>+	      pin->prop.capabilities)) {
>> 		NL_SET_ERR_MSG(extack, "state changing is not allowed");
>> 		return -EOPNOTSUPP;
>> 	}
>>@@ -732,7 +732,7 @@ dpll_pin_state_set(struct dpll_device *dpll, struct
>>dpll_pin *pin,
>> 	int ret;
>>
>> 	if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
>>-	      pin->prop->capabilities)) {
>>+	      pin->prop.capabilities)) {
>> 		NL_SET_ERR_MSG(extack, "state changing is not allowed");
>> 		return -EOPNOTSUPP;
>> 	}
>>@@ -759,7 +759,7 @@ dpll_pin_prio_set(struct dpll_device *dpll, struct
>>dpll_pin *pin,
>> 	int ret;
>>
>> 	if (!(DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE &
>>-	      pin->prop->capabilities)) {
>>+	      pin->prop.capabilities)) {
>> 		NL_SET_ERR_MSG(extack, "prio changing is not allowed");
>> 		return -EOPNOTSUPP;
>> 	}
>>@@ -787,7 +787,7 @@ dpll_pin_direction_set(struct dpll_pin *pin, struct
>>dpll_device *dpll,
>> 	int ret;
>>
>> 	if (!(DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE &
>>-	      pin->prop->capabilities)) {
>>+	      pin->prop.capabilities)) {
>> 		NL_SET_ERR_MSG(extack, "direction changing is not allowed");
>> 		return -EOPNOTSUPP;
>> 	}
>>@@ -817,8 +817,8 @@ dpll_pin_phase_adj_set(struct dpll_pin *pin, struct
>>nlattr *phase_adj_attr,
>> 	int ret;
>>
>> 	phase_adj = nla_get_s32(phase_adj_attr);
>>-	if (phase_adj > pin->prop->phase_range.max ||
>>-	    phase_adj < pin->prop->phase_range.min) {
>>+	if (phase_adj > pin->prop.phase_range.max ||
>>+	    phase_adj < pin->prop.phase_range.min) {
>> 		NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr,
>> 				    "phase adjust value not supported");
>> 		return -EINVAL;
>>@@ -999,7 +999,7 @@ dpll_pin_find(u64 clock_id, struct nlattr
>>*mod_name_attr,
>> 	unsigned long i;
>>
>> 	xa_for_each_marked(&dpll_pin_xa, i, pin, DPLL_REGISTERED) {
>>-		prop = pin->prop;
>>+		prop = &pin->prop;
>> 		cid_match = clock_id ? pin->clock_id == clock_id : true;
>> 		mod_match = mod_name_attr && module_name(pin->module) ?
>> 			!nla_strcmp(mod_name_attr,
>>--
>>2.38.1
>>


^ permalink raw reply

* Re: PRP with VLAN support - or how to contribute to a Linux network driver
From: Kristian Myrland Overskeid @ 2023-11-09 12:20 UTC (permalink / raw)
  To: Heiko Gerstung; +Cc: Andrew Lunn, netdev@vger.kernel.org
In-Reply-To: <DB0904D7-4F30-4A61-A4CB-48C7BFF4390F@meinberg.de>

If you simply remove the line "dev->features |=
NETIF_F_VLAN_CHALLENGED;" in hsr_device.c, the hsr-module is handling
vlan frames without any further modifications. Unless you need to send
vlan tagged supervision frames, I'm pretty sure the current
implementation works just as fine with vlan as without.

However, in my opinion, the discard-algorithm
(hsr_register_frame_out() in hsr_framereg.c) is not made for switched
networks. The problem with the current implementation is that it does
not account for frames arriving in a different order than it was sent
from a host. It simply checks if the sequence number of an arriving
frame is higher than the previous one. If the network has some sort of
priority, it must be expected that frames will arrive out of order
when the network load is big enough for the switches to start
prioritizing.

My solution was to add a linked list to the node struct, one for each
registered vlan id. It contains the vlan id, last sequence number and
time. On reception of a vlan frame to the HSR_PT_MASTER, it retrieves
the "node_seq_out" and "node_time_out" based on the vlan.

This works fine for me because all the prp nodes are connected to
trunk ports and the switches are prioritizing frames based on the vlan
tag.

If a prp node is connected to an access port, but the network is using
vlan priority, all sequence numbers and timestamps with the
corresponding vlan id must be kept in a hashed list. The list must be
regularly checked to remove elements before new frames with a wrapped
around sequence number can arrive.

ZHAW School of Engineering has made a prp program for both linux user
and kernel space with such a discard algorithm. The program does not
compile without some modifications, but the discard algorithm works
fine. The program is open source and can be found at
https://github.com/ZHAW-InES-Team/sw_stack_prp1.

tor. 9. nov. 2023 kl. 09:08 skrev Heiko Gerstung <heiko.gerstung@meinberg.de>:
>
> Am 08.11.23, 16:17 schrieb "Andrew Lunn" <andrew@lunn.ch <mailto:andrew@lunn.ch>>:
>
>
> >> I would like to discuss if it makes sense to remove the PRP
> >> functionality from the HSR driver (which is based on the bridge
> >> kernel module AFAICS) and instead implement PRP as a separate module
> >> (based on the Bonding driver, which would make more sense for PRP).
>
>
> > Seems like nobody replied. I don't know PRP or HSR, so i can only make
> > general remarks.
>
> Thank you for responding!
>
> > The general policy is that we don't rip something out and replace it
> > with new code. We try to improve what already exists to meet the
> > demands. This is partially because of backwards compatibility. There
> > could be users using the code as is. You cannot break that. Can you
> > step by step modify the current code to make use of bonding, and in
> > the process show you don't break the current use cases?
>
> Understood. I am not sure if we can change the hsr driver to gradually use a more bonding-like approach for prp and I believe this might not be required, as long as we can get VLAN support into it.
>
> > You also need to consider offloading to hardware. The bridge code has infrastructure
> > to offload. Does the bond driver? I've no idea about that.
>
> I do not know this either but would expect that the nature of bonding would not require offloading support (I do not see a potential for efficiency/performance improvements here, unlike HSR or PRP).
>
> >> Hoping for advise what the next steps could be. Happy to discuss
> >> this off-list as it may not be of interest for most people.
>
> > You probably want to get together with others who are interested in
> > PRP and HSR. linutronix, ti, microchip, etc.
>
> Yes, would love to do that and my hope was that I would find them here. I am not familiar with the "orphaned" status for a kernel module, but I would have expected that one of the mentioned parties interested in PRP/HSR would have adopted the module.
>
> > Andrew
>
> Again, thanks a lot for your comments and remarks, very useful.
>
> Heiko
>
>
>

^ permalink raw reply

* Re: [RFC PATCH v3 07/12] page-pool: device memory support
From: Mina Almasry @ 2023-11-09 12:20 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: netdev, linux-kernel, linux-arch, linux-kselftest, linux-media,
	dri-devel, linaro-mm-sig, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Jesper Dangaard Brouer,
	Ilias Apalodimas, Arnd Bergmann, David Ahern, Willem de Bruijn,
	Shuah Khan, Sumit Semwal, Christian König, Shakeel Butt,
	Jeroen de Borst, Praveen Kaligineedi
In-Reply-To: <a8ae22dc-5b85-9efe-16c7-d95d455828fa@huawei.com>

On Thu, Nov 9, 2023 at 1:30 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/11/9 11:20, Mina Almasry wrote:
> > On Wed, Nov 8, 2023 at 2:56 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> >
> > Agreed everything above is undoable.
> >
> >> But we might be able to do something as folio is doing now, mm subsystem
> >> is still seeing 'struct folio/page', but other subsystem like slab is using
> >> 'struct slab', and there is still some common fields shared between
> >> 'struct folio' and 'struct slab'.
> >>
> >
> > In my eyes this is almost exactly what I suggested in RFC v1 and got
> > immediately nacked with no room to negotiate. What we did for v1 is to
> > allocate struct pages for dma-buf to make dma-bufs look like struct
> > page to mm subsystem. Almost exactly what you're describing above.
>
> Maybe the above is where we have disagreement:
> Do we still need make dma-bufs look like struct page to mm subsystem?
> IMHO, the answer is no. We might only need to make dma-bufs look like
> struct page to net stack and page pool subsystem. I think that is already
> what this pacthset is trying to do, what I am suggesting is just make
> it more like 'struct page' to net stack and page pool subsystem, in order
> to try to avoid most of the 'if' checking in net stack and page pool
> subsystem.
>

First, most of the checking in the net stack is
skb_frag_not_readable(). dma-buf are fundamentally not kmap()able and
not readable. So we can't remove those, no matter what we do I think.
Can we agree on that? If so, lets discuss removing most of the ifs in
the page pool, only.

> > It's a no-go. I don't think renaming struct page to netmem is going to
> > move the needle (it also re-introduces code-churn). What I feel like I
> > learnt is that dma-bufs are not struct pages and can't be made to look
> > like one, I think.
> >
> >> As the netmem patchset, is devmem able to reuse the below 'struct netmem'
> >> and rename it to 'struct page_pool_iov'?
> >
> > I don't think so. For the reasons above, but also practically it
> > immediately falls apart. Consider this field in netmem:
> >
> > + * @flags: The same as the page flags.  Do not use directly.
> >
> > dma-buf don't have or support page-flags, and making dma-buf looks
> > like they support page flags or any page-like features (other than
> > dma_addr) seems extremely unacceptable to mm folks.
>
> As far as I tell, as we limit the devmem usage in netstack, the below
> is the related mm function call for 'struct page' for devmem:
> page_ref_*(): page->_refcount does not need changing

Sorry, I don't understand. Are you suggesting we call page_ref_add() &
page_ref_sub() on page_pool_iov? That is basically making
page_pool_iov look like struct page to the mm stack, since page_ref_*
are mm calls, which you say above we don't need to do. We will still
need to special case this, no?

> page_is_pfmemalloc(): which is corresponding to page->pp_magic, and
>                       devmem provider can set/unset it in it's 'alloc_pages'
>                       ops.

page_is_pfmemalloc() has nothing to do with page->pp_magic. It checks
page->lru.next to figure out if this is a pfmemalloc. page_pool_iov
has no page->lru.next. Still need to special case this?

> page_to_nid(): we may need to handle it differently somewhat like this
>                patch does as page_to_nid() may has different implementation
>                based on different configuration.

So you're saying we need to handle page_to_nid() differently for
devmem? So we're not going to be able to avoid the if statement.

> page_pool_iov_put_many(): as mentioned in other thread, if net stack is not
>                           calling page_pool_page_put_many() directly, we
>                           can reuse napi_pp_put_page() for devmem too, and
>                           handle the special case for devmem in 'release_page'
>                           ops.
>

page_pool_iov_put_many()/page_pool_iov_get_many() are called to do
refcounting before the page is released back to the provider. I'm not
seeing how we can handle the special case inside of 'release_page' -
that's too late, as far as I can tell.

The only way to remove the if statements in the page pool is to
implement what you said was not feasible in an earlier email. We would
define this struct:

struct netmem {
        /* common fields */
        refcount_t refcount;
        bool is_pfmemalloc;
        int nid;
        ......
        union {
                struct devmem{
                        struct dmabuf_genpool_chunk_owner *owner;
                };

                struct page * page;
        };
};

Then, we would require all memory providers to allocate struct netmem
for the memory and set the common fields, including ones that have
struct pages. For devmem, netmem->page will be NULL, because netmem
has no page.

If we do that, the page pool can ignore whether the underlying memory
is page or devmem, because it can use the common fields, example:

/* page_ref_count replacement */
netmem_ref_count(struct netmem* netmem) {
    return netmem->refcount;
}

/* page_ref_add replacement */
netmem_ref_add(struct netmem* netmem) {
   atomic_inc(netmem->refcount);
}

/* page_to_nid replacement */
netmem_nid(struct netmem* netmem) {
    return netmem->nid;
}

/* page_is_pfmemalloc() replacement */
netmem_is_pfmemalloc(struct netmem* netmem) {
    return netmem->is_pfmemalloc;
}

/* page_ref_sub replacement */
netmem_ref_sub(struct netmem* netmem) {
    atomic_sub(netmet->refcount);
    if (netmem->refcount == 0) {
                  /* release page to the memory provider.
                   * struct page memory provider will do put_page(),
                   * devmem will do something else */
           }
     }
}

I think this MAY BE technically feasible, but I'm not sure it's better:

1. It is a huge refactor to the page pool, lots of code churn. While
the page pool currently uses page*, it needs to be completely
refactored to use netmem*.
2. It causes extra memory usage. struct netmem needs to be allocated
for every struct page.
3. It has minimal perf upside. The page_is_page_pool_iov() checks
currently have minimal perf impact, and I demonstrated that to Jesper
in RFC v2.
4. It also may not be technically feasible. I'm not sure how netmem
interacts with skb_frag_t. I guess we replace struct page* bv_page
with struct netmem* bv_page, and add changes there.
5. Drivers need to be refactored to use netmem* instead of page*,
unless we cast netmem* to page* before returning to the driver.

Possibly other downsides, these are what I could immediately think of.

If I'm still misunderstanding your suggestion, it may be time to send
me a concrete code snippet of what you have in mind. I'm a bit
confused at the moment because the only avenue I see to remove the if
statements in the page pool is to define the struct that we agreed is
not feasible in earlier emails.

> >
> >> So that 'struct page' for normal
> >> memory and 'struct page_pool_iov' for devmem share the common fields used
> >> by page pool and net stack?
> >
> > Are you suggesting that we'd cast a netmem* to a page* and call core
> > mm APIs on it? It's basically what was happening with RFC v1, where
> > things that are not struct pages were made to look like struct pages.
> >
> > Also, there isn't much upside for what you're suggesting, I think. For
> > example I can align the refcount variable in struct page_pool_iov with
> > the refcount in struct page so that this works:
> >
> > put_page((struct page*)ppiov);
> >
> > but it's a disaster. Because put_page() will call __put_page() if the
> > page is freed, and __put_page() will try to return the page to the
> > buddy allocator!
>
> As what I suggested above, Can we handle this in devmem provider's
> 'release_page' ops instead of calling put_page() directly as for devmem.
>
> >
> >>  And we might be able to reuse the 'flags',
> >> '_pp_mapping_pad' and '_mapcount' for specific mem provider, which is enough
> >> for the devmem only requiring a single pointer to point to it's
> >> owner?
> >>
> >
> > All the above seems quite similar to RFC v1 again, using netmem
> > instead of struct page. In RFC v1 we re-used zone_device_data() for
> > the dma-buf owner equivalent.
>
> As we have added a few checkings to limit 'struct page' for devmem to
> be only used in net stack, we can decouple 'struct page' for devmem
> from mm subsystem, zone_device_data() is not really needed, right?
>
> If we can decouple 'struct page' for normal memory from mm subsystem
> through the folio work in the future, then we may define a more abstract
> structure for page pool and net stack instead of reusing 'struct page'
> from mm.
>
> >

--
Thanks,
Mina

^ permalink raw reply

* Re: [PATCH net 1/1] net: stmmac: fix MAC and phylink mismatch issue after resume with STMMAC_FLAG_USE_PHY_WOL enabled
From: Paolo Abeni @ 2023-11-09 12:13 UTC (permalink / raw)
  To: Russell King (Oracle), Gan Yi Fang
  Cc: Alexandre Torgue, Jose Abreu, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Maxime Coquelin, Joakim Zhang, netdev,
	linux-stm32, linux-arm-kernel, linux-kernel, Looi Hong Aun,
	Voon Weifeng, Song Yoong Siang
In-Reply-To: <ZUyjOEQHHnnbzwrV@shell.armlinux.org.uk>

On Thu, 2023-11-09 at 09:15 +0000, Russell King (Oracle) wrote:
> On Thu, Nov 09, 2023 at 01:00:27PM +0800, Gan Yi Fang wrote:
> > From: "Gan, Yi Fang" <yi.fang.gan@intel.com>
> > 
> > The issue happened when flag STMMAC_FLAG_USE_PHY_WOL is enabled.
> > It can be reproduced with steps below:
> > 1. Advertise only one speed on the host
> > 2. Enable the WoL on the host
> > 3. Suspend the host
> > 4. Wake up the host
> > 
> > When the WoL is disabled, both the PHY and MAC will suspend and wake up
> > with everything configured well. When WoL is enabled, the PHY needs to be
> > stay awake to receive the signal from remote client but MAC will enter
> > suspend mode.
> > 
> > When the MAC resumes from suspend, phylink_resume() will call
> > phylink_start() to start the phylink instance which will trigger the
> > phylink machine to invoke the mac_link_up callback function. The
> > stmmac_mac_link_up() will configure the MAC_CTRL_REG based on the current
> > link state. Then the stmmac_hw_setup() will be called to configure the MAC.
> > 
> > This sequence might cause mismatch of the link state between MAC and
> > phylink. This patch moves the phylink_resume() after stmamc_hw_setup() to
> > ensure the MAC is initialized before phylink is being configured.
> 
> Isn't this going to cause problems?
> 
> stmamc_hw_setup() calls stmmac_init_dma_engine(), which then calls
> stmmac_reset() - and stmmac_reset() can fail if the PHY clock isn't
> running, which is why phylink_resume() gets called before this.

@Gan Yi Fang: at very least we need a solid explanation in the commit
message why this change don't cause the above problems.

Thanks,

Paolo


^ permalink raw reply

* Re: [PATCH net] virtio_net: fix missing dma unmap for resize
From: Michael S. Tsirkin @ 2023-11-09 12:06 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: netdev, Jason Wang, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, virtualization
In-Reply-To: <20231106081832.668-1-xuanzhuo@linux.alibaba.com>

On Mon, Nov 06, 2023 at 04:18:32PM +0800, Xuan Zhuo wrote:
> For rq, we have three cases getting buffers from virtio core:
> 
> 1. virtqueue_get_buf{,_ctx}
> 2. virtqueue_detach_unused_buf
> 3. callback for virtqueue_resize
> 
> But in commit 295525e29a5b("virtio_net: merge dma operations when
> filling mergeable buffers"), I missed the dma unmap for the #3 case.
> 
> That will leak some memory, because I did not release the pages referred
> by the unused buffers.
> 
> If we do such script, we will make the system OOM.
> 
>     while true
>     do
>             ethtool -G ens4 rx 128
>             ethtool -G ens4 rx 256
>             free -m
>     done
> 
> Fixes: 295525e29a5b ("virtio_net: merge dma operations when filling mergeable buffers")
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  drivers/net/virtio_net.c | 43 ++++++++++++++++++++--------------------
>  1 file changed, 22 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index d16f592c2061..6423a3a007ce 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -408,6 +408,17 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
>  	return p;
>  }
>  
> +static void virtnet_rq_free_buf(struct virtnet_info *vi,
> +				struct receive_queue *rq, void *buf)
> +{
> +	if (vi->mergeable_rx_bufs)
> +		put_page(virt_to_head_page(buf));
> +	else if (vi->big_packets)
> +		give_pages(rq, buf);
> +	else
> +		put_page(virt_to_head_page(buf));
> +}
> +

>  static void enable_delayed_refill(struct virtnet_info *vi)
>  {
>  	spin_lock_bh(&vi->refill_lock);
> @@ -634,17 +645,6 @@ static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
>  	return buf;
>  }
>  
> -static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> -{
> -	void *buf;
> -
> -	buf = virtqueue_detach_unused_buf(rq->vq);
> -	if (buf && rq->do_dma)
> -		virtnet_rq_unmap(rq, buf, 0);
> -
> -	return buf;
> -}
> -
>  static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len)
>  {
>  	struct virtnet_rq_dma *dma;
> @@ -1764,7 +1764,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
>  	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
>  		pr_debug("%s: short packet %i\n", dev->name, len);
>  		DEV_STATS_INC(dev, rx_length_errors);
> -		virtnet_rq_free_unused_buf(rq->vq, buf);
> +		virtnet_rq_free_buf(vi, rq, buf);
>  		return;
>  	}
>  
> @@ -4034,14 +4034,15 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
>  static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
>  {
>  	struct virtnet_info *vi = vq->vdev->priv;
> +	struct receive_queue *rq;
>  	int i = vq2rxq(vq);
>  
> -	if (vi->mergeable_rx_bufs)
> -		put_page(virt_to_head_page(buf));
> -	else if (vi->big_packets)
> -		give_pages(&vi->rq[i], buf);
> -	else
> -		put_page(virt_to_head_page(buf));
> +	rq = &vi->rq[i];
> +
> +	if (rq->do_dma)
> +		virtnet_rq_unmap(rq, buf, 0);
> +
> +	virtnet_rq_free_buf(vi, rq, buf);
>  }
>  

So we have virtnet_rq_free_buf which sounds like it should free any
buf, and we have virtnet_rq_free_unused_buf which is only for unused.
Or so it would seem from names but this is not true.
Better function names?

>  static void free_unused_bufs(struct virtnet_info *vi)
> @@ -4057,10 +4058,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
>  	}
>  
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
> -		struct receive_queue *rq = &vi->rq[i];
> +		struct virtqueue *vq = vi->rq[i].vq;
>  
> -		while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> -			virtnet_rq_free_unused_buf(rq->vq, buf);
> +		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> +			virtnet_rq_free_unused_buf(vq, buf);
>  		cond_resched();
>  	}
>  }
> -- 
> 2.32.0.3.g01195cf9f


^ permalink raw reply

* Re: [PATCH v2 net 4/7] net/sched: taprio: get corrected value of cycle_time and interval
From: Vladimir Oltean @ 2023-11-09 12:01 UTC (permalink / raw)
  To: Faizal Rahim
  Cc: Vinicius Costa Gomes, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev, linux-kernel
In-Reply-To: <20231107112023.676016-5-faizal.abdul.rahim@linux.intel.com>

On Tue, Nov 07, 2023 at 06:20:20AM -0500, Faizal Rahim wrote:
> @@ -215,6 +216,31 @@ static void switch_schedules(struct taprio_sched *q,
>  	*admin = NULL;
>  }
>  
> +static bool cycle_corr_active(s64 cycle_time_correction)
> +{
> +	if (cycle_time_correction == INIT_CYCLE_TIME_CORRECTION)
> +		return false;
> +	else
> +		return true;
> +}
> @@ -259,14 +286,6 @@ static int duration_to_length(struct taprio_sched *q, u64 duration)
>  	return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte));
>  }
>  
> -static bool cycle_corr_active(s64 cycle_time_correction)
> -{
> -	if (cycle_time_correction == INIT_CYCLE_TIME_CORRECTION)
> -		return false;
> -	else
> -		return true;
> -}
> -

Don't move code around that you've introduced in earlier changes. Just
place it where it needs to be from the beginning.

^ permalink raw reply

* Re: [PATCH net-next v2 17/21] virtio_net: xsk: rx: skip dma unmap when rq is bind with AF_XDP
From: Michael S. Tsirkin @ 2023-11-09 12:00 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Jason Wang, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization, bpf
In-Reply-To: <1699528202.3090942-4-xuanzhuo@linux.alibaba.com>

On Thu, Nov 09, 2023 at 07:10:02PM +0800, Xuan Zhuo wrote:
> On Thu, 9 Nov 2023 03:15:03 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Tue, Nov 07, 2023 at 11:12:23AM +0800, Xuan Zhuo wrote:
> > > When rq is bound with AF_XDP, the buffer dma is managed
> > > by the AF_XDP APIs. So the buffer got from the virtio core should
> > > skip the dma unmap operation.
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> >
> >
> > I don't get it - is this like a bugfix?
> 
> I want focus on this. So let it as an independent commit.
> 
> > And why do we need our own flag and checks?
> > Doesn't virtio core DTRT?
> 
> 
> struct vring_virtqueue {
> 	[....]
> 
> 	/* Do DMA mapping by driver */
> 	bool premapped;
> 
> We can not.
> 
> So I add own flag.
> 
> Thanks.

Still don't get it. Why not check the premapped flag?

> 
> >
> > > ---
> > >  drivers/net/virtio/main.c       | 8 +++++---
> > >  drivers/net/virtio/virtio_net.h | 3 +++
> > >  drivers/net/virtio/xsk.c        | 1 +
> > >  3 files changed, 9 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> > > index 15943a22e17d..a318b2533b94 100644
> > > --- a/drivers/net/virtio/main.c
> > > +++ b/drivers/net/virtio/main.c
> > > @@ -430,7 +430,7 @@ static void *virtnet_rq_get_buf(struct virtnet_rq *rq, u32 *len, void **ctx)
> > >  	void *buf;
> > >
> > >  	buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > -	if (buf && rq->do_dma)
> > > +	if (buf && rq->do_dma_unmap)
> > >  		virtnet_rq_unmap(rq, buf, *len);
> > >
> > >  	return buf;
> > > @@ -561,8 +561,10 @@ static void virtnet_set_premapped(struct virtnet_info *vi)
> > >
> > >  		/* disable for big mode */
> > >  		if (vi->mergeable_rx_bufs || !vi->big_packets) {
> > > -			if (!virtqueue_set_dma_premapped(vi->rq[i].vq))
> > > +			if (!virtqueue_set_dma_premapped(vi->rq[i].vq)) {
> > >  				vi->rq[i].do_dma = true;
> > > +				vi->rq[i].do_dma_unmap = true;
> > > +			}
> > >  		}
> > >  	}
> > >  }
> > > @@ -3944,7 +3946,7 @@ void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > >
> > >  	rq = &vi->rq[i];
> > >
> > > -	if (rq->do_dma)
> > > +	if (rq->do_dma_unmap)
> > >  		virtnet_rq_unmap(rq, buf, 0);
> > >
> > >  	virtnet_rq_free_buf(vi, rq, buf);
> > > diff --git a/drivers/net/virtio/virtio_net.h b/drivers/net/virtio/virtio_net.h
> > > index 1242785e311e..2005d0cd22e2 100644
> > > --- a/drivers/net/virtio/virtio_net.h
> > > +++ b/drivers/net/virtio/virtio_net.h
> > > @@ -135,6 +135,9 @@ struct virtnet_rq {
> > >  	/* Do dma by self */
> > >  	bool do_dma;
> > >
> > > +	/* Do dma unmap after getting buf from virtio core. */
> > > +	bool do_dma_unmap;
> > > +
> > >  	struct {
> > >  		struct xsk_buff_pool *pool;
> > >
> > > diff --git a/drivers/net/virtio/xsk.c b/drivers/net/virtio/xsk.c
> > > index e737c3353212..b09c473c29fb 100644
> > > --- a/drivers/net/virtio/xsk.c
> > > +++ b/drivers/net/virtio/xsk.c
> > > @@ -210,6 +210,7 @@ static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct virtnet_rq *
> > >  		xdp_rxq_info_unreg(&rq->xsk.xdp_rxq);
> > >
> > >  	rq->xsk.pool = pool;
> > > +	rq->do_dma_unmap = !pool;
> > >
> > >  	virtnet_rx_resume(vi, rq);
> > >
> > > --
> > > 2.32.0.3.g01195cf9f
> >
> >


^ permalink raw reply

* Re: [PATCH net-next v2 14/21] virtio_net: xsk: tx: virtnet_free_old_xmit() distinguishes xsk buffer
From: Michael S. Tsirkin @ 2023-11-09 11:59 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Jason Wang, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization, bpf
In-Reply-To: <1699528568.0674586-6-xuanzhuo@linux.alibaba.com>

On Thu, Nov 09, 2023 at 07:16:08PM +0800, Xuan Zhuo wrote:
> On Thu, 9 Nov 2023 06:11:49 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Tue, Nov 07, 2023 at 11:12:20AM +0800, Xuan Zhuo wrote:
> > > virtnet_free_old_xmit distinguishes three type ptr(skb, xdp frame, xsk
> > > buffer) by the last bits of the pointer.
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > >  drivers/net/virtio/virtio_net.h | 18 ++++++++++++++++--
> > >  drivers/net/virtio/xsk.h        |  5 +++++
> > >  2 files changed, 21 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio/virtio_net.h b/drivers/net/virtio/virtio_net.h
> > > index a431a2c1ee47..a13d6d301fdb 100644
> > > --- a/drivers/net/virtio/virtio_net.h
> > > +++ b/drivers/net/virtio/virtio_net.h
> > > @@ -225,6 +225,11 @@ struct virtnet_info {
> > >  	struct failover *failover;
> > >  };
> > >
> > > +static inline bool virtnet_is_skb_ptr(void *ptr)
> > > +{
> > > +	return !((unsigned long)ptr & VIRTIO_XMIT_DATA_MASK);
> > > +}
> > > +
> > >  static inline bool virtnet_is_xdp_frame(void *ptr)
> > >  {
> > >  	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
> > > @@ -235,6 +240,8 @@ static inline struct xdp_frame *virtnet_ptr_to_xdp(void *ptr)
> > >  	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
> > >  }
> > >
> > > +static inline u32 virtnet_ptr_to_xsk(void *ptr);
> > > +
> >
> > I don't understand why you need this here.
> 
> The below function virtnet_free_old_xmit needs this.
> 
> Thanks.

I don't understand why is virtnet_free_old_xmit inline, either.

> >
> >
> > >  static inline void *virtnet_sq_unmap(struct virtnet_sq *sq, void *data)
> > >  {
> > >  	struct virtnet_sq_dma *next, *head;
> > > @@ -261,11 +268,12 @@ static inline void *virtnet_sq_unmap(struct virtnet_sq *sq, void *data)
> > >  static inline void virtnet_free_old_xmit(struct virtnet_sq *sq, bool in_napi,
> > >  					 u64 *bytes, u64 *packets)
> > >  {
> > > +	unsigned int xsknum = 0;
> > >  	unsigned int len;
> > >  	void *ptr;
> > >
> > >  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> > > -		if (!virtnet_is_xdp_frame(ptr)) {
> > > +		if (virtnet_is_skb_ptr(ptr)) {
> > >  			struct sk_buff *skb;
> > >
> > >  			if (sq->do_dma)
> > > @@ -277,7 +285,7 @@ static inline void virtnet_free_old_xmit(struct virtnet_sq *sq, bool in_napi,
> > >
> > >  			*bytes += skb->len;
> > >  			napi_consume_skb(skb, in_napi);
> > > -		} else {
> > > +		} else if (virtnet_is_xdp_frame(ptr)) {
> > >  			struct xdp_frame *frame;
> > >
> > >  			if (sq->do_dma)
> > > @@ -287,9 +295,15 @@ static inline void virtnet_free_old_xmit(struct virtnet_sq *sq, bool in_napi,
> > >
> > >  			*bytes += xdp_get_frame_len(frame);
> > >  			xdp_return_frame(frame);
> > > +		} else {
> > > +			*bytes += virtnet_ptr_to_xsk(ptr);
> > > +			++xsknum;
> > >  		}
> > >  		(*packets)++;
> > >  	}
> > > +
> > > +	if (xsknum)
> > > +		xsk_tx_completed(sq->xsk.pool, xsknum);
> > >  }
> > >
> > >  static inline bool virtnet_is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
> > > diff --git a/drivers/net/virtio/xsk.h b/drivers/net/virtio/xsk.h
> > > index 1bd19dcda649..7ebc9bda7aee 100644
> > > --- a/drivers/net/virtio/xsk.h
> > > +++ b/drivers/net/virtio/xsk.h
> > > @@ -14,6 +14,11 @@ static inline void *virtnet_xsk_to_ptr(u32 len)
> > >  	return (void *)(p | VIRTIO_XSK_FLAG);
> > >  }
> > >
> > > +static inline u32 virtnet_ptr_to_xsk(void *ptr)
> > > +{
> > > +	return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET;
> > > +}
> > > +
> > >  int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp);
> > >  bool virtnet_xsk_xmit(struct virtnet_sq *sq, struct xsk_buff_pool *pool,
> > >  		      int budget);
> > > --
> > > 2.32.0.3.g01195cf9f
> >
> >


^ permalink raw reply

* Re: [PATCH net-next v2 12/21] virtio_net: xsk: tx: support tx
From: Michael S. Tsirkin @ 2023-11-09 11:58 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Jason Wang, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, virtualization, bpf
In-Reply-To: <1699527983.483377-3-xuanzhuo@linux.alibaba.com>

On Thu, Nov 09, 2023 at 07:06:23PM +0800, Xuan Zhuo wrote:
> On Thu, 9 Nov 2023 03:09:00 -0500, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Tue, Nov 07, 2023 at 11:12:18AM +0800, Xuan Zhuo wrote:
> > > The driver's tx napi is very important for XSK. It is responsible for
> > > obtaining data from the XSK queue and sending it out.
> > >
> > > At the beginning, we need to trigger tx napi.
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > >  drivers/net/virtio/main.c       |  12 +++-
> > >  drivers/net/virtio/virtio_net.h |   3 +-
> > >  drivers/net/virtio/xsk.c        | 110 ++++++++++++++++++++++++++++++++
> > >  drivers/net/virtio/xsk.h        |  13 ++++
> > >  4 files changed, 136 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> > > index 6c608b3ce27d..ff6bc764089d 100644
> > > --- a/drivers/net/virtio/main.c
> > > +++ b/drivers/net/virtio/main.c
> > > @@ -2074,6 +2074,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > >  	struct virtnet_info *vi = sq->vq->vdev->priv;
> > >  	unsigned int index = vq2txq(sq->vq);
> > >  	struct netdev_queue *txq;
> > > +	int busy = 0;
> > >  	int opaque;
> > >  	bool done;
> > >
> > > @@ -2086,11 +2087,20 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > >  	txq = netdev_get_tx_queue(vi->dev, index);
> > >  	__netif_tx_lock(txq, raw_smp_processor_id());
> > >  	virtqueue_disable_cb(sq->vq);
> > > -	free_old_xmit(sq, true);
> > > +
> > > +	if (sq->xsk.pool)
> > > +		busy |= virtnet_xsk_xmit(sq, sq->xsk.pool, budget);
> >
> > You use bitwise or on errno values? What's going on here?
> 
> virtnet_xsk_xmit() return that it is busy or not. Not the errno.
> Here just record whether this handler is busy or not.


Ah I see it's a bool. So make busy a bool too.


> >
> >
> > > +	else
> > > +		free_old_xmit(sq, true);
> > >
> > >  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> > >  		netif_tx_wake_queue(txq);
> > >
> > > +	if (busy) {
> > > +		__netif_tx_unlock(txq);
> > > +		return budget;
> > > +	}
> > > +
> > >  	opaque = virtqueue_enable_cb_prepare(sq->vq);
> > >
> > >  	done = napi_complete_done(napi, 0);
> > > diff --git a/drivers/net/virtio/virtio_net.h b/drivers/net/virtio/virtio_net.h
> > > index 442af4673bf8..1c21af47e13c 100644
> > > --- a/drivers/net/virtio/virtio_net.h
> > > +++ b/drivers/net/virtio/virtio_net.h
> > > @@ -9,7 +9,8 @@
> > >  #include <net/xdp_sock_drv.h>
> > >
> > >  #define VIRTIO_XDP_FLAG	BIT(0)
> > > -#define VIRTIO_XMIT_DATA_MASK (VIRTIO_XDP_FLAG)
> > > +#define VIRTIO_XSK_FLAG	BIT(1)
> > > +#define VIRTIO_XMIT_DATA_MASK (VIRTIO_XDP_FLAG | VIRTIO_XSK_FLAG)
> > >
> > >  /* RX packet size EWMA. The average packet size is used to determine the packet
> > >   * buffer size when refilling RX rings. As the entire RX ring may be refilled
> > > diff --git a/drivers/net/virtio/xsk.c b/drivers/net/virtio/xsk.c
> > > index 8b397787603f..caa448308232 100644
> > > --- a/drivers/net/virtio/xsk.c
> > > +++ b/drivers/net/virtio/xsk.c
> > > @@ -4,9 +4,119 @@
> > >   */
> > >
> > >  #include "virtio_net.h"
> > > +#include "xsk.h"
> > >
> > >  static struct virtio_net_hdr_mrg_rxbuf xsk_hdr;
> > >
> > > +static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> > > +{
> > > +	sg->dma_address = addr;
> > > +	sg->length = len;
> > > +}
> > > +
> > > +static void virtnet_xsk_check_queue(struct virtnet_sq *sq)
> > > +{
> > > +	struct virtnet_info *vi = sq->vq->vdev->priv;
> > > +	struct net_device *dev = vi->dev;
> > > +	int qnum = sq - vi->sq;
> > > +
> > > +	/* If it is a raw buffer queue, it does not check whether the status
> > > +	 * of the queue is stopped when sending. So there is no need to check
> > > +	 * the situation of the raw buffer queue.
> > > +	 */
> > > +	if (virtnet_is_xdp_raw_buffer_queue(vi, qnum))
> > > +		return;
> > > +
> > > +	/* If this sq is not the exclusive queue of the current cpu,
> > > +	 * then it may be called by start_xmit, so check it running out
> > > +	 * of space.
> > > +	 *
> > > +	 * Stop the queue to avoid getting packets that we are
> > > +	 * then unable to transmit. Then wait the tx interrupt.
> > > +	 */
> > > +	if (sq->vq->num_free < 2 + MAX_SKB_FRAGS)
> >
> > what does MAX_SKB_FRAGS have to do with it? And where's 2 coming from?
> 
> check_sq_full_and_disable()
> 
> Thanks.


This is one example of duplication I was talking about earlier.

> >
> > > +		netif_stop_subqueue(dev, qnum);
> > > +}
> > > +
> > > +static int virtnet_xsk_xmit_one(struct virtnet_sq *sq,
> > > +				struct xsk_buff_pool *pool,
> > > +				struct xdp_desc *desc)
> > > +{
> > > +	struct virtnet_info *vi;
> > > +	dma_addr_t addr;
> > > +
> > > +	vi = sq->vq->vdev->priv;
> > > +
> > > +	addr = xsk_buff_raw_get_dma(pool, desc->addr);
> > > +	xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len);
> > > +
> > > +	sg_init_table(sq->sg, 2);
> > > +
> > > +	sg_fill_dma(sq->sg, sq->xsk.hdr_dma_address, vi->hdr_len);
> > > +	sg_fill_dma(sq->sg + 1, addr, desc->len);
> > > +
> > > +	return virtqueue_add_outbuf(sq->vq, sq->sg, 2,
> > > +				    virtnet_xsk_to_ptr(desc->len), GFP_ATOMIC);
> > > +}
> > > +
> > > +static int virtnet_xsk_xmit_batch(struct virtnet_sq *sq,
> > > +				  struct xsk_buff_pool *pool,
> > > +				  unsigned int budget,
> > > +				  u64 *kicks)
> > > +{
> > > +	struct xdp_desc *descs = pool->tx_descs;
> > > +	u32 nb_pkts, max_pkts, i;
> > > +	bool kick = false;
> > > +	int err;
> > > +
> > > +	/* Every xsk tx packet needs two desc(virtnet header and packet). So we
> > > +	 * use sq->vq->num_free / 2 as the limitation.
> > > +	 */
> > > +	max_pkts = min_t(u32, budget, sq->vq->num_free / 2);
> > > +
> > > +	nb_pkts = xsk_tx_peek_release_desc_batch(pool, max_pkts);
> > > +	if (!nb_pkts)
> > > +		return 0;
> > > +
> > > +	for (i = 0; i < nb_pkts; i++) {
> > > +		err = virtnet_xsk_xmit_one(sq, pool, &descs[i]);
> > > +		if (unlikely(err))
> > > +			break;
> > > +
> > > +		kick = true;
> > > +	}
> > > +
> > > +	if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
> > > +		(*kicks)++;
> > > +
> > > +	return i;
> > > +}
> > > +
> > > +bool virtnet_xsk_xmit(struct virtnet_sq *sq, struct xsk_buff_pool *pool,
> > > +		      int budget)
> > > +{
> > > +	u64 bytes = 0, packets = 0, kicks = 0;
> > > +	int sent;
> > > +
> > > +	virtnet_free_old_xmit(sq, true, &bytes, &packets);
> > > +
> > > +	sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
> > > +
> > > +	virtnet_xsk_check_queue(sq);
> > > +
> > > +	u64_stats_update_begin(&sq->stats.syncp);
> > > +	u64_stats_add(&sq->stats.packets, packets);
> > > +	u64_stats_add(&sq->stats.bytes, bytes);
> > > +	u64_stats_add(&sq->stats.kicks, kicks);
> > > +	u64_stats_add(&sq->stats.xdp_tx,  sent);
> > > +	u64_stats_update_end(&sq->stats.syncp);
> > > +
> > > +	if (xsk_uses_need_wakeup(pool))
> > > +		xsk_set_tx_need_wakeup(pool);
> > > +
> > > +	return sent == budget;
> > > +}
> > > +
> > >  static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct virtnet_rq *rq,
> > >  				    struct xsk_buff_pool *pool)
> > >  {
> > > diff --git a/drivers/net/virtio/xsk.h b/drivers/net/virtio/xsk.h
> > > index 1918285c310c..73ca8cd5308b 100644
> > > --- a/drivers/net/virtio/xsk.h
> > > +++ b/drivers/net/virtio/xsk.h
> > > @@ -3,5 +3,18 @@
> > >  #ifndef __XSK_H__
> > >  #define __XSK_H__
> > >
> > > +#define VIRTIO_XSK_FLAG_OFFSET	4
> > > +
> > > +static inline void *virtnet_xsk_to_ptr(u32 len)
> > > +{
> > > +	unsigned long p;
> > > +
> > > +	p = len << VIRTIO_XSK_FLAG_OFFSET;
> > > +
> > > +	return (void *)(p | VIRTIO_XSK_FLAG);
> > > +}
> > > +
> > >  int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp);
> > > +bool virtnet_xsk_xmit(struct virtnet_sq *sq, struct xsk_buff_pool *pool,
> > > +		      int budget);
> > >  #endif
> > > --
> > > 2.32.0.3.g01195cf9f
> >


^ permalink raw reply

* Re: [PATCH v2 net 6/7] net/sched: taprio: fix q->current_entry is NULL before its expiry
From: Vladimir Oltean @ 2023-11-09 11:55 UTC (permalink / raw)
  To: Faizal Rahim
  Cc: Vinicius Costa Gomes, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev, linux-kernel
In-Reply-To: <20231107112023.676016-7-faizal.abdul.rahim@linux.intel.com>

On Tue, Nov 07, 2023 at 06:20:22AM -0500, Faizal Rahim wrote:
> Fix the issue of prematurely setting q->current_entry to NULL in the
> setup_first_end_time() function when a new admin schedule arrives
> while the oper schedule is still running but hasn't transitioned yet.
> This premature setting causes problems because any reference to
> q->current_entry, such as in taprio_dequeue(), will result in NULL
> during this period, which is incorrect. q->current_entry should remain
> valid until the currently running entry expires.
> 
> To address this issue, only set q->current_entry to NULL when there is
> no oper schedule currently running.
> 
> Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler")

The "Fixes" tag represents the commit where the code started becoming a
problem, not when the code that you're changing was first introduced.

I find it hard to believe that the problem was in commit 5a781ccbd19e
("tc: Add support for configuring the taprio scheduler"), because we
didn't support admin -> oper dynamic schedules back then.

> Signed-off-by: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
> ---
>  net/sched/sch_taprio.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
> index 01b114edec30..c60e9e7ac193 100644
> --- a/net/sched/sch_taprio.c
> +++ b/net/sched/sch_taprio.c
> @@ -1375,7 +1375,8 @@ static void setup_first_end_time(struct taprio_sched *q,
>  			first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]);
>  	}
>  
> -	rcu_assign_pointer(q->current_entry, NULL);
> +	if (!hrtimer_active(&q->advance_timer))
> +		rcu_assign_pointer(q->current_entry, NULL);
>  }
>  
>  static void taprio_start_sched(struct Qdisc *sch,
> -- 
> 2.25.1
>

^ permalink raw reply

* Re: [PATCH v2 net 5/7] net/sched: taprio: fix delayed switching to new schedule after timer expiry
From: Vladimir Oltean @ 2023-11-09 11:50 UTC (permalink / raw)
  To: Faizal Rahim
  Cc: Vinicius Costa Gomes, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev, linux-kernel
In-Reply-To: <20231107112023.676016-6-faizal.abdul.rahim@linux.intel.com>

On Tue, Nov 07, 2023 at 06:20:21AM -0500, Faizal Rahim wrote:
> If a new GCL is triggered and the new admin base time falls before the
> expiry of advance_timer (current running entry from oper),
> taprio_start_sched() resets the current advance_timer expiry to the
> new admin base time. However, upon expiry, advance_sched() doesn't
> immediately switch to the admin schedule. It continues running entries
> from the old oper schedule, and only switches to the new admin schedule
> much later. Ideally, if the advance_timer is shorten to align with the
> new admin base time, when the timer expires, advance_sched() should
> trigger switch_schedules() at the beginning.
> 
> To resolve this issue, set the cycle_time_correction to a non-initialized
> value in taprio_start_sched(). advance_sched() will use it to initiate
> switch_schedules() at the beginning.
> 
> Fixes: a3d43c0d56f1 ("taprio: Add support adding an admin schedule")

Did the commit you blame really introduce this issue, or was it your
rework to trigger switch_schedules() based on the correction?

> Signed-off-by: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
> ---
>  net/sched/sch_taprio.c | 21 ++++++++++++++++++---
>  1 file changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
> index f18a5fe12f0c..01b114edec30 100644
> --- a/net/sched/sch_taprio.c
> +++ b/net/sched/sch_taprio.c
> @@ -1379,14 +1379,19 @@ static void setup_first_end_time(struct taprio_sched *q,
>  }
>  
>  static void taprio_start_sched(struct Qdisc *sch,
> -			       ktime_t start, struct sched_gate_list *new)
> +			       ktime_t new_base_time,
> +			       struct sched_gate_list *new)
>  {
>  	struct taprio_sched *q = qdisc_priv(sch);
> -	ktime_t expires;
> +	struct sched_gate_list *oper = NULL;
> +	ktime_t expires, start;
>  
>  	if (FULL_OFFLOAD_IS_ENABLED(q->flags))
>  		return;
>  
> +	oper = rcu_dereference_protected(q->oper_sched,
> +					 lockdep_is_held(&q->current_entry_lock));
> +
>  	expires = hrtimer_get_expires(&q->advance_timer);
>  	if (expires == 0)
>  		expires = KTIME_MAX;
> @@ -1395,7 +1400,17 @@ static void taprio_start_sched(struct Qdisc *sch,
>  	 * reprogram it to the earliest one, so we change the admin
>  	 * schedule to the operational one at the right time.
>  	 */
> -	start = min_t(ktime_t, start, expires);
> +	start = min_t(ktime_t, new_base_time, expires);
> +
> +	if (expires != KTIME_MAX &&
> +	    ktime_compare(start, new_base_time) == 0) {
> +		/* Since timer was changed to align to the new admin schedule,
> +		 * setting the variable below to a non-initialized value will

I find the wording "setting the variable below to a non-initialized value"
confusing. 0 is non-initialized? You're talking about a value different
than INIT_CYCLE_TIME_CORRECTION. What about "setting a specific cycle
correction will indicate ..."?

> +		 * indicate to advance_sched() to call switch_schedules() after
> +		 * this timer expires.
> +		 */
> +		oper->cycle_time_correction = 0;

Why 0 and not ktime_sub(new_base_time, oper->cycle_end_time)? Doesn't
the precise correction value make a difference?

> +	}
>  
>  	hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
>  }
> -- 
> 2.25.1
>

^ permalink raw reply

* Re: [PATCH net v2 1/2] r8169: add handling DASH when DASH is disabled
From: Paolo Abeni @ 2023-11-09 11:49 UTC (permalink / raw)
  To: ChunHao Lin, hkallweit1
  Cc: nic_swsd, davem, edumazet, kuba, netdev, linux-kernel, stable
In-Reply-To: <20231108184849.2925-2-hau@realtek.com>

On Thu, 2023-11-09 at 02:48 +0800, ChunHao Lin wrote:
> For devices that support DASH, even DASH is disabled, there may still
> exist a default firmware that will influence device behavior.
> So driver needs to handle DASH for devices that support DASH, no
> matter the DASH status is.
> 
> This patch also prepare for "fix DASH deviceis network lost issue".
> 
> Signed-off-by: ChunHao Lin <hau@realtek.com>

You should include the fixes tag you already added in v1 and your Sob
should come as the last tag

The same applies to the next patch 

> Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>

It's not clear where/when Heiner provided the above tag for this patch.
I hope that was off-list.

> Cc: stable@vger.kernel.org
> ---
>  drivers/net/ethernet/realtek/r8169_main.c | 35 ++++++++++++++++-------
>  1 file changed, 25 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
> index 0c76c162b8a9..108dc75050ba 100644
> --- a/drivers/net/ethernet/realtek/r8169_main.c
> +++ b/drivers/net/ethernet/realtek/r8169_main.c
> @@ -624,6 +624,7 @@ struct rtl8169_private {
>  
>  	unsigned supports_gmii:1;
>  	unsigned aspm_manageable:1;
> +	unsigned dash_enabled:1;
>  	dma_addr_t counters_phys_addr;
>  	struct rtl8169_counters *counters;
>  	struct rtl8169_tc_offsets tc_offset;
> @@ -1253,14 +1254,26 @@ static bool r8168ep_check_dash(struct rtl8169_private *tp)
>  	return r8168ep_ocp_read(tp, 0x128) & BIT(0);
>  }
>  
> -static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp)
> +static bool rtl_dash_is_enabled(struct rtl8169_private *tp)
> +{
> +	switch (tp->dash_type) {
> +	case RTL_DASH_DP:
> +		return r8168dp_check_dash(tp);
> +	case RTL_DASH_EP:
> +		return r8168ep_check_dash(tp);
> +	default:
> +		return false;
> +	}
> +}
> +
> +static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp)
>  {
>  	switch (tp->mac_version) {
>  	case RTL_GIGA_MAC_VER_28:
>  	case RTL_GIGA_MAC_VER_31:
> -		return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE;
> +		return RTL_DASH_DP;
>  	case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53:
> -		return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE;
> +		return RTL_DASH_EP;
>  	default:
>  		return RTL_DASH_NONE;
>  	}
> @@ -1453,7 +1466,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
>  
>  	device_set_wakeup_enable(tp_to_dev(tp), wolopts);
>  
> -	if (tp->dash_type == RTL_DASH_NONE) {
> +	if (!tp->dash_enabled) {
>  		rtl_set_d3_pll_down(tp, !wolopts);
>  		tp->dev->wol_enabled = wolopts ? 1 : 0;
>  	}
> @@ -2512,7 +2525,7 @@ static void rtl_wol_enable_rx(struct rtl8169_private *tp)
>  
>  static void rtl_prepare_power_down(struct rtl8169_private *tp)
>  {
> -	if (tp->dash_type != RTL_DASH_NONE)
> +	if (tp->dash_enabled)
>  		return;
>  
>  	if (tp->mac_version == RTL_GIGA_MAC_VER_32 ||
> @@ -4869,7 +4882,7 @@ static int rtl8169_runtime_idle(struct device *device)
>  {
>  	struct rtl8169_private *tp = dev_get_drvdata(device);
>  
> -	if (tp->dash_type != RTL_DASH_NONE)
> +	if (tp->dash_enabled)
>  		return -EBUSY;
>  
>  	if (!netif_running(tp->dev) || !netif_carrier_ok(tp->dev))
> @@ -4896,7 +4909,7 @@ static void rtl_shutdown(struct pci_dev *pdev)
>  	rtl_rar_set(tp, tp->dev->perm_addr);
>  
>  	if (system_state == SYSTEM_POWER_OFF &&
> -	    tp->dash_type == RTL_DASH_NONE) {
> +		!tp->dash_enabled) {

Since you have to repost, please maintain the correct indentation
above:

	if (system_state == SYSTEM_POWER_OFF &&
	    !tp->dash_enabled) {

        ^^^^
spaces here.


Cheers,

Paolo


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox