Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 2/3] net: phy: make phy_enable_interrupts() non-static
From: Enguerrand de Ribaucourt @ 2022-12-20 13:19 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, woojung.huh, davem, UNGLinuxDriver,
	Enguerrand de Ribaucourt
In-Reply-To: <9235D6609DB808459E95D78E17F2E43D408987FF@CHN-SV-EXMX02.mchp-main.com>

Currently, phy_disable_interrupts() allows to disable interrupts without
freeing them. However, the only exported function to re-enable them is
phy_request_interrupt(), which also requests them again. It should be
possible to re-enable interrupts without re-allocating them.

This is required for lan78xx.c where we want to enable/disable the phy
interrupts during lan78xx_link_status_change().

Signed-off-by: Enguerrand de Ribaucourt <enguerrand.de-ribaucourt@savoirfairelinux.com>
---
 drivers/net/phy/phy.c | 3 ++-
 include/linux/phy.h   | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 33250da76466..4168cf54aa59 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1040,10 +1040,11 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
  * phy_enable_interrupts - Enable the interrupts from the PHY side
  * @phydev: target phy_device struct
  */
-static int phy_enable_interrupts(struct phy_device *phydev)
+int phy_enable_interrupts(struct phy_device *phydev)
 {
 	return phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED);
 }
+EXPORT_SYMBOL(phy_enable_interrupts);
 
 /**
  * phy_request_interrupt - request and enable interrupt for a PHY device
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 71eeb4e3b1fd..d2350f0dc566 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1715,6 +1715,7 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd);
 int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
 int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd);
 int phy_disable_interrupts(struct phy_device *phydev);
+int phy_enable_interrupts(struct phy_device *phydev);
 void phy_request_interrupt(struct phy_device *phydev);
 void phy_free_interrupt(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
-- 
2.25.1


^ permalink raw reply related

* [PATCH v3 3/3] net: lan78xx: prevent LAN88XX specific operations
From: Enguerrand de Ribaucourt @ 2022-12-20 13:19 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, woojung.huh, davem, UNGLinuxDriver,
	Enguerrand de Ribaucourt
In-Reply-To: <9235D6609DB808459E95D78E17F2E43D408987FF@CHN-SV-EXMX02.mchp-main.com>

Some operations during the cable switch workaround modify the register
LAN88XX_INT_MASK of the PHY. However, this register is specific to the
LAN8835 PHY. For instance, if a DP8322I PHY is connected to the LAN7801,
that register (0x19), corresponds to the LED and MAC address
configuration, resulting in unapropriate behavior.

Use the generic phy interrupt functions instead.

Fixes: 89b36fb5e532 ("lan78xx: Lan7801 Support for Fixed PHY")
Signed-off-by: Enguerrand de Ribaucourt <enguerrand.de-ribaucourt@savoirfairelinux.com>
---
 drivers/net/usb/lan78xx.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index f18ab8e220db..0f279082b924 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -28,6 +28,7 @@
 #include <linux/phy_fixed.h>
 #include <linux/of_mdio.h>
 #include <linux/of_net.h>
+#include <linux/phy.h>
 #include "lan78xx.h"
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
@@ -2123,10 +2124,7 @@ static void lan78xx_link_status_change(struct net_device *net)
 	 * at forced 100 F/H mode.
 	 */
 	if (!phydev->autoneg && (phydev->speed == 100)) {
-		/* disable phy interrupt */
-		temp = phy_read(phydev, LAN88XX_INT_MASK);
-		temp &= ~LAN88XX_INT_MASK_MDINTPIN_EN_;
-		phy_write(phydev, LAN88XX_INT_MASK, temp);
+		phy_disable_interrupts(phydev);
 
 		temp = phy_read(phydev, MII_BMCR);
 		temp &= ~(BMCR_SPEED100 | BMCR_SPEED1000);
@@ -2134,13 +2132,7 @@ static void lan78xx_link_status_change(struct net_device *net)
 		temp |= BMCR_SPEED100;
 		phy_write(phydev, MII_BMCR, temp); /* set to 100 later */
 
-		/* clear pending interrupt generated while workaround */
-		temp = phy_read(phydev, LAN88XX_INT_STS);
-
-		/* enable phy interrupt back */
-		temp = phy_read(phydev, LAN88XX_INT_MASK);
-		temp |= LAN88XX_INT_MASK_MDINTPIN_EN_;
-		phy_write(phydev, LAN88XX_INT_MASK, temp);
+		phy_enable_interrupts(phydev);
 	}
 }
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH v3 1/3] net: phy: add EXPORT_SYMBOL to phy_disable_interrupts()
From: Enguerrand de Ribaucourt @ 2022-12-20 13:19 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, woojung.huh, davem, UNGLinuxDriver,
	Enguerrand de Ribaucourt
In-Reply-To: <9235D6609DB808459E95D78E17F2E43D408987FF@CHN-SV-EXMX02.mchp-main.com>

It seems EXPORT_SYMBOL was forgotten when phy_disable_interrupts() was
made non static. For consistency with the other exported functions in
this file, EXPORT_SYMBOL should be used.

Fixes: 3dd4ef1bdbac ("net: phy: make phy_disable_interrupts() non-static")
Signed-off-by: Enguerrand de Ribaucourt <enguerrand.de-ribaucourt@savoirfairelinux.com>
---
 drivers/net/phy/phy.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e5b6cb1a77f9..33250da76466 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -992,6 +992,7 @@ int phy_disable_interrupts(struct phy_device *phydev)
 	/* Disable PHY interrupts */
 	return phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
 }
+EXPORT_SYMBOL(phy_disable_interrupts);
 
 /**
  * phy_interrupt - PHY interrupt handler
-- 
2.25.1


^ permalink raw reply related

* [PATCH v3 0/3] net: lan78xx: prevent LAN88XX specific operations
From: Enguerrand de Ribaucourt @ 2022-12-20 13:19 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, woojung.huh, davem, UNGLinuxDriver
In-Reply-To: <9235D6609DB808459E95D78E17F2E43D408987FF@CHN-SV-EXMX02.mchp-main.com>

This patch series replaces lan78xx.c code specific to the LAN8835 phy with the
generic phy functions, making it compatible with other phys.

To that end, the exports of phy_disable_interrupts() and phy_enable_interrupts()
needed to be refreshed.


^ permalink raw reply

* Re: [PATCH 1/2] ip/ip6_gre: Fix changing addr gen mode not generating IPv6 link local address
From: Paolo Abeni @ 2022-12-20 13:16 UTC (permalink / raw)
  To: Thomas Winter, davem, yoshfuji, dsahern, kuba, a, netdev,
	linux-kernel
In-Reply-To: <20221219010619.1826599-2-Thomas.Winter@alliedtelesis.co.nz>

On Mon, 2022-12-19 at 14:06 +1300, Thomas Winter wrote:
> Commit e5dd729460ca changed the code path so that GRE tunnels
> generate an IPv6 address based on the tunnel source address.
> It also changed the code path so GRE tunnels don't call addrconf_addr_gen
> in addrconf_dev_config which is called by addrconf_sysctl_addr_gen_mode
> when the IN6_ADDR_GEN_MODE is changed.
> 
> This patch aims to fix this issue by moving the code in addrconf_notify
> which calls the addr gen for GRE and SIT into a separate function
> and calling it in the places that expect the IPv6 address to be
> generated.
> 
> The previous addrconf_dev_config is renamed to addrconf_eth_config
> since it only expected eth type interfaces and follows the
> addrconf_gre/sit_config format.
> 
> Fixes: e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address")
> Signed-off-by: Thomas Winter <Thomas.Winter@alliedtelesis.co.nz>
> ---
>  net/ipv6/addrconf.c | 47 +++++++++++++++++++++++++--------------------
>  1 file changed, 26 insertions(+), 21 deletions(-)
> 
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index 6dcf034835ec..e9d7ec03316d 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -3355,7 +3355,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
>  	}
>  }
>  
> -static void addrconf_dev_config(struct net_device *dev)
> +static void addrconf_eth_config(struct net_device *dev)

You are creating a new function with the name of an existing one, while
renaming the latter. IMHO this makes the patch hard to review as there
are some existing call side for the old name, which we likelly want to
explicitly see here.

>  {
>  	struct inet6_dev *idev;
>  
> @@ -3447,6 +3447,30 @@ static void addrconf_gre_config(struct net_device *dev)
>  }
>  #endif
>  
> +static void addrconf_dev_config(struct net_device *dev)
> +{
> +	switch (dev->type) {
> +#if IS_ENABLED(CONFIG_IPV6_SIT)
> +	case ARPHRD_SIT:
> +		addrconf_sit_config(dev);
> +		break;
> +#endif
> +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
> +	case ARPHRD_IP6GRE:
> +	case ARPHRD_IPGRE:
> +		addrconf_gre_config(dev);
> +		break;
> +#endif
> +	case ARPHRD_LOOPBACK:
> +		init_loopback(dev);
> +		break;

If I read correctly, this change will cause unneeded attempt to re-add
the loopback address on the loopback device when the lo.addr_gen_mode
sysfs entry is touched. I think such side effect should be avoided.

Thanks,

Paolo


^ permalink raw reply

* kernel BUG in __skb_gso_segment
From: Tudor Ambarus @ 2022-12-20 13:12 UTC (permalink / raw)
  To: mst, jasowang, virtualization, edumazet, davem, kuba, pabeni,
	netdev, willemb, syzkaller, liuhangbin
  Cc: linux-kernel, joneslee

Hi,

There's a bug [1] reported by syzkaller in linux-5.15.y that I'd like
to squash. The commit in stable that introduces the bug is:
b99c71f90978 net: skip virtio_net_hdr_set_proto if protocol already set
The upstream commit for this is:
1ed1d592113959f00cc552c3b9f47ca2d157768f

I discovered that in mainline this bug was squashed by the following
commits:
e9d3f80935b6 ("net/af_packet: make sure to pull mac header")
dfed913e8b55 ("net/af_packet: add VLAN support for AF_PACKET SOCK_RAW GSO")

I'm seeking for some guidance on how to fix linux-5.15.y. From what I
understand, the bug in stable is triggered because we end up with a
header offset of 18, that eventually triggers the GSO crash in
__skb_pull. If I revert the commit in culprit from linux-5.15.y, we'll
end up with a header offset of 14, the bug is not hit and the packet is 
dropped at validate_xmit_skb() time. I'm wondering if reverting it is
the right thing to do, as the commit is marked as a fix. Backporting the
2 commits from mainline is not an option as they introduce new support.
Would such a patch be better than reverting the offending commit?

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index a960de68ac69..188b6f05e5bd 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -25,7 +25,7 @@ static inline bool virtio_net_hdr_match_proto(__be16 
protocol, __u8 gso_type)
  static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
                                            const struct virtio_net_hdr 
*hdr)
  {
-       if (skb->protocol)
+       if (skb->protocol && skb->protocol != htons(ETH_P_ALL))
                 return 0;

         switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {


Thanks!
ta

[1] 
https://syzkaller.appspot.com/bug?id=ce5575575f074c33ff80d104f5baee26f22e95f5

^ permalink raw reply related

* Re: [PATCH v2] net: lan78xx: prevent LAN88XX specific operations
From: Enguerrand de Ribaucourt @ 2022-12-20 13:10 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: netdev, woojung huh, davem, UNGLinuxDriver
In-Reply-To: <d382c89ca5dc8675ed88efeae62f4adc0e72d6c0.camel@redhat.com>

> From: "Paolo Abeni" <pabeni@redhat.com>
> To: "Enguerrand de Ribaucourt" <enguerrand.de-ribaucourt@savoirfairelinux.com>,
> "netdev" <netdev@vger.kernel.org>
> Cc: "woojung huh" <woojung.huh@microchip.com>, "davem" <davem@davemloft.net>,
> "UNGLinuxDriver" <UNGLinuxDriver@microchip.com>
> Sent: Tuesday, December 20, 2022 1:45:08 PM
> Subject: Re: [PATCH v2] net: lan78xx: prevent LAN88XX specific operations

> On Tue, 2022-12-20 at 12:37 +0100, Enguerrand de Ribaucourt wrote:
> > Some operations during the cable switch workaround modify the register
> > LAN88XX_INT_MASK of the PHY. However, this register is specific to the
> > LAN8835 PHY. For instance, if a DP8322I PHY is connected to the LAN7801,
> > that register (0x19), corresponds to the LED and MAC address
> > configuration, resulting in unapropriate behavior.

> > Use the generic phy interrupt functions instead.

> > Fixes: 89b36fb5e532 ("lan78xx: Lan7801 Support for Fixed PHY")
> > Reviewed-by: Paolo Abeni <pabeni@redhat.com>;
>> Signed-off-by: Enguerrand de Ribaucourt
> > <enguerrand.de-ribaucourt@savoirfairelinux.com>
> > ---
> > drivers/net/usb/lan78xx.c | 14 +++-----------
> > 1 file changed, 3 insertions(+), 11 deletions(-)

> > diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
> > index f18ab8e220db..65d5d54994ff 100644
> > --- a/drivers/net/usb/lan78xx.c
> > +++ b/drivers/net/usb/lan78xx.c
> > @@ -28,6 +28,7 @@
> > #include <linux/phy_fixed.h>
> > #include <linux/of_mdio.h>
> > #include <linux/of_net.h>
> > +#include <linux/phy.h>
> > #include "lan78xx.h"

> > #define DRIVER_AUTHOR "WOOJUNG HUH <woojung.huh@microchip.com>"
>> @@ -2123,10 +2124,7 @@ static void lan78xx_link_status_change(struct net_device
> > *net)
> > * at forced 100 F/H mode.
> > */
> > if (!phydev->autoneg && (phydev->speed == 100)) {
> > - /* disable phy interrupt */
> > - temp = phy_read(phydev, LAN88XX_INT_MASK);
> > - temp &= ~LAN88XX_INT_MASK_MDINTPIN_EN_;
> > - phy_write(phydev, LAN88XX_INT_MASK, temp);
> > + phy_disable_interrupts(phydev);

> > temp = phy_read(phydev, MII_BMCR);
> > temp &= ~(BMCR_SPEED100 | BMCR_SPEED1000);
>> @@ -2134,13 +2132,7 @@ static void lan78xx_link_status_change(struct net_device
> > *net)
> > temp |= BMCR_SPEED100;
> > phy_write(phydev, MII_BMCR, temp); /* set to 100 later */

> > - /* clear pending interrupt generated while workaround */
> > - temp = phy_read(phydev, LAN88XX_INT_STS);
> > -
> > - /* enable phy interrupt back */
> > - temp = phy_read(phydev, LAN88XX_INT_MASK);
> > - temp |= LAN88XX_INT_MASK_MDINTPIN_EN_;
> > - phy_write(phydev, LAN88XX_INT_MASK, temp);
> > + phy_request_interrupt(phydev);
> > }
> > }


> Oops, this does not even build... please take your time testing the
> code before sending patches to the ML.

I did verify that the patch built with the driver configured as built-in. I'll
investigate if there's a problem when built as a module.

> Paolo

^ permalink raw reply

* [PATCH net] bonding: fix lockdep splat in bond_miimon_commit()
From: Eric Dumazet @ 2022-12-20 13:08 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, eric.dumazet, Eric Dumazet, syzbot, Hangbin Liu,
	Jay Vosburgh, Veaceslav Falico, Andy Gospodarek

bond_miimon_commit() is run while RTNL is held, not RCU.

WARNING: suspicious RCU usage
6.1.0-syzkaller-09671-g89529367293c #0 Not tainted
-----------------------------
drivers/net/bonding/bond_main.c:2704 suspicious rcu_dereference_check() usage!

Fixes: e95cc44763a4 ("bonding: do failover when high prio link up")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Hangbin Liu <liuhangbin@gmail.com>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
---
 drivers/net/bonding/bond_main.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b4c65783960a5aa14de5d64aeea190f02a04be44..0363ce597661422b82a7d33ef001151b275f9ada 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2654,10 +2654,12 @@ static void bond_miimon_link_change(struct bonding *bond,
 
 static void bond_miimon_commit(struct bonding *bond)
 {
-	struct slave *slave, *primary;
+	struct slave *slave, *primary, *active;
 	bool do_failover = false;
 	struct list_head *iter;
 
+	ASSERT_RTNL();
+
 	bond_for_each_slave(bond, slave, iter) {
 		switch (slave->link_new_state) {
 		case BOND_LINK_NOCHANGE:
@@ -2700,8 +2702,8 @@ static void bond_miimon_commit(struct bonding *bond)
 
 			bond_miimon_link_change(bond, slave, BOND_LINK_UP);
 
-			if (!rcu_access_pointer(bond->curr_active_slave) || slave == primary ||
-			    slave->prio > rcu_dereference(bond->curr_active_slave)->prio)
+			active = rtnl_dereference(bond->curr_active_slave);
+			if (!active || slave == primary || slave->prio > active->prio)
 				do_failover = true;
 
 			continue;
-- 
2.39.0.314.g84b9a713c41-goog


^ permalink raw reply related

* Re: [PATCH] wifi: rtl8xxxu: fixing transmisison failure for rtl8192eu
From: Bitterblue Smith @ 2022-12-20 13:03 UTC (permalink / raw)
  To: Ping-Ke Shih, Jun ASAKA, Jes.Sorensen@gmail.com
  Cc: kvalo@kernel.org, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com,
	linux-wireless@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <3b4124ebabcb4ceaae89cd9ccf84c7de@realtek.com>

On 20/12/2022 07:44, Ping-Ke Shih wrote:
> 
> 
>> -----Original Message-----
>> From: Jun ASAKA <JunASAKA@zzy040330.moe>
>> Sent: Saturday, December 17, 2022 11:07 AM
>> To: Jes.Sorensen@gmail.com
>> Cc: kvalo@kernel.org; davem@davemloft.net; edumazet@google.com; kuba@kernel.org; pabeni@redhat.com;
>> linux-wireless@vger.kernel.org; netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Jun ASAKA
>> <JunASAKA@zzy040330.moe>
>> Subject: [PATCH] wifi: rtl8xxxu: fixing transmisison failure for rtl8192eu
>>
>> Fixing transmission failure which results in
>> "authentication with ... timed out". This can be
>> fixed by disable the REG_TXPAUSE.
>>
>> Signed-off-by: Jun ASAKA <JunASAKA@zzy040330.moe>
>> ---
>>  drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c | 5 +++++
>>  1 file changed, 5 insertions(+)
>>
>> diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c
>> b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c
>> index a7d76693c02d..9d0ed6760cb6 100644
>> --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c
>> +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c
>> @@ -1744,6 +1744,11 @@ static void rtl8192e_enable_rf(struct rtl8xxxu_priv *priv)
>>  	val8 = rtl8xxxu_read8(priv, REG_PAD_CTRL1);
>>  	val8 &= ~BIT(0);
>>  	rtl8xxxu_write8(priv, REG_PAD_CTRL1, val8);
>> +
>> +	/*
>> +	 * Fix transmission failure of rtl8192e.
>> +	 */
>> +	rtl8xxxu_write8(priv, REG_TXPAUSE, 0x00);
> 
> I trace when rtl8xxxu set REG_TXPAUSE=0xff that will stop TX.
> The occasions include RF calibration, LPS mode (called by power off), and
> going to stop. So, I think RF calibration does TX pause but not restore
> settings after calibration, and causes TX stuck. As the flow I traced,
> this patch looks reasonable. But, I wonder why other people don't meet
> this problem.
> 
Other people have this problem too:
https://bugzilla.kernel.org/show_bug.cgi?id=196769
https://bugzilla.kernel.org/show_bug.cgi?id=216746

The RF calibration does restore REG_TXPAUSE at the end. What happens is
when you plug in the device, something (mac80211? wpa_supplicant?) calls
rtl8xxxu_start(), then rtl8xxxu_stop(), then rtl8xxxu_start() again.
rtl8xxxu_stop() sets REG_TXPAUSE to 0xff and nothing sets it back to 0.

^ permalink raw reply

* Re: wifi: ath9k: use proper statements in conditionals
From: Kalle Valo @ 2022-12-20 13:02 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Toke Høiland-Jørgensen, Pavel Skripkin, Arnd Bergmann,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Tetsuo Handa, linux-wireless, netdev, linux-kernel
In-Reply-To: <20221215165553.1950307-1-arnd@kernel.org>

Arnd Bergmann <arnd@kernel.org> wrote:

> From: Arnd Bergmann <arnd@arndb.de>
> 
> A previous cleanup patch accidentally broke some conditional
> expressions by replacing the safe "do {} while (0)" constructs
> with empty macros. gcc points this out when extra warnings
> are enabled:
> 
> drivers/net/wireless/ath/ath9k/hif_usb.c: In function 'ath9k_skb_queue_complete':
> drivers/net/wireless/ath/ath9k/hif_usb.c:251:57: error: suggest braces around empty body in an 'else' statement [-Werror=empty-body]
>   251 |                         TX_STAT_INC(hif_dev, skb_failed);
> 
> Make both sets of macros proper expressions again.
> 
> Fixes: d7fc76039b74 ("ath9k: htc: clean up statistics macros")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>

Patch applied to wireless.git, thanks.

b7dc753fe33a wifi: ath9k: use proper statements in conditionals

-- 
https://patchwork.kernel.org/project/linux-wireless/patch/20221215165553.1950307-1-arnd@kernel.org/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches


^ permalink raw reply

* Re: [PATCH] wifi: mt76: mt7996: select CONFIG_RELAY
From: Kalle Valo @ 2022-12-20 13:01 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Felix Fietkau, Lorenzo Bianconi, Ryder Lee, Arnd Bergmann,
	Shayne Chen, Sean Wang, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Matthias Brugger, linux-wireless,
	netdev, linux-arm-kernel, linux-mediatek, linux-kernel
In-Reply-To: <20221215163133.4152299-1-arnd@kernel.org>

Arnd Bergmann <arnd@kernel.org> wrote:

> From: Arnd Bergmann <arnd@arndb.de>
> 
> Without CONFIG_RELAY, the driver fails to link:
> 
> ERROR: modpost: "relay_flush" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined!
> ERROR: modpost: "relay_switch_subbuf" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined!
> ERROR: modpost: "relay_open" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined!
> ERROR: modpost: "relay_reset" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined!
> ERROR: modpost: "relay_file_operations" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined!
> 
> The same change was done in mt7915 for the corresponding copy of the code.
> 
> Fixes: 98686cd21624 ("wifi: mt76: mt7996: add driver for MediaTek Wi-Fi 7 (802.11be) devices")
> See-also: 988845c9361a ("mt76: mt7915: add support for passing chip/firmware debug data to user space")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Patch applied to wireless.git, thanks.

37fc9ad1617a wifi: mt76: mt7996: select CONFIG_RELAY

-- 
https://patchwork.kernel.org/project/linux-wireless/patch/20221215163133.4152299-1-arnd@kernel.org/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches


^ permalink raw reply

* [PATCH v1] qlcnic: prevent ->dcb use-after-free on qlcnic_dcb_enable() failure
From: Daniil Tatianin @ 2022-12-20 12:56 UTC (permalink / raw)
  To: Shahed Shaikh
  Cc: Daniil Tatianin, Manish Chopra, GR-Linux-NIC-Dev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, netdev, linux-kernel

adapter->dcb would get silently freed inside qlcnic_dcb_enable() in
case qlcnic_dcb_attach() would return an error, which always happens
under OOM conditions. This would lead to use-after-free because both
of the existing callers invoke qlcnic_dcb_get_info() on the obtained
pointer, which is potentially freed at that point.

Propagate errors from qlcnic_dcb_enable(), and instead free the dcb
pointer at callsite.

Found by Linux Verification Center (linuxtesting.org) with the SVACE
static analysis tool.

Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru>
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c | 9 ++++++++-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h       | 5 ++---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c      | 9 ++++++++-
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c
index dbb800769cb6..465f149d94d4 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c
@@ -2505,7 +2505,14 @@ int qlcnic_83xx_init(struct qlcnic_adapter *adapter)
 		goto disable_mbx_intr;
 
 	qlcnic_83xx_clear_function_resources(adapter);
-	qlcnic_dcb_enable(adapter->dcb);
+
+	err = qlcnic_dcb_enable(adapter->dcb);
+	if (err) {
+		qlcnic_clear_dcb_ops(adapter->dcb);
+		adapter->dcb = NULL;
+		goto disable_mbx_intr;
+	}
+
 	qlcnic_83xx_initialize_nic(adapter, 1);
 	qlcnic_dcb_get_info(adapter->dcb);
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
index 7519773eaca6..e1460f9c38bf 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
@@ -112,9 +112,8 @@ static inline void qlcnic_dcb_init_dcbnl_ops(struct qlcnic_dcb *dcb)
 		dcb->ops->init_dcbnl_ops(dcb);
 }
 
-static inline void qlcnic_dcb_enable(struct qlcnic_dcb *dcb)
+static inline int qlcnic_dcb_enable(struct qlcnic_dcb *dcb)
 {
-	if (dcb && qlcnic_dcb_attach(dcb))
-		qlcnic_clear_dcb_ops(dcb);
+	return dcb ? qlcnic_dcb_attach(dcb) : 0;
 }
 #endif
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 28476b982bab..36ba15fc9776 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -2599,7 +2599,14 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			 "Device does not support MSI interrupts\n");
 
 	if (qlcnic_82xx_check(adapter)) {
-		qlcnic_dcb_enable(adapter->dcb);
+		err = qlcnic_dcb_enable(adapter->dcb);
+		if (err) {
+			qlcnic_clear_dcb_ops(adapter->dcb);
+			adapter->dcb = NULL;
+			dev_err(&pdev->dev, "Failed to enable DCB\n");
+			goto err_out_free_hw;
+		}
+
 		qlcnic_dcb_get_info(adapter->dcb);
 		err = qlcnic_setup_intr(adapter);
 
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v2] net: lan78xx: prevent LAN88XX specific operations
From: Enguerrand de Ribaucourt @ 2022-12-20 12:47 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: netdev, woojung huh, davem, UNGLinuxDriver
In-Reply-To: <1061700ecedf92911d474a675bd3c47354ab600a.camel@redhat.com>

----- Original Message -----
> From: "Paolo Abeni" <pabeni@redhat.com>
> To: "Enguerrand de Ribaucourt" <enguerrand.de-ribaucourt@savoirfairelinux.com>, "netdev" <netdev@vger.kernel.org>
> Cc: "woojung huh" <woojung.huh@microchip.com>, "davem" <davem@davemloft.net>, "UNGLinuxDriver"
> <UNGLinuxDriver@microchip.com>
> Sent: Tuesday, December 20, 2022 1:41:08 PM
> Subject: Re: [PATCH v2] net: lan78xx: prevent LAN88XX specific operations

> On Tue, 2022-12-20 at 12:37 +0100, Enguerrand de Ribaucourt wrote:
> > Some operations during the cable switch workaround modify the register
> > LAN88XX_INT_MASK of the PHY. However, this register is specific to the
> > LAN8835 PHY. For instance, if a DP8322I PHY is connected to the LAN7801,
> > that register (0x19), corresponds to the LED and MAC address
> > configuration, resulting in unapropriate behavior.

> > Use the generic phy interrupt functions instead.

> > Fixes: 89b36fb5e532 ("lan78xx: Lan7801 Support for Fixed PHY")
> > Reviewed-by: Paolo Abeni <pabeni@redhat.com>;

> You should not attach this tag (or acked-by) on your own.

Thanks, I'm still new with the patching process.

> The following is not even the code I was _asking_ about...

>> Signed-off-by: Enguerrand de Ribaucourt
> > <enguerrand.de-ribaucourt@savoirfairelinux.com>
> > ---
> > drivers/net/usb/lan78xx.c | 14 +++-----------
> > 1 file changed, 3 insertions(+), 11 deletions(-)

> > diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
> > index f18ab8e220db..65d5d54994ff 100644
> > --- a/drivers/net/usb/lan78xx.c
> > +++ b/drivers/net/usb/lan78xx.c
> > @@ -28,6 +28,7 @@
> > #include <linux/phy_fixed.h>
> > #include <linux/of_mdio.h>
> > #include <linux/of_net.h>
> > +#include <linux/phy.h>
> > #include "lan78xx.h"

> > #define DRIVER_AUTHOR "WOOJUNG HUH <woojung.huh@microchip.com>"
>> @@ -2123,10 +2124,7 @@ static void lan78xx_link_status_change(struct net_device
> > *net)
> > * at forced 100 F/H mode.
> > */
> > if (!phydev->autoneg && (phydev->speed == 100)) {
> > - /* disable phy interrupt */
> > - temp = phy_read(phydev, LAN88XX_INT_MASK);
> > - temp &= ~LAN88XX_INT_MASK_MDINTPIN_EN_;
> > - phy_write(phydev, LAN88XX_INT_MASK, temp);
> > + phy_disable_interrupts(phydev);

> > temp = phy_read(phydev, MII_BMCR);
> > temp &= ~(BMCR_SPEED100 | BMCR_SPEED1000);
>> @@ -2134,13 +2132,7 @@ static void lan78xx_link_status_change(struct net_device
> > *net)
> > temp |= BMCR_SPEED100;
> > phy_write(phydev, MII_BMCR, temp); /* set to 100 later */

> > - /* clear pending interrupt generated while workaround */
> > - temp = phy_read(phydev, LAN88XX_INT_STS);
> > -
> > - /* enable phy interrupt back */
> > - temp = phy_read(phydev, LAN88XX_INT_MASK);
> > - temp |= LAN88XX_INT_MASK_MDINTPIN_EN_;
> > - phy_write(phydev, LAN88XX_INT_MASK, temp);
> > + phy_request_interrupt(phydev);

> This looks wrong. Should probably be:

> phy_enable_interrupts(phydev);

phy_enable_interrupts isn't exported in the header. I'll add a
dedicated commit for that.

> Paolo

^ permalink raw reply

* Re: [PATCH v2] net: lan78xx: prevent LAN88XX specific operations
From: Paolo Abeni @ 2022-12-20 12:45 UTC (permalink / raw)
  To: Enguerrand de Ribaucourt, netdev; +Cc: woojung.huh, davem, UNGLinuxDriver
In-Reply-To: <20221220113733.714233-1-enguerrand.de-ribaucourt@savoirfairelinux.com>

On Tue, 2022-12-20 at 12:37 +0100, Enguerrand de Ribaucourt wrote:
> Some operations during the cable switch workaround modify the register
> LAN88XX_INT_MASK of the PHY. However, this register is specific to the
> LAN8835 PHY. For instance, if a DP8322I PHY is connected to the LAN7801,
> that register (0x19), corresponds to the LED and MAC address
> configuration, resulting in unapropriate behavior.
> 
> Use the generic phy interrupt functions instead.
> 
> Fixes: 89b36fb5e532 ("lan78xx: Lan7801 Support for Fixed PHY")
> Reviewed-by: Paolo Abeni <pabeni@redhat.com>;
> Signed-off-by: Enguerrand de Ribaucourt <enguerrand.de-ribaucourt@savoirfairelinux.com>
> ---
>  drivers/net/usb/lan78xx.c | 14 +++-----------
>  1 file changed, 3 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
> index f18ab8e220db..65d5d54994ff 100644
> --- a/drivers/net/usb/lan78xx.c
> +++ b/drivers/net/usb/lan78xx.c
> @@ -28,6 +28,7 @@
>  #include <linux/phy_fixed.h>
>  #include <linux/of_mdio.h>
>  #include <linux/of_net.h>
> +#include <linux/phy.h>
>  #include "lan78xx.h"
>  
>  #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
> @@ -2123,10 +2124,7 @@ static void lan78xx_link_status_change(struct net_device *net)
>  	 * at forced 100 F/H mode.
>  	 */
>  	if (!phydev->autoneg && (phydev->speed == 100)) {
> -		/* disable phy interrupt */
> -		temp = phy_read(phydev, LAN88XX_INT_MASK);
> -		temp &= ~LAN88XX_INT_MASK_MDINTPIN_EN_;
> -		phy_write(phydev, LAN88XX_INT_MASK, temp);
> +		phy_disable_interrupts(phydev);
>  
>  		temp = phy_read(phydev, MII_BMCR);
>  		temp &= ~(BMCR_SPEED100 | BMCR_SPEED1000);
> @@ -2134,13 +2132,7 @@ static void lan78xx_link_status_change(struct net_device *net)
>  		temp |= BMCR_SPEED100;
>  		phy_write(phydev, MII_BMCR, temp); /* set to 100 later */
>  
> -		/* clear pending interrupt generated while workaround */
> -		temp = phy_read(phydev, LAN88XX_INT_STS);
> -
> -		/* enable phy interrupt back */
> -		temp = phy_read(phydev, LAN88XX_INT_MASK);
> -		temp |= LAN88XX_INT_MASK_MDINTPIN_EN_;
> -		phy_write(phydev, LAN88XX_INT_MASK, temp);
> +		phy_request_interrupt(phydev);
>  	}
>  }
> 

Oops, this does not even build... please take your time testing the
code before sending patches to the ML.

Paolo


^ permalink raw reply

* Re: [PATCH v2] net: lan78xx: prevent LAN88XX specific operations
From: Paolo Abeni @ 2022-12-20 12:41 UTC (permalink / raw)
  To: Enguerrand de Ribaucourt, netdev; +Cc: woojung.huh, davem, UNGLinuxDriver
In-Reply-To: <20221220113733.714233-1-enguerrand.de-ribaucourt@savoirfairelinux.com>

On Tue, 2022-12-20 at 12:37 +0100, Enguerrand de Ribaucourt wrote:
> Some operations during the cable switch workaround modify the register
> LAN88XX_INT_MASK of the PHY. However, this register is specific to the
> LAN8835 PHY. For instance, if a DP8322I PHY is connected to the LAN7801,
> that register (0x19), corresponds to the LED and MAC address
> configuration, resulting in unapropriate behavior.
> 
> Use the generic phy interrupt functions instead.
> 
> Fixes: 89b36fb5e532 ("lan78xx: Lan7801 Support for Fixed PHY")
> Reviewed-by: Paolo Abeni <pabeni@redhat.com>;

You should not attach this tag (or acked-by) on your own. 

The following is not even the code I was _asking_ about...

> Signed-off-by: Enguerrand de Ribaucourt <enguerrand.de-ribaucourt@savoirfairelinux.com>
> ---
>  drivers/net/usb/lan78xx.c | 14 +++-----------
>  1 file changed, 3 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
> index f18ab8e220db..65d5d54994ff 100644
> --- a/drivers/net/usb/lan78xx.c
> +++ b/drivers/net/usb/lan78xx.c
> @@ -28,6 +28,7 @@
>  #include <linux/phy_fixed.h>
>  #include <linux/of_mdio.h>
>  #include <linux/of_net.h>
> +#include <linux/phy.h>
>  #include "lan78xx.h"
>  
>  #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
> @@ -2123,10 +2124,7 @@ static void lan78xx_link_status_change(struct net_device *net)
>  	 * at forced 100 F/H mode.
>  	 */
>  	if (!phydev->autoneg && (phydev->speed == 100)) {
> -		/* disable phy interrupt */
> -		temp = phy_read(phydev, LAN88XX_INT_MASK);
> -		temp &= ~LAN88XX_INT_MASK_MDINTPIN_EN_;
> -		phy_write(phydev, LAN88XX_INT_MASK, temp);
> +		phy_disable_interrupts(phydev);
>  
>  		temp = phy_read(phydev, MII_BMCR);
>  		temp &= ~(BMCR_SPEED100 | BMCR_SPEED1000);
> @@ -2134,13 +2132,7 @@ static void lan78xx_link_status_change(struct net_device *net)
>  		temp |= BMCR_SPEED100;
>  		phy_write(phydev, MII_BMCR, temp); /* set to 100 later */
>  
> -		/* clear pending interrupt generated while workaround */
> -		temp = phy_read(phydev, LAN88XX_INT_STS);
> -
> -		/* enable phy interrupt back */
> -		temp = phy_read(phydev, LAN88XX_INT_MASK);
> -		temp |= LAN88XX_INT_MASK_MDINTPIN_EN_;
> -		phy_write(phydev, LAN88XX_INT_MASK, temp);
> +		phy_request_interrupt(phydev);

This looks wrong. Should probably be:

	phy_enable_interrupts(phydev);


Paolo


^ permalink raw reply

* Re: [net-next] ipv6: fix routing cache overflow for raw sockets
From: Paolo Abeni @ 2022-12-20 12:35 UTC (permalink / raw)
  To: Jon Maxwell, davem
  Cc: edumazet, kuba, yoshfuji, dsahern, netdev, linux-kernel
In-Reply-To: <20221218234801.579114-1-jmaxwell37@gmail.com>

On Mon, 2022-12-19 at 10:48 +1100, Jon Maxwell wrote:
> Sending Ipv6 packets in a loop via a raw socket triggers an issue where a 
> route is cloned by ip6_rt_cache_alloc() for each packet sent. This quickly 
> consumes the Ipv6 max_size threshold which defaults to 4096 resulting in 
> these warnings:
> 
> [1]   99.187805] dst_alloc: 7728 callbacks suppressed
> [2] Route cache is full: consider increasing sysctl net.ipv6.route.max_size.
> .
> .
> [300] Route cache is full: consider increasing sysctl net.ipv6.route.max_size.

If I read correctly, the maximum number of dst that the raw socket can
use this way is limited by the number of packets it allows via the
sndbuf limit, right?

Are other FLOWI_FLAG_KNOWN_NH users affected, too? e.g. nf_dup_ipv6,
ipvs, seg6?

@DavidA: why do we need to create RTF_CACHE clones for KNOWN_NH flows?

Thanks,

Paolo


^ permalink raw reply

* Re: [PATCH net-next v1 1/2] net: marvell: prestera: Add router ipv6 ABI
From: Piotr Raczynski @ 2022-12-20 12:21 UTC (permalink / raw)
  To: Yevhen Orlov
  Cc: netdev@vger.kernel.org, Volodymyr Mytnyk, Taras Chornyi,
	Mickey Rachamim, Serhiy Pshyk, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Andrew Lunn, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <Y5+RSF0Had10xizI@yorlov.ow.s>

On Sun, Dec 18, 2022 at 11:16:40PM +0100, Yevhen Orlov wrote:
> There are only lpm add/del for ipv6 needed.
> Nexthops indexes shared with ipv4.
> 
> Limitations:
> - Only "local" and "main" tables supported
> - Only generic interfaces supported for router (no bridges or vlans)
> 
Yevhen, currently net-next is closed.

Please repost when net-next reopens after Jan 2nd.

> Co-developed-by: Taras Chornyi <taras.chornyi@plvision.eu>
> Signed-off-by: Taras Chornyi <taras.chornyi@plvision.eu>
> Co-developed-by: Elad Nachman <enachman@marvell.com>
> Signed-off-by: Elad Nachman <enachman@marvell.com>
> Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
> ---
>  .../ethernet/marvell/prestera/prestera_hw.c   | 34 +++++++++++++++++++
>  .../ethernet/marvell/prestera/prestera_hw.h   |  4 +++
>  .../marvell/prestera/prestera_router_hw.c     | 33 ++++++++++++++----
>  3 files changed, 65 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_hw.c
> index fc6f7d2746e8..13341056599a 100644
> --- a/drivers/net/ethernet/marvell/prestera/prestera_hw.c
> +++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.c
> @@ -540,6 +540,11 @@ struct prestera_msg_iface {
>  	u8 __pad[3];
>  };
>  
> +enum prestera_msg_ip_addr_v {
> +	PRESTERA_MSG_IPV4 = 0,
> +	PRESTERA_MSG_IPV6
> +};
> +
>  struct prestera_msg_ip_addr {
>  	union {
>  		__be32 ipv4;
> @@ -2088,6 +2093,35 @@ int prestera_hw_lpm_del(struct prestera_switch *sw, u16 vr_id,
>  			    sizeof(req));
>  }
>  
> +int prestera_hw_lpm6_add(struct prestera_switch *sw, u16 vr_id,
> +			 __u8 *dst, u32 dst_len, u32 grp_id)
> +{
> +	struct prestera_msg_lpm_req req;
> +
> +	req.dst.v = PRESTERA_MSG_IPV6;
> +	memcpy(&req.dst.u.ipv6, dst, 16);
> +	req.dst_len = __cpu_to_le32(dst_len);
> +	req.vr_id = __cpu_to_le16(vr_id);
> +	req.grp_id = __cpu_to_le32(grp_id);
> +
> +	return prestera_cmd(sw, PRESTERA_CMD_TYPE_ROUTER_LPM_ADD, &req.cmd,
> +			    sizeof(req));
> +}
> +
> +int prestera_hw_lpm6_del(struct prestera_switch *sw, u16 vr_id,
> +			 __u8 *dst, u32 dst_len)
> +{
> +	struct prestera_msg_lpm_req req;
> +
> +	req.dst.v = PRESTERA_MSG_IPV6;
> +	memcpy(&req.dst.u.ipv6, dst, 16);
> +	req.dst_len = __cpu_to_le32(dst_len);
> +	req.vr_id = __cpu_to_le16(vr_id);
> +
> +	return prestera_cmd(sw, PRESTERA_CMD_TYPE_ROUTER_LPM_DELETE, &req.cmd,
> +			    sizeof(req));
> +}
> +
>  int prestera_hw_nh_entries_set(struct prestera_switch *sw, int count,
>  			       struct prestera_neigh_info *nhs, u32 grp_id)
>  {
> diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.h b/drivers/net/ethernet/marvell/prestera/prestera_hw.h
> index 0a929279e1ce..8769be6752bc 100644
> --- a/drivers/net/ethernet/marvell/prestera/prestera_hw.h
> +++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.h
> @@ -266,6 +266,10 @@ int prestera_hw_lpm_add(struct prestera_switch *sw, u16 vr_id,
>  			__be32 dst, u32 dst_len, u32 grp_id);
>  int prestera_hw_lpm_del(struct prestera_switch *sw, u16 vr_id,
>  			__be32 dst, u32 dst_len);
> +int prestera_hw_lpm6_add(struct prestera_switch *sw, u16 vr_id,
> +			 __u8 *dst, u32 dst_len, u32 grp_id);
> +int prestera_hw_lpm6_del(struct prestera_switch *sw, u16 vr_id,
> +			 __u8 *dst, u32 dst_len);
>  
>  /* NH API */
>  int prestera_hw_nh_entries_set(struct prestera_switch *sw, int count,
> diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
> index 02faaea2aefa..1c6d0cdbdfdf 100644
> --- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
> +++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
> @@ -581,8 +581,16 @@ static void __prestera_fib_node_destruct(struct prestera_switch *sw,
>  	struct prestera_vr *vr;
>  
>  	vr = fib_node->info.vr;
> -	prestera_hw_lpm_del(sw, vr->hw_vr_id, fib_node->key.addr.u.ipv4,
> -			    fib_node->key.prefix_len);
> +	if (fib_node->key.addr.v == PRESTERA_IPV4)
> +		prestera_hw_lpm_del(sw, vr->hw_vr_id, fib_node->key.addr.u.ipv4,
> +				    fib_node->key.prefix_len);
> +	else if (fib_node->key.addr.v == PRESTERA_IPV6)
> +		prestera_hw_lpm6_del(sw, vr->hw_vr_id,
> +				     (u8 *)&fib_node->key.addr.u.ipv6.s6_addr,
> +				     fib_node->key.prefix_len);
> +	else
> +		WARN(1, "Invalid address version. Memory corrupted?");
> +
>  	switch (fib_node->info.type) {
>  	case PRESTERA_FIB_TYPE_UC_NH:
>  		prestera_nexthop_group_put(sw, fib_node->info.nh_grp);
> @@ -661,8 +669,16 @@ prestera_fib_node_create(struct prestera_switch *sw,
>  		goto err_nh_grp_get;
>  	}
>  
> -	err = prestera_hw_lpm_add(sw, vr->hw_vr_id, key->addr.u.ipv4,
> -				  key->prefix_len, grp_id);
> +	if (key->addr.v == PRESTERA_IPV4)
> +		err = prestera_hw_lpm_add(sw, vr->hw_vr_id, key->addr.u.ipv4,
> +					  key->prefix_len, grp_id);
> +	else if (key->addr.v == PRESTERA_IPV6)
> +		err = prestera_hw_lpm6_add(sw, vr->hw_vr_id,
> +					   (u8 *)&key->addr.u.ipv6.s6_addr,
> +					   key->prefix_len, grp_id);
> +	else
> +		WARN(1, "Invalid address version. Memory corrupted?");
> +
>  	if (err)
>  		goto err_lpm_add;
>  
> @@ -674,8 +690,13 @@ prestera_fib_node_create(struct prestera_switch *sw,
>  	return fib_node;
>  
>  err_ht_insert:
> -	prestera_hw_lpm_del(sw, vr->hw_vr_id, key->addr.u.ipv4,
> -			    key->prefix_len);
> +	if (key->addr.v == PRESTERA_IPV4)
> +		prestera_hw_lpm_del(sw, vr->hw_vr_id, key->addr.u.ipv4,
> +				    key->prefix_len);
> +	else if (key->addr.v == PRESTERA_IPV6)
> +		prestera_hw_lpm6_del(sw, vr->hw_vr_id,
> +				     (u8 *)&key->addr.u.ipv6.s6_addr,
> +				     key->prefix_len);
>  err_lpm_add:
>  	if (fib_type == PRESTERA_FIB_TYPE_UC_NH)
>  		prestera_nexthop_group_put(sw, fib_node->info.nh_grp);
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH] ath10k: snoc: enable threaded napi on WCN3990
From: Manikanta Pubbisetty @ 2022-12-20 12:14 UTC (permalink / raw)
  To: Abhishek Kumar, kvalo
  Cc: ath10k, linux-wireless, linux-kernel, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni
In-Reply-To: <20221220075215.1.Ic12e347e0d61a618124b742614e82bbd5d770173@changeid>

On 12/20/2022 1:25 PM, Abhishek Kumar wrote:
> NAPI poll can be done in threaded context along with soft irq
> context. Threaded context can be scheduled efficiently, thus
> creating less of bottleneck during Rx processing. This patch is
> to enable threaded NAPI on ath10k driver.
> 
> Based on testing, it was observed that on WCN3990, the CPU0 reaches
> 100% utilization when napi runs in softirq context. At the same
> time the other CPUs are at low consumption percentage. This
> does not allow device to reach its maximum throughput potential.
> After enabling threaded napi, CPU load is balanced across all CPUs
> and following improvments were observed:
> - UDP_RX increase by ~22-25%
> - TCP_RX increase by ~15%
> 
> Tested-on: WCN3990 hw1.0 SNOC WLAN.HL.3.2.2-00696-QCAHLSWMTPL-1
> Signed-off-by: Abhishek Kumar <kuabhs@chromium.org>
> ---
> 
>   drivers/net/wireless/ath/ath10k/core.c | 16 ++++++++++++++++
>   drivers/net/wireless/ath/ath10k/hw.h   |  2 ++
>   drivers/net/wireless/ath/ath10k/snoc.c |  3 +++
>   3 files changed, 21 insertions(+)
> 
> diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
> index 5eb131ab916fd..ee4b6ba508c81 100644
> --- a/drivers/net/wireless/ath/ath10k/core.c
> +++ b/drivers/net/wireless/ath/ath10k/core.c
> @@ -100,6 +100,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA988X_HW_2_0_VERSION,
> @@ -140,6 +141,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA9887_HW_1_0_VERSION,
> @@ -181,6 +183,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA6174_HW_3_2_VERSION,
> @@ -217,6 +220,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA6174_HW_2_1_VERSION,
> @@ -257,6 +261,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA6174_HW_2_1_VERSION,
> @@ -297,6 +302,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA6174_HW_3_0_VERSION,
> @@ -337,6 +343,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA6174_HW_3_2_VERSION,
> @@ -381,6 +388,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA99X0_HW_2_0_DEV_VERSION,
> @@ -427,6 +435,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA9984_HW_1_0_DEV_VERSION,
> @@ -480,6 +489,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA9888_HW_2_0_DEV_VERSION,
> @@ -530,6 +540,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA9377_HW_1_0_DEV_VERSION,
> @@ -570,6 +581,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA9377_HW_1_1_DEV_VERSION,
> @@ -612,6 +624,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA9377_HW_1_1_DEV_VERSION,
> @@ -645,6 +658,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = QCA4019_HW_1_0_DEV_VERSION,
> @@ -692,6 +706,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = false,
>   		.use_fw_tx_credits = true,
>   		.delay_unmap_buffer = false,
> +		.enable_threaded_napi = false,
>   	},
>   	{
>   		.id = WCN3990_HW_1_0_DEV_VERSION,
> @@ -725,6 +740,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
>   		.hw_restart_disconnect = true,
>   		.use_fw_tx_credits = false,
>   		.delay_unmap_buffer = true,
> +		.enable_threaded_napi = true,
>   	},
>   };
>   
> diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
> index 9643031a4427a..adf3076b96503 100644
> --- a/drivers/net/wireless/ath/ath10k/hw.h
> +++ b/drivers/net/wireless/ath/ath10k/hw.h
> @@ -639,6 +639,8 @@ struct ath10k_hw_params {
>   	bool use_fw_tx_credits;
>   
>   	bool delay_unmap_buffer;
> +
> +	bool enable_threaded_napi;
>   };
>   
>   struct htt_resp;
> diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
> index cfcb759a87dea..b94150fb6ef06 100644
> --- a/drivers/net/wireless/ath/ath10k/snoc.c
> +++ b/drivers/net/wireless/ath/ath10k/snoc.c
> @@ -927,6 +927,9 @@ static int ath10k_snoc_hif_start(struct ath10k *ar)
>   
>   	bitmap_clear(ar_snoc->pending_ce_irqs, 0, CE_COUNT_MAX);
>   
> +	if (ar->hw_params.enable_threaded_napi)
> +		dev_set_threaded(&ar->napi_dev, true);
> +

Since this is done in the API specific to WCN3990, we do not need 
hw_param for this.

Thanks,
Manikanta

^ permalink raw reply

* Re: [RFC PATCH v1 0/2] virtio/vsock: fix mutual rx/tx hungup
From: Arseniy Krasnov @ 2022-12-20 12:09 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Stefan Hajnoczi, edumazet@google.com, David S. Miller,
	Jakub Kicinski, Paolo Abeni, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, virtualization@lists.linux-foundation.org,
	kvm@vger.kernel.org, kernel, Krasnov Arseniy
In-Reply-To: <20221220104312.5efhzu5ildj5smnn@sgarzare-redhat>

On 20.12.2022 13:43, Stefano Garzarella wrote:
> On Tue, Dec 20, 2022 at 09:23:17AM +0000, Arseniy Krasnov wrote:
>> On 20.12.2022 11:33, Stefano Garzarella wrote:
>>> On Tue, Dec 20, 2022 at 07:14:27AM +0000, Arseniy Krasnov wrote:
>>>> On 19.12.2022 18:41, Stefano Garzarella wrote:
>>>>
>>>> Hello!
>>>>
>>>>> Hi Arseniy,
>>>>>
>>>>> On Sat, Dec 17, 2022 at 8:42 PM Arseniy Krasnov <AVKrasnov@sberdevices.ru> wrote:
>>>>>>
>>>>>> Hello,
>>>>>>
>>>>>> seems I found strange thing(may be a bug) where sender('tx' later) and
>>>>>> receiver('rx' later) could stuck forever. Potential fix is in the first
>>>>>> patch, second patch contains reproducer, based on vsock test suite.
>>>>>> Reproducer is simple: tx just sends data to rx by 'write() syscall, rx
>>>>>> dequeues it using 'read()' syscall and uses 'poll()' for waiting. I run
>>>>>> server in host and client in guest.
>>>>>>
>>>>>> rx side params:
>>>>>> 1) SO_VM_SOCKETS_BUFFER_SIZE is 256Kb(e.g. default).
>>>>>> 2) SO_RCVLOWAT is 128Kb.
>>>>>>
>>>>>> What happens in the reproducer step by step:
>>>>>>
>>>>>
>>>>> I put the values of the variables involved to facilitate understanding:
>>>>>
>>>>> RX: buf_alloc = 256 KB; fwd_cnt = 0; last_fwd_cnt = 0;
>>>>>     free_space = buf_alloc - (fwd_cnt - last_fwd_cnt) = 256 KB
>>>>>
>>>>> The credit update is sent if
>>>>> free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE [64 KB]
>>>>>
>>>>>> 1) tx tries to send 256Kb + 1 byte (in a single 'write()')
>>>>>> 2) tx sends 256Kb, data reaches rx (rx_bytes == 256Kb)
>>>>>> 3) tx waits for space in 'write()' to send last 1 byte
>>>>>> 4) rx does poll(), (rx_bytes >= rcvlowat) 256Kb >= 128Kb, POLLIN is set
>>>>>> 5) rx reads 64Kb, credit update is not sent due to *
>>>>>
>>>>> RX: buf_alloc = 256 KB; fwd_cnt = 64 KB; last_fwd_cnt = 0;
>>>>>     free_space = 192 KB
>>>>>
>>>>>> 6) rx does poll(), (rx_bytes >= rcvlowat) 192Kb >= 128Kb, POLLIN is set
>>>>>> 7) rx reads 64Kb, credit update is not sent due to *
>>>>>
>>>>> RX: buf_alloc = 256 KB; fwd_cnt = 128 KB; last_fwd_cnt = 0;
>>>>>     free_space = 128 KB
>>>>>
>>>>>> 8) rx does poll(), (rx_bytes >= rcvlowat) 128Kb >= 128Kb, POLLIN is set
>>>>>> 9) rx reads 64Kb, credit update is not sent due to *
>>>>>
>>>>> Right, (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) is still false.
>>>>>
>>>>> RX: buf_alloc = 256 KB; fwd_cnt = 196 KB; last_fwd_cnt = 0;
>>>>>     free_space = 64 KB
>>>>>
>>>>>> 10) rx does poll(), (rx_bytes < rcvlowat) 64Kb < 128Kb, rx waits in poll()
>>>>>
>>>>> I agree that the TX is stuck because we are not sending the credit
>>>>> update, but also if RX sends the credit update at step 9, RX won't be
>>>>> woken up at step 10, right?
>>>>
>>>> Yes, RX will sleep, but TX will wake up and as we inform TX how much
>>>> free space we have, now there are two cases for TX:
>>>> 1) send "small" rest of data(e.g. without blocking again), leave 'write()'
>>>>   and continue execution. RX still waits in 'poll()'. Later TX will
>>>>   send enough data to wake up RX.
>>>> 2) send "big" rest of data - if rest is too big to leave 'write()' and TX
>>>>   will wait again for the free space - it will be able to send enough data
>>>>   to wake up RX as we compared 'rx_bytes' with rcvlowat value in RX.
>>>
>>> Right, so I'd update the test to behave like this.
>> Sorry, You mean vsock_test? To cover TX waiting for free space at RX, thus checking
>> this kernel patch logic?
> 
> Yep, I mean the test that you added in this series.
Ok
> 
>>> And I'd explain better the problem we are going to fix in the commit message.
>> Ok
>>>
>>>>>
>>>>>>
>>>>>> * is optimization in 'virtio_transport_stream_do_dequeue()' which
>>>>>>   sends OP_CREDIT_UPDATE only when we have not too much space -
>>>>>>   less than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
>>>>>>
>>>>>> Now tx side waits for space inside write() and rx waits in poll() for
>>>>>> 'rx_bytes' to reach SO_RCVLOWAT value. Both sides will wait forever. I
>>>>>> think, possible fix is to send credit update not only when we have too
>>>>>> small space, but also when number of bytes in receive queue is smaller
>>>>>> than SO_RCVLOWAT thus not enough to wake up sleeping reader. I'm not
>>>>>> sure about correctness of this idea, but anyway - I think that problem
>>>>>> above exists. What do You think?
>>>>>
>>>>> I'm not sure, I have to think more about it, but if RX reads less than
>>>>> SO_RCVLOWAT, I expect it's normal to get to a case of stuck.
>>>>>
>>>>> In this case we are only unstucking TX, but even if it sends that single
>>>>> byte, RX is still stuck and not consuming it, so it was useless to wake
>>>>> up TX if RX won't consume it anyway, right?
>>>>
>>>> 1) I think it is not useless, because we inform(not just wake up) TX that
>>>> there is free space at RX side - as i mentioned above.
>>>> 2) Anyway i think that this situation is a little bit strange: TX thinks that
>>>> there is no free space at RX and waits for it, but there is free space at RX!
>>>> At the same time, RX waits in poll() forever - it is ready to get new portion
>>>> of data to return POLLIN, but TX "thinks" exactly opposite thing - RX is full
>>>> of data. Of course, if there will be just stalls in TX data handling - it will
>>>> be ok - just performance degradation, but TX stucks forever.
>>>
>>> We did it to avoid a lot of credit update messages.
>> Yes, i see
>>> Anyway I think here the main point is why RX is setting SO_RCVLOWAT to 128 KB and then reads only half of it?
>>>
>>> So I think if the users set SO_RCVLOWAT to a value and then RX reads less then it, is expected to get stuck.
>> That a really interesting question, I've found nothing about this case in Google(not sure for 100%) or POSIX. But,
>> i can modify reproducer: it sets SO_RCVLOWAT to 128Kb BEFORE entering its last poll where it will stuck. In this
>> case behaviour looks more legal: it uses default SO_RCVLOWAT of 1, read 64Kb each time. Finally it sets SO_RCVLOWAT
>> to 128Kb(and imagine that it prepares 128Kb 'read()' buffer) and enters poll() - we will get same effect: TX will wait
>> for space, RX waits in 'poll()'.
> 
> Good point!
> 
>>>
>>> Anyway, since the change will not impact the default behaviour (SO_RCVLOWAT = 1) we can merge this patch, but IMHO we need to explain the case better and improve the test.
>> I see, of course I'm not sure about this change, just want to ask someone who knows this code better
> 
> Yes, it's an RFC, so you did well! :-)
So ok, I'll prepare RFC version of this patchset(e.g. CV with explanation, kernel patch and test for it)
> 
>>>
>>>>
>>>>>
>>>>> If RX woke up (e.g. SO_RCVLOWAT = 64KB) and read the remaining 64KB,
>>>>> then it would still send the credit update even without this patch and
>>>>> TX will send the 1 byte.
>>>>
>>>> But how RX will wake up in this case? E.g. it calls poll() without timeout,
>>>> connection is established, RX ignores signal
>>>
>>> RX will wake up because SO_RCVLOWAT is 64KB and there are 64 KB in the buffer. Then RX will read it and send the credit update to TX because
>>> free_space is 0.
>> IIUC, i'm talking about 10 steps above, e.g. RX will never wake up, because TX is waiting for space.
> 
> Yep, but if RX uses SO_RCVLOWAT = 64 KB instead of 128 KB (I mean if RX reads all the bytes that it's waiting as it specified in SO_RCVLOWAT), then RX will send the credit message.
> 
> But there is the case that you mentioned, when SO_RCVLOWAT is chagend while executing.
I'll use this case for test
> 
> Thanks,
> Stefano
> 
Thanks, Arseniy


^ permalink raw reply

* Re: [RFC PATCH v5 0/4] vsock: update tools and error handling
From: Arseniy Krasnov @ 2022-12-20 12:06 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: David S. Miller, edumazet@google.com, Paolo Abeni, Jakub Kicinski,
	linux-kernel@vger.kernel.org,
	virtualization@lists.linux-foundation.org, netdev@vger.kernel.org,
	kernel, Bobby Eshleman, Krasnov Arseniy
In-Reply-To: <20221220103824.w7xcwsg3o2mls7cs@sgarzare-redhat>

On 20.12.2022 13:38, Stefano Garzarella wrote:
> On Tue, Dec 20, 2022 at 07:16:38AM +0000, Arseniy Krasnov wrote:
>> Patchset consists of two parts:
>>
>> 1) Kernel patch
>> One patch from Bobby Eshleman. I took single patch from Bobby:
>> https://lore.kernel.org/lkml/d81818b868216c774613dd03641fcfe63cc55a45
>> .1660362668.git.bobby.eshleman@bytedance.com/ and use only part for
>> af_vsock.c, as VMCI and Hyper-V parts were rejected.
>>
>> I used it, because for SOCK_SEQPACKET big messages handling was broken -
>> ENOMEM was returned instead of EMSGSIZE. And anyway, current logic which
>> always replaces any error code returned by transport to ENOMEM looks
>> strange for me also(for example in EMSGSIZE case it was changed to
>> ENOMEM).
>>
>> 2) Tool patches
>> Since there is work on several significant updates for vsock(virtio/
>> vsock especially): skbuff, DGRAM, zerocopy rx/tx, so I think that this
>> patchset will be useful.
>>
>> This patchset updates vsock tests and tools a little bit. First of all
>> it updates test suite: two new tests are added. One test is reworked
>> message bound test. Now it is more complex. Instead of sending 1 byte
>> messages with one MSG_EOR bit, it sends messages of random length(one
>> half of messages are smaller than page size, second half are bigger)
>> with random number of MSG_EOR bits set. Receiver also don't know total
>> number of messages. Message bounds control is maintained by hash sum
>> of messages length calculation. Second test is for SOCK_SEQPACKET - it
>> tries to send message with length more than allowed. I think both tests
>> will be useful for DGRAM support also.
>>
>> Third thing that this patchset adds is small utility to test vsock
>> performance for both rx and tx. I think this util could be useful as
>> 'iperf'/'uperf', because:
>> 1) It is small comparing to 'iperf' or 'uperf', so it very easy to add
>>   new mode or feature to it(especially vsock specific).
>> 2) It allows to set SO_RCVLOWAT and SO_VM_SOCKETS_BUFFER_SIZE option.
>>   Whole throughtput depends on both parameters.
>> 3) It is located in the kernel source tree, so it could be updated by
>>   the same patchset which changes related kernel functionality in vsock.
>>
>> I used this util very often to check performance of my rx zerocopy
>> support(this tool has rx zerocopy support, but not in this patchset).
>>
>> Here is comparison of outputs from three utils: 'iperf', 'uperf' and
>> 'vsock_perf'. In all three cases sender was at guest side. rx and
>> tx buffers were always 64Kb(because by default 'uperf' uses 8K).
>>
>> iperf:
>>
>>   [ ID] Interval           Transfer     Bitrate
>>   [  5]   0.00-10.00  sec  12.8 GBytes  11.0 Gbits/sec sender
>>   [  5]   0.00-10.00  sec  12.8 GBytes  11.0 Gbits/sec receiver
>>
>> uperf:
>>
>>   Total     16.27GB /  11.36(s) =    12.30Gb/s       23455op/s
>>
>> vsock_perf:
>>
>>   tx performance: 12.301529 Gbits/s
>>   rx performance: 12.288011 Gbits/s
>>
>> Results are almost same in all three cases.
> 
> Thanks for checking this!
> 
>>
>> Patchset was rebased and tested on skbuff v8 patch from Bobby Eshleman:
>> https://lore.kernel.org/netdev/20221215043645.3545127-1-bobby.eshleman@bytedance.com/
> 
> I reviewed all the patches, in the last one there is just to update the README, so I think it is ready for net-next (when it will re-open).
Thanks! I'll fix it(just forgot about README) and send v6 with 'net-next' tag when net-next will be opened
> 
> Thanks,
> Stefano
> 


^ permalink raw reply

* [bpf-next v2 5/5] samples/bpf: use BPF_KSYSCALL macro in syscall tracing programs
From: Daniel T. Lee @ 2022-12-20 11:59 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, Andrii Nakryiko,
	Yonghong Song
  Cc: bpf, netdev
In-Reply-To: <20221220115928.11979-1-danieltimlee@gmail.com>

This commit enhances the syscall tracing programs by using the
BPF_SYSCALL macro to reduce the inconvenience of parsing arguments from
pt_regs. By simplifying argument extraction, bpf program will become
clear to understand.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
---
 samples/bpf/map_perf_test.bpf.c               | 26 ++++++++-----------
 .../bpf/test_current_task_under_cgroup.bpf.c  |  4 ++-
 samples/bpf/test_probe_write_user.bpf.c       | 12 ++++-----
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/samples/bpf/map_perf_test.bpf.c b/samples/bpf/map_perf_test.bpf.c
index 0c7885057ffe..3cdeba2afe12 100644
--- a/samples/bpf/map_perf_test.bpf.c
+++ b/samples/bpf/map_perf_test.bpf.c
@@ -101,7 +101,7 @@ struct {
 } lru_hash_lookup_map SEC(".maps");
 
 SEC("ksyscall/getuid")
-int stress_hmap(struct pt_regs *ctx)
+int BPF_KSYSCALL(stress_hmap)
 {
 	u32 key = bpf_get_current_pid_tgid();
 	long init_val = 1;
@@ -119,7 +119,7 @@ int stress_hmap(struct pt_regs *ctx)
 }
 
 SEC("ksyscall/geteuid")
-int stress_percpu_hmap(struct pt_regs *ctx)
+int BPF_KSYSCALL(stress_percpu_hmap)
 {
 	u32 key = bpf_get_current_pid_tgid();
 	long init_val = 1;
@@ -136,7 +136,7 @@ int stress_percpu_hmap(struct pt_regs *ctx)
 }
 
 SEC("ksyscall/getgid")
-int stress_hmap_alloc(struct pt_regs *ctx)
+int BPF_KSYSCALL(stress_hmap_alloc)
 {
 	u32 key = bpf_get_current_pid_tgid();
 	long init_val = 1;
@@ -153,7 +153,7 @@ int stress_hmap_alloc(struct pt_regs *ctx)
 }
 
 SEC("ksyscall/getegid")
-int stress_percpu_hmap_alloc(struct pt_regs *ctx)
+int BPF_KSYSCALL(stress_percpu_hmap_alloc)
 {
 	u32 key = bpf_get_current_pid_tgid();
 	long init_val = 1;
@@ -168,11 +168,10 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx)
 	}
 	return 0;
 }
-
 SEC("ksyscall/connect")
-int stress_lru_hmap_alloc(struct pt_regs *ctx)
+int BPF_KSYSCALL(stress_lru_hmap_alloc, int fd, struct sockaddr_in *uservaddr,
+		 int addrlen)
 {
-	struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
 	char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn";
 	union {
 		u16 dst6[8];
@@ -185,14 +184,11 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx)
 			u32 key;
 		};
 	} test_params;
-	struct sockaddr_in6 *in6;
+	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)uservaddr;
 	u16 test_case;
-	int addrlen, ret;
 	long val = 1;
 	u32 key = 0;
-
-	in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs);
-	addrlen = (int)PT_REGS_PARM3_CORE(real_regs);
+	int ret;
 
 	if (addrlen != sizeof(*in6))
 		return 0;
@@ -250,7 +246,7 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx)
 }
 
 SEC("ksyscall/gettid")
-int stress_lpm_trie_map_alloc(struct pt_regs *ctx)
+int BPF_KSYSCALL(stress_lpm_trie_map_alloc)
 {
 	union {
 		u32 b32[2];
@@ -272,7 +268,7 @@ int stress_lpm_trie_map_alloc(struct pt_regs *ctx)
 }
 
 SEC("ksyscall/getpgid")
-int stress_hash_map_lookup(struct pt_regs *ctx)
+int BPF_KSYSCALL(stress_hash_map_lookup)
 {
 	u32 key = 1, i;
 	long *value;
@@ -285,7 +281,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx)
 }
 
 SEC("ksyscall/getppid")
-int stress_array_map_lookup(struct pt_regs *ctx)
+int BPF_KSYSCALL(stress_array_map_lookup)
 {
 	u32 key = 1, i;
 	long *value;
diff --git a/samples/bpf/test_current_task_under_cgroup.bpf.c b/samples/bpf/test_current_task_under_cgroup.bpf.c
index 0b059cee3cba..58b9cf7ed659 100644
--- a/samples/bpf/test_current_task_under_cgroup.bpf.c
+++ b/samples/bpf/test_current_task_under_cgroup.bpf.c
@@ -8,6 +8,8 @@
 #include "vmlinux.h"
 #include <linux/version.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
 
 struct {
 	__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
@@ -25,7 +27,7 @@ struct {
 
 /* Writes the last PID that called sync to a map at index 0 */
 SEC("ksyscall/sync")
-int bpf_prog1(struct pt_regs *ctx)
+int BPF_KSYSCALL(bpf_prog1)
 {
 	u64 pid = bpf_get_current_pid_tgid();
 	int idx = 0;
diff --git a/samples/bpf/test_probe_write_user.bpf.c b/samples/bpf/test_probe_write_user.bpf.c
index a0f10c5ca273..a4f3798b7fb0 100644
--- a/samples/bpf/test_probe_write_user.bpf.c
+++ b/samples/bpf/test_probe_write_user.bpf.c
@@ -27,24 +27,22 @@ struct {
  * of course, across platforms, and over time, the ABI may change.
  */
 SEC("ksyscall/connect")
-int bpf_prog1(struct pt_regs *ctx)
+int BPF_KSYSCALL(bpf_prog1, int fd, struct sockaddr_in *uservaddr,
+		 int addrlen)
 {
-	struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
-	void *sockaddr_arg = (void *)PT_REGS_PARM2_CORE(real_regs);
-	int sockaddr_len = (int)PT_REGS_PARM3_CORE(real_regs);
 	struct sockaddr_in new_addr, orig_addr = {};
 	struct sockaddr_in *mapped_addr;
 
-	if (sockaddr_len > sizeof(orig_addr))
+	if (addrlen > sizeof(orig_addr))
 		return 0;
 
-	if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0)
+	if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), uservaddr) != 0)
 		return 0;
 
 	mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr);
 	if (mapped_addr != NULL) {
 		memcpy(&new_addr, mapped_addr, sizeof(new_addr));
-		bpf_probe_write_user(sockaddr_arg, &new_addr,
+		bpf_probe_write_user(uservaddr, &new_addr,
 				     sizeof(new_addr));
 	}
 	return 0;
-- 
2.34.1


^ permalink raw reply related

* [bpf-next v2 3/5] samples/bpf: change _kern suffix to .bpf with syscall tracing program
From: Daniel T. Lee @ 2022-12-20 11:59 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, Andrii Nakryiko,
	Yonghong Song
  Cc: bpf, netdev
In-Reply-To: <20221220115928.11979-1-danieltimlee@gmail.com>

Currently old compile rule (CLANG-bpf) doesn't contains VMLINUX_H define
flag which is essential for the bpf program that includes "vmlinux.h".
Also old compile rule doesn't directly specify the compile target as bpf,
instead it uses bunch of extra options with clang followed by long chain
of commands. (e.g. clang | opt | llvm-dis | llc)

In Makefile, there is already new compile rule which is more simple and
neat. And it also has -D__VMLINUX_H__ option. By just changing the _kern
suffix to .bpf will inherit the benefit of the new CLANG-BPF compile
target.

Also, this commit adds dummy gnu/stub.h to the samples/bpf directory.
As commit 1c2dd16add7e ("selftests/bpf: get rid of -D__x86_64__") noted,
compiling with 'clang -target bpf' will raise an error with stubs.h
unless workaround (-D__x86_64) is used. This commit solves this problem
by adding dummy stub.h to make /usr/include/features.h to follow the
expected path as the same way selftests/bpf dealt with.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
---
Changes in V2:
- add gnu/stub.h hack to fix compile error with 'clang -target bpf'

 samples/bpf/Makefile                                   | 10 +++++-----
 samples/bpf/gnu/stubs.h                                |  1 +
 .../bpf/{map_perf_test_kern.c => map_perf_test.bpf.c}  |  0
 samples/bpf/map_perf_test_user.c                       |  2 +-
 ...oup_kern.c => test_current_task_under_cgroup.bpf.c} |  0
 samples/bpf/test_current_task_under_cgroup_user.c      |  2 +-
 ...e_write_user_kern.c => test_probe_write_user.bpf.c} |  0
 samples/bpf/test_probe_write_user_user.c               |  2 +-
 .../bpf/{trace_output_kern.c => trace_output.bpf.c}    |  0
 samples/bpf/trace_output_user.c                        |  2 +-
 samples/bpf/{tracex2_kern.c => tracex2.bpf.c}          |  0
 samples/bpf/tracex2_user.c                             |  2 +-
 12 files changed, 11 insertions(+), 10 deletions(-)
 create mode 100644 samples/bpf/gnu/stubs.h
 rename samples/bpf/{map_perf_test_kern.c => map_perf_test.bpf.c} (100%)
 rename samples/bpf/{test_current_task_under_cgroup_kern.c => test_current_task_under_cgroup.bpf.c} (100%)
 rename samples/bpf/{test_probe_write_user_kern.c => test_probe_write_user.bpf.c} (100%)
 rename samples/bpf/{trace_output_kern.c => trace_output.bpf.c} (100%)
 rename samples/bpf/{tracex2_kern.c => tracex2.bpf.c} (100%)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 727da3c5879b..22039a0a5b35 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -125,21 +125,21 @@ always-y += sockex1_kern.o
 always-y += sockex2_kern.o
 always-y += sockex3_kern.o
 always-y += tracex1_kern.o
-always-y += tracex2_kern.o
+always-y += tracex2.bpf.o
 always-y += tracex3_kern.o
 always-y += tracex4_kern.o
 always-y += tracex5_kern.o
 always-y += tracex6_kern.o
 always-y += tracex7_kern.o
 always-y += sock_flags_kern.o
-always-y += test_probe_write_user_kern.o
-always-y += trace_output_kern.o
+always-y += test_probe_write_user.bpf.o
+always-y += trace_output.bpf.o
 always-y += tcbpf1_kern.o
 always-y += tc_l2_redirect_kern.o
 always-y += lathist_kern.o
 always-y += offwaketime_kern.o
 always-y += spintest_kern.o
-always-y += map_perf_test_kern.o
+always-y += map_perf_test.bpf.o
 always-y += test_overhead_tp_kern.o
 always-y += test_overhead_raw_tp_kern.o
 always-y += test_overhead_kprobe_kern.o
@@ -147,7 +147,7 @@ always-y += parse_varlen.o parse_simple.o parse_ldabs.o
 always-y += test_cgrp2_tc_kern.o
 always-y += xdp1_kern.o
 always-y += xdp2_kern.o
-always-y += test_current_task_under_cgroup_kern.o
+always-y += test_current_task_under_cgroup.bpf.o
 always-y += trace_event_kern.o
 always-y += sampleip_kern.o
 always-y += lwt_len_hist_kern.o
diff --git a/samples/bpf/gnu/stubs.h b/samples/bpf/gnu/stubs.h
new file mode 100644
index 000000000000..719225b16626
--- /dev/null
+++ b/samples/bpf/gnu/stubs.h
@@ -0,0 +1 @@
+/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test.bpf.c
similarity index 100%
rename from samples/bpf/map_perf_test_kern.c
rename to samples/bpf/map_perf_test.bpf.c
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index 1bb53f4b29e1..d2fbcf963cdf 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -443,7 +443,7 @@ int main(int argc, char **argv)
 	if (argc > 4)
 		max_cnt = atoi(argv[4]);
 
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
 	obj = bpf_object__open_file(filename, NULL);
 	if (libbpf_get_error(obj)) {
 		fprintf(stderr, "ERROR: opening BPF object file failed\n");
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup.bpf.c
similarity index 100%
rename from samples/bpf/test_current_task_under_cgroup_kern.c
rename to samples/bpf/test_current_task_under_cgroup.bpf.c
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c
index 6fb25906835e..9726ed2a8a8b 100644
--- a/samples/bpf/test_current_task_under_cgroup_user.c
+++ b/samples/bpf/test_current_task_under_cgroup_user.c
@@ -21,7 +21,7 @@ int main(int argc, char **argv)
 	char filename[256];
 	int map_fd[2];
 
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
 	obj = bpf_object__open_file(filename, NULL);
 	if (libbpf_get_error(obj)) {
 		fprintf(stderr, "ERROR: opening BPF object file failed\n");
diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user.bpf.c
similarity index 100%
rename from samples/bpf/test_probe_write_user_kern.c
rename to samples/bpf/test_probe_write_user.bpf.c
diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c
index 00ccfb834e45..2a539aec4116 100644
--- a/samples/bpf/test_probe_write_user_user.c
+++ b/samples/bpf/test_probe_write_user_user.c
@@ -24,7 +24,7 @@ int main(int ac, char **argv)
 	mapped_addr_in = (struct sockaddr_in *)&mapped_addr;
 	tmp_addr_in = (struct sockaddr_in *)&tmp_addr;
 
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
 	obj = bpf_object__open_file(filename, NULL);
 	if (libbpf_get_error(obj)) {
 		fprintf(stderr, "ERROR: opening BPF object file failed\n");
diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output.bpf.c
similarity index 100%
rename from samples/bpf/trace_output_kern.c
rename to samples/bpf/trace_output.bpf.c
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index 371732f9cf8e..d316fd2c8e24 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -51,7 +51,7 @@ int main(int argc, char **argv)
 	char filename[256];
 	FILE *f;
 
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
 	obj = bpf_object__open_file(filename, NULL);
 	if (libbpf_get_error(obj)) {
 		fprintf(stderr, "ERROR: opening BPF object file failed\n");
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2.bpf.c
similarity index 100%
rename from samples/bpf/tracex2_kern.c
rename to samples/bpf/tracex2.bpf.c
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index 089e408abd7a..2131f1648cf1 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -123,7 +123,7 @@ int main(int ac, char **argv)
 	int i, j = 0;
 	FILE *f;
 
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
 	obj = bpf_object__open_file(filename, NULL);
 	if (libbpf_get_error(obj)) {
 		fprintf(stderr, "ERROR: opening BPF object file failed\n");
-- 
2.34.1


^ permalink raw reply related

* [bpf-next v2 4/5] samples/bpf: fix tracex2 by using BPF_KSYSCALL macro
From: Daniel T. Lee @ 2022-12-20 11:59 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, Andrii Nakryiko,
	Yonghong Song
  Cc: bpf, netdev
In-Reply-To: <20221220115928.11979-1-danieltimlee@gmail.com>

Currently, there is a problem with tracex2, as it doesn't print the
histogram properly and the results are misleading. (all results report
as 0)

The problem is caused by a change in arguments of the function to which
the kprobe connects. This tracex2 bpf program uses kprobe (attached
to __x64_sys_write) to figure out the size of the write system call. In
order to achieve this, the third argument 'count' must be intact.

The following is a prototype of the sys_write variant. (checked with
pfunct)

    ~/git/linux$ pfunct -P fs/read_write.o | grep sys_write
    ssize_t ksys_write(unsigned int fd, const char  * buf, size_t count);
    long int __x64_sys_write(const struct pt_regs  * regs);
    ... cross compile with s390x ...
    long int __s390_sys_write(struct pt_regs * regs);

Since the nature of SYSCALL_WRAPPER function wraps the argument once,
additional process of argument extraction is required to properly parse
the argument.

    #define BPF_KSYSCALL(name, args...)
    ... snip ...
    struct pt_regs *regs = LINUX_HAS_SYSCALL_WRAPPER                    \
			   ? (struct pt_regs *)PT_REGS_PARM1(ctx)       \
			   : ctx;                                       \

In order to fix this problem, the BPF_SYSCALL macro has been used. This
reduces the hassle of parsing arguments from pt_regs. Since the macro
uses the CORE version of argument extraction, additional portability
comes too.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
---
 samples/bpf/tracex2.bpf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/tracex2.bpf.c b/samples/bpf/tracex2.bpf.c
index a712eefc742e..0a5c75b367be 100644
--- a/samples/bpf/tracex2.bpf.c
+++ b/samples/bpf/tracex2.bpf.c
@@ -8,6 +8,7 @@
 #include <linux/version.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
 
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
@@ -76,14 +77,13 @@ struct {
 } my_hist_map SEC(".maps");
 
 SEC("ksyscall/write")
-int bpf_prog3(struct pt_regs *ctx)
+int BPF_KSYSCALL(bpf_prog3, unsigned int fd, const char *buf, size_t count)
 {
-	long write_size = PT_REGS_PARM3(ctx);
 	long init_val = 1;
 	long *value;
 	struct hist_key key;
 
-	key.index = log2l(write_size);
+	key.index = log2l(count);
 	key.pid_tgid = bpf_get_current_pid_tgid();
 	key.uid_gid = bpf_get_current_uid_gid();
 	bpf_get_current_comm(&key.comm, sizeof(key.comm));
-- 
2.34.1


^ permalink raw reply related

* [bpf-next v2 2/5] samples/bpf: use vmlinux.h instead of implicit headers in syscall tracing program
From: Daniel T. Lee @ 2022-12-20 11:59 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, Andrii Nakryiko,
	Yonghong Song
  Cc: bpf, netdev
In-Reply-To: <20221220115928.11979-1-danieltimlee@gmail.com>

This commit applies vmlinux.h to syscall tracing program. This change
allows the bpf program to refer to the internal structure as a single
"vmlinux.h" instead of including each header referenced by the bpf
program.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
---
 samples/bpf/map_perf_test_kern.c                  | 5 ++---
 samples/bpf/test_current_task_under_cgroup_kern.c | 4 +---
 samples/bpf/test_probe_write_user_kern.c          | 5 ++---
 samples/bpf/trace_output_kern.c                   | 3 +--
 samples/bpf/tracex2_kern.c                        | 4 +---
 5 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index 874e2f7e3d5d..0c7885057ffe 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -4,10 +4,9 @@
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  */
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include "vmlinux.h"
+#include <errno.h>
 #include <linux/version.h>
-#include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c
index 541fc861b984..0b059cee3cba 100644
--- a/samples/bpf/test_current_task_under_cgroup_kern.c
+++ b/samples/bpf/test_current_task_under_cgroup_kern.c
@@ -5,11 +5,9 @@
  * License as published by the Free Software Foundation.
  */
 
-#include <linux/ptrace.h>
-#include <uapi/linux/bpf.h>
+#include "vmlinux.h"
 #include <linux/version.h>
 #include <bpf/bpf_helpers.h>
-#include <uapi/linux/utsname.h>
 
 struct {
 	__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c
index d60cabaaf753..a0f10c5ca273 100644
--- a/samples/bpf/test_probe_write_user_kern.c
+++ b/samples/bpf/test_probe_write_user_kern.c
@@ -4,9 +4,8 @@
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  */
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <uapi/linux/bpf.h>
+#include "vmlinux.h"
+#include <string.h>
 #include <linux/version.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c
index a481abf8c4c5..565a73b51b04 100644
--- a/samples/bpf/trace_output_kern.c
+++ b/samples/bpf/trace_output_kern.c
@@ -1,6 +1,5 @@
-#include <linux/ptrace.h>
+#include "vmlinux.h"
 #include <linux/version.h>
-#include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
 struct {
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
index 82091facb83c..a712eefc742e 100644
--- a/samples/bpf/tracex2_kern.c
+++ b/samples/bpf/tracex2_kern.c
@@ -4,10 +4,8 @@
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  */
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include "vmlinux.h"
 #include <linux/version.h>
-#include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-- 
2.34.1


^ permalink raw reply related

* [bpf-next v2 1/5] samples/bpf: use kyscall instead of kprobe in syscall tracing program
From: Daniel T. Lee @ 2022-12-20 11:59 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, Andrii Nakryiko,
	Yonghong Song
  Cc: bpf, netdev
In-Reply-To: <20221220115928.11979-1-danieltimlee@gmail.com>

Syscall tracing using kprobe is quite unstable. Since it uses the exact
name of the kernel function, the program might broke due to the rename
of a function. The problem can also be caused by a changes in the
arguments of the function to which the kprobe connects.

In this commit, ksyscall is used instead of kprobe. By using ksyscall,
libbpf will detect the appropriate kernel function name.
(e.g. sys_write -> __s390_sys_write). This eliminates the need to worry
about which wrapper function to attach in order to parse arguments.

In addition, ksyscall provides more fine method with attaching system
call, the coarse SYSCALL helper at trace_common.h can be removed.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
---
 samples/bpf/map_perf_test_kern.c                | 17 ++++++++---------
 .../bpf/test_current_task_under_cgroup_kern.c   |  3 +--
 samples/bpf/test_map_in_map_kern.c              |  1 -
 samples/bpf/test_probe_write_user_kern.c        |  3 +--
 samples/bpf/trace_common.h                      | 13 -------------
 samples/bpf/trace_output_kern.c                 |  3 +--
 samples/bpf/tracex2_kern.c                      |  3 +--
 7 files changed, 12 insertions(+), 31 deletions(-)
 delete mode 100644 samples/bpf/trace_common.h

diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index 7342c5b2f278..874e2f7e3d5d 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -11,7 +11,6 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
-#include "trace_common.h"
 
 #define MAX_ENTRIES 1000
 #define MAX_NR_CPUS 1024
@@ -102,7 +101,7 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 } lru_hash_lookup_map SEC(".maps");
 
-SEC("kprobe/" SYSCALL(sys_getuid))
+SEC("ksyscall/getuid")
 int stress_hmap(struct pt_regs *ctx)
 {
 	u32 key = bpf_get_current_pid_tgid();
@@ -120,7 +119,7 @@ int stress_hmap(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/" SYSCALL(sys_geteuid))
+SEC("ksyscall/geteuid")
 int stress_percpu_hmap(struct pt_regs *ctx)
 {
 	u32 key = bpf_get_current_pid_tgid();
@@ -137,7 +136,7 @@ int stress_percpu_hmap(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/" SYSCALL(sys_getgid))
+SEC("ksyscall/getgid")
 int stress_hmap_alloc(struct pt_regs *ctx)
 {
 	u32 key = bpf_get_current_pid_tgid();
@@ -154,7 +153,7 @@ int stress_hmap_alloc(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/" SYSCALL(sys_getegid))
+SEC("ksyscall/getegid")
 int stress_percpu_hmap_alloc(struct pt_regs *ctx)
 {
 	u32 key = bpf_get_current_pid_tgid();
@@ -171,7 +170,7 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/" SYSCALL(sys_connect))
+SEC("ksyscall/connect")
 int stress_lru_hmap_alloc(struct pt_regs *ctx)
 {
 	struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
@@ -251,7 +250,7 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/" SYSCALL(sys_gettid))
+SEC("ksyscall/gettid")
 int stress_lpm_trie_map_alloc(struct pt_regs *ctx)
 {
 	union {
@@ -273,7 +272,7 @@ int stress_lpm_trie_map_alloc(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/" SYSCALL(sys_getpgid))
+SEC("ksyscall/getpgid")
 int stress_hash_map_lookup(struct pt_regs *ctx)
 {
 	u32 key = 1, i;
@@ -286,7 +285,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/" SYSCALL(sys_getppid))
+SEC("ksyscall/getppid")
 int stress_array_map_lookup(struct pt_regs *ctx)
 {
 	u32 key = 1, i;
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c
index fbd43e2bb4d3..541fc861b984 100644
--- a/samples/bpf/test_current_task_under_cgroup_kern.c
+++ b/samples/bpf/test_current_task_under_cgroup_kern.c
@@ -10,7 +10,6 @@
 #include <linux/version.h>
 #include <bpf/bpf_helpers.h>
 #include <uapi/linux/utsname.h>
-#include "trace_common.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
@@ -27,7 +26,7 @@ struct {
 } perf_map SEC(".maps");
 
 /* Writes the last PID that called sync to a map at index 0 */
-SEC("kprobe/" SYSCALL(sys_sync))
+SEC("ksyscall/sync")
 int bpf_prog1(struct pt_regs *ctx)
 {
 	u64 pid = bpf_get_current_pid_tgid();
diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c
index b0200c8eac09..0e17f9ade5c5 100644
--- a/samples/bpf/test_map_in_map_kern.c
+++ b/samples/bpf/test_map_in_map_kern.c
@@ -13,7 +13,6 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
-#include "trace_common.h"
 
 #define MAX_NR_PORTS 65536
 
diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c
index 220a96438d75..d60cabaaf753 100644
--- a/samples/bpf/test_probe_write_user_kern.c
+++ b/samples/bpf/test_probe_write_user_kern.c
@@ -11,7 +11,6 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
-#include "trace_common.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
@@ -28,7 +27,7 @@ struct {
  * This example sits on a syscall, and the syscall ABI is relatively stable
  * of course, across platforms, and over time, the ABI may change.
  */
-SEC("kprobe/" SYSCALL(sys_connect))
+SEC("ksyscall/connect")
 int bpf_prog1(struct pt_regs *ctx)
 {
 	struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h
deleted file mode 100644
index 8cb5400aed1f..000000000000
--- a/samples/bpf/trace_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef __TRACE_COMMON_H
-#define __TRACE_COMMON_H
-
-#ifdef __x86_64__
-#define SYSCALL(SYS) "__x64_" __stringify(SYS)
-#elif defined(__s390x__)
-#define SYSCALL(SYS) "__s390x_" __stringify(SYS)
-#else
-#define SYSCALL(SYS)  __stringify(SYS)
-#endif
-
-#endif
diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c
index b64815af0943..a481abf8c4c5 100644
--- a/samples/bpf/trace_output_kern.c
+++ b/samples/bpf/trace_output_kern.c
@@ -2,7 +2,6 @@
 #include <linux/version.h>
 #include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
-#include "trace_common.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
@@ -11,7 +10,7 @@ struct {
 	__uint(max_entries, 2);
 } my_map SEC(".maps");
 
-SEC("kprobe/" SYSCALL(sys_write))
+SEC("ksyscall/write")
 int bpf_prog1(struct pt_regs *ctx)
 {
 	struct S {
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
index 93e0b7680b4f..82091facb83c 100644
--- a/samples/bpf/tracex2_kern.c
+++ b/samples/bpf/tracex2_kern.c
@@ -10,7 +10,6 @@
 #include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "trace_common.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
@@ -78,7 +77,7 @@ struct {
 	__uint(max_entries, 1024);
 } my_hist_map SEC(".maps");
 
-SEC("kprobe/" SYSCALL(sys_write))
+SEC("ksyscall/write")
 int bpf_prog3(struct pt_regs *ctx)
 {
 	long write_size = PT_REGS_PARM3(ctx);
-- 
2.34.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox