Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net v2 2/2] r8169: fix network lost after resume on DASH systems
From: ChunHao Lin @ 2023-11-08 18:48 UTC (permalink / raw)
  To: hkallweit1
  Cc: nic_swsd, davem, edumazet, kuba, pabeni, netdev, linux-kernel,
	ChunHao Lin, stable
In-Reply-To: <20231108184849.2925-1-hau@realtek.com>

Device that support DASH may be reseted or powered off during suspend.
So driver needs to handle DASH during system suspend and resume. Or
DASH firmware will influence device behavior and causes network lost.

Signed-off-by: ChunHao Lin <hau@realtek.com>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Cc: stable@vger.kernel.org
---
 drivers/net/ethernet/realtek/r8169_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 108dc75050ba..5027cd595fe6 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -4661,10 +4661,16 @@ static void rtl8169_down(struct rtl8169_private *tp)
 	rtl8169_cleanup(tp);
 	rtl_disable_exit_l1(tp);
 	rtl_prepare_power_down(tp);
+
+	if (tp->dash_type != RTL_DASH_NONE)
+		rtl8168_driver_stop(tp);
 }
 
 static void rtl8169_up(struct rtl8169_private *tp)
 {
+	if (tp->dash_type != RTL_DASH_NONE)
+		rtl8168_driver_start(tp);
+
 	pci_set_master(tp->pci_dev);
 	phy_init_hw(tp->phydev);
 	phy_resume(tp->phydev);
-- 
2.39.2


^ permalink raw reply related

* [PATCH net v2 1/2] r8169: add handling DASH when DASH is disabled
From: ChunHao Lin @ 2023-11-08 18:48 UTC (permalink / raw)
  To: hkallweit1
  Cc: nic_swsd, davem, edumazet, kuba, pabeni, netdev, linux-kernel,
	ChunHao Lin, stable
In-Reply-To: <20231108184849.2925-1-hau@realtek.com>

For devices that support DASH, even DASH is disabled, there may still
exist a default firmware that will influence device behavior.
So driver needs to handle DASH for devices that support DASH, no
matter the DASH status is.

This patch also prepare for "fix DASH deviceis network lost issue".

Signed-off-by: ChunHao Lin <hau@realtek.com>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Cc: stable@vger.kernel.org
---
 drivers/net/ethernet/realtek/r8169_main.c | 35 ++++++++++++++++-------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 0c76c162b8a9..108dc75050ba 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -624,6 +624,7 @@ struct rtl8169_private {
 
 	unsigned supports_gmii:1;
 	unsigned aspm_manageable:1;
+	unsigned dash_enabled:1;
 	dma_addr_t counters_phys_addr;
 	struct rtl8169_counters *counters;
 	struct rtl8169_tc_offsets tc_offset;
@@ -1253,14 +1254,26 @@ static bool r8168ep_check_dash(struct rtl8169_private *tp)
 	return r8168ep_ocp_read(tp, 0x128) & BIT(0);
 }
 
-static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp)
+static bool rtl_dash_is_enabled(struct rtl8169_private *tp)
+{
+	switch (tp->dash_type) {
+	case RTL_DASH_DP:
+		return r8168dp_check_dash(tp);
+	case RTL_DASH_EP:
+		return r8168ep_check_dash(tp);
+	default:
+		return false;
+	}
+}
+
+static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp)
 {
 	switch (tp->mac_version) {
 	case RTL_GIGA_MAC_VER_28:
 	case RTL_GIGA_MAC_VER_31:
-		return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE;
+		return RTL_DASH_DP;
 	case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53:
-		return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE;
+		return RTL_DASH_EP;
 	default:
 		return RTL_DASH_NONE;
 	}
@@ -1453,7 +1466,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
 
 	device_set_wakeup_enable(tp_to_dev(tp), wolopts);
 
-	if (tp->dash_type == RTL_DASH_NONE) {
+	if (!tp->dash_enabled) {
 		rtl_set_d3_pll_down(tp, !wolopts);
 		tp->dev->wol_enabled = wolopts ? 1 : 0;
 	}
@@ -2512,7 +2525,7 @@ static void rtl_wol_enable_rx(struct rtl8169_private *tp)
 
 static void rtl_prepare_power_down(struct rtl8169_private *tp)
 {
-	if (tp->dash_type != RTL_DASH_NONE)
+	if (tp->dash_enabled)
 		return;
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_32 ||
@@ -4869,7 +4882,7 @@ static int rtl8169_runtime_idle(struct device *device)
 {
 	struct rtl8169_private *tp = dev_get_drvdata(device);
 
-	if (tp->dash_type != RTL_DASH_NONE)
+	if (tp->dash_enabled)
 		return -EBUSY;
 
 	if (!netif_running(tp->dev) || !netif_carrier_ok(tp->dev))
@@ -4896,7 +4909,7 @@ static void rtl_shutdown(struct pci_dev *pdev)
 	rtl_rar_set(tp, tp->dev->perm_addr);
 
 	if (system_state == SYSTEM_POWER_OFF &&
-	    tp->dash_type == RTL_DASH_NONE) {
+		!tp->dash_enabled) {
 		pci_wake_from_d3(pdev, tp->saved_wolopts);
 		pci_set_power_state(pdev, PCI_D3hot);
 	}
@@ -5254,7 +5267,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1);
 	tp->aspm_manageable = !rc;
 
-	tp->dash_type = rtl_check_dash(tp);
+	tp->dash_type = rtl_get_dash_type(tp);
+	tp->dash_enabled = rtl_dash_is_enabled(tp);
 
 	tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK;
 
@@ -5325,7 +5339,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* configure chip for default features */
 	rtl8169_set_features(dev, dev->features);
 
-	if (tp->dash_type == RTL_DASH_NONE) {
+	if (!tp->dash_enabled) {
 		rtl_set_d3_pll_down(tp, true);
 	} else {
 		rtl_set_d3_pll_down(tp, false);
@@ -5365,7 +5379,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			    "ok" : "ko");
 
 	if (tp->dash_type != RTL_DASH_NONE) {
-		netdev_info(dev, "DASH enabled\n");
+		netdev_info(dev, "DASH %s\n",
+			    tp->dash_enabled ? "enabled" : "disabled");
 		rtl8168_driver_start(tp);
 	}
 
-- 
2.39.2


^ permalink raw reply related

* Re: [PATCH iwl-net v3] ice: fix DDP package download for packages without signature segment
From: Simon Horman @ 2023-11-08 18:59 UTC (permalink / raw)
  To: Paul Greenwalt
  Cc: intel-wired-lan, netdev, jesse.brandeburg, anthony.l.nguyen,
	davem, kuba, tony.brelinski, Dan Nowlin, Maciej Fijalkowski,
	Wojciech Drewek, Jacob Keller
In-Reply-To: <20231107173227.862417-1-paul.greenwalt@intel.com>

On Tue, Nov 07, 2023 at 12:32:27PM -0500, Paul Greenwalt wrote:
> From: Dan Nowlin <dan.nowlin@intel.com>
> 
> Commit 3cbdb0343022 ("ice: Add support for E830 DDP package segment")
> incorrectly removed support for package download for packages without a
> signature segment. These packages include the signature buffer inline
> in the configurations buffers, and not in a signature segment.
> 
> Fix package download by providing download support for both packages
> with (ice_download_pkg_with_sig_seg()) and without signature segment
> (ice_download_pkg_without_sig_seg()).
> 
> Fixes: 3cbdb0343022 ("ice: Add support for E830 DDP package segment")
> Reported-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> Closes: https://lore.kernel.org/netdev/ZUT50a94kk2pMGKb@boxer/
> Tested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
> Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
> Signed-off-by: Paul Greenwalt <paul.greenwalt@intel.com>

Reviewed-by: Simon Horman <horms@kernel.org>


^ permalink raw reply

* Re: EIO on send with UDP_SEGMENT
From: Willem de Bruijn @ 2023-11-08 19:11 UTC (permalink / raw)
  To: Jakub Sitnicki; +Cc: Willem de Bruijn, netdev, kernel-team
In-Reply-To: <87fs1gkthv.fsf@cloudflare.com>

On Wed, Nov 8, 2023 at 1:08 PM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> On Wed, Nov 08, 2023 at 10:10 AM -05, Willem de Bruijn wrote:
> > On Wed, Nov 8, 2023 at 6:03 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> [...]
>
> >> Do you think the restriction in udp_send_skb can be lifted or tweaked?
> >
> > The argument against has been that segmentation offload offers no
> > performance benefit if the stack has to fall back onto software
> > checksumming.
>
> Interesting. Thanks for sharing the context. Must admit, it would have
> not been my first guess that the software GSO+checksum itself is not
> worth it. Despite it happening late on the TX path.

The heuristic is that checksum during copy_from_user is cheap, while
checksum after qdisc dequeue might have to read cold memory.

There will be cases where the data is warm. So YMMV. But that is the
basis for the choice.

^ permalink raw reply

* Re: [PATCH net-next 1/1] ptp: clockmatrix: support 32-bit address space
From: Rahul Rameshbabu @ 2023-11-08 19:11 UTC (permalink / raw)
  To: Min Li; +Cc: richardcochran, lee, linux-kernel, netdev, Min Li
In-Reply-To: <MW5PR03MB6932FF68AEEF51E83EDDB89CA0A8A@MW5PR03MB6932.namprd03.prod.outlook.com>

On Wed, 08 Nov, 2023 13:28:30 -0500 Min Li <lnimi@hotmail.com> wrote:
> From: Min Li <min.li.xe@renesas.com>
>
> We used to assume 0x2010xxxx address. Now that
> we need to access 0x2011xxxx address, we need
> to support read/write the whole 32-bit address space.
>
> Signed-off-by: Min Li <min.li.xe@renesas.com>
> ---
>  drivers/ptp/ptp_clockmatrix.c    |  72 ++--
>  drivers/ptp/ptp_clockmatrix.h    |  33 +-
>  include/linux/mfd/idt8a340_reg.h | 542 ++++++++++++++++---------------
>  3 files changed, 340 insertions(+), 307 deletions(-)
>
> diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c
> index f6f9d4adce04..875841892842 100644
> --- a/drivers/ptp/ptp_clockmatrix.c
> +++ b/drivers/ptp/ptp_clockmatrix.c

<snip>

> @@ -1705,10 +1720,14 @@ static s32 idtcm_getmaxphase(struct ptp_clock_info *ptp __always_unused)
>  }
>  
>  /*
> - * Internal function for implementing support for write phase offset
> + * Maximum absolute value for write phase offset in picoseconds

This logic should be part of getmaxphase rather than adjphase. Due to
the existing implementation of getmaxphase in this driver, the checks
added to the driver in this patch are no longer applicable or reachable
based on the value of MAX_ABS_WRITE_PHASE_PICOSECONDS.

  https://lore.kernel.org/netdev/20230612211500.309075-8-rrameshbabu@nvidia.com/

>   *
>   * @channel:  channel
>   * @delta_ns: delta in nanoseconds
> + *
> + * Destination signed register is 32-bit register in resolution of 50ps
> + *
> + * 0x7fffffff * 50 =  2147483647 * 50 = 107374182350
>   */
>  static int _idtcm_adjphase(struct idtcm_channel *channel, s32 delta_ns)
>  {
> @@ -1717,6 +1736,7 @@ static int _idtcm_adjphase(struct idtcm_channel *channel, s32 delta_ns)
>  	u8 i;
>  	u8 buf[4] = {0};
>  	s32 phase_50ps;
> +	s64 offset_ps;
>  
>  	if (channel->mode != PTP_PLL_MODE_WRITE_PHASE) {
>  		err = channel->configure_write_phase(channel);
> @@ -1724,7 +1744,19 @@ static int _idtcm_adjphase(struct idtcm_channel *channel, s32 delta_ns)
>  			return err;
>  	}
>  
> -	phase_50ps = div_s64((s64)delta_ns * 1000, 50);
> +	offset_ps = (s64)delta_ns * 1000;
> +
> +	/*
> +	 * Check for 32-bit signed max * 50:
> +	 *
> +	 * 0x7fffffff * 50 =  2147483647 * 50 = 107374182350
> +	 */
> +	if (offset_ps > MAX_ABS_WRITE_PHASE_PICOSECONDS)
> +		offset_ps = MAX_ABS_WRITE_PHASE_PICOSECONDS;
> +	else if (offset_ps < -MAX_ABS_WRITE_PHASE_PICOSECONDS)
> +		offset_ps = -MAX_ABS_WRITE_PHASE_PICOSECONDS;
> +
> +	phase_50ps = div_s64(offset_ps, 50);

This logic is unreachable because of idtcm_getmaxphase. Please remove
this hunk. The point of getmaxphase is to standardize the behavior of
what happens when a user passes an adjustment value not supported by a
device rather than letting device drivers define different behaviors for
handling this case.

  https://lore.kernel.org/netdev/20230612211500.309075-6-rrameshbabu@nvidia.com/

>  
>  	for (i = 0; i < 4; i++) {
>  		buf[i] = phase_50ps & 0xff;
> diff --git a/drivers/ptp/ptp_clockmatrix.h b/drivers/ptp/ptp_clockmatrix.h
> index 7c17c4f7f573..a0aa88c8a4ab 100644
> --- a/drivers/ptp/ptp_clockmatrix.h
> +++ b/drivers/ptp/ptp_clockmatrix.h
> @@ -19,6 +19,7 @@
>  #define MAX_REF_CLK	(16)
>  
>  #define MAX_ABS_WRITE_PHASE_NANOSECONDS (107374182L)
> +#define MAX_ABS_WRITE_PHASE_PICOSECONDS (107374182350LL)

This macro is not needed due to reasons previously mentioned.

<snip>

--
Thanks,

Rahul Rameshbabu

^ permalink raw reply

* Re: [PATCH 09/22] [v2] arch: fix asm-offsets.c building with -Wmissing-prototypes
From: Sam Ravnborg @ 2023-11-08 19:10 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Andrew Morton, linux-kernel, Masahiro Yamada, linux-kbuild,
	Arnd Bergmann, Matt Turner, Vineet Gupta, Russell King,
	Catalin Marinas, Will Deacon, Steven Rostedt, Masami Hiramatsu,
	Mark Rutland, Guo Ren, Peter Zijlstra, Ard Biesheuvel,
	Huacai Chen, Greg Ungerer, Michal Simek, Thomas Bogendoerfer,
	Dinh Nguyen, Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Geoff Levand, Palmer Dabbelt, Heiko Carstens,
	John Paul Adrian Glaubitz, David S. Miller, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, x86, Helge Deller, Sudip Mukherjee,
	Greg Kroah-Hartman, Timur Tabi, Kent Overstreet, David Woodhouse,
	Naveen N. Rao, Anil S Keshavamurthy, Kees Cook, Vincenzo Frascino,
	Juri Lelli, Vincent Guittot, Nathan Chancellor, Nick Desaulniers,
	Nicolas Schier, Al Viro, Uwe Kleine-König, linux-alpha,
	linux-snps-arc, linux-arm-kernel, linux-trace-kernel, linux-csky,
	loongarch, linux-m68k, linux-mips, linuxppc-dev, linux-riscv,
	linux-s390, linux-sh, sparclinux, netdev, linux-parisc, linux-usb,
	linux-fbdev, dri-devel, linux-bcachefs, linux-mtd
In-Reply-To: <20231108125843.3806765-10-arnd@kernel.org>

On Wed, Nov 08, 2023 at 01:58:30PM +0100, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
> 
> When -Wmissing-prototypes is enabled, the some asm-offsets.c files fail
> to build, even when this warning is disabled in the Makefile for normal
> files:
> 
> arch/sparc/kernel/asm-offsets.c:22:5: error: no previous prototype for 'sparc32_foo' [-Werror=missing-prototypes]
> arch/sparc/kernel/asm-offsets.c:48:5: error: no previous prototype for 'foo' [-Werror=missing-prototypes]
> 
> Address this by making use of the same trick as x86, marking these
> functions as 'static __used' to avoid the need for a prototype
> by not drop them in dead-code elimination.
> 
> Suggested-by: Masahiro Yamada <masahiroy@kernel.org>
> Link: https://lore.kernel.org/lkml/CAK7LNARfEmFk0Du4Hed19eX_G6tUC5wG0zP+L1AyvdpOF4ybXQ@mail.gmail.com/
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Looks good. I sometimes looks at sparc patches so I looked at this one.
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>

^ permalink raw reply

* Re: [PATCH 15/22] arch: vdso: consolidate gettime prototypes
From: Arnd Bergmann @ 2023-11-08 19:37 UTC (permalink / raw)
  To: Christophe Leroy, Arnd Bergmann, Andrew Morton,
	linux-kernel@vger.kernel.org, Masahiro Yamada,
	linux-kbuild@vger.kernel.org
  Cc: Matt Turner, Vineet Gupta, Russell King, Catalin Marinas,
	Will Deacon, Steven Rostedt, Masami Hiramatsu, Mark Rutland,
	guoren, Peter Zijlstra, Ard Biesheuvel, Huacai Chen, Greg Ungerer,
	Michal Simek, Thomas Bogendoerfer, Dinh Nguyen, Michael Ellerman,
	Nicholas Piggin, Geoff Levand, Palmer Dabbelt, Heiko Carstens,
	John Paul Adrian Glaubitz, David S . Miller, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, x86@kernel.org, Helge Deller,
	Sudip Mukherjee, Greg Kroah-Hartman, Timur Tabi, Kent Overstreet,
	David Woodhouse, Naveen N. Rao, Anil S Keshavamurthy, Kees Cook,
	Vincenzo Frascino, Juri Lelli, Vincent Guittot, Nathan Chancellor,
	Nick Desaulniers, Nicolas Schier, Alexander Viro,
	Uwe Kleine-König, linux-alpha@vger.kernel.org,
	linux-snps-arc@lists.infradead.org,
	linux-arm-kernel@lists.infradead.org,
	linux-trace-kernel@vger.kernel.org, linux-csky@vger.kernel.org,
	loongarch@lists.linux.dev, linux-m68k@lists.linux-m68k.org,
	linux-mips@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	linux-riscv@lists.infradead.org, linux-s390@vger.kernel.org,
	linux-sh@vger.kernel.org, sparclinux@vger.kernel.org, Netdev,
	linux-parisc@vger.kernel.org, linux-usb@vger.kernel.org,
	linux-fbdev@vger.kernel.org, dri-devel@lists.freedesktop.org,
	linux-bcachefs@vger.kernel.org, linux-mtd@lists.infradead.org
In-Reply-To: <ecedb0f1-9543-35c6-18bd-723e6bf21173@csgroup.eu>

On Wed, Nov 8, 2023, at 19:31, Christophe Leroy wrote:
> Le 08/11/2023 à 13:58, Arnd Bergmann a écrit :

> powerpc has functions doing more or less the same, they are called 
> __c_kernel_clock_gettime() and alike with their prototypes siting in 
> arch/powerpc/include/asm/vdso/gettimeofday.h
>
> Should those prototypes be moved to include/vdso/gettime.h too and 
> eventually renamed, or are they considered too powerpc specific ?

I don't actually know, my initial interpretation was that
these function names are part of the user ABI for the vdso,
but I never looked closely enough at how vdso works to
be sure what the actual ABI is.

If __c_kernel_clock_gettime() etc are not part of the user-facing
ABI, I think renaming them for consistency with the other
architectures would be best.

     Arnd

^ permalink raw reply

* Re: [PATCH] Documentation: Document the Netlink spec
From: Jonathan Corbet @ 2023-11-08 20:27 UTC (permalink / raw)
  To: Breno Leitao; +Cc: linux-doc, netdev, kuba, pabeni, edumazet
In-Reply-To: <20231103135622.250314-1-leitao@debian.org>

Breno Leitao <leitao@debian.org> writes:

> This is a Sphinx extension that parses the Netlink YAML spec files
> (Documentation/netlink/specs/), and generates a rst file to be
> displayed into Documentation pages.
>
> Create a new Documentation/networking/netlink_spec page, and a sub-page
> for each Netlink spec that needs to be documented, such as ethtool,
> devlink, netdev, etc.
>
> Create a Sphinx directive extension that reads the YAML spec
> (located under Documentation/netlink/specs), parses it and returns a RST
> string that is inserted where the Sphinx directive was called.

So I finally had a chance to look a bit at this; I have a few
impressions.

First of all, if you put something silly into one of the YAML files, it
kills the whole docs build, which is ... not desirable:

> Exception occurred:
>   File "/usr/lib64/python3.11/site-packages/yaml/scanner.py", line 577, in fetch_value
>     raise ScannerError(None, None,
> yaml.scanner.ScannerError: mapping values are not allowed here
>   in "/stuff/k/git/kernel/Documentation/netlink/specs/ovs_datapath.yaml", line 14, column 9
> 

That error needs to be caught and handled in some more graceful way.

I do have to wonder, though, whether a sphinx extension is the right way
to solve this problem.  You're essentially implementing a filter that
turns one YAML file into one RST file; might it be better to keep that
outside of sphinx as a standalone script, invoked by the Makefile?

Note that I'm asking because I wonder, I'm not saying I would block an
extension-based implementation.

Thanks,

jon

^ permalink raw reply

* [PATCH net] net: set SOCK_RCU_FREE before inserting socket into hashtable
From: Stanislav Fomichev @ 2023-11-08 20:28 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni, Stanislav Fomichev

We've started to see the following kernel traces:

 WARNING: CPU: 83 PID: 0 at net/core/filter.c:6641 sk_lookup+0x1bd/0x1d0

 Call Trace:
  <IRQ>
  __bpf_skc_lookup+0x10d/0x120
  bpf_sk_lookup+0x48/0xd0
  bpf_sk_lookup_tcp+0x19/0x20
  bpf_prog_<redacted>+0x37c/0x16a3
  cls_bpf_classify+0x205/0x2e0
  tcf_classify+0x92/0x160
  __netif_receive_skb_core+0xe52/0xf10
  __netif_receive_skb_list_core+0x96/0x2b0
  napi_complete_done+0x7b5/0xb70
  <redacted>_poll+0x94/0xb0
  net_rx_action+0x163/0x1d70
  __do_softirq+0xdc/0x32e
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x36/0x50
  do_softirq+0x44/0x70

I'm not 100% what is causing them. It might be some kernel change or
new code path in the bpf program. But looking at the code,
I'm assuming the issue has been there for a while.

__inet_hash can race with lockless (rcu) readers on the other cpus:

  __inet_hash
    __sk_nulls_add_node_rcu
    <- (bpf triggers here)
    sock_set_flag(SOCK_RCU_FREE)

Let's move the SOCK_RCU_FREE part up a bit, before we are inserting
the socket into hashtables. Note, that the race is really harmless;
the bpf callers are handling this situation (where listener socket
doesn't have SOCK_RCU_FREE set) correctly, so the only
annoyance is a WARN_ONCE (so not 100% sure whether it should
wait until net-next instead).

For the fixes tag, I'm using the original commit which added the flag.

Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 net/ipv4/inet_hashtables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 598c1b114d2c..a532f749e477 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -751,12 +751,12 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 		if (err)
 			goto unlock;
 	}
+	sock_set_flag(sk, SOCK_RCU_FREE);
 	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
 		sk->sk_family == AF_INET6)
 		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
 	else
 		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
-	sock_set_flag(sk, SOCK_RCU_FREE);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
 	spin_unlock(&ilb2->lock);
-- 
2.42.0.869.gea05f2083d-goog


^ permalink raw reply related

* Re: [PATCH 16/22] bcachefs: mark bch2_target_to_text_sb() static
From: Kent Overstreet @ 2023-11-08 20:19 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Andrew Morton, linux-kernel, Masahiro Yamada, linux-kbuild,
	Arnd Bergmann, Matt Turner, Vineet Gupta, Russell King,
	Catalin Marinas, Will Deacon, Steven Rostedt, Masami Hiramatsu,
	Mark Rutland, Guo Ren, Peter Zijlstra, Ard Biesheuvel,
	Huacai Chen, Greg Ungerer, Michal Simek, Thomas Bogendoerfer,
	Dinh Nguyen, Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Geoff Levand, Palmer Dabbelt, Heiko Carstens,
	John Paul Adrian Glaubitz, David S. Miller, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, x86, Helge Deller, Sudip Mukherjee,
	Greg Kroah-Hartman, Timur Tabi, David Woodhouse, Naveen N. Rao,
	Anil S Keshavamurthy, Kees Cook, Vincenzo Frascino, Juri Lelli,
	Vincent Guittot, Nathan Chancellor, Nick Desaulniers,
	Nicolas Schier, Al Viro, Uwe Kleine-König, linux-alpha,
	linux-snps-arc, linux-arm-kernel, linux-trace-kernel, linux-csky,
	loongarch, linux-m68k, linux-mips, linuxppc-dev, linux-riscv,
	linux-s390, linux-sh, sparclinux, netdev, linux-parisc, linux-usb,
	linux-fbdev, dri-devel, linux-bcachefs, linux-mtd
In-Reply-To: <20231108125843.3806765-17-arnd@kernel.org>

On Wed, Nov 08, 2023 at 01:58:37PM +0100, Arnd Bergmann wrote:
> From: Arnd Bergmann <arnd@arndb.de>
> 
> bch2_target_to_text_sb() is only called in the file it is defined in,
> and it has no extern prototype:
> 
> fs/bcachefs/disk_groups.c:583:6: error: no previous prototype for 'bch2_target_to_text_sb' [-Werror=missing-prototypes]
> 
> Mark it static to avoid the warning and have the code better optimized.
> 
> Fixes: bf0d9e89de2e ("bcachefs: Split apart bch2_target_to_text(), bch2_target_to_text_sb()")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

This is already fixed in my tree.

^ permalink raw reply

* Re: [PATCH iwl-net] i40e: Fix max frame size check
From: Jesse Brandeburg @ 2023-11-08 20:38 UTC (permalink / raw)
  To: Ivan Vecera, intel-wired-lan
  Cc: Jacob Keller, Wojciech Drewek, Tony Nguyen, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	open list:NETWORKING DRIVERS
In-Reply-To: <20231108151018.72670-1-ivecera@redhat.com>

On 11/8/2023 7:10 AM, Ivan Vecera wrote:
> Commit 3a2c6ced90e1 ("i40e: Add a check to see if MFS is set") added
> a check for port's MFS (max frame size) value. The value is stored
> in PRTGL_SAH register for each physical port. According datasheet this
> register is defined as:
> 
> PRTGL_SAH[PRT]: (0x001E2140 + 0x4*PRT, PRT=0...3)
> 
> where PRT is physical port number.

<trimmed lkml, and a couple of non-existent intel addresses>

Was there an actual problem here? I suspect if you read all the
registers for each PF's BAR, you'll find that all 4 report the same,
correct value, for the perspective of the BAR they're being read from.

The i40e hardware does this (somewhat non-obvious) for *lots* of port
specific registers, and what happens is no matter which of the 4 you
read the value from, you'll get the right "port specific" value. This is
because the hardware designers decided to make a different "view" on the
register set depending on which PF you access it from. The only time
these offsets matter is when the part is in debug mode or when the
firmware is reading the internal registers (from the internal firmware
register space - which has no aliasing) that rely on the correct offset.

In this case, I think your change won't make any functional difference,
but I can see why you want to make the change as the code doesn't match
the datasheet's definition of the register.

That all said, unless you can prove a problem, I'm relatively sure that
nothing is wrong here in functionality or code. And if you go this
route, there might be a lot of other registers to fix that have the same
aliasing.

I apologize for the confusing manuals and header file, it's complicated
but in practice works really well. Effectively you can't read other
port's registers by accident.

Here was my experiment showing the aliasing on X722. You'll note that
the lower 16 bits are a MAC address that doesn't change no matter which
register you read.

device 20:0.0
0x1e2140 == 0x26008245
0x1e2144 == 0x26008245
0x1e2148 == 0x26008245
0x1e214c == 0x26008245
device 20:0.1
0x1e2140 == 0x26008345
0x1e2144 == 0x26008345
0x1e2148 == 0x26008345
0x1e214c == 0x26008345
device 20:0.2
0x1e2140 == 0x26008445
0x1e2144 == 0x26008445
0x1e2148 == 0x26008445
0x1e214c == 0x26008445
device 20:0.3
0x1e2140 == 0x26008545
0x1e2144 == 0x26008545
0x1e2148 == 0x26008545
0x1e214c == 0x26008545

lspci -d ::0200
20:00.0 Ethernet controller: Intel Corporation Ethernet Connection X722
for 10GBASE-T (rev 04)
20:00.1 Ethernet controller: Intel Corporation Ethernet Connection X722
for 10GBASE-T (rev 04)
20:00.2 Ethernet controller: Intel Corporation Ethernet Connection X722
for 10GbE SFP+ (rev 04)
20:00.3 Ethernet controller: Intel Corporation Ethernet Connection X722
for 10GbE SFP+ (rev 04)

Hope this helps!

^ permalink raw reply

* Re: [PATCH 10/22] microblaze: include linux/cpu.h for trap_init() prototype
From: Geert Uytterhoeven @ 2023-11-08 20:42 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Andrew Morton, linux-kernel, Masahiro Yamada, linux-kbuild,
	Arnd Bergmann, Matt Turner, Vineet Gupta, Russell King,
	Catalin Marinas, Will Deacon, Steven Rostedt, Masami Hiramatsu,
	Mark Rutland, Guo Ren, Peter Zijlstra, Ard Biesheuvel,
	Huacai Chen, Greg Ungerer, Michal Simek, Thomas Bogendoerfer,
	Dinh Nguyen, Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Geoff Levand, Palmer Dabbelt, Heiko Carstens,
	John Paul Adrian Glaubitz, David S. Miller, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, x86, Helge Deller, Sudip Mukherjee,
	Greg Kroah-Hartman, Timur Tabi, Kent Overstreet, David Woodhouse,
	Naveen N. Rao, Anil S Keshavamurthy, Kees Cook, Vincenzo Frascino,
	Juri Lelli, Vincent Guittot, Nathan Chancellor, Nick Desaulniers,
	Nicolas Schier, Al Viro, Uwe Kleine-König, linux-alpha,
	linux-snps-arc, linux-arm-kernel, linux-trace-kernel, linux-csky,
	loongarch, linux-m68k, linux-mips, linuxppc-dev, linux-riscv,
	linux-s390, linux-sh, sparclinux, netdev, linux-parisc, linux-usb,
	linux-fbdev, dri-devel, linux-bcachefs, linux-mtd
In-Reply-To: <20231108125843.3806765-11-arnd@kernel.org>

Hi Arnd,

On Wed, Nov 8, 2023 at 2:01 PM Arnd Bergmann <arnd@kernel.org> wrote:
> From: Arnd Bergmann <arnd@arndb.de>
>
> Microblaze runs into a single -Wmissing-prototypes warning when that is
> enabled:
>
> arch/microblaze/kernel/traps.c:21:6: warning: no previous prototype for 'trap_init' [-Wmissing-prototypes]
>
> Include the right header to avoid this.
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Thanks for your patch!

>  arch/alpha/kernel/traps.c      | 1 +
>  arch/csky/include/asm/traps.h  | 2 --
>  arch/csky/kernel/traps.c       | 1 +
>  arch/m68k/coldfire/vectors.c   | 3 +--
>  arch/m68k/coldfire/vectors.h   | 3 ---

Ah, so this is where the m68k changes listed in the cover letter are
hiding ;-)

>  arch/microblaze/kernel/traps.c | 1 +
>  arch/sparc/kernel/traps_32.c   | 1 +
>  arch/sparc/kernel/traps_64.c   | 1 +
>  arch/x86/include/asm/traps.h   | 1 -
>  arch/x86/kernel/traps.c        | 1 +
>  10 files changed, 7 insertions(+), 8 deletions(-)
>  delete mode 100644 arch/m68k/coldfire/vectors.h

Obviously the non-microblaze changes should be spun off in separate
patches.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH net] net: set SOCK_RCU_FREE before inserting socket into hashtable
From: Eric Dumazet @ 2023-11-08 20:58 UTC (permalink / raw)
  To: Stanislav Fomichev; +Cc: netdev, davem, kuba, pabeni
In-Reply-To: <20231108202819.1932920-1-sdf@google.com>

On Wed, Nov 8, 2023 at 9:28 PM Stanislav Fomichev <sdf@google.com> wrote:
>
> We've started to see the following kernel traces:
>
>  WARNING: CPU: 83 PID: 0 at net/core/filter.c:6641 sk_lookup+0x1bd/0x1d0
>
>  Call Trace:
>   <IRQ>
>   __bpf_skc_lookup+0x10d/0x120
>   bpf_sk_lookup+0x48/0xd0
>   bpf_sk_lookup_tcp+0x19/0x20
>   bpf_prog_<redacted>+0x37c/0x16a3
>   cls_bpf_classify+0x205/0x2e0
>   tcf_classify+0x92/0x160
>   __netif_receive_skb_core+0xe52/0xf10
>   __netif_receive_skb_list_core+0x96/0x2b0
>   napi_complete_done+0x7b5/0xb70
>   <redacted>_poll+0x94/0xb0
>   net_rx_action+0x163/0x1d70
>   __do_softirq+0xdc/0x32e
>   asm_call_irq_on_stack+0x12/0x20
>   </IRQ>
>   do_softirq_own_stack+0x36/0x50
>   do_softirq+0x44/0x70
>
> I'm not 100% what is causing them. It might be some kernel change or
> new code path in the bpf program. But looking at the code,
> I'm assuming the issue has been there for a while.
>
> __inet_hash can race with lockless (rcu) readers on the other cpus:
>
>   __inet_hash
>     __sk_nulls_add_node_rcu
>     <- (bpf triggers here)
>     sock_set_flag(SOCK_RCU_FREE)
>
> Let's move the SOCK_RCU_FREE part up a bit, before we are inserting
> the socket into hashtables. Note, that the race is really harmless;
> the bpf callers are handling this situation (where listener socket
> doesn't have SOCK_RCU_FREE set) correctly, so the only
> annoyance is a WARN_ONCE (so not 100% sure whether it should
> wait until net-next instead).
>
> For the fixes tag, I'm using the original commit which added the flag.

When this commit added the flag, precise location of the
sock_set_flag(sk, SOCK_RCU_FREE)
did not matter, because the thread calling __inet_hash() owns a reference on sk.

SOCK_RCU_FREE was tested only at dismantle time.

Back then BPF was not able yet to perform lookups, and double check if
SOCK_RCU_FREE
was set or not.

Checking SOCK_RCU_FREE _after_ the lookup to infer if a refcount has
been taken came
with commit 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")

I think we can be more precise and help future debugging, in case more problems
need investigations.

Can you augment the changelog and use a different Fixes: tag ?

With that,

Reviewed-by: Eric Dumazet <edumazet@google.com>

>
> Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood")
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  net/ipv4/inet_hashtables.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 598c1b114d2c..a532f749e477 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -751,12 +751,12 @@ int __inet_hash(struct sock *sk, struct sock *osk)
>                 if (err)
>                         goto unlock;
>         }
> +       sock_set_flag(sk, SOCK_RCU_FREE);
>         if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
>                 sk->sk_family == AF_INET6)
>                 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
>         else
>                 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> -       sock_set_flag(sk, SOCK_RCU_FREE);
>         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  unlock:
>         spin_unlock(&ilb2->lock);
> --
> 2.42.0.869.gea05f2083d-goog
>

^ permalink raw reply

* Re: [PATCH net] net: set SOCK_RCU_FREE before inserting socket into hashtable
From: Stanislav Fomichev @ 2023-11-08 21:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem, kuba, pabeni
In-Reply-To: <CANn89iJnX6sm1UHbU6TKzoWJJyNLGjpN_amb8bkmgnLk8Qj_gQ@mail.gmail.com>

On Wed, Nov 8, 2023 at 12:58 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Nov 8, 2023 at 9:28 PM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > We've started to see the following kernel traces:
> >
> >  WARNING: CPU: 83 PID: 0 at net/core/filter.c:6641 sk_lookup+0x1bd/0x1d0
> >
> >  Call Trace:
> >   <IRQ>
> >   __bpf_skc_lookup+0x10d/0x120
> >   bpf_sk_lookup+0x48/0xd0
> >   bpf_sk_lookup_tcp+0x19/0x20
> >   bpf_prog_<redacted>+0x37c/0x16a3
> >   cls_bpf_classify+0x205/0x2e0
> >   tcf_classify+0x92/0x160
> >   __netif_receive_skb_core+0xe52/0xf10
> >   __netif_receive_skb_list_core+0x96/0x2b0
> >   napi_complete_done+0x7b5/0xb70
> >   <redacted>_poll+0x94/0xb0
> >   net_rx_action+0x163/0x1d70
> >   __do_softirq+0xdc/0x32e
> >   asm_call_irq_on_stack+0x12/0x20
> >   </IRQ>
> >   do_softirq_own_stack+0x36/0x50
> >   do_softirq+0x44/0x70
> >
> > I'm not 100% what is causing them. It might be some kernel change or
> > new code path in the bpf program. But looking at the code,
> > I'm assuming the issue has been there for a while.
> >
> > __inet_hash can race with lockless (rcu) readers on the other cpus:
> >
> >   __inet_hash
> >     __sk_nulls_add_node_rcu
> >     <- (bpf triggers here)
> >     sock_set_flag(SOCK_RCU_FREE)
> >
> > Let's move the SOCK_RCU_FREE part up a bit, before we are inserting
> > the socket into hashtables. Note, that the race is really harmless;
> > the bpf callers are handling this situation (where listener socket
> > doesn't have SOCK_RCU_FREE set) correctly, so the only
> > annoyance is a WARN_ONCE (so not 100% sure whether it should
> > wait until net-next instead).
> >
> > For the fixes tag, I'm using the original commit which added the flag.
>
> When this commit added the flag, precise location of the
> sock_set_flag(sk, SOCK_RCU_FREE)
> did not matter, because the thread calling __inet_hash() owns a reference on sk.
>
> SOCK_RCU_FREE was tested only at dismantle time.
>
> Back then BPF was not able yet to perform lookups, and double check if
> SOCK_RCU_FREE
> was set or not.
>
> Checking SOCK_RCU_FREE _after_ the lookup to infer if a refcount has
> been taken came
> with commit 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
>
> I think we can be more precise and help future debugging, in case more problems
> need investigations.
>
> Can you augment the changelog and use a different Fixes: tag ?
>
> With that,
>
> Reviewed-by: Eric Dumazet <edumazet@google.com>

Sure, thank you for the timeline! Will resend shortly with the updated
changelog.

> >
> > Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood")
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  net/ipv4/inet_hashtables.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> > index 598c1b114d2c..a532f749e477 100644
> > --- a/net/ipv4/inet_hashtables.c
> > +++ b/net/ipv4/inet_hashtables.c
> > @@ -751,12 +751,12 @@ int __inet_hash(struct sock *sk, struct sock *osk)
> >                 if (err)
> >                         goto unlock;
> >         }
> > +       sock_set_flag(sk, SOCK_RCU_FREE);
> >         if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
> >                 sk->sk_family == AF_INET6)
> >                 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> >         else
> >                 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> > -       sock_set_flag(sk, SOCK_RCU_FREE);
> >         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> >  unlock:
> >         spin_unlock(&ilb2->lock);
> > --
> > 2.42.0.869.gea05f2083d-goog
> >

^ permalink raw reply

* Re: [PATCH v9 bpf-next 02/17] bpf: add BPF token delegation mount options to BPF FS
From: Andrii Nakryiko @ 2023-11-08 21:09 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andrii Nakryiko, bpf, netdev, paul, linux-fsdevel,
	linux-security-module, keescook, kernel-team, sargun
In-Reply-To: <20231108-ungeeignet-uhren-698f16b4b36b@brauner>

On Wed, Nov 8, 2023 at 5:51 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Fri, Nov 03, 2023 at 12:05:08PM -0700, Andrii Nakryiko wrote:
> > Add few new mount options to BPF FS that allow to specify that a given
> > BPF FS instance allows creation of BPF token (added in the next patch),
> > and what sort of operations are allowed under BPF token. As such, we get
> > 4 new mount options, each is a bit mask
> >   - `delegate_cmds` allow to specify which bpf() syscall commands are
> >     allowed with BPF token derived from this BPF FS instance;
> >   - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies
> >     a set of allowable BPF map types that could be created with BPF token;
> >   - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies
> >     a set of allowable BPF program types that could be loaded with BPF token;
> >   - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies
> >     a set of allowable BPF program attach types that could be loaded with
> >     BPF token; delegate_progs and delegate_attachs are meant to be used
> >     together, as full BPF program type is, in general, determined
> >     through both program type and program attach type.
> >
> > Currently, these mount options accept the following forms of values:
> >   - a special value "any", that enables all possible values of a given
> >   bit set;
> >   - numeric value (decimal or hexadecimal, determined by kernel
> >   automatically) that specifies a bit mask value directly;
> >   - all the values for a given mount option are combined, if specified
> >   multiple times. E.g., `mount -t bpf nodev /path/to/mount -o
> >   delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3
> >   mask.
> >
> > Ideally, more convenient (for humans) symbolic form derived from
> > corresponding UAPI enums would be accepted (e.g., `-o
> > delegate_progs=kprobe|tracepoint`) and I intend to implement this, but
> > it requires a bunch of UAPI header churn, so I postponed it until this
> > feature lands upstream or at least there is a definite consensus that
> > this feature is acceptable and is going to make it, just to minimize
> > amount of wasted effort and not increase amount of non-essential code to
> > be reviewed.
> >
> > Attentive reader will notice that BPF FS is now marked as
> > FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init
> > user namespace as long as the process has sufficient *namespaced*
> > capabilities within that user namespace. But in reality we still
> > restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in
> > init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added
> > to allow creating BPF FS context object (i.e., fsopen("bpf")) from
> > inside unprivileged process inside non-init userns, to capture that
> > userns as the owning userns. It will still be required to pass this
> > context object back to privileged process to instantiate and mount it.
> >
> > This manipulation is important, because capturing non-init userns as the
> > owning userns of BPF FS instance (super block) allows to use that userns
> > to constraint BPF token to that userns later on (see next patch). So
> > creating BPF FS with delegation inside unprivileged userns will restrict
> > derived BPF token objects to only "work" inside that intended userns,
> > making it scoped to a intended "container".
> >
> > There is a set of selftests at the end of the patch set that simulates
> > this sequence of steps and validates that everything works as intended.
> > But careful review is requested to make sure there are no missed gaps in
> > the implementation and testing.
> >
> > All this is based on suggestions and discussions with Christian Brauner
> > ([0]), to the best of my ability to follow all the implications.
>
> "who will not be held responsible for any CVE future or present as he's
>  not sure whether bpf token is a good idea in general"
>
> I'm not opposing it because it's really not my subsystem. But it'd be
> nice if you also added a disclaimer that I'm not endorsing this. :)
>

Sure, I'll clarify. I still appreciate your reviewing everything and
pointing out all the gotchas (like the reconfiguration and other
stuff), thanks!

> A comment below.
>
> >
> >   [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  include/linux/bpf.h | 10 ++++++
> >  kernel/bpf/inode.c  | 88 +++++++++++++++++++++++++++++++++++++++------
> >  2 files changed, 88 insertions(+), 10 deletions(-)
> >

[...]

> >       opt = fs_parse(fc, bpf_fs_parameters, param, &result);
> >       if (opt < 0) {
> > @@ -665,6 +692,25 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
> >       case OPT_MODE:
> >               opts->mode = result.uint_32 & S_IALLUGO;
> >               break;
> > +     case OPT_DELEGATE_CMDS:
> > +     case OPT_DELEGATE_MAPS:
> > +     case OPT_DELEGATE_PROGS:
> > +     case OPT_DELEGATE_ATTACHS:
> > +             if (strcmp(param->string, "any") == 0) {
> > +                     msk = ~0ULL;
> > +             } else {
> > +                     err = kstrtou64(param->string, 0, &msk);
> > +                     if (err)
> > +                             return err;
> > +             }
> > +             switch (opt) {
> > +             case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break;
> > +             case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break;
> > +             case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break;
> > +             case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break;
> > +             default: return -EINVAL;
> > +             }
> > +             break;
> >       }
>
> So just to repeat that this will allow a container to set it's own
> delegation options:
>
>         # unprivileged container
>
>         fd_fs = fsopen();
>         fsconfig(fd_fs, FSCONFIG_BLA_BLA, "give-me-all-the-delegation");
>
>         # Now hand of that fd_fs to a privileged process
>
>         fsconfig(fd_fs, FSCONFIG_CREATE_CMD, ...)
>
> This means the container manager can't be part of your threat model
> because you need to trust it to set delegation options.
>
> But if the container manager is part of your threat model then you can
> never trust an fd_fs handed to you because the container manager might
> have enabled arbitrary delegation privileges.
>
> There's ways around this:
>
> (1) kernel: Account for this in the kernel and require privileges when
>     setting delegation options.

What sort of privilege would that be? We are in an unprivileged user
namespace, so that would have to be some ns_capable() checks or
something? I can add ns_capable(CAP_BPF), but what else did you have
in mind?

I think even if we say that privileged parent does FSCONFIG_SET_STRING
and unprivileged child just does sys_fsopen("bpf", 0) and nothing
more, we still can't be sure that child won't race with parent and set
FSCONFIG_SET_STRING at the same time. Because they both have access to
the same fs_fd.

> (2) userspace: A trusted helper that allocates an fs_context fd in
>     the target user namespace, then sets delegation options and creates
>     superblock.
>
> (1) Is more restrictive but also more secure. (2) is less restrictive
> but requires more care from userspace.
>
> Either way I would probably consider writing a document detailing
> various delegation scenarios and possible pitfalls and implications
> before advertising it.
>
> If you choose (2) then you also need to be aware that the security of
> this also hinges on bpffs not allowing to reconfigure parameters once it
> has been mounted. Otherwise an unprivileged container can change
> delegation options.
>
> I would recommend that you either add a dummy bpf_reconfigure() method
> with a comment in it or you add a comment on top of bpf_context_ops.
> Something like:
>
> /*
>  * Unprivileged mounts of bpffs are owned by the user namespace they are
>  * mounted in. That means unprivileged users can change vfs mount
>  * options (ro<->rw, nosuid, etc.).
>  *
>  * They currently cannot change bpffs specific mount options such as
>  * delegation settings. If that is ever implemented it is necessary to
>  * require rivileges in the initial namespace. Otherwise unprivileged
>  * users can change delegation options to whatever they want.
>  */

Yep, I will add a custom callback. I think we can allow reconfiguring
towards less permissive delegation subset, but I'll need to look at
the specifics to see if we can support that easily.

^ permalink raw reply

* Re: [PATCH v9 bpf-next 03/17] bpf: introduce BPF token object
From: Andrii Nakryiko @ 2023-11-08 21:09 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andrii Nakryiko, bpf, netdev, paul, linux-fsdevel,
	linux-security-module, keescook, kernel-team, sargun
In-Reply-To: <20231108-verbuchen-unteilbar-9005061e2b48@brauner>

On Wed, Nov 8, 2023 at 6:28 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Fri, Nov 03, 2023 at 12:05:09PM -0700, Andrii Nakryiko wrote:
> > Add new kind of BPF kernel object, BPF token. BPF token is meant to
> > allow delegating privileged BPF functionality, like loading a BPF
> > program or creating a BPF map, from privileged process to a *trusted*
> > unprivileged process, all while have a good amount of control over which
> > privileged operations could be performed using provided BPF token.
> >
> > This is achieved through mounting BPF FS instance with extra delegation
> > mount options, which determine what operations are delegatable, and also
> > constraining it to the owning user namespace (as mentioned in the
> > previous patch).
> >
> > BPF token itself is just a derivative from BPF FS and can be created
> > through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts
> > a path specification (using the usual fd + string path combo) to a BPF
> > FS mount. Currently, BPF token "inherits" delegated command, map types,
> > prog type, and attach type bit sets from BPF FS as is. In the future,
> > having an BPF token as a separate object with its own FD, we can allow
> > to further restrict BPF token's allowable set of things either at the creation
> > time or after the fact, allowing the process to guard itself further
> > from, e.g., unintentionally trying to load undesired kind of BPF
> > programs. But for now we keep things simple and just copy bit sets as is.
> >
> > When BPF token is created from BPF FS mount, we take reference to the
> > BPF super block's owning user namespace, and then use that namespace for
> > checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN}
> > capabilities that are normally only checked against init userns (using
> > capable()), but now we check them using ns_capable() instead (if BPF
> > token is provided). See bpf_token_capable() for details.
> >
> > Such setup means that BPF token in itself is not sufficient to grant BPF
> > functionality. User namespaced process has to *also* have necessary
> > combination of capabilities inside that user namespace. So while
> > previously CAP_BPF was useless when granted within user namespace, now
> > it gains a meaning and allows container managers and sys admins to have
> > a flexible control over which processes can and need to use BPF
> > functionality within the user namespace (i.e., container in practice).
> > And BPF FS delegation mount options and derived BPF tokens serve as
> > a per-container "flag" to grant overall ability to use bpf() (plus further
> > restrict on which parts of bpf() syscalls are treated as namespaced).
> >
> > Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF)
> > within the BPF FS owning user namespace, rounding up the ns_capable()
> > story of BPF token.
> >
> > The alternative to creating BPF token object was:
> >   a) not having any extra object and just pasing BPF FS path to each
> >      relevant bpf() command. This seems suboptimal as it's racy (mount
> >      under the same path might change in between checking it and using it
> >      for bpf() command). And also less flexible if we'd like to further
>
> I don't understand "mount under the same path might change in between
> checking it and using it for bpf() command".
>
> Just require userspace to open() the bpffs instance and pass that fd to
> bpf() just as you're doing right now. If that is racy then the current
> implementation is even more so because it is passing:
>
> bpffs_path_fd
> bpffs_pathname
>
> and then performs a lookup. More on that below.

Yes, this is a result of my initial confusion with how O_PATH-based
open() works. You are right that it's not racy, I'll update the
message.

>
> I want to point out that most of this code here is unnecessary if you
> use the bpffs fd itself as a token. But that's your decision. I'm just
> saying that I'm not sure the critique that it's racy is valid.

Ack.

>
> >      restrict ourselves compared to all the delegated functionality
> >      allowed on BPF FS.
> >   b) use non-bpf() interface, e.g., ioctl(), but otherwise also create
> >      a dedicated FD that would represent a token-like functionality. This
> >      doesn't seem superior to having a proper bpf() command, so
> >      BPF_TOKEN_CREATE was chosen.
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  include/linux/bpf.h            |  41 +++++++
> >  include/uapi/linux/bpf.h       |  39 +++++++
> >  kernel/bpf/Makefile            |   2 +-
> >  kernel/bpf/inode.c             |  17 ++-
> >  kernel/bpf/syscall.c           |  17 +++
> >  kernel/bpf/token.c             | 197 +++++++++++++++++++++++++++++++++
> >  tools/include/uapi/linux/bpf.h |  39 +++++++
> >  7 files changed, 342 insertions(+), 10 deletions(-)
> >  create mode 100644 kernel/bpf/token.c
> >

[...]

> > +
> > +#define BPF_TOKEN_INODE_NAME "bpf-token"
> > +
> > +static const struct inode_operations bpf_token_iops = { };
> > +
> > +static const struct file_operations bpf_token_fops = {
> > +     .release        = bpf_token_release,
> > +     .show_fdinfo    = bpf_token_show_fdinfo,
> > +};
> > +
> > +int bpf_token_create(union bpf_attr *attr)
> > +{
> > +     struct bpf_mount_opts *mnt_opts;
> > +     struct bpf_token *token = NULL;
> > +     struct user_namespace *userns;
> > +     struct inode *inode;
> > +     struct file *file;
> > +     struct path path;
> > +     umode_t mode;
> > +     int err, fd;
> > +
> > +     err = user_path_at(attr->token_create.bpffs_path_fd,
> > +                        u64_to_user_ptr(attr->token_create.bpffs_pathname),
> > +                        LOOKUP_FOLLOW | LOOKUP_EMPTY, &path);
>
> Do you really need bpffs_path_fd and bpffs_pathname?
> This seems unnecessar as you're forcing a lookup that's best done in
> userspace through regular open() apis. So I would just make this:
>
> struct { /* struct used by BPF_TOKEN_CREATE command */
>         __u32           flags;
>         __u32           bpffs_path_fd;
> } token_create;
>
> In bpf_token_create() you can then just do:
>
>         struct fd f;
>         struct path path;
>
>         f = fdget(attr->token_create.bpffs_path_fd);
>         if (!f.file)
>                 return -EBADF;
>
>         *path = f.file->f_path;
>         path_get(path);
>         fdput(f);
>

Yes, you are right. I'll simplify this part, thanks.


> > +     if (err)
> > +             return err;
> > +
> > +     if (path.mnt->mnt_root != path.dentry) {
> > +             err = -EINVAL;
> > +             goto out_path;
> > +     }
> > +     if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
> > +             err = -EINVAL;
> > +             goto out_path;
> > +     }

[...]

^ permalink raw reply

* Re: [PATCH 10/22] microblaze: include linux/cpu.h for trap_init() prototype
From: Geert Uytterhoeven @ 2023-11-08 21:10 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Arnd Bergmann, Andrew Morton, linux-kernel, Masahiro Yamada,
	linux-kbuild, Matt Turner, Vineet Gupta, Russell King,
	Catalin Marinas, Will Deacon, Steven Rostedt, Masami Hiramatsu,
	Mark Rutland, guoren, Peter Zijlstra, Ard Biesheuvel, Huacai Chen,
	Greg Ungerer, Michal Simek, Thomas Bogendoerfer, Dinh Nguyen,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, Geoff Levand,
	Palmer Dabbelt, Heiko Carstens, John Paul Adrian Glaubitz,
	David S . Miller, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	x86, Helge Deller, Sudip Mukherjee, Greg Kroah-Hartman,
	Timur Tabi, Kent Overstreet, David Woodhouse, Naveen N. Rao,
	Anil S Keshavamurthy, Kees Cook, Vincenzo Frascino, Juri Lelli,
	Vincent Guittot, Nathan Chancellor, Nick Desaulniers,
	Nicolas Schier, Alexander Viro, Uwe Kleine-König,
	linux-alpha, linux-snps-arc, linux-arm-kernel, linux-trace-kernel,
	linux-csky@vger.kernel.org, loongarch, linux-m68k, linux-mips,
	linuxppc-dev, linux-riscv, linux-s390, linux-sh, sparclinux,
	Netdev, linux-parisc, linux-usb, linux-fbdev, dri-devel,
	linux-bcachefs@vger.kernel.org, linux-mtd
In-Reply-To: <e7753f82-c3de-48fc-955d-59773222aaa9@app.fastmail.com>

Hi Arnd,

On Wed, Nov 8, 2023 at 10:07 PM Arnd Bergmann <arnd@arndb.de> wrote:
> On Wed, Nov 8, 2023, at 21:42, Geert Uytterhoeven wrote:
> > On Wed, Nov 8, 2023 at 2:01 PM Arnd Bergmann <arnd@kernel.org> wrote:
> >> From: Arnd Bergmann <arnd@arndb.de>
> >>
> >> Microblaze runs into a single -Wmissing-prototypes warning when that is
> >> enabled:
> >>
> >> arch/microblaze/kernel/traps.c:21:6: warning: no previous prototype for 'trap_init' [-Wmissing-prototypes]
> >>
> >> Include the right header to avoid this.
> >>
> >> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> >
> > Thanks for your patch!
> >
> >>  arch/alpha/kernel/traps.c      | 1 +
> >>  arch/csky/include/asm/traps.h  | 2 --
> >>  arch/csky/kernel/traps.c       | 1 +
> >>  arch/m68k/coldfire/vectors.c   | 3 +--
> >>  arch/m68k/coldfire/vectors.h   | 3 ---
> >
> > Ah, so this is where the m68k changes listed in the cover letter are
> > hiding ;-)
> >
> >>  arch/microblaze/kernel/traps.c | 1 +
> >>  arch/sparc/kernel/traps_32.c   | 1 +
> >>  arch/sparc/kernel/traps_64.c   | 1 +
> >>  arch/x86/include/asm/traps.h   | 1 -
> >>  arch/x86/kernel/traps.c        | 1 +
> >>  10 files changed, 7 insertions(+), 8 deletions(-)
> >>  delete mode 100644 arch/m68k/coldfire/vectors.h
> >
> > Obviously the non-microblaze changes should be spun off in separate
> > patches.
>
> I messed up one of my rebases here and accidentally sent
> the wrong changelog text. My intention was to have the
> combined patch but with this text:
>
>     arch: include linux/cpu.h for trap_init() prototype
>
>     some architectures run into a -Wmissing-prototypes warning
>     for trap_init()
>
>     arch/microblaze/kernel/traps.c:21:6: warning: no previous prototype for 'trap_init' [-Wmissing-prototypes]
>
>     Include the right header to avoid this consistently, removing
>     the extra declarations on m68k and x86 that were added as local
>     workarounds already.
>
>     Signed-off-by: Arnd Bergmann <arnd@arndb.de>

That makes sense, although it's hard to combine this with "my preference
would be for the patches to make it through the respective subsystem
maintainer trees"...

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [PATCH net v2] net: set SOCK_RCU_FREE before inserting socket into hashtable
From: Stanislav Fomichev @ 2023-11-08 21:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni, Stanislav Fomichev

We've started to see the following kernel traces:

 WARNING: CPU: 83 PID: 0 at net/core/filter.c:6641 sk_lookup+0x1bd/0x1d0

 Call Trace:
  <IRQ>
  __bpf_skc_lookup+0x10d/0x120
  bpf_sk_lookup+0x48/0xd0
  bpf_sk_lookup_tcp+0x19/0x20
  bpf_prog_<redacted>+0x37c/0x16a3
  cls_bpf_classify+0x205/0x2e0
  tcf_classify+0x92/0x160
  __netif_receive_skb_core+0xe52/0xf10
  __netif_receive_skb_list_core+0x96/0x2b0
  napi_complete_done+0x7b5/0xb70
  <redacted>_poll+0x94/0xb0
  net_rx_action+0x163/0x1d70
  __do_softirq+0xdc/0x32e
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x36/0x50
  do_softirq+0x44/0x70

__inet_hash can race with lockless (rcu) readers on the other cpus:

  __inet_hash
    __sk_nulls_add_node_rcu
    <- (bpf triggers here)
    sock_set_flag(SOCK_RCU_FREE)

Let's move the SOCK_RCU_FREE part up a bit, before we are inserting
the socket into hashtables. Note, that the race is really harmless;
the bpf callers are handling this situation (where listener socket
doesn't have SOCK_RCU_FREE set) correctly, so the only
annoyance is a WARN_ONCE.

More details from Eric regarding SOCK_RCU_FREE timeline:

Commit 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under
synflood") added SOCK_RCU_FREE. At that time, the precise location of
sock_set_flag(sk, SOCK_RCU_FREE) did not matter, because the thread calling
__inet_hash() owns a reference on sk. SOCK_RCU_FREE was only tested
at dismantle time.

Commit 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
started checking SOCK_RCU_FREE _after_ the lookup to infer whether
the refcount has been taken care of.

Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 net/ipv4/inet_hashtables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 598c1b114d2c..a532f749e477 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -751,12 +751,12 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 		if (err)
 			goto unlock;
 	}
+	sock_set_flag(sk, SOCK_RCU_FREE);
 	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
 		sk->sk_family == AF_INET6)
 		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
 	else
 		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
-	sock_set_flag(sk, SOCK_RCU_FREE);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
 	spin_unlock(&ilb2->lock);
-- 
2.42.0.869.gea05f2083d-goog

^ permalink raw reply related

* Re: [PATCH 10/22] microblaze: include linux/cpu.h for trap_init() prototype
From: Arnd Bergmann @ 2023-11-08 21:07 UTC (permalink / raw)
  To: Geert Uytterhoeven, Arnd Bergmann
  Cc: Andrew Morton, linux-kernel, Masahiro Yamada, linux-kbuild,
	Matt Turner, Vineet Gupta, Russell King, Catalin Marinas,
	Will Deacon, Steven Rostedt, Masami Hiramatsu, Mark Rutland,
	guoren, Peter Zijlstra, Ard Biesheuvel, Huacai Chen, Greg Ungerer,
	Michal Simek, Thomas Bogendoerfer, Dinh Nguyen, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy, Geoff Levand, Palmer Dabbelt,
	Heiko Carstens, John Paul Adrian Glaubitz, David S . Miller,
	Andy Lutomirski, Thomas Gleixner, Ingo Molnar, x86, Helge Deller,
	Sudip Mukherjee, Greg Kroah-Hartman, Timur Tabi, Kent Overstreet,
	David Woodhouse, Naveen N. Rao, Anil S Keshavamurthy, Kees Cook,
	Vincenzo Frascino, Juri Lelli, Vincent Guittot, Nathan Chancellor,
	Nick Desaulniers, Nicolas Schier, Alexander Viro,
	Uwe Kleine-König, linux-alpha, linux-snps-arc,
	linux-arm-kernel, linux-trace-kernel, linux-csky@vger.kernel.org,
	loongarch, linux-m68k, linux-mips, linuxppc-dev, linux-riscv,
	linux-s390, linux-sh, sparclinux, Netdev, linux-parisc, linux-usb,
	linux-fbdev, dri-devel, linux-bcachefs@vger.kernel.org, linux-mtd
In-Reply-To: <CAMuHMdXgdn_cMq0YeqPu3sUeM5cEYbCoodxu8XwCGiRJ-vFsyw@mail.gmail.com>

On Wed, Nov 8, 2023, at 21:42, Geert Uytterhoeven wrote:
>
> On Wed, Nov 8, 2023 at 2:01 PM Arnd Bergmann <arnd@kernel.org> wrote:
>> From: Arnd Bergmann <arnd@arndb.de>
>>
>> Microblaze runs into a single -Wmissing-prototypes warning when that is
>> enabled:
>>
>> arch/microblaze/kernel/traps.c:21:6: warning: no previous prototype for 'trap_init' [-Wmissing-prototypes]
>>
>> Include the right header to avoid this.
>>
>> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
>
> Thanks for your patch!
>
>>  arch/alpha/kernel/traps.c      | 1 +
>>  arch/csky/include/asm/traps.h  | 2 --
>>  arch/csky/kernel/traps.c       | 1 +
>>  arch/m68k/coldfire/vectors.c   | 3 +--
>>  arch/m68k/coldfire/vectors.h   | 3 ---
>
> Ah, so this is where the m68k changes listed in the cover letter are
> hiding ;-)
>
>>  arch/microblaze/kernel/traps.c | 1 +
>>  arch/sparc/kernel/traps_32.c   | 1 +
>>  arch/sparc/kernel/traps_64.c   | 1 +
>>  arch/x86/include/asm/traps.h   | 1 -
>>  arch/x86/kernel/traps.c        | 1 +
>>  10 files changed, 7 insertions(+), 8 deletions(-)
>>  delete mode 100644 arch/m68k/coldfire/vectors.h
>
> Obviously the non-microblaze changes should be spun off in separate
> patches.

I messed up one of my rebases here and accidentally sent
the wrong changelog text. My intention was to have the
combined patch but with this text:

    arch: include linux/cpu.h for trap_init() prototype
    
    some architectures run into a -Wmissing-prototypes warning
    for trap_init()
    
    arch/microblaze/kernel/traps.c:21:6: warning: no previous prototype for 'trap_init' [-Wmissing-prototypes]
    
    Include the right header to avoid this consistently, removing
    the extra declarations on m68k and x86 that were added as local
    workarounds already.
    
    Signed-off-by: Arnd Bergmann <arnd@arndb.de>


I made the same mistake with the "arch: add do_page_fault prototypes"
patch that was missing an explanation.

      Arnd

^ permalink raw reply

* Re: BPF/XDP: kernel panic when removing an interface that is an xdp_redirect target
From: Nelson, Shannon @ 2023-11-08 21:30 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, Jesper Dangaard Brouer,
	David Ahern, Jakub Kicinski, netdev, bpf, Daniel Borkmann,
	Alexei Starovoitov, Andrii Nakryiko
In-Reply-To: <87h6lxy3zq.fsf@toke.dk>

On 11/7/2023 7:31 AM, Toke Høiland-Jørgensen wrote:
> 
> "Nelson, Shannon" <shannon.nelson@amd.com> writes:
> 
>> While testing new code to support XDP in the ionic driver we found that
>> we could panic the kernel by running a bind/unbind loop on the target
>> interface of an xdp_redirect action.  Obviously this is a stress test
>> that is abusing the system, but it does point to a window of opportunity
>> in bq_enqueue() and bq_xmit_all().  I believe that while the validity of
>> the target interface has been checked in __xdp_enqueue(), the interface
>> can be unbound by the time either bq_enqueue() or bq_xmit_all() tries to
>> use the interface.  There is no locking or reference taken on the
>> interface to hold it in place before the target’s ndo_xdp_xmit() is called.
>>
>> Below is a stack trace that our tester captured while running our test
>> code on a RHEL 9.2 kernel – yes, I know, unpublished driver code on a
>> non-upstream kernel.  But if you look at the current upstream code in
>> kernel/bpf/devmap.c I think you can see what we ran into.
>>
>> Other than telling users to not abuse the system with a bind/unbind
>> loop, is there something we can do to limit the potential pain here?
>> Without knowing what interfaces might be targeted by the users’ XDP
>> programs, is there a step the originating driver can do to take
>> precautions?  Did we simply miss a step in the driver, or is this an
>> actual problem in the devmap code?
> 
> Sounds like a driver bug :)

Entirely possible, wouldn't be our first ... :-)

> 
> The XDP redirect flow guarantees that all outstanding packets are
> flushed within a single NAPI cycle, as documented here:
> https://docs.kernel.org/bpf/redirect.html
> 
> So basically, the driver should be doing a two-step teardown: remove
> global visibility of the resource in question, wait for all concurrent
> users to finish, and *then* free the data structure. This corresponds to
> the usual RCU protection: resources should be kept alive until all
> concurrent RCU critical sections have exited on all CPUs. So if your
> driver is removing an interface's data structure without waiting for
> concurrent NAPI cycles to finish, that's a bug in the driver.
> 
> This kind of thing is what the synchronize_net() function is for; for a
> usage example, see veth_napi_del_range(). My guess would be that you're
> missing this as part of your driver teardown flow?

Essentially, the first thing we do in the remove function is to call 
unregister_netdev(), which has synchronize_net() in the path, so I don't 
think this is missing from our scenario, but thanks for the hint, I'll 
keep this in mind.  I do see there are a couple of net drivers that are 
more aggressive about calling it directly in some other parts of the 
logic - I don't think that has a bearing on this issue, but I'll keep it 
in mind.

> 
> Another source of a bug like this could be that your driver does not in
> fact call xdp_do_flush() before exiting its NAPI cycle, so that there
> will be packets from the previous cycle in the bq queue, in which case
> the assumption mentioned in the linked document obviously breaks down.
> But that would also be a driver bug :)

We do call the xdp_do_flush() at the end of the NAPI cycle, just before 
calling napi_complete_done().

> 
> -Toke
> 

Thanks for the notes - I'll have our tester spend some more time with 
this using other drivers/interfaces as the targets to see if we can 
gather more information on the scenario.

sln


^ permalink raw reply

* Re: [PATCH net] indirect_call_wrapper: Fix typo in INDIRECT_CALL_$NR kerneldoc
From: Simon Horman @ 2023-11-08 21:45 UTC (permalink / raw)
  To: Tobias Klauser; +Cc: Paolo Abeni, netdev, linux-kernel
In-Reply-To: <20231107080117.29511-1-tklauser@distanz.ch>

On Tue, Nov 07, 2023 at 09:01:17AM +0100, Tobias Klauser wrote:
> Fix a small typo in the kerneldoc comment of the INDIRECT_CALL_$NR
> macro.
> 
> Signed-off-by: Tobias Klauser <tklauser@distanz.ch>

Hi Tobias,

I am fine with this change but I don't think it fits the guidelines for
a bug fix. So I think it would be better targeted at net-next.

	Subject: [PATCH net-next] ...

If so, the following applies:

## Form letter - net-next-closed

The merge window for v6.7 has begun and therefore net-next is closed
for new drivers, features, code refactoring and optimizations.
We are currently accepting bug fixes only.

Please repost when net-next reopens after November 12th.

RFC patches sent for review only are obviously welcome at any time.

See: https://www.kernel.org/doc/html/next/process/maintainer-netdev.html#development-cycle

^ permalink raw reply

* [PATCH net v3] hv_netvsc: Mark VF as slave before exposing it to user-mode
From: longli @ 2023-11-08 22:06 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	linux-hyperv, netdev, linux-kernel
  Cc: Long Li

From: Long Li <longli@microsoft.com>

When a VF is being exposed form the kernel, it should be marked as "slave"
before exposing to the user-mode. The VF is not usable without netvsc running
as master. The user-mode should never see a VF without the "slave" flag.

An example of a user-mode program depending on this flag is cloud-init
(https://github.com/canonical/cloud-init/blob/19.3/cloudinit/net/__init__.py)
When scanning interfaces, it checks on if this interface has a master to
decide if it should be configured. There are other user-mode programs perform
similar checks.

This commit moves the code of setting the slave flag to the time before VF is
exposed to user-mode.

Signed-off-by: Long Li <longli@microsoft.com>
---

Change since v1:
Use a new function to handle NETDEV_POST_INIT.

Change since v2:
Add "net" in subject. Add more details on the user-mode program behavior.

 drivers/net/hyperv/netvsc_drv.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ec77fb9dcf89..fdad58dcc6a8 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2206,9 +2206,6 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
 		goto upper_link_failed;
 	}
 
-	/* set slave flag before open to prevent IPv6 addrconf */
-	vf_netdev->flags |= IFF_SLAVE;
-
 	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
 
 	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
@@ -2320,11 +2317,9 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
 	 */
 	list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
 		ndev = hv_get_drvdata(ndev_ctx->device_ctx);
-		if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) {
-			netdev_notice(vf_netdev,
-				      "falling back to mac addr based matching\n");
+		if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr) ||
+		    ether_addr_equal(vf_netdev->dev_addr, ndev->perm_addr))
 			return ndev;
-		}
 	}
 
 	netdev_notice(vf_netdev,
@@ -2332,6 +2327,19 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
 	return NULL;
 }
 
+static int netvsc_prepare_slave(struct net_device *vf_netdev)
+{
+	struct net_device *ndev;
+
+	ndev = get_netvsc_byslot(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
+
+	/* set slave flag before open to prevent IPv6 addrconf */
+	vf_netdev->flags |= IFF_SLAVE;
+	return NOTIFY_DONE;
+}
+
 static int netvsc_register_vf(struct net_device *vf_netdev)
 {
 	struct net_device_context *net_device_ctx;
@@ -2753,6 +2761,8 @@ static int netvsc_netdev_event(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 	switch (event) {
+	case NETDEV_POST_INIT:
+		return netvsc_prepare_slave(event_dev);
 	case NETDEV_REGISTER:
 		return netvsc_register_vf(event_dev);
 	case NETDEV_UNREGISTER:
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v2 net 1/7] net/sched: taprio: fix too early schedules switching
From: Vladimir Oltean @ 2023-11-08 22:27 UTC (permalink / raw)
  To: Faizal Rahim
  Cc: Vinicius Costa Gomes, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	netdev, linux-kernel
In-Reply-To: <20231107112023.676016-2-faizal.abdul.rahim@linux.intel.com>

On Tue, Nov 07, 2023 at 06:20:17AM -0500, Faizal Rahim wrote:
> In the current taprio code for dynamic schedule change,
> admin/oper schedule switching happens immediately when
> should_change_schedules() is true. Then the last entry of
> the old admin schedule stops being valid anymore from
> taprio_dequeue_from_txq’s perspective.

Admittedly, I may have become a bit detached from this code base in the
past months, but I don't understand the reasoning here.

Could you please explain what makes the last entry of the old admin
schedule be invalid from taprio_dequeue_from_txq()'s perspective?

What I see is that when should_change_schedules() is true, we change
q->oper_sched and q->admin_sched through the switch_schedules() call,
but we don't change q->current_entry, so I fail to understand the
connection you are implying.

On the other hand (and I see I did mention this in the other thread),
it seems that taprio_skb_exceeds_queue_max_sdu() - called from the
enqueue() path - looks at q->oper_sched, and that's a valid reason why
we'd want to delay the schedule switch until admin's actual base time,
rather than the current oper's cycle_end_time.

But please, let's spare no expense in providing a proper problem
description, justification for the change and Fixes: tag. This is not
optional.

> To solve this, we have to delay the switch_schedules() call via
> the new cycle_time_correction variable. The variable serves 2
> purposes:
> 1. Upon entering advance_sched(), if the value is set to a
> non-initialized value, it indicates that we need to change
> schedule.
> 2. Store the cycle time correction value which will be used for
> negative or positive correction.

It needs to be stated much more clearly that only purpose 1 is relevant
here (I would even go as far as to omit its secondary purpose here).
The only reason we are using the correction variable is because it
happens that we'll need that in later changes.

> 
> Fixes: a3d43c0d56f1 ("taprio: Add support adding an admin schedule")

I believe that since the only observable problem has to do with
taprio_skb_exceeds_queue_max_sdu(), the Fixes: tag should be the commit
which added that logic. Which is:

Fixes: a878fd46fe43 ("net/sched: keep the max_frm_len information inside struct sched_gate_list")

> Signed-off-by: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
> ---
>  net/sched/sch_taprio.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
> index 2e1949de4171..dee103647823 100644
> --- a/net/sched/sch_taprio.c
> +++ b/net/sched/sch_taprio.c
> @@ -41,6 +41,7 @@ static struct static_key_false taprio_have_working_mqprio;
>  #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
>  #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
>  #define TAPRIO_FLAGS_INVALID U32_MAX
> +#define INIT_CYCLE_TIME_CORRECTION S64_MIN

I would prefer naming it CYCLE_TIME_CORRECTION_INVALID or _UNSPEC.
It is not just used as the "initial" value.

>  
>  struct sched_entry {
>  	/* Durations between this GCL entry and the GCL entry where the
> @@ -75,6 +76,7 @@ struct sched_gate_list {
>  	ktime_t cycle_end_time;
>  	s64 cycle_time;
>  	s64 cycle_time_extension;
> +	s64 cycle_time_correction;
>  	s64 base_time;
>  };
>  
> @@ -940,8 +942,10 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
>  	admin = rcu_dereference_protected(q->admin_sched,
>  					  lockdep_is_held(&q->current_entry_lock));
>  
> -	if (!oper)
> +	if (!oper || oper->cycle_time_correction != INIT_CYCLE_TIME_CORRECTION) {

You could introduce even as early as this change a "static bool
sched_switch_pending(struct sched_gate_list *oper)" function, which
incorporates the entire body of this "if" expression.

> +		oper->cycle_time_correction = INIT_CYCLE_TIME_CORRECTION;
>  		switch_schedules(q, &admin, &oper);
> +	}
>  
>  	/* This can happen in two cases: 1. this is the very first run
>  	 * of this function (i.e. we weren't running any schedule
> @@ -981,7 +985,7 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
>  		 * schedule runs.
>  		 */
>  		end_time = sched_base_time(admin);
> -		switch_schedules(q, &admin, &oper);
> +		oper->cycle_time_correction = 0;
>  	}
>  
>  	next->end_time = end_time;
> @@ -1174,6 +1178,7 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb,
>  	}
>  
>  	taprio_calculate_gate_durations(q, new);
> +	new->cycle_time_correction = INIT_CYCLE_TIME_CORRECTION;
>  
>  	return 0;
>  }
> -- 
> 2.25.1
>

^ permalink raw reply

* [PATCH][ncsi] Revert NCSI link loss/gain commit
From: Johnathan Mantey @ 2023-11-08 22:29 UTC (permalink / raw)
  To: netdev

The NCSI commit
ncsi: Propagate carrier gain/loss events to the NCSI controller
introduced unwanted behavior.

The intent for the commit was to be able to detect carrier loss/gain
for just the NIC connected to the BMC. The unwanted effect is a
carrier loss for auxiliary paths also causes the BMC to lose
carrier. The BMC never regains carrier despite the secondary NIC
regaining a link.

This change, when merged, needs to be backported to stable kernels.

Signed-off-by: Johnathan Mantey <johnathanx.mantey@intel.com>
---
 net/ncsi/ncsi-aen.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index f8854bff286c..62fb1031763d 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -89,11 +89,6 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	if ((had_link == has_link) || chained)
 		return 0;

-	if (had_link)
-		netif_carrier_off(ndp->ndev.dev);
-	else
-		netif_carrier_on(ndp->ndev.dev);
-
 	if (!ndp->multi_package && !nc->package->multi_channel) {
 		if (had_link) {
 			ndp->flags |= NCSI_DEV_RESHUFFLE;
-- 
2.41.0

^ permalink raw reply related

* [PATCH net-next v4] hv_netvsc: Mark VF as slave before exposing it to user-mode
From: longli @ 2023-11-08 22:56 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	linux-hyperv, netdev, linux-kernel
  Cc: Long Li

From: Long Li <longli@microsoft.com>

When a VF is being exposed form the kernel, it should be marked as "slave"
before exposing to the user-mode. The VF is not usable without netvsc running
as master. The user-mode should never see a VF without the "slave" flag.

An example of a user-mode program depending on this flag is cloud-init
(https://github.com/canonical/cloud-init/blob/19.3/cloudinit/net/__init__.py)
When scanning interfaces, it checks on if this interface has a master to
decide if it should be configured. There are other user-mode programs perform
similar checks.

This commit moves the code of setting the slave flag to the time before VF is
exposed to user-mode.

Signed-off-by: Long Li <longli@microsoft.com>
---

Change since v1:
Use a new function to handle NETDEV_POST_INIT.

Change since v2:
Add "net" in subject. Add more details on the user-mode program behavior.

Change since v3:
Change target to net-next.

 drivers/net/hyperv/netvsc_drv.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ec77fb9dcf89..fdad58dcc6a8 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2206,9 +2206,6 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
 		goto upper_link_failed;
 	}
 
-	/* set slave flag before open to prevent IPv6 addrconf */
-	vf_netdev->flags |= IFF_SLAVE;
-
 	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
 
 	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
@@ -2320,11 +2317,9 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
 	 */
 	list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
 		ndev = hv_get_drvdata(ndev_ctx->device_ctx);
-		if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) {
-			netdev_notice(vf_netdev,
-				      "falling back to mac addr based matching\n");
+		if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr) ||
+		    ether_addr_equal(vf_netdev->dev_addr, ndev->perm_addr))
 			return ndev;
-		}
 	}
 
 	netdev_notice(vf_netdev,
@@ -2332,6 +2327,19 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
 	return NULL;
 }
 
+static int netvsc_prepare_slave(struct net_device *vf_netdev)
+{
+	struct net_device *ndev;
+
+	ndev = get_netvsc_byslot(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
+
+	/* set slave flag before open to prevent IPv6 addrconf */
+	vf_netdev->flags |= IFF_SLAVE;
+	return NOTIFY_DONE;
+}
+
 static int netvsc_register_vf(struct net_device *vf_netdev)
 {
 	struct net_device_context *net_device_ctx;
@@ -2753,6 +2761,8 @@ static int netvsc_netdev_event(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 	switch (event) {
+	case NETDEV_POST_INIT:
+		return netvsc_prepare_slave(event_dev);
 	case NETDEV_REGISTER:
 		return netvsc_register_vf(event_dev);
 	case NETDEV_UNREGISTER:
-- 
2.34.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox