public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses
@ 2024-11-24 12:38 Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 02/19] s390/pci: Use topology ID for multi-function devices Sasha Levin
                   ` (17 more replies)
  0 siblings, 18 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Niklas Schnelle, Gerd Bayer, Heiko Carstens, Sasha Levin, gor,
	agordeev, gerald.schaefer, jroedel, mjrosato, lukas, linux-s390

From: Niklas Schnelle <schnelle@linux.ibm.com>

[ Upstream commit 0467cdde8c4320bbfdb31a8cff1277b202f677fc ]

Instead of relying on the observed but not architected firmware behavior
that PCI functions from the same card are listed in ascending RID order
in clp_list_pci() ensure this by sorting. To allow for sorting separate
the initial clp_list_pci() and creation of the virtual PCI busses.

Note that fundamentally in our per-PCI function hotplug design non RID
order of discovery is still possible. For example when the two PFs of
a two port NIC are hotplugged after initial boot and in descending RID
order. In this case the virtual PCI bus would be created by the second
PF using that PF's UID as domain number instead of that of the first PF.
Thus the domain number would then change from the UID of the second PF
to that of the first PF on reboot but there is really nothing we can do
about that since changing domain numbers at runtime seems even worse.
This only impacts the domain number as the RIDs are consistent and thus
even with just the second PF visible it will show up in the correct
position on the virtual bus.

Reviewed-by: Gerd Bayer <gbayer@linux.ibm.com>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/s390/include/asm/pci.h |  5 ++-
 arch/s390/pci/pci.c         | 69 ++++++++++++++++++++++++++++++++-----
 arch/s390/pci/pci_clp.c     | 12 ++++---
 arch/s390/pci/pci_event.c   | 13 ++++---
 4 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 9d920ced60475..45e87c7c122a6 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -130,6 +130,7 @@ struct zpci_dev {
 	u16		vfn;		/* virtual function number */
 	u16		pchid;		/* physical channel ID */
 	u16		maxstbl;	/* Maximum store block size */
+	u16		rid;		/* RID as supplied by firmware */
 	u8		pfgid;		/* function group ID */
 	u8		pft;		/* pci function type */
 	u8		port;
@@ -210,12 +211,14 @@ extern struct airq_iv *zpci_aif_sbv;
 ----------------------------------------------------------------------------- */
 /* Base stuff */
 struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
+int zpci_add_device(struct zpci_dev *zdev);
 int zpci_enable_device(struct zpci_dev *);
 int zpci_disable_device(struct zpci_dev *);
 int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh);
 int zpci_deconfigure_device(struct zpci_dev *zdev);
 void zpci_device_reserved(struct zpci_dev *zdev);
 bool zpci_is_device_configured(struct zpci_dev *zdev);
+int zpci_scan_devices(void);
 
 int zpci_hot_reset_device(struct zpci_dev *zdev);
 int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *);
@@ -225,7 +228,7 @@ void zpci_update_fh(struct zpci_dev *zdev, u32 fh);
 
 /* CLP */
 int clp_setup_writeback_mio(void);
-int clp_scan_pci_devices(void);
+int clp_scan_pci_devices(struct list_head *scan_list);
 int clp_query_pci_fn(struct zpci_dev *zdev);
 int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as);
 int clp_disable_fh(struct zpci_dev *zdev, u32 *fh);
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index bd9624c20b802..b7efa96776eac 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -29,6 +29,7 @@
 #include <linux/pci.h>
 #include <linux/printk.h>
 #include <linux/lockdep.h>
+#include <linux/list_sort.h>
 
 #include <asm/isc.h>
 #include <asm/airq.h>
@@ -785,7 +786,6 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
 	struct zpci_dev *zdev;
 	int rc;
 
-	zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state);
 	zdev = kzalloc(sizeof(*zdev), GFP_KERNEL);
 	if (!zdev)
 		return ERR_PTR(-ENOMEM);
@@ -805,6 +805,19 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
 	mutex_init(&zdev->fmb_lock);
 	mutex_init(&zdev->kzdev_lock);
 
+	return zdev;
+
+error:
+	zpci_dbg(0, "crt fid:%x, rc:%d\n", fid, rc);
+	kfree(zdev);
+	return ERR_PTR(rc);
+}
+
+int zpci_add_device(struct zpci_dev *zdev)
+{
+	int rc;
+
+	zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", zdev->fid, zdev->fh, zdev->state);
 	rc = zpci_init_iommu(zdev);
 	if (rc)
 		goto error;
@@ -816,15 +829,13 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
 	spin_lock(&zpci_list_lock);
 	list_add_tail(&zdev->entry, &zpci_list);
 	spin_unlock(&zpci_list_lock);
-
-	return zdev;
+	return 0;
 
 error_destroy_iommu:
 	zpci_destroy_iommu(zdev);
 error:
-	zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc);
-	kfree(zdev);
-	return ERR_PTR(rc);
+	zpci_dbg(0, "add fid:%x, rc:%d\n", zdev->fid, rc);
+	return rc;
 }
 
 bool zpci_is_device_configured(struct zpci_dev *zdev)
@@ -1082,6 +1093,49 @@ bool zpci_is_enabled(void)
 	return s390_pci_initialized;
 }
 
+static int zpci_cmp_rid(void *priv, const struct list_head *a,
+			const struct list_head *b)
+{
+	struct zpci_dev *za = container_of(a, struct zpci_dev, entry);
+	struct zpci_dev *zb = container_of(b, struct zpci_dev, entry);
+
+	/*
+	 * PCI functions without RID available maintain original order
+	 * between themselves but sort before those with RID.
+	 */
+	if (za->rid == zb->rid)
+		return za->rid_available > zb->rid_available;
+	/*
+	 * PCI functions with RID sort by RID ascending.
+	 */
+	return za->rid > zb->rid;
+}
+
+static void zpci_add_devices(struct list_head *scan_list)
+{
+	struct zpci_dev *zdev, *tmp;
+
+	list_sort(NULL, scan_list, &zpci_cmp_rid);
+	list_for_each_entry_safe(zdev, tmp, scan_list, entry) {
+		list_del_init(&zdev->entry);
+		zpci_add_device(zdev);
+	}
+}
+
+int zpci_scan_devices(void)
+{
+	LIST_HEAD(scan_list);
+	int rc;
+
+	rc = clp_scan_pci_devices(&scan_list);
+	if (rc)
+		return rc;
+
+	zpci_add_devices(&scan_list);
+	zpci_bus_scan_busses();
+	return 0;
+}
+
 static int __init pci_base_init(void)
 {
 	int rc;
@@ -1111,10 +1165,9 @@ static int __init pci_base_init(void)
 	if (rc)
 		goto out_irq;
 
-	rc = clp_scan_pci_devices();
+	rc = zpci_scan_devices();
 	if (rc)
 		goto out_find;
-	zpci_bus_scan_busses();
 
 	s390_pci_initialized = 1;
 	return 0;
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 6f55a59a08711..f7430086e9739 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -164,8 +164,10 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 	zdev->port = response->port;
 	zdev->uid = response->uid;
 	zdev->fmb_length = sizeof(u32) * response->fmb_len;
-	zdev->rid_available = response->rid_avail;
 	zdev->is_physfn = response->is_physfn;
+	zdev->rid_available = response->rid_avail;
+	if (zdev->rid_available)
+		zdev->rid = response->rid;
 	if (!s390_pci_no_rid && zdev->rid_available)
 		zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN;
 
@@ -407,6 +409,7 @@ static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid,
 
 static void __clp_add(struct clp_fh_list_entry *entry, void *data)
 {
+	struct list_head *scan_list = data;
 	struct zpci_dev *zdev;
 
 	if (!entry->vendor_id)
@@ -417,10 +420,11 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data)
 		zpci_zdev_put(zdev);
 		return;
 	}
-	zpci_create_device(entry->fid, entry->fh, entry->config_state);
+	zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state);
+	list_add_tail(&zdev->entry, scan_list);
 }
 
-int clp_scan_pci_devices(void)
+int clp_scan_pci_devices(struct list_head *scan_list)
 {
 	struct clp_req_rsp_list_pci *rrb;
 	int rc;
@@ -429,7 +433,7 @@ int clp_scan_pci_devices(void)
 	if (!rrb)
 		return -ENOMEM;
 
-	rc = clp_list_pci(rrb, NULL, __clp_add);
+	rc = clp_list_pci(rrb, scan_list, __clp_add);
 
 	clp_free_block(rrb);
 	return rc;
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index d4f19d33914cb..47f934f4e828e 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -340,6 +340,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 			zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED);
 			if (IS_ERR(zdev))
 				break;
+			zpci_add_device(zdev);
 		} else {
 			/* the configuration request may be stale */
 			if (zdev->state != ZPCI_FN_STATE_STANDBY)
@@ -349,10 +350,14 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 		zpci_scan_configured_device(zdev, ccdf->fh);
 		break;
 	case 0x0302: /* Reserved -> Standby */
-		if (!zdev)
-			zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY);
-		else
+		if (!zdev) {
+			zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY);
+			if (IS_ERR(zdev))
+				break;
+			zpci_add_device(zdev);
+		} else {
 			zpci_update_fh(zdev, ccdf->fh);
+		}
 		break;
 	case 0x0303: /* Deconfiguration requested */
 		if (zdev) {
@@ -381,7 +386,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 		break;
 	case 0x0306: /* 0x308 or 0x302 for multiple devices */
 		zpci_remove_reserved_devices();
-		clp_scan_pci_devices();
+		zpci_scan_devices();
 		break;
 	case 0x0308: /* Standby -> Reserved */
 		if (!zdev)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 02/19] s390/pci: Use topology ID for multi-function devices
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 03/19] s390/pci: Ignore RID for isolated VFs Sasha Levin
                   ` (16 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Niklas Schnelle, Gerd Bayer, Heiko Carstens, Sasha Levin, gor,
	agordeev, gerald.schaefer, jgg, mjrosato, lukas, wintera,
	linux-s390

From: Niklas Schnelle <schnelle@linux.ibm.com>

[ Upstream commit 126034faaac5f356822c4a9bebfa75664da11056 ]

The newly introduced topology ID (TID) field in the CLP Query PCI
Function explicitly identifies groups of PCI functions whose RIDs belong
to the same (sub-)topology. When available use the TID instead of the
PCHID to match zPCI busses/domains for multi-function devices. Note that
currently only a single PCI bus per TID is supported. This change is
required because in future machines the PCHID will not identify a PCI
card but a specific port in the case of some multi-port NICs while from
a PCI point of view the entire card is a subtopology.

Reviewed-by: Gerd Bayer <gbayer@linux.ibm.com>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/s390/include/asm/pci.h     |  9 ++++++---
 arch/s390/include/asm/pci_clp.h |  8 +++++---
 arch/s390/pci/pci_bus.c         | 17 ++++++++++-------
 arch/s390/pci/pci_clp.c         |  3 +++
 4 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 45e87c7c122a6..e107eda7af86f 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -107,9 +107,10 @@ struct zpci_bus {
 	struct list_head	resources;
 	struct list_head	bus_next;
 	struct resource		bus_resource;
-	int			pchid;
+	int			topo;		/* TID if topo_is_tid, PCHID otherwise */
 	int			domain_nr;
-	bool			multifunction;
+	u8			multifunction	: 1;
+	u8			topo_is_tid	: 1;
 	enum pci_bus_speed	max_bus_speed;
 };
 
@@ -131,6 +132,7 @@ struct zpci_dev {
 	u16		pchid;		/* physical channel ID */
 	u16		maxstbl;	/* Maximum store block size */
 	u16		rid;		/* RID as supplied by firmware */
+	u16		tid;		/* Topology for which RID is valid */
 	u8		pfgid;		/* function group ID */
 	u8		pft;		/* pci function type */
 	u8		port;
@@ -141,7 +143,8 @@ struct zpci_dev {
 	u8		is_physfn	: 1;
 	u8		util_str_avail	: 1;
 	u8		irqs_registered	: 1;
-	u8		reserved	: 2;
+	u8		tid_avail	: 1;
+	u8		reserved	: 1;
 	unsigned int	devfn;		/* DEVFN part of the RID*/
 
 	u8 pfip[CLP_PFIP_NR_SEGMENTS];	/* pci function internal path */
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index f0c677ddd2706..14afb9ce91f3b 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -110,7 +110,8 @@ struct clp_req_query_pci {
 struct clp_rsp_query_pci {
 	struct clp_rsp_hdr hdr;
 	u16 vfn;			/* virtual fn number */
-	u16			:  3;
+	u16			:  2;
+	u16 tid_avail		:  1;
 	u16 rid_avail		:  1;
 	u16 is_physfn		:  1;
 	u16 reserved1		:  1;
@@ -130,8 +131,9 @@ struct clp_rsp_query_pci {
 	u64 edma;			/* end dma as */
 #define ZPCI_RID_MASK_DEVFN 0x00ff
 	u16 rid;			/* BUS/DEVFN PCI address */
-	u16 reserved0;
-	u32 reserved[10];
+	u32 reserved0;
+	u16 tid;
+	u32 reserved[9];
 	u32 uid;			/* user defined id */
 	u8 util_str[CLP_UTIL_STR_LEN];	/* utility string */
 	u32 reserved2[16];
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index daa5d7450c7d3..54879e773e4a3 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -232,13 +232,13 @@ static void zpci_bus_put(struct zpci_bus *zbus)
 	kref_put(&zbus->kref, zpci_bus_release);
 }
 
-static struct zpci_bus *zpci_bus_get(int pchid)
+static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
 {
 	struct zpci_bus *zbus;
 
 	mutex_lock(&zbus_list_lock);
 	list_for_each_entry(zbus, &zbus_list, bus_next) {
-		if (pchid == zbus->pchid) {
+		if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) {
 			kref_get(&zbus->kref);
 			goto out_unlock;
 		}
@@ -249,7 +249,7 @@ static struct zpci_bus *zpci_bus_get(int pchid)
 	return zbus;
 }
 
-static struct zpci_bus *zpci_bus_alloc(int pchid)
+static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
 {
 	struct zpci_bus *zbus;
 
@@ -257,7 +257,8 @@ static struct zpci_bus *zpci_bus_alloc(int pchid)
 	if (!zbus)
 		return NULL;
 
-	zbus->pchid = pchid;
+	zbus->topo = topo;
+	zbus->topo_is_tid = topo_is_tid;
 	INIT_LIST_HEAD(&zbus->bus_next);
 	mutex_lock(&zbus_list_lock);
 	list_add_tail(&zbus->bus_next, &zbus_list);
@@ -321,8 +322,9 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev)
 
 int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops)
 {
+	bool topo_is_tid = zdev->tid_avail;
 	struct zpci_bus *zbus = NULL;
-	int rc = -EBADF;
+	int topo, rc = -EBADF;
 
 	if (zpci_nb_devices == ZPCI_NR_DEVICES) {
 		pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n",
@@ -333,11 +335,12 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops)
 	if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS)
 		return -EINVAL;
 
+	topo = topo_is_tid ? zdev->tid : zdev->pchid;
 	if (!s390_pci_no_rid && zdev->rid_available)
-		zbus = zpci_bus_get(zdev->pchid);
+		zbus = zpci_bus_get(topo, topo_is_tid);
 
 	if (!zbus) {
-		zbus = zpci_bus_alloc(zdev->pchid);
+		zbus = zpci_bus_alloc(topo, topo_is_tid);
 		if (!zbus)
 			return -ENOMEM;
 	}
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index f7430086e9739..e222036874e51 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -170,6 +170,9 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 		zdev->rid = response->rid;
 	if (!s390_pci_no_rid && zdev->rid_available)
 		zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN;
+	zdev->tid_avail = response->tid_avail;
+	if (zdev->tid_avail)
+		zdev->tid = response->tid;
 
 	memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip));
 	if (response->util_str_avail) {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 03/19] s390/pci: Ignore RID for isolated VFs
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 02/19] s390/pci: Use topology ID for multi-function devices Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 04/19] epoll: annotate racy check Sasha Levin
                   ` (15 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Niklas Schnelle, Gerd Bayer, Heiko Carstens, Sasha Levin,
	gerald.schaefer, gor, agordeev, linux-s390

From: Niklas Schnelle <schnelle@linux.ibm.com>

[ Upstream commit 25f39d3dcb48bbc824a77d16b3d977f0f3713cfe ]

Ensure that VFs used in isolation, that is with their parent PF not
visible to the configuration but with their RID exposed, are treated
compatibly with existing isolated VF use cases without exposed RID
including RoCE Express VFs. This allows creating configurations where
one LPAR manages PFs while their child VFs are used by other LPARs. This
gives the LPAR managing the PFs a role analogous to that of the
hypervisor in a typical use case of passing child VFs to guests.

Instead of creating a multifunction struct zpci_bus whenever a PCI
function with RID exposed is discovered only create such a bus for
configured physical functions and only consider multifunction busses
when searching for an existing bus. Additionally only set zdev->devfn to
the devfn part of the RID once the function is added to a multifunction
bus.

This also fixes probing of more than 7 such isolated VFs from the same
physical bus. This is because common PCI code in pci_scan_slot() only
looks for more functions when pdev->multifunction is set which somewhat
counter intutively is not the case for VFs.

Note that PFs are looked at before their child VFs is guaranteed because
we sort the zpci_list by RID ascending.

Reviewed-by: Gerd Bayer <gbayer@linux.ibm.com>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/s390/pci/pci_bus.c | 33 ++++++++++++++++++++-------------
 arch/s390/pci/pci_clp.c |  2 --
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index 54879e773e4a3..1b74a000ff645 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -168,9 +168,16 @@ void zpci_bus_scan_busses(void)
 	mutex_unlock(&zbus_list_lock);
 }
 
+static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev)
+{
+	return !s390_pci_no_rid && zdev->rid_available &&
+		zpci_is_device_configured(zdev) &&
+		!zdev->vfn;
+}
+
 /* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus
  * @zbus: the zbus holding the zdevices
- * @fr: PCI root function that will determine the bus's domain, and bus speeed
+ * @fr: PCI root function that will determine the bus's domain, and bus speed
  * @ops: the pci operations
  *
  * The PCI function @fr determines the domain (its UID), multifunction property
@@ -188,7 +195,7 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s
 		return domain;
 
 	zbus->domain_nr = domain;
-	zbus->multifunction = fr->rid_available;
+	zbus->multifunction = zpci_bus_is_multifunction_root(fr);
 	zbus->max_bus_speed = fr->max_bus_speed;
 
 	/*
@@ -238,6 +245,8 @@ static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
 
 	mutex_lock(&zbus_list_lock);
 	list_for_each_entry(zbus, &zbus_list, bus_next) {
+		if (!zbus->multifunction)
+			continue;
 		if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) {
 			kref_get(&zbus->kref);
 			goto out_unlock;
@@ -293,19 +302,22 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev)
 {
 	int rc = -EINVAL;
 
+	if (zbus->multifunction) {
+		if (!zdev->rid_available) {
+			WARN_ONCE(1, "rid_available not set for multifunction\n");
+			return rc;
+		}
+		zdev->devfn = zdev->rid & ZPCI_RID_MASK_DEVFN;
+	}
+
 	if (zbus->function[zdev->devfn]) {
 		pr_err("devfn %04x is already assigned\n", zdev->devfn);
 		return rc;
 	}
-
 	zdev->zbus = zbus;
 	zbus->function[zdev->devfn] = zdev;
 	zpci_nb_devices++;
 
-	if (zbus->multifunction && !zdev->rid_available) {
-		WARN_ONCE(1, "rid_available not set for multifunction\n");
-		goto error;
-	}
 	rc = zpci_init_slot(zdev);
 	if (rc)
 		goto error;
@@ -332,13 +344,8 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops)
 		return -ENOSPC;
 	}
 
-	if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS)
-		return -EINVAL;
-
 	topo = topo_is_tid ? zdev->tid : zdev->pchid;
-	if (!s390_pci_no_rid && zdev->rid_available)
-		zbus = zpci_bus_get(topo, topo_is_tid);
-
+	zbus = zpci_bus_get(topo, topo_is_tid);
 	if (!zbus) {
 		zbus = zpci_bus_alloc(topo, topo_is_tid);
 		if (!zbus)
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index e222036874e51..74dac6da03d5b 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -168,8 +168,6 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
 	zdev->rid_available = response->rid_avail;
 	if (zdev->rid_available)
 		zdev->rid = response->rid;
-	if (!s390_pci_no_rid && zdev->rid_available)
-		zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN;
 	zdev->tid_avail = response->tid_avail;
 	if (zdev->tid_avail)
 		zdev->tid = response->tid;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 04/19] epoll: annotate racy check
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 02/19] s390/pci: Use topology ID for multi-function devices Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 03/19] s390/pci: Ignore RID for isolated VFs Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 05/19] kselftest/arm64: Log fp-stress child startup errors to stdout Sasha Levin
                   ` (14 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Christian Brauner, Jan Kara, syzbot+3b6b32dc50537a49bb4a,
	Sasha Levin, viro, linux-fsdevel

From: Christian Brauner <brauner@kernel.org>

[ Upstream commit 6474353a5e3d0b2cf610153cea0c61f576a36d0a ]

Epoll relies on a racy fastpath check during __fput() in
eventpoll_release() to avoid the hit of pointlessly acquiring a
semaphore. Annotate that race by using WRITE_ONCE() and READ_ONCE().

Link: https://lore.kernel.org/r/66edfb3c.050a0220.3195df.001a.GAE@google.com
Link: https://lore.kernel.org/r/20240925-fungieren-anbauen-79b334b00542@brauner
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: syzbot+3b6b32dc50537a49bb4a@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/eventpoll.c            | 6 ++++--
 include/linux/eventpoll.h | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1ae4542f0bd88..90fbab6b6f036 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 	to_free = NULL;
 	head = file->f_ep;
 	if (head->first == &epi->fllink && !epi->fllink.next) {
-		file->f_ep = NULL;
+		/* See eventpoll_release() for details. */
+		WRITE_ONCE(file->f_ep, NULL);
 		if (!is_file_epoll(file)) {
 			struct epitems_head *v;
 			v = container_of(head, struct epitems_head, epitems);
@@ -1603,7 +1604,8 @@ static int attach_epitem(struct file *file, struct epitem *epi)
 			spin_unlock(&file->f_lock);
 			goto allocate;
 		}
-		file->f_ep = head;
+		/* See eventpoll_release() for details. */
+		WRITE_ONCE(file->f_ep, head);
 		to_free = NULL;
 	}
 	hlist_add_head_rcu(&epi->fllink, file->f_ep);
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 3337745d81bd6..0c0d00fcd131f 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -42,7 +42,7 @@ static inline void eventpoll_release(struct file *file)
 	 * because the file in on the way to be removed and nobody ( but
 	 * eventpoll ) has still a reference to this file.
 	 */
-	if (likely(!file->f_ep))
+	if (likely(!READ_ONCE(file->f_ep)))
 		return;
 
 	/*
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 05/19] kselftest/arm64: Log fp-stress child startup errors to stdout
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (2 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 04/19] epoll: annotate racy check Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 06/19] s390/cpum_sf: Handle CPU hotplug remove during sampling Sasha Levin
                   ` (13 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Mark Brown, Catalin Marinas, Sasha Levin, will, shuah,
	mark.rutland, linux-arm-kernel, linux-kselftest

From: Mark Brown <broonie@kernel.org>

[ Upstream commit dca93d29845dfed60910ba13dbfb6ae6a0e19f6d ]

Currently if we encounter an error between fork() and exec() of a child
process we log the error to stderr. This means that the errors don't get
annotated with the child information which makes diagnostics harder and
means that if we miss the exit signal from the child we can deadlock
waiting for output from the child. Improve robustness and output quality
by logging to stdout instead.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241023-arm64-fp-stress-exec-fail-v1-1-ee3c62932c15@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index faac24bdefeb9..80f22789504d6 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -79,7 +79,7 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = dup2(pipefd[1], 1);
 		if (ret == -1) {
-			fprintf(stderr, "dup2() %d\n", errno);
+			printf("dup2() %d\n", errno);
 			exit(EXIT_FAILURE);
 		}
 
@@ -89,7 +89,7 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = dup2(startup_pipe[0], 3);
 		if (ret == -1) {
-			fprintf(stderr, "dup2() %d\n", errno);
+			printf("dup2() %d\n", errno);
 			exit(EXIT_FAILURE);
 		}
 
@@ -107,16 +107,15 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = read(3, &i, sizeof(i));
 		if (ret < 0)
-			fprintf(stderr, "read(startp pipe) failed: %s (%d)\n",
-				strerror(errno), errno);
+			printf("read(startp pipe) failed: %s (%d)\n",
+			       strerror(errno), errno);
 		if (ret > 0)
-			fprintf(stderr, "%d bytes of data on startup pipe\n",
-				ret);
+			printf("%d bytes of data on startup pipe\n", ret);
 		close(3);
 
 		ret = execl(program, program, NULL);
-		fprintf(stderr, "execl(%s) failed: %d (%s)\n",
-			program, errno, strerror(errno));
+		printf("execl(%s) failed: %d (%s)\n",
+		       program, errno, strerror(errno));
 
 		exit(EXIT_FAILURE);
 	} else {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 06/19] s390/cpum_sf: Handle CPU hotplug remove during sampling
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (3 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 05/19] kselftest/arm64: Log fp-stress child startup errors to stdout Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 07/19] block: RCU protect disk->conv_zones_bitmap Sasha Levin
                   ` (12 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Thomas Richter, Hendrik Brueckner, Heiko Carstens, Sasha Levin,
	gor, agordeev, sumanthk, linux-s390

From: Thomas Richter <tmricht@linux.ibm.com>

[ Upstream commit a0bd7dacbd51c632b8e2c0500b479af564afadf3 ]

CPU hotplug remove handling triggers the following function
call sequence:

   CPUHP_AP_PERF_S390_SF_ONLINE  --> s390_pmu_sf_offline_cpu()
   ...
   CPUHP_AP_PERF_ONLINE          --> perf_event_exit_cpu()

The s390 CPUMF sampling CPU hotplug handler invokes:

 s390_pmu_sf_offline_cpu()
 +-->  cpusf_pmu_setup()
       +--> setup_pmc_cpu()
            +--> deallocate_buffers()

This function de-allocates all sampling data buffers (SDBs) allocated
for that CPU at event initialization. It also clears the
PMU_F_RESERVED bit. The CPU is gone and can not be sampled.

With the event still being active on the removed CPU, the CPU event
hotplug support in kernel performance subsystem triggers the
following function calls on the removed CPU:

  perf_event_exit_cpu()
  +--> perf_event_exit_cpu_context()
       +--> __perf_event_exit_context()
	    +--> __perf_remove_from_context()
	         +--> event_sched_out()
	              +--> cpumsf_pmu_del()
	                   +--> cpumsf_pmu_stop()
                                +--> hw_perf_event_update()

to stop and remove the event. During removal of the event, the
sampling device driver tries to read out the remaining samples from
the sample data buffers (SDBs). But they have already been freed
(and may have been re-assigned). This may lead to a use after free
situation in which case the samples are most likely invalid. In the
best case the memory has not been reassigned and still contains
valid data.

Remedy this situation and check if the CPU is still in reserved
state (bit PMU_F_RESERVED set). In this case the SDBs have not been
released an contain valid data. This is always the case when
the event is removed (and no CPU hotplug off occured).
If the PMU_F_RESERVED bit is not set, the SDB buffers are gone.

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Hendrik Brueckner <brueckner@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/s390/kernel/perf_cpum_sf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 5b765e3ccf0ca..625833a1dccd6 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1780,7 +1780,9 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags)
 	event->hw.state |= PERF_HES_STOPPED;
 
 	if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
-		hw_perf_event_update(event, 1);
+		/* CPU hotplug off removes SDBs. No samples to extract. */
+		if (cpuhw->flags & PMU_F_RESERVED)
+			hw_perf_event_update(event, 1);
 		event->hw.state |= PERF_HES_UPTODATE;
 	}
 	perf_pmu_enable(event->pmu);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 07/19] block: RCU protect disk->conv_zones_bitmap
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (4 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 06/19] s390/cpum_sf: Handle CPU hotplug remove during sampling Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 08/19] btrfs: don't take dev_replace rwsem on task already holding it Sasha Levin
                   ` (11 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Damien Le Moal, Christoph Hellwig, Johannes Thumshirn, Jens Axboe,
	Sasha Levin, linux-block

From: Damien Le Moal <dlemoal@kernel.org>

[ Upstream commit d7cb6d7414ea1b33536fa6d11805cb8dceec1f97 ]

Ensure that a disk revalidation changing the conventional zones bitmap
of a disk does not cause invalid memory references when using the
disk_zone_is_conv() helper by RCU protecting the disk->conv_zones_bitmap
pointer.

disk_zone_is_conv() is modified to operate under the RCU read lock and
the function disk_set_conv_zones_bitmap() is added to update a disk
conv_zones_bitmap pointer using rcu_replace_pointer() with the disk
zone_wplugs_lock spinlock held.

disk_free_zone_resources() is modified to call
disk_update_zone_resources() with a NULL bitmap pointer to free the disk
conv_zones_bitmap. disk_set_conv_zones_bitmap() is also used in
disk_update_zone_resources() to set the new (revalidated) bitmap and
free the old one.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20241107064300.227731-2-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 block/blk-zoned.c      | 43 ++++++++++++++++++++++++++++++------------
 include/linux/blkdev.h |  2 +-
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index af19296fa50df..74e39545562d5 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -350,9 +350,15 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 
 static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
 {
-	if (!disk->conv_zones_bitmap)
-		return false;
-	return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
+	unsigned long *bitmap;
+	bool is_conv;
+
+	rcu_read_lock();
+	bitmap = rcu_dereference(disk->conv_zones_bitmap);
+	is_conv = bitmap && test_bit(disk_zone_no(disk, sector), bitmap);
+	rcu_read_unlock();
+
+	return is_conv;
 }
 
 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
@@ -1455,6 +1461,24 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
 	disk->zone_wplugs_hash_bits = 0;
 }
 
+static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
+					       unsigned long *bitmap)
+{
+	unsigned int nr_conv_zones = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	if (bitmap)
+		nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
+	bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
+				     lockdep_is_held(&disk->zone_wplugs_lock));
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+	kfree_rcu_mightsleep(bitmap);
+
+	return nr_conv_zones;
+}
+
 void disk_free_zone_resources(struct gendisk *disk)
 {
 	if (!disk->zone_wplugs_pool)
@@ -1478,8 +1502,7 @@ void disk_free_zone_resources(struct gendisk *disk)
 	mempool_destroy(disk->zone_wplugs_pool);
 	disk->zone_wplugs_pool = NULL;
 
-	bitmap_free(disk->conv_zones_bitmap);
-	disk->conv_zones_bitmap = NULL;
+	disk_set_conv_zones_bitmap(disk, NULL);
 	disk->zone_capacity = 0;
 	disk->last_zone_capacity = 0;
 	disk->nr_zones = 0;
@@ -1538,17 +1561,15 @@ static int disk_update_zone_resources(struct gendisk *disk,
 				      struct blk_revalidate_zone_args *args)
 {
 	struct request_queue *q = disk->queue;
-	unsigned int nr_seq_zones, nr_conv_zones = 0;
+	unsigned int nr_seq_zones, nr_conv_zones;
 	unsigned int pool_size;
 	struct queue_limits lim;
 
 	disk->nr_zones = args->nr_zones;
 	disk->zone_capacity = args->zone_capacity;
 	disk->last_zone_capacity = args->last_zone_capacity;
-	swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
-	if (disk->conv_zones_bitmap)
-		nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap,
-					      disk->nr_zones);
+	nr_conv_zones =
+		disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
 	if (nr_conv_zones >= disk->nr_zones) {
 		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
 			disk->disk_name, nr_conv_zones, disk->nr_zones);
@@ -1823,8 +1844,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 		disk_free_zone_resources(disk);
 	blk_mq_unfreeze_queue(q);
 
-	kfree(args.conv_zones_bitmap);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50c3b959da281..3027ce2de7319 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -194,7 +194,7 @@ struct gendisk {
 	unsigned int		nr_zones;
 	unsigned int		zone_capacity;
 	unsigned int		last_zone_capacity;
-	unsigned long		*conv_zones_bitmap;
+	unsigned long __rcu	*conv_zones_bitmap;
 	unsigned int            zone_wplugs_hash_bits;
 	spinlock_t              zone_wplugs_lock;
 	struct mempool_s	*zone_wplugs_pool;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 08/19] btrfs: don't take dev_replace rwsem on task already holding it
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (5 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 07/19] block: RCU protect disk->conv_zones_bitmap Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 09/19] btrfs: zlib: make the compression path to handle sector size < page size Sasha Levin
                   ` (10 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Johannes Thumshirn, Filipe Manana, David Sterba, Sasha Levin, clm,
	josef, linux-btrfs

From: Johannes Thumshirn <johannes.thumshirn@wdc.com>

[ Upstream commit 8cca35cb29f81eba3e96ec44dad8696c8a2f9138 ]

Running fstests btrfs/011 with MKFS_OPTIONS="-O rst" to force the usage of
the RAID stripe-tree, we get the following splat from lockdep:

 BTRFS info (device sdd): dev_replace from /dev/sdd (devid 1) to /dev/sdb started

 ============================================
 WARNING: possible recursive locking detected
 6.11.0-rc3-btrfs-for-next #599 Not tainted
 --------------------------------------------
 btrfs/2326 is trying to acquire lock:
 ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250

 but task is already holding lock:
 ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&fs_info->dev_replace.rwsem);
   lock(&fs_info->dev_replace.rwsem);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 1 lock held by btrfs/2326:
  #0: ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250

 stack backtrace:
 CPU: 1 UID: 0 PID: 2326 Comm: btrfs Not tainted 6.11.0-rc3-btrfs-for-next #599
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 Call Trace:
  <TASK>
  dump_stack_lvl+0x5b/0x80
  __lock_acquire+0x2798/0x69d0
  ? __pfx___lock_acquire+0x10/0x10
  ? __pfx___lock_acquire+0x10/0x10
  lock_acquire+0x19d/0x4a0
  ? btrfs_map_block+0x39f/0x2250
  ? __pfx_lock_acquire+0x10/0x10
  ? find_held_lock+0x2d/0x110
  ? lock_is_held_type+0x8f/0x100
  down_read+0x8e/0x440
  ? btrfs_map_block+0x39f/0x2250
  ? __pfx_down_read+0x10/0x10
  ? do_raw_read_unlock+0x44/0x70
  ? _raw_read_unlock+0x23/0x40
  btrfs_map_block+0x39f/0x2250
  ? btrfs_dev_replace_by_ioctl+0xd69/0x1d00
  ? btrfs_bio_counter_inc_blocked+0xd9/0x2e0
  ? __kasan_slab_alloc+0x6e/0x70
  ? __pfx_btrfs_map_block+0x10/0x10
  ? __pfx_btrfs_bio_counter_inc_blocked+0x10/0x10
  ? kmem_cache_alloc_noprof+0x1f2/0x300
  ? mempool_alloc_noprof+0xed/0x2b0
  btrfs_submit_chunk+0x28d/0x17e0
  ? __pfx_btrfs_submit_chunk+0x10/0x10
  ? bvec_alloc+0xd7/0x1b0
  ? bio_add_folio+0x171/0x270
  ? __pfx_bio_add_folio+0x10/0x10
  ? __kasan_check_read+0x20/0x20
  btrfs_submit_bio+0x37/0x80
  read_extent_buffer_pages+0x3df/0x6c0
  btrfs_read_extent_buffer+0x13e/0x5f0
  read_tree_block+0x81/0xe0
  read_block_for_search+0x4bd/0x7a0
  ? __pfx_read_block_for_search+0x10/0x10
  btrfs_search_slot+0x78d/0x2720
  ? __pfx_btrfs_search_slot+0x10/0x10
  ? lock_is_held_type+0x8f/0x100
  ? kasan_save_track+0x14/0x30
  ? __kasan_slab_alloc+0x6e/0x70
  ? kmem_cache_alloc_noprof+0x1f2/0x300
  btrfs_get_raid_extent_offset+0x181/0x820
  ? __pfx_lock_acquire+0x10/0x10
  ? __pfx_btrfs_get_raid_extent_offset+0x10/0x10
  ? down_read+0x194/0x440
  ? __pfx_down_read+0x10/0x10
  ? do_raw_read_unlock+0x44/0x70
  ? _raw_read_unlock+0x23/0x40
  btrfs_map_block+0x5b5/0x2250
  ? __pfx_btrfs_map_block+0x10/0x10
  scrub_submit_initial_read+0x8fe/0x11b0
  ? __pfx_scrub_submit_initial_read+0x10/0x10
  submit_initial_group_read+0x161/0x3a0
  ? lock_release+0x20e/0x710
  ? __pfx_submit_initial_group_read+0x10/0x10
  ? __pfx_lock_release+0x10/0x10
  scrub_simple_mirror.isra.0+0x3eb/0x580
  scrub_stripe+0xe4d/0x1440
  ? lock_release+0x20e/0x710
  ? __pfx_scrub_stripe+0x10/0x10
  ? __pfx_lock_release+0x10/0x10
  ? do_raw_read_unlock+0x44/0x70
  ? _raw_read_unlock+0x23/0x40
  scrub_chunk+0x257/0x4a0
  scrub_enumerate_chunks+0x64c/0xf70
  ? __mutex_unlock_slowpath+0x147/0x5f0
  ? __pfx_scrub_enumerate_chunks+0x10/0x10
  ? bit_wait_timeout+0xb0/0x170
  ? __up_read+0x189/0x700
  ? scrub_workers_get+0x231/0x300
  ? up_write+0x490/0x4f0
  btrfs_scrub_dev+0x52e/0xcd0
  ? create_pending_snapshots+0x230/0x250
  ? __pfx_btrfs_scrub_dev+0x10/0x10
  btrfs_dev_replace_by_ioctl+0xd69/0x1d00
  ? lock_acquire+0x19d/0x4a0
  ? __pfx_btrfs_dev_replace_by_ioctl+0x10/0x10
  ? lock_release+0x20e/0x710
  ? btrfs_ioctl+0xa09/0x74f0
  ? __pfx_lock_release+0x10/0x10
  ? do_raw_spin_lock+0x11e/0x240
  ? __pfx_do_raw_spin_lock+0x10/0x10
  btrfs_ioctl+0xa14/0x74f0
  ? lock_acquire+0x19d/0x4a0
  ? find_held_lock+0x2d/0x110
  ? __pfx_btrfs_ioctl+0x10/0x10
  ? lock_release+0x20e/0x710
  ? do_sigaction+0x3f0/0x860
  ? __pfx_do_vfs_ioctl+0x10/0x10
  ? do_raw_spin_lock+0x11e/0x240
  ? lockdep_hardirqs_on_prepare+0x270/0x3e0
  ? _raw_spin_unlock_irq+0x28/0x50
  ? do_sigaction+0x3f0/0x860
  ? __pfx_do_sigaction+0x10/0x10
  ? __x64_sys_rt_sigaction+0x18e/0x1e0
  ? __pfx___x64_sys_rt_sigaction+0x10/0x10
  ? __x64_sys_close+0x7c/0xd0
  __x64_sys_ioctl+0x137/0x190
  do_syscall_64+0x71/0x140
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7f0bd1114f9b
 Code: Unable to access opcode bytes at 0x7f0bd1114f71.
 RSP: 002b:00007ffc8a8c3130 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f0bd1114f9b
 RDX: 00007ffc8a8c35e0 RSI: 00000000ca289435 RDI: 0000000000000003
 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000007
 R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffc8a8c6c85
 R13: 00000000398e72a0 R14: 0000000000004361 R15: 0000000000000004
  </TASK>

This happens because on RAID stripe-tree filesystems we recurse back into
btrfs_map_block() on scrub to perform the logical to device physical
mapping.

But as the device replace task is already holding the dev_replace::rwsem
we deadlock.

So don't take the dev_replace::rwsem in case our task is the task performing
the device replace.

Suggested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/dev-replace.c | 2 ++
 fs/btrfs/fs.h          | 2 ++
 fs/btrfs/volumes.c     | 8 +++++---
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 83d5cdd77f293..604399e59a3d1 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		return ret;
 
 	down_write(&dev_replace->rwsem);
+	dev_replace->replace_task = current;
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -994,6 +995,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
 	fs_devices->rw_devices++;
 
+	dev_replace->replace_task = NULL;
 	up_write(&dev_replace->rwsem);
 	btrfs_rm_dev_replace_blocked(fs_info);
 
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 79f64e383eddf..cbfb225858a59 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -317,6 +317,8 @@ struct btrfs_dev_replace {
 
 	struct percpu_counter bio_counter;
 	wait_queue_head_t replace_wait;
+
+	struct task_struct *replace_task;
 };
 
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eb51b609190fb..920df7585b0d1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6481,13 +6481,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	max_len = btrfs_max_io_len(map, map_offset, &io_geom);
 	*length = min_t(u64, map->chunk_len - map_offset, max_len);
 
-	down_read(&dev_replace->rwsem);
+	if (dev_replace->replace_task != current)
+		down_read(&dev_replace->rwsem);
+
 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
 	/*
 	 * Hold the semaphore for read during the whole operation, write is
 	 * requested at commit time but must wait.
 	 */
-	if (!dev_replace_is_ongoing)
+	if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
 		up_read(&dev_replace->rwsem);
 
 	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
@@ -6627,7 +6629,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	bioc->mirror_num = io_geom.mirror_num;
 
 out:
-	if (dev_replace_is_ongoing) {
+	if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
 		lockdep_assert_held(&dev_replace->rwsem);
 		/* Unlock and let waiting writers proceed */
 		up_read(&dev_replace->rwsem);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 09/19] btrfs: zlib: make the compression path to handle sector size < page size
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (6 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 08/19] btrfs: don't take dev_replace rwsem on task already holding it Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-25 15:20   ` David Sterba
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 10/19] btrfs: make extent_range_clear_dirty_for_io() to handle sector size < page size cases Sasha Levin
                   ` (9 subsequent siblings)
  17 siblings, 1 reply; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Qu Wenruo, David Sterba, Sasha Levin, clm, josef, linux-btrfs

From: Qu Wenruo <wqu@suse.com>

[ Upstream commit f6ebedb09bb276256e084196e2322562dc4aac10 ]

Inside zlib_compress_folios(), each time we switch the input page cache,
the @start is increased by PAGE_SIZE.

But for the incoming compression support for sector size < page size
(previously we support compression only when the range is fully page
aligned), this is not going to handle the following case:

    0          32K         64K          96K
    |          |///////////||///////////|

@start has the initial value 32K, indicating the start filepos of the
to-be-compressed range.

And when grabbing the first page as input, we always call "start +=
PAGE_SIZE;".

But since @start is starting at 32K, it will be increased by 64K,
resulting it to be 96K for the next range, causing incorrect input range
and corruption for the future subpage compression.

Fix it by only increase @start by the input size.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/zlib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 100abc00b794c..ddf0d5a448a74 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 				pg_off = offset_in_page(start);
 				cur_len = btrfs_calc_input_length(orig_end, start);
 				data_in = kmap_local_folio(in_folio, pg_off);
-				start += PAGE_SIZE;
+				start += cur_len;
 				workspace->strm.next_in = data_in;
 				workspace->strm.avail_in = cur_len;
 			}
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 10/19] btrfs: make extent_range_clear_dirty_for_io() to handle sector size < page size cases
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (7 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 09/19] btrfs: zlib: make the compression path to handle sector size < page size Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-25 15:21   ` David Sterba
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 11/19] btrfs: avoid unnecessary device path update for the same device Sasha Levin
                   ` (8 subsequent siblings)
  17 siblings, 1 reply; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Qu Wenruo, David Sterba, Sasha Levin, clm, josef, linux-btrfs

From: Qu Wenruo <wqu@suse.com>

[ Upstream commit a4ef54dbb576032ba31a646a5ffc8a26a83cb92c ]

For btrfs with sector size < page size (e.g. 4K sector size, 64K page
size), and enable the sector perfect compression support, then the
following dirty range can lead to problems:

   0     32K     64K     96K    128K
   |     |///////||//////|    |/|
                              124K

In above case, if we start writeback for that inode, the last dirty
range [124K, 128K) will not be submitted and cause reserved space
leakage:

- Start writeback for page 0
  We find the range [32K, 96K) is suitable for compression, and queue it
  into a workqueue to do the delayed compression and submission.

- Compression happens for range [32K, 96K)
  Function extent_range_clear_dirty_for_io() is called, however it is
  only doing full page handling, not considering any the extra bitmaps
  for subpage cases.

  That function will clear page dirty for both page 0 and page 64K.

- Writeback for the inode is done
  Because page 64K has its dirty flag cleared, it will not be considered
  as a writeback target.

This means the range [124K, 128K) will not be submitted, and reserved
space for it will be leaked.

Fix this problem by using the subpage helper to clear the dirty flag.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e4ca1e7d2e58..686d39309410f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -902,7 +902,8 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
 				ret = PTR_ERR(folio);
 			continue;
 		}
-		folio_clear_dirty_for_io(folio);
+		btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+					      end + 1 - start);
 		folio_put(folio);
 	}
 	return ret;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 11/19] btrfs: avoid unnecessary device path update for the same device
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (8 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 10/19] btrfs: make extent_range_clear_dirty_for_io() to handle sector size < page size cases Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 12/19] btrfs: canonicalize the device path before adding it Sasha Levin
                   ` (7 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Qu Wenruo, Filipe Manana, Fabian Vogt, David Sterba, Sasha Levin,
	clm, josef, linux-btrfs

From: Qu Wenruo <wqu@suse.com>

[ Upstream commit 2e8b6bc0ab41ce41e6dfcc204b6cc01d5abbc952 ]

[PROBLEM]
It is very common for udev to trigger device scan, and every time a
mounted btrfs device got re-scan from different soft links, we will get
some of unnecessary device path updates, this is especially common
for LVM based storage:

 # lvs
  scratch1 test -wi-ao---- 10.00g
  scratch2 test -wi-a----- 10.00g
  scratch3 test -wi-a----- 10.00g
  scratch4 test -wi-a----- 10.00g
  scratch5 test -wi-a----- 10.00g
  test     test -wi-a----- 10.00g

 # mkfs.btrfs -f /dev/test/scratch1
 # mount /dev/test/scratch1 /mnt/btrfs
 # dmesg -c
 [  205.705234] BTRFS: device fsid 7be2602f-9e35-4ecf-a6ff-9e91d2c182c9 devid 1 transid 6 /dev/mapper/test-scratch1 (253:4) scanned by mount (1154)
 [  205.710864] BTRFS info (device dm-4): first mount of filesystem 7be2602f-9e35-4ecf-a6ff-9e91d2c182c9
 [  205.711923] BTRFS info (device dm-4): using crc32c (crc32c-intel) checksum algorithm
 [  205.713856] BTRFS info (device dm-4): using free-space-tree
 [  205.722324] BTRFS info (device dm-4): checking UUID tree

So far so good, but even if we just touched any soft link of
"dm-4", we will get quite some unnecessary device path updates.

 # touch /dev/mapper/test-scratch1
 # dmesg -c
 [  469.295796] BTRFS info: devid 1 device path /dev/mapper/test-scratch1 changed to /dev/dm-4 scanned by (udev-worker) (1221)
 [  469.300494] BTRFS info: devid 1 device path /dev/dm-4 changed to /dev/mapper/test-scratch1 scanned by (udev-worker) (1221)

Such device path rename is unnecessary and can lead to random path
change due to the udev race.

[CAUSE]
Inside device_list_add(), we are using a very primitive way checking if
the device has changed, strcmp().

Which can never handle links well, no matter if it's hard or soft links.

So every different link of the same device will be treated as a different
device, causing the unnecessary device path update.

[FIX]
Introduce a helper, is_same_device(), and use path_equal() to properly
detect the same block device.
So that the different soft links won't trigger the rename race.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Link: https://bugzilla.suse.com/show_bug.cgi?id=1230641
Reported-by: Fabian Vogt <fvogt@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/volumes.c | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 920df7585b0d1..5e75a4e3a5be5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -732,6 +732,42 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
 	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
 }
 
+static bool is_same_device(struct btrfs_device *device, const char *new_path)
+{
+	struct path old = { .mnt = NULL, .dentry = NULL };
+	struct path new = { .mnt = NULL, .dentry = NULL };
+	char *old_path = NULL;
+	bool is_same = false;
+	int ret;
+
+	if (!device->name)
+		goto out;
+
+	old_path = kzalloc(PATH_MAX, GFP_NOFS);
+	if (!old_path)
+		goto out;
+
+	rcu_read_lock();
+	ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+	rcu_read_unlock();
+	if (ret < 0)
+		goto out;
+
+	ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
+	if (ret)
+		goto out;
+	ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
+	if (ret)
+		goto out;
+	if (path_equal(&old, &new))
+		is_same = true;
+out:
+	kfree(old_path);
+	path_put(&old);
+	path_put(&new);
+	return is_same;
+}
+
 /*
  * Add new device to list of registered devices
  *
@@ -852,7 +888,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 				MAJOR(path_devt), MINOR(path_devt),
 				current->comm, task_pid_nr(current));
 
-	} else if (!device->name || strcmp(device->name->str, path)) {
+	} else if (!device->name || !is_same_device(device, path)) {
 		/*
 		 * When FS is already mounted.
 		 * 1. If you are here and if the device->name is NULL that
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 12/19] btrfs: canonicalize the device path before adding it
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (9 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 11/19] btrfs: avoid unnecessary device path update for the same device Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb cache miss for btree search Sasha Levin
                   ` (6 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Qu Wenruo, Filipe Manana, Fabian Vogt, David Sterba, Sasha Levin,
	clm, josef, linux-btrfs

From: Qu Wenruo <wqu@suse.com>

[ Upstream commit 7e06de7c83a746e58d4701e013182af133395188 ]

[PROBLEM]
Currently btrfs accepts any file path for its device, resulting some
weird situation:

 # ./mount_by_fd /dev/test/scratch1  /mnt/btrfs/

The program has the following source code:

 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/mount.h>

 int main(int argc, char *argv[]) {
	int fd = open(argv[1], O_RDWR);
	char path[256];
	snprintf(path, sizeof(path), "/proc/self/fd/%d", fd);
	return mount(path, argv[2], "btrfs", 0, NULL);
 }

Then we can have the following weird device path:

 BTRFS: device fsid 2378be81-fe12-46d2-a9e8-68cf08dd98d5 devid 1 transid 7 /proc/self/fd/3 (253:2) scanned by mount_by_fd (18440)

Normally it's not a big deal, and later udev can trigger a device path
rename. But if udev didn't trigger, the device path "/proc/self/fd/3"
will show up in mtab.

[CAUSE]
For filename "/proc/self/fd/3", it means the opened file descriptor 3.
In above case, it's exactly the device we want to open, aka points to
"/dev/test/scratch1" which is another symlink pointing to "/dev/dm-2".

Inside kernel we solve the mount source using LOOKUP_FOLLOW, which
follows the symbolic link and grab the proper block device.

But inside btrfs we also save the filename into btrfs_device::name, and
utilize that member to report our mount source, which leads to the above
situation.

[FIX]
Instead of unconditionally trust the path, check if the original file
(not following the symbolic link) is inside "/dev/", if not, then
manually lookup the path to its final destination, and use that as our
device path.

This allows us to still use symbolic links, like
"/dev/mapper/test-scratch" from LVM2, which is required for fstests runs
with LVM2 setup.

And for really weird names, like the above case, we solve it to
"/dev/dm-2" instead.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Link: https://bugzilla.suse.com/show_bug.cgi?id=1230641
Reported-by: Fabian Vogt <fvogt@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/volumes.c | 87 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e75a4e3a5be5..5895397364aac 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -732,6 +732,78 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
 	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
 }
 
+/*
+ * We can have very weird soft links passed in.
+ * One example is "/proc/self/fd/<fd>", which can be a soft link to
+ * a block device.
+ *
+ * But it's never a good idea to use those weird names.
+ * Here we check if the path (not following symlinks) is a good one inside
+ * "/dev/".
+ */
+static bool is_good_dev_path(const char *dev_path)
+{
+	struct path path = { .mnt = NULL, .dentry = NULL };
+	char *path_buf = NULL;
+	char *resolved_path;
+	bool is_good = false;
+	int ret;
+
+	if (!dev_path)
+		goto out;
+
+	path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!path_buf)
+		goto out;
+
+	/*
+	 * Do not follow soft link, just check if the original path is inside
+	 * "/dev/".
+	 */
+	ret = kern_path(dev_path, 0, &path);
+	if (ret)
+		goto out;
+	resolved_path = d_path(&path, path_buf, PATH_MAX);
+	if (IS_ERR(resolved_path))
+		goto out;
+	if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
+		goto out;
+	is_good = true;
+out:
+	kfree(path_buf);
+	path_put(&path);
+	return is_good;
+}
+
+static int get_canonical_dev_path(const char *dev_path, char *canonical)
+{
+	struct path path = { .mnt = NULL, .dentry = NULL };
+	char *path_buf = NULL;
+	char *resolved_path;
+	int ret;
+
+	if (!dev_path) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!path_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
+	if (ret)
+		goto out;
+	resolved_path = d_path(&path, path_buf, PATH_MAX);
+	ret = strscpy(canonical, resolved_path, PATH_MAX);
+out:
+	kfree(path_buf);
+	path_put(&path);
+	return ret;
+}
+
 static bool is_same_device(struct btrfs_device *device, const char *new_path)
 {
 	struct path old = { .mnt = NULL, .dentry = NULL };
@@ -1419,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	bool new_device_added = false;
 	struct btrfs_device *device = NULL;
 	struct file *bdev_file;
+	char *canonical_path = NULL;
 	u64 bytenr;
 	dev_t devt;
 	int ret;
 
 	lockdep_assert_held(&uuid_mutex);
 
+	if (!is_good_dev_path(path)) {
+		canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
+		if (canonical_path) {
+			ret = get_canonical_dev_path(path, canonical_path);
+			if (ret < 0) {
+				kfree(canonical_path);
+				canonical_path = NULL;
+			}
+		}
+	}
 	/*
 	 * Avoid an exclusive open here, as the systemd-udev may initiate the
 	 * device scan which may race with the user's mount or mkfs command,
@@ -1469,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 		goto free_disk_super;
 	}
 
-	device = device_list_add(path, disk_super, &new_device_added);
+	device = device_list_add(canonical_path ? : path, disk_super,
+				 &new_device_added);
 	if (!IS_ERR(device) && new_device_added)
 		btrfs_free_stale_devices(device->devt, device);
 
@@ -1478,6 +1562,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 
 error_bdev_put:
 	fput(bdev_file);
+	kfree(canonical_path);
 
 	return device;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb cache miss for btree search
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (10 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 12/19] btrfs: canonicalize the device path before adding it Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-25 11:23   ` Filipe Manana
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 14/19] btrfs: do not clear read-only when adding sprout device Sasha Levin
                   ` (5 subsequent siblings)
  17 siblings, 1 reply; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Robbie Ko, Filipe Manana, David Sterba, Sasha Levin, clm, josef,
	linux-btrfs

From: Robbie Ko <robbieko@synology.com>

[ Upstream commit 99785998ed1cea142e20f4904ced26537a37bf74 ]

When crawling btree, if an eb cache miss occurs, we change to use the eb
read lock and release all previous locks (including the parent lock) to
reduce lock contention.

If an eb cache miss occurs in a leaf and needs to execute IO, before this
change we released locks only from level 2 and up and we read a leaf's
content from disk while holding a lock on its parent (level 1), causing
the unnecessary lock contention on the parent, after this change we
release locks from level 1 and up, but we lock level 0, and read leaf's
content from disk.

Because we have prepared the check parameters and the read lock of eb we
hold, we can ensure that no race will occur during the check and cause
unexpected errors.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/ctree.c | 101 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0cc919d15b144..dd92acd66624f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1515,12 +1515,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	struct btrfs_tree_parent_check check = { 0 };
 	u64 blocknr;
 	u64 gen;
-	struct extent_buffer *tmp;
-	int ret;
+	struct extent_buffer *tmp = NULL;
+	int ret = 0;
 	int parent_level;
-	bool unlock_up;
+	int err;
+	bool read_tmp = false;
+	bool tmp_locked = false;
+	bool path_released = false;
 
-	unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
 	gen = btrfs_node_ptr_generation(*eb_ret, slot);
 	parent_level = btrfs_header_level(*eb_ret);
@@ -1551,68 +1553,105 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			 */
 			if (btrfs_verify_level_key(tmp,
 					parent_level - 1, &check.first_key, gen)) {
-				free_extent_buffer(tmp);
-				return -EUCLEAN;
+				ret = -EUCLEAN;
+				goto out;
 			}
 			*eb_ret = tmp;
-			return 0;
+			tmp = NULL;
+			ret = 0;
+			goto out;
 		}
 
 		if (p->nowait) {
-			free_extent_buffer(tmp);
-			return -EAGAIN;
+			ret = -EAGAIN;
+			goto out;
 		}
 
-		if (unlock_up)
+		if (!p->skip_locking) {
 			btrfs_unlock_up_safe(p, level + 1);
-
-		/* now we're allowed to do a blocking uptodate check */
-		ret = btrfs_read_extent_buffer(tmp, &check);
-		if (ret) {
-			free_extent_buffer(tmp);
+			tmp_locked = true;
+			btrfs_tree_read_lock(tmp);
 			btrfs_release_path(p);
-			return ret;
+			ret = -EAGAIN;
+			path_released = true;
 		}
 
-		if (unlock_up)
-			ret = -EAGAIN;
+		/* Now we're allowed to do a blocking uptodate check. */
+		err = btrfs_read_extent_buffer(tmp, &check);
+		if (err) {
+			ret = err;
+			goto out;
+		}
 
+		if (ret == 0) {
+			ASSERT(!tmp_locked);
+			*eb_ret = tmp;
+			tmp = NULL;
+		}
 		goto out;
 	} else if (p->nowait) {
-		return -EAGAIN;
+		ret = -EAGAIN;
+		goto out;
 	}
 
-	if (unlock_up) {
+	if (!p->skip_locking) {
 		btrfs_unlock_up_safe(p, level + 1);
 		ret = -EAGAIN;
-	} else {
-		ret = 0;
 	}
 
 	if (p->reada != READA_NONE)
 		reada_for_search(fs_info, p, level, slot, key->objectid);
 
-	tmp = read_tree_block(fs_info, blocknr, &check);
+	tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
 	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		tmp = NULL;
+		goto out;
+	}
+	read_tmp = true;
+
+	if (!p->skip_locking) {
+		ASSERT(ret == -EAGAIN);
+		tmp_locked = true;
+		btrfs_tree_read_lock(tmp);
 		btrfs_release_path(p);
-		return PTR_ERR(tmp);
+		path_released = true;
+	}
+
+	/* Now we're allowed to do a blocking uptodate check. */
+	err = btrfs_read_extent_buffer(tmp, &check);
+	if (err) {
+		ret = err;
+		goto out;
 	}
+
 	/*
 	 * If the read above didn't mark this buffer up to date,
 	 * it will never end up being up to date.  Set ret to EIO now
 	 * and give up so that our caller doesn't loop forever
 	 * on our EAGAINs.
 	 */
-	if (!extent_buffer_uptodate(tmp))
+	if (!extent_buffer_uptodate(tmp)) {
 		ret = -EIO;
+		goto out;
+	}
 
-out:
 	if (ret == 0) {
+		ASSERT(!tmp_locked);
 		*eb_ret = tmp;
-	} else {
-		free_extent_buffer(tmp);
-		btrfs_release_path(p);
+		tmp = NULL;
+	}
+out:
+	if (tmp) {
+		if (tmp_locked)
+			btrfs_tree_read_unlock(tmp);
+		if (read_tmp && ret && ret != -EAGAIN)
+			free_extent_buffer_stale(tmp);
+		else
+			free_extent_buffer(tmp);
 	}
+	if (ret && !path_released)
+		btrfs_release_path(p);
 
 	return ret;
 }
@@ -2198,7 +2237,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		err = read_block_for_search(root, p, &b, level, slot, key);
-		if (err == -EAGAIN)
+		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
 			ret = err;
@@ -2325,7 +2364,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 		}
 
 		err = read_block_for_search(root, p, &b, level, slot, key);
-		if (err == -EAGAIN)
+		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
 			ret = err;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 14/19] btrfs: do not clear read-only when adding sprout device
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (11 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb cache miss for btree search Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 15/19] btrfs: fix warning on PTR_ERR() against NULL device at btrfs_control_ioctl() Sasha Levin
                   ` (4 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Boris Burkov, Qu Wenruo, David Sterba, Sasha Levin, clm, josef,
	linux-btrfs

From: Boris Burkov <boris@bur.io>

[ Upstream commit 70958a949d852cbecc3d46127bf0b24786df0130 ]

If you follow the seed/sprout wiki, it suggests the following workflow:

btrfstune -S 1 seed_dev
mount seed_dev mnt
btrfs device add sprout_dev
mount -o remount,rw mnt

The first mount mounts the FS readonly, which results in not setting
BTRFS_FS_OPEN, and setting the readonly bit on the sb. The device add
somewhat surprisingly clears the readonly bit on the sb (though the
mount is still practically readonly, from the users perspective...).
Finally, the remount checks the readonly bit on the sb against the flag
and sees no change, so it does not run the code intended to run on
ro->rw transitions, leaving BTRFS_FS_OPEN unset.

As a result, when the cleaner_kthread runs, it sees no BTRFS_FS_OPEN and
does no work. This results in leaking deleted snapshots until we run out
of space.

I propose fixing it at the first departure from what feels reasonable:
when we clear the readonly bit on the sb during device add.

A new fstest I have written reproduces the bug and confirms the fix.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/volumes.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5895397364aac..0c4d14c59ebec 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2842,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
-		btrfs_clear_sb_rdonly(sb);
-
 		/* GFP_KERNEL allocation must not be under device_list_mutex */
 		seed_devices = btrfs_init_sprout(fs_info);
 		if (IS_ERR(seed_devices)) {
@@ -2986,8 +2984,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	mutex_unlock(&fs_info->chunk_mutex);
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 error_trans:
-	if (seeding_dev)
-		btrfs_set_sb_rdonly(sb);
 	if (trans)
 		btrfs_end_transaction(trans);
 error_free_zone:
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 15/19] btrfs: fix warning on PTR_ERR() against NULL device at btrfs_control_ioctl()
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (12 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 14/19] btrfs: do not clear read-only when adding sprout device Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 16/19] md/raid1: Handle bio_split() errors Sasha Levin
                   ` (3 subsequent siblings)
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Filipe Manana, Qu Wenruo, David Sterba, Sasha Levin, clm, josef,
	linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

[ Upstream commit 2342d6595b608eec94187a17dc112dd4c2a812fa ]

Smatch complains about calling PTR_ERR() against a NULL pointer:

  fs/btrfs/super.c:2272 btrfs_control_ioctl() warn: passing zero to 'PTR_ERR'

Fix this by calling PTR_ERR() against the device pointer only if it
contains an error.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/super.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c64d071341223..4505995eec342 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2256,7 +2256,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
 		if (IS_ERR_OR_NULL(device)) {
 			mutex_unlock(&uuid_mutex);
-			ret = PTR_ERR(device);
+			if (IS_ERR(device))
+				ret = PTR_ERR(device);
+			else
+				ret = 0;
 			break;
 		}
 		ret = !(device->fs_devices->num_devices ==
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 16/19] md/raid1: Handle bio_split() errors
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (13 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 15/19] btrfs: fix warning on PTR_ERR() against NULL device at btrfs_control_ioctl() Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-25  8:55   ` John Garry
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 17/19] kselftest/arm64: Corrupt P0 in the irritator when testing SSVE Sasha Levin
                   ` (2 subsequent siblings)
  17 siblings, 1 reply; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: John Garry, Yu Kuai, Hannes Reinecke, Jens Axboe, Sasha Levin,
	song, linux-raid

From: John Garry <john.g.garry@oracle.com>

[ Upstream commit b1a7ad8b5c4fa28325ee7b369a2d545d3e16ccde ]

Add proper bio_split() error handling. For any error, call
raid_end_bio_io() and return.

For the case of an in the write path, we need to undo the increment in
the rdev pending count and NULLify the r1_bio->bios[] pointers.

For read path failure, we need to undo rdev pending count increment from
the earlier read_balance() call.

Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20241111112150.3756529-6-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/raid1.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6c9d24203f39f..7e023e9303c8a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1322,7 +1322,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	const enum req_op op = bio_op(bio);
 	const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
 	int max_sectors;
-	int rdisk;
+	int rdisk, error;
 	bool r1bio_existed = !!r1_bio;
 
 	/*
@@ -1383,6 +1383,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	if (max_sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, max_sectors,
 					      gfp, &conf->bio_split);
+
+		if (IS_ERR(split)) {
+			error = PTR_ERR(split);
+			goto err_handle;
+		}
 		bio_chain(split, bio);
 		submit_bio_noacct(bio);
 		bio = split;
@@ -1410,6 +1415,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	read_bio->bi_private = r1_bio;
 	mddev_trace_remap(mddev, read_bio, r1_bio->sector);
 	submit_bio_noacct(read_bio);
+	return;
+
+err_handle:
+	atomic_dec(&mirror->rdev->nr_pending);
+	bio->bi_status = errno_to_blk_status(error);
+	set_bit(R1BIO_Uptodate, &r1_bio->state);
+	raid_end_bio_io(r1_bio);
 }
 
 static void raid1_write_request(struct mddev *mddev, struct bio *bio,
@@ -1417,7 +1429,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 {
 	struct r1conf *conf = mddev->private;
 	struct r1bio *r1_bio;
-	int i, disks;
+	int i, disks, k, error;
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	int first_clone;
@@ -1576,6 +1588,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	if (max_sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, max_sectors,
 					      GFP_NOIO, &conf->bio_split);
+
+		if (IS_ERR(split)) {
+			error = PTR_ERR(split);
+			goto err_handle;
+		}
 		bio_chain(split, bio);
 		submit_bio_noacct(bio);
 		bio = split;
@@ -1660,6 +1677,18 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	/* In case raid1d snuck in to freeze_array */
 	wake_up_barrier(conf);
+	return;
+err_handle:
+	for (k = 0; k < i; k++) {
+		if (r1_bio->bios[k]) {
+			rdev_dec_pending(conf->mirrors[k].rdev, mddev);
+			r1_bio->bios[k] = NULL;
+		}
+	}
+
+	bio->bi_status = errno_to_blk_status(error);
+	set_bit(R1BIO_Uptodate, &r1_bio->state);
+	raid_end_bio_io(r1_bio);
 }
 
 static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 17/19] kselftest/arm64: Corrupt P0 in the irritator when testing SSVE
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (14 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 16/19] md/raid1: Handle bio_split() errors Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 18/19] kselftest/arm64: Don't leak pipe fds in pac.exec_sign_all() Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 19/19] ext4: partial zero eof block on unaligned inode size extension Sasha Levin
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Mark Brown, Catalin Marinas, Sasha Levin, will, shuah,
	mark.rutland, thiago.bauermann, linux-arm-kernel, linux-kselftest

From: Mark Brown <broonie@kernel.org>

[ Upstream commit 3e360ef0c0a1fb6ce9a302e40b8057c41ba8a9d2 ]

When building for streaming SVE the irritator for SVE skips updates of both
P0 and FFR. While FFR is skipped since it might not be present there is no
reason to skip corrupting P0 so switch to an instruction valid in streaming
mode and move the ifdef.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-3-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/testing/selftests/arm64/fp/sve-test.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index fff60e2a25add..4fcb492aee1fb 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -304,9 +304,9 @@ function irritator_handler
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3
-#ifndef SSVE
 	// And P0
-	rdffr	p0.b
+	ptrue	p0.d
+#ifndef SSVE
 	// And FFR
 	wrffr	p15.b
 #endif
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 18/19] kselftest/arm64: Don't leak pipe fds in pac.exec_sign_all()
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (15 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 17/19] kselftest/arm64: Corrupt P0 in the irritator when testing SSVE Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 19/19] ext4: partial zero eof block on unaligned inode size extension Sasha Levin
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Mark Brown, Catalin Marinas, Sasha Levin, will, shuah,
	linux-arm-kernel, linux-kselftest

From: Mark Brown <broonie@kernel.org>

[ Upstream commit 27141b690547da5650a420f26ec369ba142a9ebb ]

The PAC exec_sign_all() test spawns some child processes, creating pipes
to be stdin and stdout for the child. It cleans up most of the file
descriptors that are created as part of this but neglects to clean up the
parent end of the child stdin and stdout. Add the missing close() calls.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241111-arm64-pac-test-collisions-v1-1-171875f37e44@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/testing/selftests/arm64/pauth/pac.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c
index b743daa772f55..5a07b3958fbf2 100644
--- a/tools/testing/selftests/arm64/pauth/pac.c
+++ b/tools/testing/selftests/arm64/pauth/pac.c
@@ -182,6 +182,9 @@ int exec_sign_all(struct signatures *signed_vals, size_t val)
 		return -1;
 	}
 
+	close(new_stdin[1]);
+	close(new_stdout[0]);
+
 	return 0;
 }
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH AUTOSEL 6.12 19/19] ext4: partial zero eof block on unaligned inode size extension
  2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
                   ` (16 preceding siblings ...)
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 18/19] kselftest/arm64: Don't leak pipe fds in pac.exec_sign_all() Sasha Levin
@ 2024-11-24 12:38 ` Sasha Levin
  17 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-11-24 12:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Brian Foster, Theodore Ts'o, Sasha Levin, adilger.kernel,
	linux-ext4

From: Brian Foster <bfoster@redhat.com>

[ Upstream commit c7fc0366c65628fd69bfc310affec4918199aae2 ]

Using mapped writes, it's technically possible to expose stale
post-eof data on a truncate up operation. Consider the following
example:

$ xfs_io -fc "pwrite 0 2k" -c "mmap 0 4k" -c "mwrite 2k 2k" \
	-c "truncate 8k" -c "pread -v 2k 16" <file>
...
00000800:  58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58  XXXXXXXXXXXXXXXX
...

This shows that the post-eof data written via mwrite lands within
EOF after a truncate up. While this is deliberate of the test case,
behavior is somewhat unpredictable because writeback does post-eof
zeroing, and writeback can occur at any time in the background. For
example, an fsync inserted between the mwrite and truncate causes
the subsequent read to instead return zeroes. This basically means
that there is a race window in this situation between any subsequent
extending operation and writeback that dictates whether post-eof
data is exposed to the file or zeroed.

To prevent this problem, perform partial block zeroing as part of
the various inode size extending operations that are susceptible to
it. For truncate extension, zero around the original eof similar to
how truncate down does partial zeroing of the new eof. For extension
via writes and fallocate related operations, zero the newly exposed
range of the file to cover any partial zeroing that must occur at
the original and new eof blocks.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Link: https://patch.msgid.link/20240919160741.208162-2-bfoster@redhat.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/extents.c |  7 ++++++-
 fs/ext4/inode.c   | 51 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 34e25eee65219..20a0a5c0bfd93 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4482,7 +4482,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	int depth = 0;
 	struct ext4_map_blocks map;
 	unsigned int credits;
-	loff_t epos;
+	loff_t epos, old_size = i_size_read(inode);
 
 	BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
 	map.m_lblk = offset;
@@ -4541,6 +4541,11 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 			if (ext4_update_inode_size(inode, epos) & 0x1)
 				inode_set_mtime_to_ts(inode,
 						      inode_get_ctime(inode));
+			if (epos > old_size) {
+				pagecache_isize_extended(inode, old_size, epos);
+				ext4_zero_partial_blocks(handle, inode,
+						     old_size, epos - old_size);
+			}
 		}
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		ext4_update_inode_fsync_trans(handle, inode, 1);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 54bdd4884fe67..f460418e2bdae 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1307,8 +1307,10 @@ static int ext4_write_end(struct file *file,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	if (old_size < pos && !verity)
+	if (old_size < pos && !verity) {
 		pagecache_isize_extended(inode, old_size, pos);
+		ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+	}
 	/*
 	 * Don't mark the inode dirty under folio lock. First, it unnecessarily
 	 * makes the holding time of folio lock longer. Second, it forces lock
@@ -1423,8 +1425,10 @@ static int ext4_journalled_write_end(struct file *file,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	if (old_size < pos && !verity)
+	if (old_size < pos && !verity) {
 		pagecache_isize_extended(inode, old_size, pos);
+		ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+	}
 
 	if (size_changed) {
 		ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2985,7 +2989,8 @@ static int ext4_da_do_write_end(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	loff_t old_size = inode->i_size;
 	bool disksize_changed = false;
-	loff_t new_i_size;
+	loff_t new_i_size, zero_len = 0;
+	handle_t *handle;
 
 	if (unlikely(!folio_buffers(folio))) {
 		folio_unlock(folio);
@@ -3029,18 +3034,21 @@ static int ext4_da_do_write_end(struct address_space *mapping,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	if (old_size < pos)
+	if (pos > old_size) {
 		pagecache_isize_extended(inode, old_size, pos);
+		zero_len = pos - old_size;
+	}
 
-	if (disksize_changed) {
-		handle_t *handle;
+	if (!disksize_changed && !zero_len)
+		return copied;
 
-		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-		if (IS_ERR(handle))
-			return PTR_ERR(handle);
-		ext4_mark_inode_dirty(handle, inode);
-		ext4_journal_stop(handle);
-	}
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (zero_len)
+		ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
 
 	return copied;
 }
@@ -5426,6 +5434,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		}
 
 		if (attr->ia_size != inode->i_size) {
+			/* attach jbd2 jinode for EOF folio tail zeroing */
+			if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
+			    oldsize & (inode->i_sb->s_blocksize - 1)) {
+				error = ext4_inode_attach_jinode(inode);
+				if (error)
+					goto err_out;
+			}
+
 			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 			if (IS_ERR(handle)) {
 				error = PTR_ERR(handle);
@@ -5436,12 +5452,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 				orphan = 1;
 			}
 			/*
-			 * Update c/mtime on truncate up, ext4_truncate() will
-			 * update c/mtime in shrink case below
+			 * Update c/mtime and tail zero the EOF folio on
+			 * truncate up. ext4_truncate() handles the shrink case
+			 * below.
 			 */
-			if (!shrink)
+			if (!shrink) {
 				inode_set_mtime_to_ts(inode,
 						      inode_set_ctime_current(inode));
+				if (oldsize & (inode->i_sb->s_blocksize - 1))
+					ext4_block_truncate_page(handle,
+							inode->i_mapping, oldsize);
+			}
 
 			if (shrink)
 				ext4_fc_track_range(handle, inode,
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH AUTOSEL 6.12 16/19] md/raid1: Handle bio_split() errors
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 16/19] md/raid1: Handle bio_split() errors Sasha Levin
@ 2024-11-25  8:55   ` John Garry
  2024-12-10 16:21     ` Sasha Levin
  0 siblings, 1 reply; 25+ messages in thread
From: John Garry @ 2024-11-25  8:55 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Yu Kuai, Hannes Reinecke, Jens Axboe, song, linux-raid

On 24/11/2024 12:38, Sasha Levin wrote:
> From: John Garry<john.g.garry@oracle.com>
> 
> [ Upstream commit b1a7ad8b5c4fa28325ee7b369a2d545d3e16ccde ]
> 
> Add proper bio_split() error handling. For any error, call
> raid_end_bio_io() and return.
> 
> For the case of an in the write path, we need to undo the increment in
> the rdev pending count and NULLify the r1_bio->bios[] pointers.
> 
> For read path failure, we need to undo rdev pending count increment from
> the earlier read_balance() call.
> 
> Reviewed-by: Yu Kuai<yukuai3@huawei.com>
> Reviewed-by: Hannes Reinecke<hare@suse.de>
> Signed-off-by: John Garry<john.g.garry@oracle.com>
> Link:https://urldefense.com/v3/__https://lore.kernel.org/ 
> r/20241111112150.3756529-6-john.g.garry@oracle.com__;!!ACWV5N9M2RV99hQ! 
> N4dieLgwxARnrFj9y51O80wHlzi_DtX0LRE- 
> kw6X6c0oWji1y3NBy1HIbHaHEkfRZJ57mxEq0kY_YRAnPg$ 
> Signed-off-by: Jens Axboe<axboe@kernel.dk>
> Signed-off-by: Sasha Levin<sashal@kernel.org>

I don't think that it is proper to backport this change without 
bio_split() error handling update. And I don't think that it is worth 
backporting the bio_split() error handling update.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb cache miss for btree search
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb cache miss for btree search Sasha Levin
@ 2024-11-25 11:23   ` Filipe Manana
  0 siblings, 0 replies; 25+ messages in thread
From: Filipe Manana @ 2024-11-25 11:23 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, stable, Robbie Ko, Filipe Manana, David Sterba, clm,
	josef, linux-btrfs

On Sun, Nov 24, 2024 at 12:46 PM Sasha Levin <sashal@kernel.org> wrote:
>
> From: Robbie Ko <robbieko@synology.com>
>
> [ Upstream commit 99785998ed1cea142e20f4904ced26537a37bf74 ]

Why is this being picked for stable?

It's not a bug fix or anything critical.
It's just a performance optimization, and it's not even one where we
know (AFAIK) of any workload where it would give very significant
gains to justify backporting to stable.

Thanks.

>
> When crawling btree, if an eb cache miss occurs, we change to use the eb
> read lock and release all previous locks (including the parent lock) to
> reduce lock contention.
>
> If an eb cache miss occurs in a leaf and needs to execute IO, before this
> change we released locks only from level 2 and up and we read a leaf's
> content from disk while holding a lock on its parent (level 1), causing
> the unnecessary lock contention on the parent, after this change we
> release locks from level 1 and up, but we lock level 0, and read leaf's
> content from disk.
>
> Because we have prepared the check parameters and the read lock of eb we
> hold, we can ensure that no race will occur during the check and cause
> unexpected errors.
>
> Reviewed-by: Filipe Manana <fdmanana@suse.com>
> Signed-off-by: Robbie Ko <robbieko@synology.com>
> Signed-off-by: David Sterba <dsterba@suse.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>  fs/btrfs/ctree.c | 101 ++++++++++++++++++++++++++++++++---------------
>  1 file changed, 70 insertions(+), 31 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 0cc919d15b144..dd92acd66624f 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1515,12 +1515,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>         struct btrfs_tree_parent_check check = { 0 };
>         u64 blocknr;
>         u64 gen;
> -       struct extent_buffer *tmp;
> -       int ret;
> +       struct extent_buffer *tmp = NULL;
> +       int ret = 0;
>         int parent_level;
> -       bool unlock_up;
> +       int err;
> +       bool read_tmp = false;
> +       bool tmp_locked = false;
> +       bool path_released = false;
>
> -       unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
>         blocknr = btrfs_node_blockptr(*eb_ret, slot);
>         gen = btrfs_node_ptr_generation(*eb_ret, slot);
>         parent_level = btrfs_header_level(*eb_ret);
> @@ -1551,68 +1553,105 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>                          */
>                         if (btrfs_verify_level_key(tmp,
>                                         parent_level - 1, &check.first_key, gen)) {
> -                               free_extent_buffer(tmp);
> -                               return -EUCLEAN;
> +                               ret = -EUCLEAN;
> +                               goto out;
>                         }
>                         *eb_ret = tmp;
> -                       return 0;
> +                       tmp = NULL;
> +                       ret = 0;
> +                       goto out;
>                 }
>
>                 if (p->nowait) {
> -                       free_extent_buffer(tmp);
> -                       return -EAGAIN;
> +                       ret = -EAGAIN;
> +                       goto out;
>                 }
>
> -               if (unlock_up)
> +               if (!p->skip_locking) {
>                         btrfs_unlock_up_safe(p, level + 1);
> -
> -               /* now we're allowed to do a blocking uptodate check */
> -               ret = btrfs_read_extent_buffer(tmp, &check);
> -               if (ret) {
> -                       free_extent_buffer(tmp);
> +                       tmp_locked = true;
> +                       btrfs_tree_read_lock(tmp);
>                         btrfs_release_path(p);
> -                       return ret;
> +                       ret = -EAGAIN;
> +                       path_released = true;
>                 }
>
> -               if (unlock_up)
> -                       ret = -EAGAIN;
> +               /* Now we're allowed to do a blocking uptodate check. */
> +               err = btrfs_read_extent_buffer(tmp, &check);
> +               if (err) {
> +                       ret = err;
> +                       goto out;
> +               }
>
> +               if (ret == 0) {
> +                       ASSERT(!tmp_locked);
> +                       *eb_ret = tmp;
> +                       tmp = NULL;
> +               }
>                 goto out;
>         } else if (p->nowait) {
> -               return -EAGAIN;
> +               ret = -EAGAIN;
> +               goto out;
>         }
>
> -       if (unlock_up) {
> +       if (!p->skip_locking) {
>                 btrfs_unlock_up_safe(p, level + 1);
>                 ret = -EAGAIN;
> -       } else {
> -               ret = 0;
>         }
>
>         if (p->reada != READA_NONE)
>                 reada_for_search(fs_info, p, level, slot, key->objectid);
>
> -       tmp = read_tree_block(fs_info, blocknr, &check);
> +       tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
>         if (IS_ERR(tmp)) {
> +               ret = PTR_ERR(tmp);
> +               tmp = NULL;
> +               goto out;
> +       }
> +       read_tmp = true;
> +
> +       if (!p->skip_locking) {
> +               ASSERT(ret == -EAGAIN);
> +               tmp_locked = true;
> +               btrfs_tree_read_lock(tmp);
>                 btrfs_release_path(p);
> -               return PTR_ERR(tmp);
> +               path_released = true;
> +       }
> +
> +       /* Now we're allowed to do a blocking uptodate check. */
> +       err = btrfs_read_extent_buffer(tmp, &check);
> +       if (err) {
> +               ret = err;
> +               goto out;
>         }
> +
>         /*
>          * If the read above didn't mark this buffer up to date,
>          * it will never end up being up to date.  Set ret to EIO now
>          * and give up so that our caller doesn't loop forever
>          * on our EAGAINs.
>          */
> -       if (!extent_buffer_uptodate(tmp))
> +       if (!extent_buffer_uptodate(tmp)) {
>                 ret = -EIO;
> +               goto out;
> +       }
>
> -out:
>         if (ret == 0) {
> +               ASSERT(!tmp_locked);
>                 *eb_ret = tmp;
> -       } else {
> -               free_extent_buffer(tmp);
> -               btrfs_release_path(p);
> +               tmp = NULL;
> +       }
> +out:
> +       if (tmp) {
> +               if (tmp_locked)
> +                       btrfs_tree_read_unlock(tmp);
> +               if (read_tmp && ret && ret != -EAGAIN)
> +                       free_extent_buffer_stale(tmp);
> +               else
> +                       free_extent_buffer(tmp);
>         }
> +       if (ret && !path_released)
> +               btrfs_release_path(p);
>
>         return ret;
>  }
> @@ -2198,7 +2237,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>                 }
>
>                 err = read_block_for_search(root, p, &b, level, slot, key);
> -               if (err == -EAGAIN)
> +               if (err == -EAGAIN && !p->nowait)
>                         goto again;
>                 if (err) {
>                         ret = err;
> @@ -2325,7 +2364,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
>                 }
>
>                 err = read_block_for_search(root, p, &b, level, slot, key);
> -               if (err == -EAGAIN)
> +               if (err == -EAGAIN && !p->nowait)
>                         goto again;
>                 if (err) {
>                         ret = err;
> --
> 2.43.0
>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH AUTOSEL 6.12 09/19] btrfs: zlib: make the compression path to handle sector size < page size
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 09/19] btrfs: zlib: make the compression path to handle sector size < page size Sasha Levin
@ 2024-11-25 15:20   ` David Sterba
  2024-12-10 16:20     ` Sasha Levin
  0 siblings, 1 reply; 25+ messages in thread
From: David Sterba @ 2024-11-25 15:20 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, stable, Qu Wenruo, David Sterba, clm, josef,
	linux-btrfs

On Sun, Nov 24, 2024 at 07:38:44AM -0500, Sasha Levin wrote:
> From: Qu Wenruo <wqu@suse.com>
> 
> [ Upstream commit f6ebedb09bb276256e084196e2322562dc4aac10 ]
> 
> Inside zlib_compress_folios(), each time we switch the input page cache,
> the @start is increased by PAGE_SIZE.
> 
> But for the incoming compression support for sector size < page size
> (previously we support compression only when the range is fully page
> aligned), this is not going to handle the following case:
> 
>     0          32K         64K          96K
>     |          |///////////||///////////|
> 
> @start has the initial value 32K, indicating the start filepos of the
> to-be-compressed range.
> 
> And when grabbing the first page as input, we always call "start +=
> PAGE_SIZE;".
> 
> But since @start is starting at 32K, it will be increased by 64K,
> resulting it to be 96K for the next range, causing incorrect input range
> and corruption for the future subpage compression.
> 
> Fix it by only increase @start by the input size.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> Signed-off-by: David Sterba <dsterba@suse.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>

Please drop this patch from stable, it's preparatory work and has
otherwise no effect.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH AUTOSEL 6.12 10/19] btrfs: make extent_range_clear_dirty_for_io() to handle sector size < page size cases
  2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 10/19] btrfs: make extent_range_clear_dirty_for_io() to handle sector size < page size cases Sasha Levin
@ 2024-11-25 15:21   ` David Sterba
  0 siblings, 0 replies; 25+ messages in thread
From: David Sterba @ 2024-11-25 15:21 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, stable, Qu Wenruo, David Sterba, clm, josef,
	linux-btrfs

On Sun, Nov 24, 2024 at 07:38:45AM -0500, Sasha Levin wrote:
> From: Qu Wenruo <wqu@suse.com>
> 
> [ Upstream commit a4ef54dbb576032ba31a646a5ffc8a26a83cb92c ]
> 
> For btrfs with sector size < page size (e.g. 4K sector size, 64K page
> size), and enable the sector perfect compression support, then the
> following dirty range can lead to problems:
> 
>    0     32K     64K     96K    128K
>    |     |///////||//////|    |/|
>                               124K
> 
> In above case, if we start writeback for that inode, the last dirty
> range [124K, 128K) will not be submitted and cause reserved space
> leakage:
> 
> - Start writeback for page 0
>   We find the range [32K, 96K) is suitable for compression, and queue it
>   into a workqueue to do the delayed compression and submission.
> 
> - Compression happens for range [32K, 96K)
>   Function extent_range_clear_dirty_for_io() is called, however it is
>   only doing full page handling, not considering any the extra bitmaps
>   for subpage cases.
> 
>   That function will clear page dirty for both page 0 and page 64K.
> 
> - Writeback for the inode is done
>   Because page 64K has its dirty flag cleared, it will not be considered
>   as a writeback target.
> 
> This means the range [124K, 128K) will not be submitted, and reserved
> space for it will be leaked.
> 
> Fix this problem by using the subpage helper to clear the dirty flag.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> Signed-off-by: David Sterba <dsterba@suse.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>

Please drop this patch from stable, it's preparatory work and has
otherwise no effect.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH AUTOSEL 6.12 09/19] btrfs: zlib: make the compression path to handle sector size < page size
  2024-11-25 15:20   ` David Sterba
@ 2024-12-10 16:20     ` Sasha Levin
  0 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-12-10 16:20 UTC (permalink / raw)
  To: David Sterba
  Cc: linux-kernel, stable, Qu Wenruo, David Sterba, clm, josef,
	linux-btrfs

On Mon, Nov 25, 2024 at 04:20:59PM +0100, David Sterba wrote:
>On Sun, Nov 24, 2024 at 07:38:44AM -0500, Sasha Levin wrote:
>> From: Qu Wenruo <wqu@suse.com>
>>
>> [ Upstream commit f6ebedb09bb276256e084196e2322562dc4aac10 ]
>>
>> Inside zlib_compress_folios(), each time we switch the input page cache,
>> the @start is increased by PAGE_SIZE.
>>
>> But for the incoming compression support for sector size < page size
>> (previously we support compression only when the range is fully page
>> aligned), this is not going to handle the following case:
>>
>>     0          32K         64K          96K
>>     |          |///////////||///////////|
>>
>> @start has the initial value 32K, indicating the start filepos of the
>> to-be-compressed range.
>>
>> And when grabbing the first page as input, we always call "start +=
>> PAGE_SIZE;".
>>
>> But since @start is starting at 32K, it will be increased by 64K,
>> resulting it to be 96K for the next range, causing incorrect input range
>> and corruption for the future subpage compression.
>>
>> Fix it by only increase @start by the input size.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> Signed-off-by: David Sterba <dsterba@suse.com>
>> Signed-off-by: Sasha Levin <sashal@kernel.org>
>
>Please drop this patch from stable, it's preparatory work and has
>otherwise no effect.

Will do, thanks!

-- 
Thanks,
Sasha

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH AUTOSEL 6.12 16/19] md/raid1: Handle bio_split() errors
  2024-11-25  8:55   ` John Garry
@ 2024-12-10 16:21     ` Sasha Levin
  0 siblings, 0 replies; 25+ messages in thread
From: Sasha Levin @ 2024-12-10 16:21 UTC (permalink / raw)
  To: John Garry
  Cc: linux-kernel, stable, Yu Kuai, Hannes Reinecke, Jens Axboe, song,
	linux-raid

On Mon, Nov 25, 2024 at 08:55:19AM +0000, John Garry wrote:
>On 24/11/2024 12:38, Sasha Levin wrote:
>>From: John Garry<john.g.garry@oracle.com>
>>
>>[ Upstream commit b1a7ad8b5c4fa28325ee7b369a2d545d3e16ccde ]
>>
>>Add proper bio_split() error handling. For any error, call
>>raid_end_bio_io() and return.
>>
>>For the case of an in the write path, we need to undo the increment in
>>the rdev pending count and NULLify the r1_bio->bios[] pointers.
>>
>>For read path failure, we need to undo rdev pending count increment from
>>the earlier read_balance() call.
>>
>>Reviewed-by: Yu Kuai<yukuai3@huawei.com>
>>Reviewed-by: Hannes Reinecke<hare@suse.de>
>>Signed-off-by: John Garry<john.g.garry@oracle.com>
>>Link:https://urldefense.com/v3/__https://lore.kernel.org/ r/20241111112150.3756529-6-john.g.garry@oracle.com__;!!ACWV5N9M2RV99hQ! 
>>N4dieLgwxARnrFj9y51O80wHlzi_DtX0LRE- 
>>kw6X6c0oWji1y3NBy1HIbHaHEkfRZJ57mxEq0kY_YRAnPg$ Signed-off-by: Jens 
>>Axboe<axboe@kernel.dk>
>>Signed-off-by: Sasha Levin<sashal@kernel.org>
>
>I don't think that it is proper to backport this change without 
>bio_split() error handling update. And I don't think that it is worth 
>backporting the bio_split() error handling update.

I'll drop it, thanks!

-- 
Thanks,
Sasha

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2024-12-10 16:21 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-24 12:38 [PATCH AUTOSEL 6.12 01/19] s390/pci: Sort PCI functions prior to creating virtual busses Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 02/19] s390/pci: Use topology ID for multi-function devices Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 03/19] s390/pci: Ignore RID for isolated VFs Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 04/19] epoll: annotate racy check Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 05/19] kselftest/arm64: Log fp-stress child startup errors to stdout Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 06/19] s390/cpum_sf: Handle CPU hotplug remove during sampling Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 07/19] block: RCU protect disk->conv_zones_bitmap Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 08/19] btrfs: don't take dev_replace rwsem on task already holding it Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 09/19] btrfs: zlib: make the compression path to handle sector size < page size Sasha Levin
2024-11-25 15:20   ` David Sterba
2024-12-10 16:20     ` Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 10/19] btrfs: make extent_range_clear_dirty_for_io() to handle sector size < page size cases Sasha Levin
2024-11-25 15:21   ` David Sterba
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 11/19] btrfs: avoid unnecessary device path update for the same device Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 12/19] btrfs: canonicalize the device path before adding it Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb cache miss for btree search Sasha Levin
2024-11-25 11:23   ` Filipe Manana
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 14/19] btrfs: do not clear read-only when adding sprout device Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 15/19] btrfs: fix warning on PTR_ERR() against NULL device at btrfs_control_ioctl() Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 16/19] md/raid1: Handle bio_split() errors Sasha Levin
2024-11-25  8:55   ` John Garry
2024-12-10 16:21     ` Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 17/19] kselftest/arm64: Corrupt P0 in the irritator when testing SSVE Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 18/19] kselftest/arm64: Don't leak pipe fds in pac.exec_sign_all() Sasha Levin
2024-11-24 12:38 ` [PATCH AUTOSEL 6.12 19/19] ext4: partial zero eof block on unaligned inode size extension Sasha Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox