From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from verein.lst.de (verein.lst.de [213.95.11.211]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 060D538BF97; Mon, 19 Jan 2026 15:56:42 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=213.95.11.211 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1768838204; cv=none; b=CGVLuijAqwFgsu+TxphUZ/qYkmNNSbYrafY5j+pvfkAkE2b+9xSOAghz6y2GXCwrMcI7RxKRZMuKlnB2njoOg7eeK1j1ZYZSJgdSQI/7begF6LLeVrlUIU/lEObjVG9MadduETIxIYzvA0DN+A7t1eKtBE8vlPV+wtqyWRirUrk= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1768838204; c=relaxed/simple; bh=D+/mNA7x3bPUPssXz3x5ZzNDARVbZcReUJg4A05wgZg=; h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version: Content-Type:Content-Disposition:In-Reply-To; b=qIA+YEAQw9TGBiyz6P54slR+ZKx2z555STiSUzj0RnGINyNb6BPlspVW4ZSS72YWeWn6BD2ZXIyq3igL7JsPMJxhRNrfh17IhjLq84Ges4tggfelD+BGwUuKckqvnJ4oKghFi0ShU65sCjFpgvQOIYGpoyHeDxE5BgFI3Ry3KXo= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=lst.de; spf=pass smtp.mailfrom=lst.de; arc=none smtp.client-ip=213.95.11.211 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=lst.de Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=lst.de Received: by verein.lst.de (Postfix, from userid 2407) id C9DE4227AA8; Mon, 19 Jan 2026 16:56:39 +0100 (CET) Date: Mon, 19 Jan 2026 16:56:39 +0100 From: Christoph Hellwig To: "Darrick J. Wong" Cc: cem@kernel.org, linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org, hch@lst.de Subject: Re: [PATCH 11/11] xfs: add media verification ioctl Message-ID: <20260119155639.GA10822@lst.de> References: <176852588473.2137143.1604994842772101197.stgit@frogsfrogsfrogs> <176852588776.2137143.7103003682733018282.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <176852588776.2137143.7103003682733018282.stgit@frogsfrogsfrogs> User-Agent: Mutt/1.5.17 (2007-11-01) On Thu, Jan 15, 2026 at 09:44:53PM -0800, Darrick J. Wong wrote: > From: Darrick J. Wong > > Add a new privileged ioctl so that xfs_scrub can ask the kernel to > verify the media of the devices backing an xfs filesystem, and have any > resulting media errors reported to fsnotify and xfs_healer. I really wish this would explain the approach (reading data into a kernel buffer, and the choices of the buffer size and I/O pattern) and their rationale a bit better here. > + > +struct xfs_group_data_lost { > + xfs_agblock_t startblock; > + xfs_extlen_t blockcount; > +}; > + > +/* Report lost file data from rmap records */ > +STATIC int > +xfs_verify_report_data_lost( > + struct xfs_btree_cur *cur, > + const struct xfs_rmap_irec *rec, > + void *data) > +{ > + struct xfs_mount *mp = cur->bc_mp; > + struct xfs_inode *ip; > + struct xfs_group_data_lost *lost = data; > + xfs_fileoff_t fileoff = rec->rm_offset; > + xfs_extlen_t blocks = rec->rm_blockcount; > + const bool is_attr = > + (rec->rm_flags & XFS_RMAP_ATTR_FORK); > + const xfs_agblock_t lost_end = > + lost->startblock + lost->blockcount; > + const xfs_agblock_t rmap_end = > + rec->rm_startblock + rec->rm_blockcount; > + int error = 0; > + > + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) > + return 0; > + > + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip); > + if (error) > + return 0; > + > + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { > + xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK); > + goto out_rele; > + } > + > + if (is_attr) { > + xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR); > + goto out_rele; > + } > + > + if (lost->startblock > rec->rm_startblock) { > + fileoff += lost->startblock - rec->rm_startblock; > + blocks -= lost->startblock - rec->rm_startblock; > + } > + if (rmap_end > lost_end) > + blocks -= rmap_end - lost_end; > + > + fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff), > + XFS_FSB_TO_B(mp, blocks), GFP_NOFS); > + > +out_rele: > + xfs_irele(ip); > + return 0; > +} > + > +/* Walk reverse mappings to look for all file data loss */ > +STATIC int > +xfs_verify_report_losses( > + struct xfs_mount *mp, > + enum xfs_group_type type, > + xfs_daddr_t daddr, > + u64 bblen) > +{ > + struct xfs_group *xg = NULL; > + struct xfs_trans *tp; > + xfs_fsblock_t start_bno, end_bno; > + uint32_t start_gno, end_gno; > + int error; > + > + if (type == XG_TYPE_RTG) { > + start_bno = xfs_daddr_to_rtb(mp, daddr); > + end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); > + } else { > + start_bno = XFS_DADDR_TO_FSB(mp, daddr); > + end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); > + } > + > + tp = xfs_trans_alloc_empty(mp); > + start_gno = xfs_fsb_to_gno(mp, start_bno, type); > + end_gno = xfs_fsb_to_gno(mp, end_bno, type); > + while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { > + struct xfs_buf *agf_bp = NULL; > + struct xfs_rtgroup *rtg = NULL; > + struct xfs_btree_cur *cur; > + struct xfs_rmap_irec ri_low = { }; > + struct xfs_rmap_irec ri_high; > + struct xfs_group_data_lost lost; > + > + if (type == XG_TYPE_AG) { > + struct xfs_perag *pag = to_perag(xg); > + > + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); > + if (error) { > + xfs_perag_put(pag); > + break; > + } > + > + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); > + } else { > + rtg = to_rtg(xg); > + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); > + cur = xfs_rtrmapbt_init_cursor(tp, rtg); > + } > + > + /* > + * Set the rmap range from ri_low to ri_high, which represents > + * a [start, end] where we looking for the files or metadata. > + */ > + memset(&ri_high, 0xFF, sizeof(ri_high)); > + if (xg->xg_gno == start_gno) > + ri_low.rm_startblock = > + xfs_fsb_to_gbno(mp, start_bno, type); > + if (xg->xg_gno == end_gno) > + ri_high.rm_startblock = > + xfs_fsb_to_gbno(mp, end_bno, type); > + > + lost.startblock = ri_low.rm_startblock; > + lost.blockcount = min(xg->xg_block_count, > + ri_high.rm_startblock + 1) - > + ri_low.rm_startblock; > + > + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, > + xfs_verify_report_data_lost, &lost); > + xfs_btree_del_cursor(cur, error); > + if (agf_bp) > + xfs_trans_brelse(tp, agf_bp); > + if (rtg) > + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); > + if (error) { > + xfs_group_put(xg); > + break; > + } > + } > + > + xfs_trans_cancel(tp); > + return 0; > +} > + > +/* > + * Compute the desired verify IO size. > + * > + * To minimize command overhead, we'd like to create bios that are 1MB, though > + * we allow the user to ask for a smaller size. > + */ > +STATIC unsigned int > +xfs_verify_iosize( > + const struct xfs_verify_media *me, > + struct xfs_buftarg *btp, > + uint64_t bbcount) > +{ > + unsigned int iosize = > + min_not_zero(SZ_1M, me->me_max_io_size); > + > + BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT); > + ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev)); > + > + return clamp(iosize, bdev_logical_block_size(btp->bt_bdev), > + BBTOB(bbcount)); > +} > +/* Allocate as much memory as we can get for verification buffer. */ > +STATIC struct folio * Can we please retired STATIC already? > +STATIC void > +xfs_verify_media_error( > + struct xfs_mount *mp, > + struct xfs_verify_media *me, > + struct xfs_buftarg *btp, > + xfs_daddr_t daddr, > + unsigned int bio_bbcount, > + blk_status_t bio_status) > +{ > + trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr, > + bio_bbcount, bio_status); > + > + /* > + * Pass any I/O error up to the caller if we didn't successfully verify > + * any bytes at all. > + */ > + if (me->me_start_daddr == daddr) > + me->me_ioerror = -blk_status_to_errno(bio_status); > + > + /* > + * PI validation failures, medium errors, or general IO errors are > + * treated as indicators of data loss. Everything else are (hopefully) > + * transient errors and are not reported. > + */ But still left in me->me_ioerror. Is that intentional? > + switch (me->me_dev) { > + case XFS_DEV_DATA: > + xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount); > + break; > + case XFS_DEV_RT: > + xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount); > + break; > + } At some point we really need dev_to_group_type and vice versa helpers.