From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-alma10-1.taild15c8.ts.net [100.103.45.18]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 32941286415 for ; Fri, 26 Jun 2026 08:48:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=100.103.45.18 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1782463724; cv=none; b=uFLPQTEnIUPaBgzOHjD/0U5QsJcnCt2UszXL61ochVyPBjazOyKTHRqpMFtqA96WMFqeDIbi57WsR9LH8Viu/+Kstm/V4GrnRJ/Ul2g5GI3FIWPJ0YZ5WWcFnRjFygMf64O+AbNnASQTM2+tBrNjNOUHwGMR3F8LN6VnM/+0IDY= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1782463724; c=relaxed/simple; bh=8SKE5+w+nUC99O0PQPXOslSSjU0k/cav2IEwIOqB1o4=; h=From:Subject:To:Cc:In-Reply-To:References:Content-Type:Date: Message-Id; b=pz+jq+4qO17jzznVZs+8TIgD2vkJ7ZsF890cnxYhvOZZc3EI7r++G3Ciwun/uIOYHQeVzZ7aNDHZEMLD6gcfPutlxXX+2m9FqY52313TrkxnaPoOABJz9Nu9uBBYIO6GuyKL+OM2g7gslx0qWz3kSE5oNsKlyVsSbwmrQUHyEFI= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=Z8Jnl+m6; arc=none smtp.client-ip=100.103.45.18 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Z8Jnl+m6" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 172581F000E9; Fri, 26 Jun 2026 08:48:42 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=kernel.org; s=k20260515; t=1782463722; bh=cTIFKFfXeX2xk6Z/mOOtvMk5UiCmmucmn3EbcJ2LHY8=; h=From:Subject:Reply-To:To:Cc:In-Reply-To:References:Date; b=Z8Jnl+m6DjHc7CVmpKUhwANNj2WDS3FcduGMba1zX2VL5ET2+pzphsXsgLTMxtUCN hPO3On6q8DmtVygIHrROSY+BM73Xo6qFGiOF+wvmu4X+U+A3fcrNWiSQi/yaozfZ/D KChQDe9kayLgfBQB9IdNwDMgkBhcHU5mJfABiFmorRCY/T8t+mM0Y6UYgdRtNZXcXX lWeWuTi/D/7nlF3rHLsy/+u2Ee5MifksM9XTxZ4i16gv08V7bVRwAcf7YWOJQJMG2X jsXWgvCVAbbywPtJdb3lK0m5/irqDPiD+93GF4avRQ/pVQW/Sj6OHDbrVH/tAjZbmB 92RtoYJhNXxRw== From: sashiko-bot@kernel.org Subject: Re: [RFC PATCH] KVM: rust: add Rust reimplementation of eventfd Reply-To: sashiko-reviews@lists.linux.dev To: leixiang Cc: kvm@vger.kernel.org, ojeda@kernel.org In-Reply-To: <20260626083212.39611-1-leixiang@kylinos.cn> References: <20260626083212.39611-1-leixiang@kylinos.cn> Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable Date: Fri, 26 Jun 2026 08:48:41 +0000 Message-Id: <20260626084842.172581F000E9@smtp.kernel.org> Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Thank you for your contribution! Sashiko AI review found 7 potential issue(= s) to consider: - [Critical] Recursive spinlock acquisition causing guaranteed deadlock in = `irqfd_wakeup`. - [Critical] ABBA deadlock between `irqfds.lock` and waitqueue lock intenti= onally hidden from lockdep. - [High] `kvm_deassign_ioeventfd_idx` fails to free the `Ioeventfd` device,= leaking memory and eventfd references. - [High] MutexGuard dropped too late, holding `irq_lock` across `synchroniz= e_srcu_expedited`. - [High] Logic error in `irqfd_wakeup` injects spurious interrupts for deac= tivating irqfds. - [High] Unsafe RCU list traversal in `kvm_notify_irqfd_resampler` lacks `r= cu_dereference` memory barriers. - [Medium] Unaligned memory read causes undefined behavior in `ioeventfd_in= _range`. -- commit 3427a3ec37eb356c05fc7c711d26f74d692d5086 Author: ninol Subject: KVM: rust: add Rust reimplementation of eventfd This patch introduces a Rust reimplementation of KVM's eventfd support, pro= viding irqfd and ioeventfd functionality. It uses RAII guards for lock mana= gement and is selectable via a new Kconfig option. > diff --git a/rust/kernel/kvm/eventfd.rs b/rust/kernel/kvm/eventfd.rs > new file mode 100644 > index 0000000000000..d9b12c7caccb0 > --- /dev/null > +++ b/rust/kernel/kvm/eventfd.rs [ ... ] > +unsafe extern "C" fn irqfd_wakeup( > + wait: *mut bindings::wait_queue_entry, > + _mode: c_uint, > + _sync: c_int, > + key: *mut c_void, > +) -> c_int { > + // SAFETY: wait is embedded in kvm_kernel_irqfd. > + let irqfd: *mut bindings::kvm_kernel_irqfd =3D > + crate::container_of!(wait, bindings::kvm_kernel_irqfd, wait); > + let flags =3D (key as usize) as bindings::__poll_t; > + let kvm =3D (*irqfd).kvm; > + let mut ret: c_int =3D 0; > + > + if (flags & bindings::POLLIN) !=3D 0 { > + let mut cnt: u64 =3D 0; > + eventfd_ctx_do_read((*irqfd).eventfd, &raw mut cnt); [Severity: Critical] Does this cause a recursive spinlock deadlock? This irqfd_wakeup() function= is invoked from the eventfd waitqueue wakeup path where the waitqueue spinlock (wqh.lock) is already held. Calling eventfd_ctx_do_read() here will try to acquire that exact same spinlock again, resulting in an immediate deadlock. > + > + // SAFETY: kvm->irq_srcu is valid. > + let _srcu_guard =3D unsafe { SrcuGuard::new(&raw mut (*kvm).irq_= srcu) }; > + > + // Read irq_entry under seqcount protection to avoid torn reads > + // when kvm_irq_routing_update() writes concurrently on another = CPU. > + let mut irq =3D loop { > + let seq =3D unsafe { bindings::read_seqcount_begin(&raw mut = (*irqfd).irq_entry_sc) }; > + let entry =3D unsafe { core::ptr::read(&(*irqfd).irq_entry) = }; > + if unsafe { bindings::read_seqcount_retry(&raw mut (*irqfd).= irq_entry_sc, seq) } =3D=3D 0 { > + break entry; > + } > + }; > + > + let deactivating =3D !irqfd_is_active(irqfd); > + if deactivating > + || unsafe { > + bindings::kvm_arch_set_irq_inatomic( > + &raw mut irq, > + kvm, > + bindings::KVM_USERSPACE_IRQ_SOURCE_ID as i32, > + 1, > + false, > + ) > + } =3D=3D -(bindings::EWOULDBLOCK as c_int) > + { > + // SAFETY: FFI call. > + unsafe { bindings::schedule_work(&raw mut (*irqfd).inject) }; > + } [Severity: High] Does this logic mistakenly inject spurious interrupts when an irqfd is deactivating? If the deactivating condition is true, the || operator short-circuits and evaluates the entire condition to true, meaning we will schedule work for deactivating irqfds. This could lead to use-after-free issues if the work executes after the irqfd memory has been freed. > + > + // _srcu_guard is dropped here > + ret =3D 1; > + } > + > + if (flags & bindings::POLLHUP) !=3D 0 { > + // SAFETY: Taking irqfds.lock is safe here per original C commen= t. > + // SAFETY: kvm is valid, irqfds.lock is valid. > + let _spin_guard =3D unsafe { SpinLockIrqSaveGuard::new(&raw mut = (*kvm).irqfds.lock) }; [Severity: Critical] Is there an ABBA deadlock risk here? This acquires irqfds.lock while the waitqueue lock (wqh.lock) is already held by the caller. However, in kvm_irqfd_register(), irqfds.lock is acquired before calling add_wait_queue_priority_exclusive(), which in turn acquires wqh.lock. This lock inversion can trigger a deadlock when registering and closing eventfds concurrently. [ ... ] > +unsafe extern "C" fn kvm_irqfd_register( > + _file: *mut bindings::file, > + wqh: *mut bindings::wait_queue_head, > + pt: *mut bindings::poll_table, > +) { [ ... ] > + // Initialize wait queue entry with our custom wakeup handler. > + init_waitqueue_func_entry(&raw mut (*irqfd).wait, Some(irqfd_wakeup)= ); > + > + // Temporarily lie to lockdep about holding irqfds.lock to avoid a > + // false positive: add_wait_queue_priority_exclusive takes wqh->lock > + // internally, and lockdep would see irqfds.lock -> wqh->lock, which > + // matches the reverse order in irqfd_wakeup(EPOLLHUP). > + // SAFETY: `kvm` is a valid pointer, and `irqfds.lock` is held. > + unsafe { bindings::rust_helper_kvm_irqfds_spin_release(kvm) }; > + > + // Add as priority waiter on eventfd. > + (*p).ret =3D add_wait_queue_priority_exclusive(wqh, &raw mut (*irqfd= ).wait); > + > + // SAFETY: `kvm` is valid, lockdep state is restored after > + // releasing wait queue lock. > + unsafe { bindings::rust_helper_kvm_irqfds_spin_acquire(kvm) }; [Severity: Critical] This isn't a false positive; silencing lockdep here hides the real ABBA deadlock mentioned above between irqfd_wakeup() and this function. Could we restructure the locking to avoid this inversion instead of hiding it from lockdep? [ ... ] > +#[inline(always)] > +unsafe fn ioeventfd_in_range( > + p: &Ioeventfd, > + addr: bindings::gpa_t, > + len: c_int, > + val: *const c_void, > +) -> bool { [ ... ] > + if let Some(datamatch) =3D p.datamatch() { > + // KVM guarantees val is naturally aligned; assert to match C's = BUG_ON. > + debug_assert!( > + (val as usize) % (len as usize) =3D=3D 0, > + "ioeventfd_in_range: val pointer not aligned to len" > + ); > + let _val: u64 =3D match len { > + 1 =3D> unsafe { (val as *const u8).read() as u64 }, > + 2 =3D> unsafe { (val as *const u16).read() as u64 }, > + 4 =3D> unsafe { (val as *const u32).read() as u64 }, > + 8 =3D> unsafe { (val as *const u64).read() as u64 }, > + _ =3D> return false, > + }; [Severity: Medium] Could this result in undefined behavior? In release builds, debug_assert! compiles away, and ptr::read() strictly requires the input pointer to be aligned. If the pointer happens to be unaligned, this invokes undefined behavior. Using ptr::read_unaligned() would be safer here. [ ... ] > +unsafe fn kvm_deassign_ioeventfd_idx( > + kvm: *mut bindings::kvm, > + bus_idx: bindings::kvm_bus, > + args: *const bindings::kvm_ioeventfd, > +) -> c_int { [ ... ] > + // SAFETY: kvm_io_bus_unregister_dev will trigger the iodevi= ce destructor, > + // which implicitly performs list_del and kfree(p). Using `n= ext` above avoids UAF. > + unsafe { bindings::kvm_io_bus_unregister_dev(kvm, bus_idx, &= raw mut (*p).dev) }; > + let bus =3D unsafe { bindings::kvm_get_bus(kvm, bus_idx) }; > + if !bus.is_null() { > + unsafe { (*bus).ioeventfd_count -=3D 1 }; > + } > + ret =3D 0; > + break; > + } > + // _mutex_guard drops here, unlocking before eventfd_ctx_put. > + } [Severity: High] Does this code leak the Ioeventfd and eventfd references? The assumption in the comment that kvm_io_bus_unregister_dev() triggers the destructor is incorrect; it only removes the device from the bus array. The caller is sti= ll responsible for explicitly deleting the list entry and freeing the memory. [ ... ] > +pub unsafe extern "C" fn kvm_notify_irqfd_resampler( > + kvm: *mut bindings::kvm, > + irqchip: c_uint, > + pin: c_uint, > +) -> bool { > + // SAFETY: kvm->irq_srcu is valid. > + let _srcu_guard =3D unsafe { SrcuGuard::new(&raw mut (*kvm).irq_srcu= ) }; > + let gsi =3D unsafe { bindings::kvm_irq_map_chip_pin(kvm, irqchip, pi= n) }; > + > + if gsi !=3D -1 { > + let mut pos =3D (*kvm).irqfds.resampler_list.next; > + let head =3D &raw mut (*kvm).irqfds.resampler_list; > + while pos !=3D head { > + let resampler: *mut bindings::kvm_kernel_irqfd_resampler =3D > + crate::container_of!(pos, bindings::kvm_kernel_irqfd_res= ampler, link); [Severity: High] Can this plain pointer traversal cause a data race? Accessing an RCU list v= ia direct pos.next dereferences under an SRCU read lock lacks the memory barri= er provided by rcu_dereference(), which is required on weakly ordered architectures to prevent reading uninitialized state. [ ... ] > +pub unsafe extern "C" fn kvm_unregister_irq_ack_notifier( > + kvm: *mut bindings::kvm, > + kian: *mut bindings::kvm_irq_ack_notifier, > +) { > + // SAFETY: Caller guarantees kvm is valid. > + let _mutex_guard =3D unsafe { MutexGuard::new(&raw mut (*kvm).irq_lo= ck) }; > + unsafe { bindings::rust_helper_hlist_del_init_rcu(&raw mut (*kian).l= ink) }; > + // _mutex_guard dropped here > + unsafe { bindings::synchronize_srcu_expedited(&raw mut (*kvm).irq_sr= cu) }; > + unsafe { bindings::kvm_arch_post_irq_ack_notifier_list_update(kvm) }; > +} [Severity: High] Is the _mutex_guard actually dropped where the comment claims? Because _mutex_guard is not explicitly dropped (e.g., via drop(_mutex_guard) or wrapping in an inner block), its lifetime extends to the end of the function scope. This means irq_lock is held across the blocking synchronize_srcu_expedited() call, violating the locking hierarchy and potentially causing a deadlock. --=20 Sashiko AI review =C2=B7 https://sashiko.dev/#/patchset/20260626083212.3961= 1-1-leixiang@kylinos.cn?part=3D1