kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mostafa Saleh <smostafa@google.com>
To: "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org>
Cc: kvm@vger.kernel.org, Suzuki K Poulose <Suzuki.Poulose@arm.com>,
	Steven Price <steven.price@arm.com>,
	Will Deacon <will@kernel.org>,
	Julien Thierry <julien.thierry.kdev@gmail.com>
Subject: Re: [RFC PATCH kvmtool 07/10] vfio/iommufd: Add basic iommufd support
Date: Sun, 27 Jul 2025 18:31:40 +0000	[thread overview]
Message-ID: <aIZwjA-wOPDPD9Co@google.com> (raw)
In-Reply-To: <20250525074917.150332-7-aneesh.kumar@kernel.org>

On Sun, May 25, 2025 at 01:19:13PM +0530, Aneesh Kumar K.V (Arm) wrote:
> This use a stage1 translate stage2 bypass iommu config.
> 
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
>  Makefile                 |   1 +
>  builtin-run.c            |   1 +
>  include/kvm/kvm-config.h |   1 +
>  include/kvm/vfio.h       |   2 +
>  vfio/core.c              |   5 +
>  vfio/iommufd.c           | 368 +++++++++++++++++++++++++++++++++++++++
>  6 files changed, 378 insertions(+)
>  create mode 100644 vfio/iommufd.c
> 
> diff --git a/Makefile b/Makefile
> index 8b2720f73386..740b95c7c3c3 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -64,6 +64,7 @@ OBJS	+= mmio.o
>  OBJS	+= pci.o
>  OBJS	+= term.o
>  OBJS	+= vfio/core.o
> +OBJS	+= vfio/iommufd.o
>  OBJS	+= vfio/pci.o
>  OBJS	+= vfio/legacy.o
>  OBJS	+= virtio/blk.o
> diff --git a/builtin-run.c b/builtin-run.c
> index 81f255f911b3..39198f9bc0d6 100644
> --- a/builtin-run.c
> +++ b/builtin-run.c
> @@ -262,6 +262,7 @@ static int loglevel_parser(const struct option *opt, const char *arg, int unset)
>  	OPT_CALLBACK('\0', "vfio-pci", NULL, "[domain:]bus:dev.fn",	\
>  		     "Assign a PCI device to the virtual machine",	\
>  		     vfio_device_parser, kvm),				\
> +	OPT_BOOLEAN('\0', "iommufd", &(cfg)->iommufd, "Use iommufd interface"),	\
>  									\
>  	OPT_GROUP("Debug options:"),					\
>  	OPT_CALLBACK_NOOPT('\0', "debug", kvm, NULL,			\
> diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
> index 592b035785c9..632eaf84b7eb 100644
> --- a/include/kvm/kvm-config.h
> +++ b/include/kvm/kvm-config.h
> @@ -65,6 +65,7 @@ struct kvm_config {
>  	bool ioport_debug;
>  	bool mmio_debug;
>  	int virtio_transport;
> +	bool iommufd;
>  };
>  
>  #endif
> diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
> index fed692b0f265..37a2b5ac3dad 100644
> --- a/include/kvm/vfio.h
> +++ b/include/kvm/vfio.h
> @@ -128,6 +128,8 @@ void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev);
>  
>  extern int (*dma_map_mem_range)(struct kvm *kvm, __u64 host_addr, __u64 iova, __u64 size);
>  extern int (*dma_unmap_mem_range)(struct kvm *kvm, __u64 iova, __u64 size);
> +int iommufd__init(struct kvm *kvm);
> +int iommufd__exit(struct kvm *kvm);
>  
>  struct kvm_mem_bank;
>  int vfio_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data);
> diff --git a/vfio/core.c b/vfio/core.c
> index 32a8e0fe67c0..0b1796c54ffd 100644
> --- a/vfio/core.c
> +++ b/vfio/core.c
> @@ -373,6 +373,8 @@ static int vfio__init(struct kvm *kvm)
>  	}
>  	kvm_vfio_device = device.fd;
>  
> +	if (kvm->cfg.iommufd)
> +		return iommufd__init(kvm);
>  	return legacy_vfio__init(kvm);
>  }
>  dev_base_init(vfio__init);
> @@ -393,6 +395,9 @@ static int vfio__exit(struct kvm *kvm)
>  
>  	free(kvm->cfg.vfio_devices);
>  
> +	if (kvm->cfg.iommufd)
> +		return iommufd__exit(kvm);
> +
>  	return legacy_vfio__exit(kvm);
>  }
>  dev_base_exit(vfio__exit);
> diff --git a/vfio/iommufd.c b/vfio/iommufd.c
> new file mode 100644
> index 000000000000..3728a06cb318
> --- /dev/null
> +++ b/vfio/iommufd.c
> @@ -0,0 +1,368 @@
> +#include <sys/types.h>
> +#include <dirent.h>
> +
> +#include "kvm/kvm.h"
> +#include <linux/iommufd.h>
> +#include <linux/list.h>
> +
> +#define VFIO_DEV_DIR		"/dev/vfio"
This is duplicate with the legacy file, so maybe move it to the header?

> +#define VFIO_DEV_NODE		VFIO_DEV_DIR "/devices/"
> +#define IOMMU_DEV		"/dev/iommu"
> +
> +static int iommu_fd;
> +static int ioas_id;
> +
> +static int __iommufd_configure_device(struct kvm *kvm, struct vfio_device *vdev)
> +{
> +	int ret;
> +
> +	vdev->info.argsz = sizeof(vdev->info);
> +	if (ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &vdev->info)) {
> +		ret = -errno;
> +		vfio_dev_err(vdev, "failed to get info");
> +		goto err_close_device;
> +	}
> +
> +	if (vdev->info.flags & VFIO_DEVICE_FLAGS_RESET &&
> +	    ioctl(vdev->fd, VFIO_DEVICE_RESET) < 0)
> +		vfio_dev_warn(vdev, "failed to reset device");
> +
> +	vdev->regions = calloc(vdev->info.num_regions, sizeof(*vdev->regions));
> +	if (!vdev->regions) {
> +		ret = -ENOMEM;
> +		goto err_close_device;
> +	}
> +
> +	/* Now for the bus-specific initialization... */
> +	switch (vdev->params->type) {
> +	case VFIO_DEVICE_PCI:
> +		BUG_ON(!(vdev->info.flags & VFIO_DEVICE_FLAGS_PCI));
> +		ret = vfio_pci_setup_device(kvm, vdev);
> +		break;
> +	default:
> +		BUG_ON(1);
> +		ret = -EINVAL;
> +	}
> +
> +	if (ret)
> +		goto err_free_regions;
> +
> +	vfio_dev_info(vdev, "assigned to device number 0x%x ",
> +		      vdev->dev_hdr.dev_num) ;
> +
> +	return 0;
> +
> +err_free_regions:
> +	free(vdev->regions);
> +err_close_device:
> +	close(vdev->fd);
> +
> +	return ret;
> +}
> +
> +static int iommufd_configure_device(struct kvm *kvm, struct vfio_device *vdev)
> +{
> +	int ret;
> +	DIR *dir = NULL;
> +	struct dirent *dir_ent;
> +	bool found_dev = false;
> +	char pci_dev_path[PATH_MAX];
> +	char vfio_dev_path[PATH_MAX];
> +	struct iommu_hwpt_alloc alloc_hwpt;
> +	struct vfio_device_bind_iommufd bind;
> +	struct vfio_device_attach_iommufd_pt attach_data;
> +
> +	ret = snprintf(pci_dev_path, PATH_MAX, "%s/vfio-dev/", vdev->sysfs_path);
> +	if (ret < 0 || ret == PATH_MAX)
> +		return -EINVAL;
> +
> +	dir = opendir(pci_dev_path);
> +	if (!dir)
> +		return -EINVAL;
> +
> +	while ((dir_ent = readdir(dir))) {
> +		if (!strncmp(dir_ent->d_name, "vfio", 4)) {
> +			ret = snprintf(vfio_dev_path, PATH_MAX, VFIO_DEV_NODE "%s", dir_ent->d_name);
> +			if (ret < 0 || ret == PATH_MAX) {
> +				ret = -EINVAL;
> +				goto err_close_dir;
> +			}
> +			found_dev = true;
> +			break;
> +		}
> +	}
> +	if (!found_dev) {
> +		ret = -ENODEV;
> +		goto err_close_dir;
> +	}

At this point we already found the device, as in error there is "err_close_dir"
so there is no need for the extra flag.

> +
> +	vdev->fd = open(vfio_dev_path, O_RDWR);
> +	if (vdev->fd == -1) {
> +		ret = errno;
> +		pr_err("Failed to open %s", vfio_dev_path);
> +		goto err_close_dir;
> +	}
> +
> +	struct kvm_device_attr attr = {
> +		.group = KVM_DEV_VFIO_FILE,
> +		.attr = KVM_DEV_VFIO_FILE_ADD,
> +		.addr = (__u64)&vdev->fd,
> +	};
> +
> +	if (ioctl(kvm_vfio_device, KVM_SET_DEVICE_ATTR, &attr)) {
> +		ret = -errno;
> +		pr_err("Failed KVM_SET_DEVICE_ATTR for KVM_DEV_VFIO_FILE");
> +		goto err_close_device;
> +	}
> +
> +	bind.argsz = sizeof(bind);
> +	bind.flags = 0;
> +	bind.iommufd = iommu_fd;
> +
> +	/* now bind the iommufd */
> +	if (ioctl(vdev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind)) {
> +		ret = -errno;
> +		vfio_dev_err(vdev, "failed to get info");
> +		goto err_close_device;
> +	}
> +
> +	alloc_hwpt.size = sizeof(struct iommu_hwpt_alloc);
> +	alloc_hwpt.flags = 0;
> +	alloc_hwpt.dev_id = bind.out_devid;
> +	alloc_hwpt.pt_id = ioas_id;
> +	alloc_hwpt.data_type = IOMMU_HWPT_DATA_NONE;
> +	alloc_hwpt.data_len = 0;
> +	alloc_hwpt.data_uptr = 0;
> +
> +	if (ioctl(iommu_fd, IOMMU_HWPT_ALLOC, &alloc_hwpt)) {
> +		ret = -errno;
> +		pr_err("Failed to allocate HWPT");
> +		goto err_close_device;
> +	}
> +
> +	attach_data.argsz = sizeof(attach_data);
> +	attach_data.flags = 0;
> +	attach_data.pt_id = alloc_hwpt.out_hwpt_id;
> +
> +	if (ioctl(vdev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data)) {
> +		ret = -errno;
> +		vfio_dev_err(vdev, "failed to attach to IOAS ");

Extra space.

> +		goto err_close_device;
> +	}
> +
> +	closedir(dir);
> +	return __iommufd_configure_device(kvm, vdev);
> +
> +err_close_device:
> +	close(vdev->fd);
> +err_close_dir:
> +	closedir(dir);
> +	return ret;
> +}
> +
> +static int iommufd_configure_devices(struct kvm *kvm)
> +{
> +	int i, ret;
> +
> +	for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) {
> +		ret = iommufd_configure_device(kvm, &vfio_devices[i]);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int iommufd_create_ioas(struct kvm *kvm)
> +{
> +	int ret;
> +	struct iommu_ioas_alloc alloc_data;
> +	iommu_fd = open(IOMMU_DEV, O_RDWR);
> +	if (iommu_fd == -1) {
> +		ret = errno;
> +		pr_err("Failed to open %s", IOMMU_DEV);
> +		return ret;
> +	}
> +
> +	alloc_data.size = sizeof(alloc_data);
> +	alloc_data.flags = 0;
> +
> +	if (ioctl(iommu_fd, IOMMU_IOAS_ALLOC, &alloc_data)) {
> +		ret = errno;

For all other ioctls, we return -errorno, except here, is there a reason
for that?

> +		pr_err("Failed to alloc IOAS ");
Also, extra space at the end, also maybe more consistent with the rest of
the code with “vfio_dev_err”.

> +		goto err_close_device;
> +	}
> +	ioas_id = alloc_data.out_ioas_id;
> +	return 0;
> +
> +err_close_device:
> +	close(iommu_fd);
> +	return ret;
> +}
> +
> +static int vfio_device_init(struct kvm *kvm, struct vfio_device *vdev)
> +{
> +	int ret, dirfd;
> +	char *group_name;
> +	unsigned long group_id;
> +	char dev_path[PATH_MAX];
> +	struct vfio_group *group = NULL;
> +
> +	ret = snprintf(dev_path, PATH_MAX, "/sys/bus/%s/devices/%s",
> +		       vdev->params->bus, vdev->params->name);
> +	if (ret < 0 || ret == PATH_MAX)
> +		return -EINVAL;
> +
> +	vdev->sysfs_path = strndup(dev_path, PATH_MAX);
> +	if (!vdev->sysfs_path)
> +		return -ENOMEM;
> +
> +	/* Find IOMMU group for this device */
> +	dirfd = open(vdev->sysfs_path, O_DIRECTORY | O_PATH | O_RDONLY);
> +	if (dirfd < 0) {
> +		vfio_dev_err(vdev, "failed to open '%s'", vdev->sysfs_path);
> +		return -errno;
> +	}
> +
> +	ret = readlinkat(dirfd, "iommu_group", dev_path, PATH_MAX);
> +	if (ret < 0) {
> +		vfio_dev_err(vdev, "no iommu_group");
> +		goto out_close;
> +	}
> +	if (ret == PATH_MAX) {
> +		ret =  -ENOMEM;
> +		goto out_close;
> +	}
> +
> +	dev_path[ret] = '\0';
> +	group_name = basename(dev_path);
> +	errno = 0;
> +	group_id = strtoul(group_name, NULL, 10);
> +	if (errno) {
> +		ret = -errno;
> +		goto out_close;
> +	}
> +
> +	list_for_each_entry(group, &vfio_groups, list) {
> +		if (group->id == group_id) {
> +			group->refs++;
> +			break;
> +		}
> +	}
> +	if (group->id != group_id) {
> +		group = calloc(1, sizeof(*group));
> +		if (!group) {
> +			ret = -ENOMEM;
> +			goto out_close;
> +		}
> +		group->id	= group_id;
> +		group->refs	= 1;
> +		/* no group fd for iommufd */
> +		group->fd	= -1;
> +		list_add(&group->list, &vfio_groups);
> +	}
> +	vdev->group = group;
> +	ret = 0;
> +

There is some duplication with “vfio_group_get_for_dev”, I wonder if we could
re-use some of this code in a helper.

> +out_close:
> +	close(dirfd);
> +	return ret;
> +}
> +
> +static int iommufd_map_mem_range(struct kvm *kvm, __u64 host_addr, __u64 iova, __u64 size)
> +{
> +	int ret = 0;
> +	struct iommu_ioas_map dma_map;
> +
> +	dma_map.size = sizeof(dma_map);
> +	dma_map.flags = IOMMU_IOAS_MAP_READABLE | IOMMU_IOAS_MAP_WRITEABLE |
> +			IOMMU_IOAS_MAP_FIXED_IOVA;
> +	dma_map.ioas_id = ioas_id;
> +	dma_map.__reserved = 0;
> +	dma_map.user_va = host_addr;
> +	dma_map.iova = iova;
> +	dma_map.length = size;
> +
> +	/* Map the guest memory for DMA (i.e. provide isolation) */
> +	if (ioctl(iommu_fd, IOMMU_IOAS_MAP, &dma_map)) {
> +		ret = -errno;
> +		pr_err("Failed to map 0x%llx -> 0x%llx (%u) for DMA",
> +		       dma_map.iova, dma_map.user_va, dma_map.size);
> +	}
> +
> +	return ret;
> +}
> +
> +static int iommufd_unmap_mem_range(struct kvm *kvm,  __u64 iova, __u64 size)
> +{
> +	int ret = 0;
> +	struct iommu_ioas_unmap dma_unmap;
> +
> +	dma_unmap.size = sizeof(dma_unmap);
> +	dma_unmap.ioas_id = ioas_id;
> +	dma_unmap.iova = iova;
> +	dma_unmap.length = size;
> +
> +	if (ioctl(iommu_fd, IOMMU_IOAS_UNMAP, &dma_unmap)) {
> +		ret = -errno;
> +		if (ret != -ENOENT)
> +			pr_err("Failed to unmap 0x%llx - size (%u) for DMA %d",
> +			       dma_unmap.iova, dma_unmap.size, ret);
> +	}
> +
> +	return ret;
> +}
> +
> +static int iommufd_map_mem_bank(struct kvm *kvm, struct kvm_mem_bank *bank, void *data)
> +{
> +	return iommufd_map_mem_range(kvm, (u64)bank->host_addr, bank->guest_phys_addr, bank->size);
> +}
> +
> +static int iommufd_configure_reserved_mem(struct kvm *kvm)
> +{
> +	int ret;
> +	struct vfio_group *group;
> +
> +	list_for_each_entry(group, &vfio_groups, list) {
> +		ret = vfio_configure_reserved_regions(kvm, group);
> +		if (ret)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
> +int iommufd__init(struct kvm *kvm)
> +{
> +	int ret, i;
> +
> +	for (i = 0; i < kvm->cfg.num_vfio_devices; ++i) {
> +		vfio_devices[i].params = &kvm->cfg.vfio_devices[i];
> +
> +		ret = vfio_device_init(kvm, &vfio_devices[i]);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	ret = iommufd_create_ioas(kvm);
> +	if (ret)
> +		return ret;
> +
> +	ret = iommufd_configure_devices(kvm);
> +	if (ret)
> +		return ret;
> +

Any failure after this point will just return, although iommufd_create_ioas()
would “close(iommu_fd)” on failure.
Also, don’t we want to close “iommu_fd” at exit similar to the VFIO container?

Thanks,
Mostafa

> +	ret = iommufd_configure_reserved_mem(kvm);
> +	if (ret)
> +		return ret;
> +
> +	dma_map_mem_range = iommufd_map_mem_range;
> +	dma_unmap_mem_range = iommufd_unmap_mem_range;
> +	/* Now map the full memory */
> +	return kvm__for_each_mem_bank(kvm, KVM_MEM_TYPE_RAM, iommufd_map_mem_bank,
> +				      NULL);
> +}
> +
> +int iommufd__exit(struct kvm *kvm)
> +{
> +	return 0;
> +}
> -- 
> 2.43.0
> 

  reply	other threads:[~2025-07-27 18:31 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-25  7:49 [RFC PATCH kvmtool 01/10] vfio: Associate vm instance with vfio fd Aneesh Kumar K.V (Arm)
2025-05-25  7:49 ` [RFC PATCH kvmtool 02/10] vfio: Rename some functions Aneesh Kumar K.V (Arm)
2025-07-27 18:20   ` Mostafa Saleh
2025-07-29  4:53     ` Aneesh Kumar K.V
2025-05-25  7:49 ` [RFC PATCH kvmtool 03/10] vfio: Create new file legacy.c Aneesh Kumar K.V (Arm)
2025-07-27 18:23   ` Mostafa Saleh
2025-07-29  4:59     ` Aneesh Kumar K.V
2025-05-25  7:49 ` [RFC PATCH kvmtool 04/10] vfio: Update vfio header from linux kernel Aneesh Kumar K.V (Arm)
2025-07-27 18:23   ` Mostafa Saleh
2025-05-25  7:49 ` [RFC PATCH kvmtool 05/10] vfio: Add dma map/unmap handlers Aneesh Kumar K.V (Arm)
2025-07-27 18:25   ` Mostafa Saleh
2025-07-29  5:03     ` Aneesh Kumar K.V
2025-05-25  7:49 ` [RFC PATCH kvmtool 06/10] vfio/iommufd: Import iommufd header from kernel Aneesh Kumar K.V (Arm)
2025-07-27 18:25   ` Mostafa Saleh
2025-05-25  7:49 ` [RFC PATCH kvmtool 07/10] vfio/iommufd: Add basic iommufd support Aneesh Kumar K.V (Arm)
2025-07-27 18:31   ` Mostafa Saleh [this message]
2025-07-29  5:12     ` Aneesh Kumar K.V
2025-07-29  9:38       ` Mostafa Saleh
2025-05-25  7:49 ` [RFC PATCH kvmtool 08/10] vfio/iommufd: Move the hwpt allocation to helper Aneesh Kumar K.V (Arm)
2025-07-27 18:32   ` Mostafa Saleh
2025-07-29  5:14     ` Aneesh Kumar K.V
2025-07-29  9:43       ` Mostafa Saleh
2025-05-25  7:49 ` [RFC PATCH kvmtool 09/10] vfio/iommufd: Add viommu and vdevice objects Aneesh Kumar K.V (Arm)
2025-07-21 12:27   ` Will Deacon
2025-07-24 14:09     ` Aneesh Kumar K.V
2025-08-04 22:33       ` Suzuki K Poulose
2025-08-08 13:00         ` Will Deacon
2025-08-11  6:16           ` Aneesh Kumar K.V
2025-07-27 18:35   ` Mostafa Saleh
2025-07-29  5:19     ` Aneesh Kumar K.V
2025-07-29  9:41       ` Mostafa Saleh
2025-07-30  8:13         ` Aneesh Kumar K.V
2025-07-30 14:15           ` Mostafa Saleh
2025-07-31  4:39             ` Aneesh Kumar K.V
2025-08-04 15:07               ` Mostafa Saleh
2025-05-25  7:49 ` [RFC PATCH kvmtool 10/10] util/update_headers: Add vfio related header files to update list Aneesh Kumar K.V (Arm)
2025-07-27 18:35   ` Mostafa Saleh
2025-07-27 18:19 ` [RFC PATCH kvmtool 01/10] vfio: Associate vm instance with vfio fd Mostafa Saleh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aIZwjA-wOPDPD9Co@google.com \
    --to=smostafa@google.com \
    --cc=Suzuki.Poulose@arm.com \
    --cc=aneesh.kumar@kernel.org \
    --cc=julien.thierry.kdev@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=steven.price@arm.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).