Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* RE: [Patch v3 1/4] RDMA/mana_ib : Rename all mana_ib_dev type variables to mib_dev
From: Long Li @ 2023-07-28 21:23 UTC (permalink / raw)
  To: sharmaajay@linuxonhyperv.com, Jason Gunthorpe, Leon Romanovsky,
	Dexuan Cui, Wei Liu, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Ajay Sharma
In-Reply-To: <1690402104-29518-2-git-send-email-sharmaajay@linuxonhyperv.com>



> -----Original Message-----
> From: sharmaajay@linuxonhyperv.com <sharmaajay@linuxonhyperv.com>
> Sent: Wednesday, July 26, 2023 1:08 PM
> To: Jason Gunthorpe <jgg@ziepe.ca>; Leon Romanovsky <leon@kernel.org>;
> Dexuan Cui <decui@microsoft.com>; Wei Liu <wei.liu@kernel.org>; David S.
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>;
> Jakub Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>
> Cc: linux-rdma@vger.kernel.org; linux-hyperv@vger.kernel.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Ajay Sharma
> <sharmaajay@microsoft.com>
> Subject: [Patch v3 1/4] RDMA/mana_ib : Rename all mana_ib_dev type
> variables to mib_dev
> 
> From: Ajay Sharma <sharmaajay@microsoft.com>
> 
> This patch does not introduce any functional changes. It creates naming
> convention to distinguish especially when used in the same
> function.Renaming all mana_ib_dev type variables to mib_dev to have clean
> separation between eth dev and ibdev variables.
> 
> Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
> ---
>  drivers/infiniband/hw/mana/cq.c      | 12 ++--
>  drivers/infiniband/hw/mana/device.c  | 34 +++++------
>  drivers/infiniband/hw/mana/main.c    | 87 ++++++++++++++--------------
>  drivers/infiniband/hw/mana/mana_ib.h |  9 +--
>  drivers/infiniband/hw/mana/mr.c      | 29 +++++-----
>  drivers/infiniband/hw/mana/qp.c      | 82 +++++++++++++-------------
>  drivers/infiniband/hw/mana/wq.c      | 21 +++----
>  7 files changed, 140 insertions(+), 134 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mana/cq.c
> b/drivers/infiniband/hw/mana/cq.c index d141cab8a1e6..1aed4e6360ba
> 100644
> --- a/drivers/infiniband/hw/mana/cq.c
> +++ b/drivers/infiniband/hw/mana/cq.c
> @@ -11,10 +11,10 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const
> struct ib_cq_init_attr *attr,
>  	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
>  	struct ib_device *ibdev = ibcq->device;
>  	struct mana_ib_create_cq ucmd = {};
> -	struct mana_ib_dev *mdev;
> +	struct mana_ib_dev *mib_dev;
>  	int err;
> 
> -	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> 
>  	if (udata->inlen < sizeof(ucmd))
>  		return -EINVAL;
> @@ -41,7 +41,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
> ib_cq_init_attr *attr,
>  		return err;
>  	}
> 
> -	err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq-
> >gdma_region);
> +	err = mana_ib_gd_create_dma_region(mib_dev, cq->umem,
> +&cq->gdma_region);
>  	if (err) {
>  		ibdev_dbg(ibdev,
>  			  "Failed to create dma region for create cq, %d\n",
> @@ -68,11 +68,11 @@ int mana_ib_destroy_cq(struct ib_cq *ibcq, struct
> ib_udata *udata)  {
>  	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
>  	struct ib_device *ibdev = ibcq->device;
> -	struct mana_ib_dev *mdev;
> +	struct mana_ib_dev *mib_dev;
> 
> -	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> 
> -	mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region);
> +	mana_ib_gd_destroy_dma_region(mib_dev, cq->gdma_region);
>  	ib_umem_release(cq->umem);
> 
>  	return 0;
> diff --git a/drivers/infiniband/hw/mana/device.c
> b/drivers/infiniband/hw/mana/device.c
> index d4541b8707e4..083f27246ba8 100644
> --- a/drivers/infiniband/hw/mana/device.c
> +++ b/drivers/infiniband/hw/mana/device.c
> @@ -51,51 +51,51 @@ static int mana_ib_probe(struct auxiliary_device
> *adev,  {
>  	struct mana_adev *madev = container_of(adev, struct mana_adev,
> adev);
>  	struct gdma_dev *mdev = madev->mdev;
> +	struct mana_ib_dev *mib_dev;
>  	struct mana_context *mc;
> -	struct mana_ib_dev *dev;
>  	int ret;
> 
>  	mc = mdev->driver_data;
> 
> -	dev = ib_alloc_device(mana_ib_dev, ib_dev);
> -	if (!dev)
> +	mib_dev = ib_alloc_device(mana_ib_dev, ib_dev);
> +	if (!mib_dev)
>  		return -ENOMEM;
> 
> -	ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops);
> +	ib_set_device_ops(&mib_dev->ib_dev, &mana_ib_dev_ops);
> 
> -	dev->ib_dev.phys_port_cnt = mc->num_ports;
> +	mib_dev->ib_dev.phys_port_cnt = mc->num_ports;
> 
> -	ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n",
> mdev,
> -		  mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt);
> +	ibdev_dbg(&mib_dev->ib_dev, "mdev=%p id=%d num_ports=%d\n",
> mdev,
> +		  mdev->dev_id.as_uint32, mib_dev->ib_dev.phys_port_cnt);
> 
> -	dev->gdma_dev = mdev;
> -	dev->ib_dev.node_type = RDMA_NODE_IB_CA;
> +	mib_dev->gdma_dev = mdev;
> +	mib_dev->ib_dev.node_type = RDMA_NODE_IB_CA;
> 
>  	/*
>  	 * num_comp_vectors needs to set to the max MSIX index
>  	 * when interrupts and event queues are implemented
>  	 */
> -	dev->ib_dev.num_comp_vectors = 1;
> -	dev->ib_dev.dev.parent = mdev->gdma_context->dev;
> +	mib_dev->ib_dev.num_comp_vectors = 1;
> +	mib_dev->ib_dev.dev.parent = mdev->gdma_context->dev;
> 
> -	ret = ib_register_device(&dev->ib_dev, "mana_%d",
> +	ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
>  				 mdev->gdma_context->dev);
>  	if (ret) {
> -		ib_dealloc_device(&dev->ib_dev);
> +		ib_dealloc_device(&mib_dev->ib_dev);
>  		return ret;
>  	}
> 
> -	dev_set_drvdata(&adev->dev, dev);
> +	dev_set_drvdata(&adev->dev, mib_dev);
> 
>  	return 0;
>  }
> 
>  static void mana_ib_remove(struct auxiliary_device *adev)  {
> -	struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev);
> +	struct mana_ib_dev *mib_dev = dev_get_drvdata(&adev->dev);
> 
> -	ib_unregister_device(&dev->ib_dev);
> -	ib_dealloc_device(&dev->ib_dev);
> +	ib_unregister_device(&mib_dev->ib_dev);
> +	ib_dealloc_device(&mib_dev->ib_dev);
>  }
> 
>  static const struct auxiliary_device_id mana_id_table[] = { diff --git
> a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
> index 7be4c3adb4e2..189e774cdab6 100644
> --- a/drivers/infiniband/hw/mana/main.c
> +++ b/drivers/infiniband/hw/mana/main.c
> @@ -5,10 +5,10 @@
> 
>  #include "mana_ib.h"
> 
> -void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd
> *pd,
> +void mana_ib_uncfg_vport(struct mana_ib_dev *mib_dev, struct
> mana_ib_pd
> +*pd,
>  			 u32 port)
>  {
> -	struct gdma_dev *gd = dev->gdma_dev;
> +	struct gdma_dev *gd = mib_dev->gdma_dev;
>  	struct mana_port_context *mpc;
>  	struct net_device *ndev;
>  	struct mana_context *mc;
> @@ -28,10 +28,11 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev,
> struct mana_ib_pd *pd,
>  	mutex_unlock(&pd->vport_mutex);
>  }
> 
> -int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct
> mana_ib_pd *pd,
> +int mana_ib_cfg_vport(struct mana_ib_dev *mib_dev, u32 port,
> +		      struct mana_ib_pd *pd,
>  		      u32 doorbell_id)
>  {
> -	struct gdma_dev *mdev = dev->gdma_dev;
> +	struct gdma_dev *mdev = mib_dev->gdma_dev;
>  	struct mana_port_context *mpc;
>  	struct mana_context *mc;
>  	struct net_device *ndev;
> @@ -45,7 +46,7 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32
> port, struct mana_ib_pd *pd,
> 
>  	pd->vport_use_count++;
>  	if (pd->vport_use_count > 1) {
> -		ibdev_dbg(&dev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Skip as this PD is already configured vport\n");
>  		mutex_unlock(&pd->vport_mutex);
>  		return 0;
> @@ -56,7 +57,8 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32
> port, struct mana_ib_pd *pd,
>  		pd->vport_use_count--;
>  		mutex_unlock(&pd->vport_mutex);
> 
> -		ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n",
> err);
> +		ibdev_dbg(&mib_dev->ib_dev, "Failed to configure
> vPort %d\n",
> +			  err);
>  		return err;
>  	}
> 
> @@ -65,7 +67,7 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32
> port, struct mana_ib_pd *pd,
>  	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
>  	pd->tx_vp_offset = mpc->tx_vp_offset;
> 
> -	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x
> doorbell_id %x\n",
> +	ibdev_dbg(&mib_dev->ib_dev, "vport handle %llx pdid %x doorbell_id
> +%x\n",
>  		  mpc->port_handle, pd->pdn, doorbell_id);
> 
>  	return 0;
> @@ -78,12 +80,12 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct
> ib_udata *udata)
>  	struct gdma_create_pd_resp resp = {};
>  	struct gdma_create_pd_req req = {};
>  	enum gdma_pd_flags flags = 0;
> -	struct mana_ib_dev *dev;
> +	struct mana_ib_dev *mib_dev;
>  	struct gdma_dev *mdev;
>  	int err;
> 
> -	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> -	mdev = dev->gdma_dev;
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	mdev = mib_dev->gdma_dev;
> 
>  	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
>  			     sizeof(resp));
> @@ -93,7 +95,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct
> ib_udata *udata)
>  				   sizeof(resp), &resp);
> 
>  	if (err || resp.hdr.status) {
> -		ibdev_dbg(&dev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to get pd_id err %d status %u\n", err,
>  			  resp.hdr.status);
>  		if (!err)
> @@ -104,7 +106,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct
> ib_udata *udata)
> 
>  	pd->pd_handle = resp.pd_handle;
>  	pd->pdn = resp.pd_id;
> -	ibdev_dbg(&dev->ib_dev, "pd_handle 0x%llx pd_id %d\n",
> +	ibdev_dbg(&mib_dev->ib_dev, "pd_handle 0x%llx pd_id %d\n",
>  		  pd->pd_handle, pd->pdn);
> 
>  	mutex_init(&pd->vport_mutex);
> @@ -118,12 +120,12 @@ int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct
> ib_udata *udata)
>  	struct ib_device *ibdev = ibpd->device;
>  	struct gdma_destory_pd_resp resp = {};
>  	struct gdma_destroy_pd_req req = {};
> -	struct mana_ib_dev *dev;
> +	struct mana_ib_dev *mib_dev;
>  	struct gdma_dev *mdev;
>  	int err;
> 
> -	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> -	mdev = dev->gdma_dev;
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	mdev = mib_dev->gdma_dev;
> 
>  	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req),
>  			     sizeof(resp));
> @@ -133,7 +135,7 @@ int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct
> ib_udata *udata)
>  				   sizeof(resp), &resp);
> 
>  	if (err || resp.hdr.status) {
> -		ibdev_dbg(&dev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to destroy pd_handle 0x%llx err %d
> status %u",
>  			  pd->pd_handle, err, resp.hdr.status);
>  		if (!err)
> @@ -204,14 +206,14 @@ int mana_ib_alloc_ucontext(struct ib_ucontext
> *ibcontext,
>  	struct mana_ib_ucontext *ucontext =
>  		container_of(ibcontext, struct mana_ib_ucontext,
> ibucontext);
>  	struct ib_device *ibdev = ibcontext->device;
> -	struct mana_ib_dev *mdev;
> +	struct mana_ib_dev *mib_dev;
>  	struct gdma_context *gc;
>  	struct gdma_dev *dev;
>  	int doorbell_page;
>  	int ret;
> 
> -	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> -	dev = mdev->gdma_dev;
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	dev = mib_dev->gdma_dev;
>  	gc = dev->gdma_context;
> 
>  	/* Allocate a doorbell page index */
> @@ -233,12 +235,12 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext
> *ibcontext)
>  	struct mana_ib_ucontext *mana_ucontext =
>  		container_of(ibcontext, struct mana_ib_ucontext,
> ibucontext);
>  	struct ib_device *ibdev = ibcontext->device;
> -	struct mana_ib_dev *mdev;
> +	struct mana_ib_dev *mib_dev;
>  	struct gdma_context *gc;
>  	int ret;
> 
> -	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> -	gc = mdev->gdma_dev->gdma_context;
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	gc = mib_dev->gdma_dev->gdma_context;
> 
>  	ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext-
> >doorbell);
>  	if (ret)
> @@ -246,7 +248,7 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext
> *ibcontext)  }
> 
>  static int
> -mana_ib_gd_first_dma_region(struct mana_ib_dev *dev,
> +mana_ib_gd_first_dma_region(struct mana_ib_dev *mib_dev,
>  			    struct gdma_context *gc,
>  			    struct gdma_create_dma_region_req *create_req,
>  			    size_t num_pages, mana_handle_t *gdma_region,
> @@ -263,7 +265,7 @@ mana_ib_gd_first_dma_region(struct mana_ib_dev
> *dev,
>  	err = mana_gd_send_request(gc, create_req_msg_size, create_req,
>  				   sizeof(create_resp), &create_resp);
>  	if (err || create_resp.hdr.status != expected_status) {
> -		ibdev_dbg(&dev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to create DMA region: %d, 0x%x\n",
>  			  err, create_resp.hdr.status);
>  		if (!err)
> @@ -273,14 +275,14 @@ mana_ib_gd_first_dma_region(struct
> mana_ib_dev *dev,
>  	}
> 
>  	*gdma_region = create_resp.dma_region_handle;
> -	ibdev_dbg(&dev->ib_dev, "Created DMA region handle 0x%llx\n",
> +	ibdev_dbg(&mib_dev->ib_dev, "Created DMA region handle
> 0x%llx\n",
>  		  *gdma_region);
> 
>  	return 0;
>  }
> 
>  static int
> -mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct
> gdma_context *gc,
> +mana_ib_gd_add_dma_region(struct mana_ib_dev *mib_dev, struct
> +gdma_context *gc,
>  			  struct gdma_dma_region_add_pages_req *add_req,
>  			  unsigned int num_pages, u32 expected_status)
> { @@ -296,7 +298,7 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev
> *dev, struct gdma_context *gc,
>  	err = mana_gd_send_request(gc, add_req_msg_size, add_req,
>  				   sizeof(add_resp), &add_resp);
>  	if (err || add_resp.hdr.status != expected_status) {
> -		ibdev_dbg(&dev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to create DMA region: %d, 0x%x\n",
>  			  err, add_resp.hdr.status);
> 
> @@ -309,7 +311,8 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev
> *dev, struct gdma_context *gc,
>  	return 0;
>  }
> 
> -int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct
> ib_umem *umem,
> +int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
> +				 struct ib_umem *umem,
>  				 mana_handle_t *gdma_region)
>  {
>  	struct gdma_dma_region_add_pages_req *add_req = NULL; @@ -
> 329,14 +332,14 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev
> *dev, struct ib_umem *umem,
>  	void *request_buf;
>  	int err;
> 
> -	mdev = dev->gdma_dev;
> +	mdev = mib_dev->gdma_dev;
>  	gc = mdev->gdma_context;
>  	hwc = gc->hwc.driver_data;
> 
>  	/* Hardware requires dma region to align to chosen page size */
>  	page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
>  	if (!page_sz) {
> -		ibdev_dbg(&dev->ib_dev, "failed to find page size.\n");
> +		ibdev_dbg(&mib_dev->ib_dev, "failed to find page size.\n");
>  		return -ENOMEM;
>  	}
>  	num_pages_total = ib_umem_num_dma_blocks(umem, page_sz);
> @@ -362,13 +365,13 @@ int mana_ib_gd_create_dma_region(struct
> mana_ib_dev *dev, struct ib_umem *umem,
>  	create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT;
>  	create_req->page_count = num_pages_total;
> 
> -	ibdev_dbg(&dev->ib_dev, "size_dma_region %lu
> num_pages_total %lu\n",
> +	ibdev_dbg(&mib_dev->ib_dev, "size_dma_region %lu
> num_pages_total
> +%lu\n",
>  		  umem->length, num_pages_total);
> 
> -	ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n",
> +	ibdev_dbg(&mib_dev->ib_dev, "page_sz %lu offset_in_page %u\n",
>  		  page_sz, create_req->offset_in_page);
> 
> -	ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu,
> gdma_page_type %u",
> +	ibdev_dbg(&mib_dev->ib_dev, "num_pages_to_handle %lu,
> gdma_page_type
> +%u",
>  		  num_pages_to_handle, create_req->gdma_page_type);
> 
>  	page_addr_list = create_req->page_addr_list; @@ -385,7 +388,7 @@
> int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct
> ib_umem *umem,
> 
>  		if (!num_pages_processed) {
>  			/* First create message */
> -			err = mana_ib_gd_first_dma_region(dev, gc,
> create_req,
> +			err = mana_ib_gd_first_dma_region(mib_dev, gc,
> create_req,
>  							  tail, gdma_region,
>  							  expected_status);
>  			if (err)
> @@ -400,7 +403,7 @@ int mana_ib_gd_create_dma_region(struct
> mana_ib_dev *dev, struct ib_umem *umem,
>  			page_addr_list = add_req->page_addr_list;
>  		} else {
>  			/* Subsequent create messages */
> -			err = mana_ib_gd_add_dma_region(dev, gc, add_req,
> tail,
> +			err = mana_ib_gd_add_dma_region(mib_dev, gc,
> add_req, tail,
>  							expected_status);
>  			if (err)
>  				break;
> @@ -417,20 +420,20 @@ int mana_ib_gd_create_dma_region(struct
> mana_ib_dev *dev, struct ib_umem *umem,
>  	}
> 
>  	if (err)
> -		mana_ib_gd_destroy_dma_region(dev, *gdma_region);
> +		mana_ib_gd_destroy_dma_region(mib_dev, *gdma_region);
> 
>  out:
>  	kfree(request_buf);
>  	return err;
>  }
> 
> -int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, u64
> gdma_region)
> +int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *mib_dev, u64
> +gdma_region)
>  {
> -	struct gdma_dev *mdev = dev->gdma_dev;
> +	struct gdma_dev *mdev = mib_dev->gdma_dev;
>  	struct gdma_context *gc;
> 
>  	gc = mdev->gdma_context;
> -	ibdev_dbg(&dev->ib_dev, "destroy dma region 0x%llx\n",
> gdma_region);
> +	ibdev_dbg(&mib_dev->ib_dev, "destroy dma region 0x%llx\n",
> +gdma_region);
> 
>  	return mana_gd_destroy_dma_region(gc, gdma_region);  } @@ -
> 440,14 +443,14 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext,
> struct vm_area_struct *vma)
>  	struct mana_ib_ucontext *mana_ucontext =
>  		container_of(ibcontext, struct mana_ib_ucontext,
> ibucontext);
>  	struct ib_device *ibdev = ibcontext->device;
> -	struct mana_ib_dev *mdev;
> +	struct mana_ib_dev *mib_dev;
>  	struct gdma_context *gc;
>  	phys_addr_t pfn;
>  	pgprot_t prot;
>  	int ret;
> 
> -	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> -	gc = mdev->gdma_dev->gdma_context;
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	gc = mib_dev->gdma_dev->gdma_context;
> 
>  	if (vma->vm_pgoff != 0) {
>  		ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma-
> >vm_pgoff); diff --git a/drivers/infiniband/hw/mana/mana_ib.h
> b/drivers/infiniband/hw/mana/mana_ib.h
> index 502cc8672eef..ee4efd0af278 100644
> --- a/drivers/infiniband/hw/mana/mana_ib.h
> +++ b/drivers/infiniband/hw/mana/mana_ib.h
> @@ -92,10 +92,11 @@ struct mana_ib_rwq_ind_table {
>  	struct ib_rwq_ind_table ib_ind_table;
>  };
> 
> -int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct
> ib_umem *umem,
> +int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
> +				 struct ib_umem *umem,
>  				 mana_handle_t *gdma_region);
> 
> -int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev,
> +int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *mib_dev,
>  				  mana_handle_t gdma_region);
> 
>  struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, @@ -129,9 +130,9 @@
> int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
> 
>  int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
> 
> -int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port_id,
> +int mana_ib_cfg_vport(struct mana_ib_dev *mib_dev, u32 port_id,
>  		      struct mana_ib_pd *pd, u32 doorbell_id); -void
> mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
> +void mana_ib_uncfg_vport(struct mana_ib_dev *mib_dev, struct
> mana_ib_pd
> +*pd,
>  			 u32 port);
> 
>  int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
> diff --git a/drivers/infiniband/hw/mana/mr.c
> b/drivers/infiniband/hw/mana/mr.c index 351207c60eb6..f6a53906204d
> 100644
> --- a/drivers/infiniband/hw/mana/mr.c
> +++ b/drivers/infiniband/hw/mana/mr.c
> @@ -25,12 +25,13 @@ mana_ib_verbs_to_gdma_access_flags(int
> access_flags)
>  	return flags;
>  }
> 
> -static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct
> mana_ib_mr *mr,
> +static int mana_ib_gd_create_mr(struct mana_ib_dev *mib_dev,
> +				struct mana_ib_mr *mr,
>  				struct gdma_create_mr_params *mr_params)
> {
> +	struct gdma_dev *mdev = mib_dev->gdma_dev;
>  	struct gdma_create_mr_response resp = {};
>  	struct gdma_create_mr_request req = {};
> -	struct gdma_dev *mdev = dev->gdma_dev;
>  	struct gdma_context *gc;
>  	int err;
> 
> @@ -49,7 +50,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev
> *dev, struct mana_ib_mr *mr,
>  		break;
> 
>  	default:
> -		ibdev_dbg(&dev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "invalid param (GDMA_MR_TYPE) passed,
> type %d\n",
>  			  req.mr_type);
>  		return -EINVAL;
> @@ -58,7 +59,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev
> *dev, struct mana_ib_mr *mr,
>  	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> &resp);
> 
>  	if (err || resp.hdr.status) {
> -		ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err,
> +		ibdev_dbg(&mib_dev->ib_dev, "Failed to create mr %d, %u",
> err,
>  			  resp.hdr.status);
>  		if (!err)
>  			err = -EPROTO;
> @@ -73,11 +74,11 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev
> *dev, struct mana_ib_mr *mr,
>  	return 0;
>  }
> 
> -static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle)
> +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *mib_dev, u64
> +mr_handle)
>  {
>  	struct gdma_destroy_mr_response resp = {};
> +	struct gdma_dev *mdev = mib_dev->gdma_dev;
>  	struct gdma_destroy_mr_request req = {};
> -	struct gdma_dev *mdev = dev->gdma_dev;
>  	struct gdma_context *gc;
>  	int err;
> 
> @@ -107,12 +108,12 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd
> *ibpd, u64 start, u64 length,
>  	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd,
> ibpd);
>  	struct gdma_create_mr_params mr_params = {};
>  	struct ib_device *ibdev = ibpd->device;
> -	struct mana_ib_dev *dev;
> +	struct mana_ib_dev *mib_dev;
>  	struct mana_ib_mr *mr;
>  	u64 dma_region_handle;
>  	int err;
> 
> -	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> 
>  	ibdev_dbg(ibdev,
>  		  "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x",
> @@ -133,7 +134,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd
> *ibpd, u64 start, u64 length,
>  		goto err_free;
>  	}
> 
> -	err = mana_ib_gd_create_dma_region(dev, mr->umem,
> &dma_region_handle);
> +	err = mana_ib_gd_create_dma_region(mib_dev, mr->umem,
> +&dma_region_handle);
>  	if (err) {
>  		ibdev_dbg(ibdev, "Failed create dma region for user-
> mr, %d\n",
>  			  err);
> @@ -151,7 +152,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd
> *ibpd, u64 start, u64 length,
>  	mr_params.gva.access_flags =
>  		mana_ib_verbs_to_gdma_access_flags(access_flags);
> 
> -	err = mana_ib_gd_create_mr(dev, mr, &mr_params);
> +	err = mana_ib_gd_create_mr(mib_dev, mr, &mr_params);
>  	if (err)
>  		goto err_dma_region;
> 
> @@ -164,7 +165,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd
> *ibpd, u64 start, u64 length,
>  	return &mr->ibmr;
> 
>  err_dma_region:
> -	mana_gd_destroy_dma_region(dev->gdma_dev->gdma_context,
> +	mana_gd_destroy_dma_region(mib_dev->gdma_dev-
> >gdma_context,
>  				   dma_region_handle);
> 
>  err_umem:
> @@ -179,12 +180,12 @@ int mana_ib_dereg_mr(struct ib_mr *ibmr, struct
> ib_udata *udata)  {
>  	struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr,
> ibmr);
>  	struct ib_device *ibdev = ibmr->device;
> -	struct mana_ib_dev *dev;
> +	struct mana_ib_dev *mib_dev;
>  	int err;
> 
> -	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> 
> -	err = mana_ib_gd_destroy_mr(dev, mr->mr_handle);
> +	err = mana_ib_gd_destroy_mr(mib_dev, mr->mr_handle);
>  	if (err)
>  		return err;
> 
> diff --git a/drivers/infiniband/hw/mana/qp.c
> b/drivers/infiniband/hw/mana/qp.c index 4b3b5b274e84..2e3a57123ed7
> 100644
> --- a/drivers/infiniband/hw/mana/qp.c
> +++ b/drivers/infiniband/hw/mana/qp.c
> @@ -5,7 +5,7 @@
> 
>  #include "mana_ib.h"
> 
> -static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
> +static int mana_ib_cfg_vport_steering(struct mana_ib_dev *mib_dev,
>  				      struct net_device *ndev,
>  				      mana_handle_t default_rxobj,
>  				      mana_handle_t ind_table[],
> @@ -21,7 +21,7 @@ static int mana_ib_cfg_vport_steering(struct
> mana_ib_dev *dev,
>  	u32 req_buf_size;
>  	int i, err;
> 
> -	mdev = dev->gdma_dev;
> +	mdev = mib_dev->gdma_dev;
>  	gc = mdev->gdma_context;
> 
>  	req_buf_size =
> @@ -55,10 +55,10 @@ static int mana_ib_cfg_vport_steering(struct
> mana_ib_dev *dev,
>  	 * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb
>  	 * ind_table to MANA_INDIRECT_TABLE_SIZE if required
>  	 */
> -	ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 <<
> log_ind_tbl_size);
> +	ibdev_dbg(&mib_dev->ib_dev, "ind table size %u\n", 1 <<
> +log_ind_tbl_size);
>  	for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) {
>  		req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)];
> -		ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i,
> +		ibdev_dbg(&mib_dev->ib_dev, "index %u handle 0x%llx\n", i,
>  			  req_indir_tab[i]);
>  	}
> 
> @@ -68,7 +68,7 @@ static int mana_ib_cfg_vport_steering(struct
> mana_ib_dev *dev,
>  	else
>  		netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE);
> 
> -	ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n",
> +	ibdev_dbg(&mib_dev->ib_dev, "vport handle %llu default_rxobj
> +0x%llx\n",
>  		  req->vport, default_rxobj);
> 
>  	err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp),
> &resp); @@ -97,12 +97,12 @@ static int mana_ib_create_qp_rss(struct ib_qp
> *ibqp, struct ib_pd *pd,
>  				 struct ib_udata *udata)
>  {
>  	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp,
> ibqp);
> -	struct mana_ib_dev *mdev =
> +	struct mana_ib_dev *mib_dev =
>  		container_of(pd->device, struct mana_ib_dev, ib_dev);
>  	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
>  	struct mana_ib_create_qp_rss_resp resp = {};
>  	struct mana_ib_create_qp_rss ucmd = {};
> -	struct gdma_dev *gd = mdev->gdma_dev;
> +	struct gdma_dev *gd = mib_dev->gdma_dev;

Need to follow the "reverse tree" style along with the rest of driver.

>  	mana_handle_t *mana_ind_table;
>  	struct mana_port_context *mpc;
>  	struct mana_context *mc;
> @@ -123,21 +123,21 @@ static int mana_ib_create_qp_rss(struct ib_qp
> *ibqp, struct ib_pd *pd,
> 
>  	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
>  	if (ret) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed copy from udata for create rss-qp, err %d\n",
>  			  ret);
>  		return ret;
>  	}
> 
>  	if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Requested max_recv_wr %d exceeding limit\n",
>  			  attr->cap.max_recv_wr);
>  		return -EINVAL;
>  	}
> 
>  	if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Requested max_recv_sge %d exceeding limit\n",
>  			  attr->cap.max_recv_sge);
>  		return -EINVAL;
> @@ -145,14 +145,14 @@ static int mana_ib_create_qp_rss(struct ib_qp
> *ibqp, struct ib_pd *pd,
> 
>  	ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size;
>  	if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Indirect table size %d exceeding limit\n",
>  			  ind_tbl_size);
>  		return -EINVAL;
>  	}
> 
>  	if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "RX Hash function is not supported, %d\n",
>  			  ucmd.rx_hash_function);
>  		return -EINVAL;
> @@ -161,14 +161,14 @@ static int mana_ib_create_qp_rss(struct ib_qp
> *ibqp, struct ib_pd *pd,
>  	/* IB ports start with 1, MANA start with 0 */
>  	port = ucmd.port;
>  	if (port < 1 || port > mc->num_ports) {
> -		ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating
> qp\n",
> +		ibdev_dbg(&mib_dev->ib_dev, "Invalid port %u in creating
> qp\n",
>  			  port);
>  		return -EINVAL;
>  	}
>  	ndev = mc->ports[port - 1];
>  	mpc = netdev_priv(ndev);
> 
> -	ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n",
> +	ibdev_dbg(&mib_dev->ib_dev, "rx_hash_function %d port %d\n",
>  		  ucmd.rx_hash_function, port);
> 
>  	mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t), @@ -
> 210,7 +210,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct
> ib_pd *pd,
>  		wq->id = wq_spec.queue_index;
>  		cq->id = cq_spec.queue_index;
> 
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "ret %d rx_object 0x%llx wq id %llu cq id %llu\n",
>  			  ret, wq->rx_object, wq->id, cq->id);
> 
> @@ -221,7 +221,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp,
> struct ib_pd *pd,
>  	}
>  	resp.num_entries = i;
> 
> -	ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object,
> +	ret = mana_ib_cfg_vport_steering(mib_dev, ndev, wq->rx_object,
>  					 mana_ind_table,
>  					 ind_tbl->log_ind_tbl_size,
>  					 ucmd.rx_hash_key_len,
> @@ -231,7 +231,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp,
> struct ib_pd *pd,
> 
>  	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
>  	if (ret) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to copy to udata create rss-qp, %d\n",
>  			  ret);
>  		goto fail;
> @@ -259,7 +259,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp,
> struct ib_pd *ibpd,  {
>  	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd,
> ibpd);
>  	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp,
> ibqp);
> -	struct mana_ib_dev *mdev =
> +	struct mana_ib_dev *mib_dev =
>  		container_of(ibpd->device, struct mana_ib_dev, ib_dev);
>  	struct mana_ib_cq *send_cq =
>  		container_of(attr->send_cq, struct mana_ib_cq, ibcq); @@ -
> 267,7 +267,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp,
> struct ib_pd *ibpd,
>  		rdma_udata_to_drv_context(udata, struct
> mana_ib_ucontext,
>  					  ibucontext);
>  	struct mana_ib_create_qp_resp resp = {};
> -	struct gdma_dev *gd = mdev->gdma_dev;
> +	struct gdma_dev *gd = mib_dev->gdma_dev;
>  	struct mana_ib_create_qp ucmd = {};
>  	struct mana_obj_spec wq_spec = {};
>  	struct mana_obj_spec cq_spec = {};
> @@ -285,7 +285,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp,
> struct ib_pd *ibpd,
> 
>  	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
>  	if (err) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to copy from udata create qp-raw, %d\n",
> err);
>  		return err;
>  	}
> @@ -296,14 +296,14 @@ static int mana_ib_create_qp_raw(struct ib_qp
> *ibqp, struct ib_pd *ibpd,
>  		return -EINVAL;
> 
>  	if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Requested max_send_wr %d exceeding limit\n",
>  			  attr->cap.max_send_wr);
>  		return -EINVAL;
>  	}
> 
>  	if (attr->cap.max_send_sge > MAX_TX_WQE_SGL_ENTRIES) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Requested max_send_sge %d exceeding limit\n",
>  			  attr->cap.max_send_sge);
>  		return -EINVAL;
> @@ -311,38 +311,38 @@ static int mana_ib_create_qp_raw(struct ib_qp
> *ibqp, struct ib_pd *ibpd,
> 
>  	ndev = mc->ports[port - 1];
>  	mpc = netdev_priv(ndev);
> -	ibdev_dbg(&mdev->ib_dev, "port %u ndev %p mpc %p\n", port,
> ndev, mpc);
> +	ibdev_dbg(&mib_dev->ib_dev, "port %u ndev %p mpc %p\n", port,
> ndev,
> +mpc);
> 
> -	err = mana_ib_cfg_vport(mdev, port - 1, pd, mana_ucontext-
> >doorbell);
> +	err = mana_ib_cfg_vport(mib_dev, port - 1, pd,
> +mana_ucontext->doorbell);
>  	if (err)
>  		return -ENODEV;
> 
>  	qp->port = port;
> 
> -	ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n",
> +	ibdev_dbg(&mib_dev->ib_dev, "ucmd sq_buf_addr 0x%llx
> port %u\n",
>  		  ucmd.sq_buf_addr, ucmd.port);
> 
>  	umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr,
> ucmd.sq_buf_size,
>  			   IB_ACCESS_LOCAL_WRITE);
>  	if (IS_ERR(umem)) {
>  		err = PTR_ERR(umem);
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to get umem for create qp-raw, err %d\n",
>  			  err);
>  		goto err_free_vport;
>  	}
>  	qp->sq_umem = umem;
> 
> -	err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem,
> +	err = mana_ib_gd_create_dma_region(mib_dev, qp->sq_umem,
>  					   &qp->sq_gdma_region);
>  	if (err) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to create dma region for create qp-
> raw, %d\n",
>  			  err);
>  		goto err_release_umem;
>  	}
> 
> -	ibdev_dbg(&mdev->ib_dev,
> +	ibdev_dbg(&mib_dev->ib_dev,
>  		  "mana_ib_gd_create_dma_region ret %d gdma_region
> 0x%llx\n",
>  		  err, qp->sq_gdma_region);
> 
> @@ -358,7 +358,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp,
> struct ib_pd *ibpd,
>  	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ,
> &wq_spec,
>  				 &cq_spec, &qp->tx_object);
>  	if (err) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to create wq for create raw-qp, err %d\n",
>  			  err);
>  		goto err_destroy_dma_region;
> @@ -371,7 +371,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp,
> struct ib_pd *ibpd,
>  	qp->sq_id = wq_spec.queue_index;
>  	send_cq->id = cq_spec.queue_index;
> 
> -	ibdev_dbg(&mdev->ib_dev,
> +	ibdev_dbg(&mib_dev->ib_dev,
>  		  "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err,
>  		  qp->tx_object, qp->sq_id, send_cq->id);
> 
> @@ -381,7 +381,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp,
> struct ib_pd *ibpd,
> 
>  	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
>  	if (err) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed copy udata for create qp-raw, %d\n",
>  			  err);
>  		goto err_destroy_wq_obj;
> @@ -393,13 +393,13 @@ static int mana_ib_create_qp_raw(struct ib_qp
> *ibqp, struct ib_pd *ibpd,
>  	mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object);
> 
>  err_destroy_dma_region:
> -	mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region);
> +	mana_ib_gd_destroy_dma_region(mib_dev, qp->sq_gdma_region);
> 
>  err_release_umem:
>  	ib_umem_release(umem);
> 
>  err_free_vport:
> -	mana_ib_uncfg_vport(mdev, pd, port - 1);
> +	mana_ib_uncfg_vport(mib_dev, pd, port - 1);
> 
>  	return err;
>  }
> @@ -435,9 +435,9 @@ static int mana_ib_destroy_qp_rss(struct
> mana_ib_qp *qp,
>  				  struct ib_rwq_ind_table *ind_tbl,
>  				  struct ib_udata *udata)
>  {
> -	struct mana_ib_dev *mdev =
> +	struct mana_ib_dev *mib_dev =
>  		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
> -	struct gdma_dev *gd = mdev->gdma_dev;
> +	struct gdma_dev *gd = mib_dev->gdma_dev;
>  	struct mana_port_context *mpc;
>  	struct mana_context *mc;
>  	struct net_device *ndev;
> @@ -452,7 +452,7 @@ static int mana_ib_destroy_qp_rss(struct
> mana_ib_qp *qp,
>  	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
>  		ibwq = ind_tbl->ind_tbl[i];
>  		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
> -		ibdev_dbg(&mdev->ib_dev, "destroying wq-
> >rx_object %llu\n",
> +		ibdev_dbg(&mib_dev->ib_dev, "destroying wq-
> >rx_object %llu\n",
>  			  wq->rx_object);
>  		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
>  	}
> @@ -462,9 +462,9 @@ static int mana_ib_destroy_qp_rss(struct
> mana_ib_qp *qp,
> 
>  static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata
> *udata)  {
> -	struct mana_ib_dev *mdev =
> +	struct mana_ib_dev *mib_dev =
>  		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
> -	struct gdma_dev *gd = mdev->gdma_dev;
> +	struct gdma_dev *gd = mib_dev->gdma_dev;
>  	struct ib_pd *ibpd = qp->ibqp.pd;
>  	struct mana_port_context *mpc;
>  	struct mana_context *mc;
> @@ -479,11 +479,11 @@ static int mana_ib_destroy_qp_raw(struct
> mana_ib_qp *qp, struct ib_udata *udata)
>  	mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object);
> 
>  	if (qp->sq_umem) {
> -		mana_ib_gd_destroy_dma_region(mdev, qp-
> >sq_gdma_region);
> +		mana_ib_gd_destroy_dma_region(mib_dev, qp-
> >sq_gdma_region);
>  		ib_umem_release(qp->sq_umem);
>  	}
> 
> -	mana_ib_uncfg_vport(mdev, pd, qp->port - 1);
> +	mana_ib_uncfg_vport(mib_dev, pd, qp->port - 1);
> 
>  	return 0;
>  }
> diff --git a/drivers/infiniband/hw/mana/wq.c
> b/drivers/infiniband/hw/mana/wq.c index 372d361510e0..56bc2b8b6690
> 100644
> --- a/drivers/infiniband/hw/mana/wq.c
> +++ b/drivers/infiniband/hw/mana/wq.c
> @@ -9,7 +9,7 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
>  				struct ib_wq_init_attr *init_attr,
>  				struct ib_udata *udata)
>  {
> -	struct mana_ib_dev *mdev =
> +	struct mana_ib_dev *mib_dev =
>  		container_of(pd->device, struct mana_ib_dev, ib_dev);
>  	struct mana_ib_create_wq ucmd = {};
>  	struct mana_ib_wq *wq;
> @@ -21,7 +21,7 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
> 
>  	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
>  	if (err) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to copy from udata for create wq, %d\n", err);
>  		return ERR_PTR(err);
>  	}
> @@ -30,13 +30,14 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
>  	if (!wq)
>  		return ERR_PTR(-ENOMEM);
> 
> -	ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n",
> ucmd.wq_buf_addr);
> +	ibdev_dbg(&mib_dev->ib_dev, "ucmd wq_buf_addr 0x%llx\n",
> +		  ucmd.wq_buf_addr);
> 
>  	umem = ib_umem_get(pd->device, ucmd.wq_buf_addr,
> ucmd.wq_buf_size,
>  			   IB_ACCESS_LOCAL_WRITE);
>  	if (IS_ERR(umem)) {
>  		err = PTR_ERR(umem);
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to get umem for create wq, err %d\n", err);
>  		goto err_free_wq;
>  	}
> @@ -46,15 +47,15 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
>  	wq->wq_buf_size = ucmd.wq_buf_size;
>  	wq->rx_object = INVALID_MANA_HANDLE;
> 
> -	err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq-
> >gdma_region);
> +	err = mana_ib_gd_create_dma_region(mib_dev, wq->umem,
> +&wq->gdma_region);
>  	if (err) {
> -		ibdev_dbg(&mdev->ib_dev,
> +		ibdev_dbg(&mib_dev->ib_dev,
>  			  "Failed to create dma region for create wq, %d\n",
>  			  err);
>  		goto err_release_umem;
>  	}
> 
> -	ibdev_dbg(&mdev->ib_dev,
> +	ibdev_dbg(&mib_dev->ib_dev,
>  		  "mana_ib_gd_create_dma_region ret %d gdma_region
> 0x%llx\n",
>  		  err, wq->gdma_region);
> 
> @@ -82,11 +83,11 @@ int mana_ib_destroy_wq(struct ib_wq *ibwq, struct
> ib_udata *udata)  {
>  	struct mana_ib_wq *wq = container_of(ibwq, struct mana_ib_wq,
> ibwq);
>  	struct ib_device *ib_dev = ibwq->device;
> -	struct mana_ib_dev *mdev;
> +	struct mana_ib_dev *mib_dev;
> 
> -	mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev);
> +	mib_dev = container_of(ib_dev, struct mana_ib_dev, ib_dev);
> 
> -	mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region);
> +	mana_ib_gd_destroy_dma_region(mib_dev, wq->gdma_region);
>  	ib_umem_release(wq->umem);
> 
>  	kfree(wq);
> --
> 2.25.1


^ permalink raw reply

* RE: [Patch v3 2/4] RDMA/mana_ib : Register Mana IB  device with Management SW
From: Long Li @ 2023-07-28 21:32 UTC (permalink / raw)
  To: sharmaajay@linuxonhyperv.com, Jason Gunthorpe, Leon Romanovsky,
	Dexuan Cui, Wei Liu, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Ajay Sharma
In-Reply-To: <1690402104-29518-3-git-send-email-sharmaajay@linuxonhyperv.com>

> Subject: [Patch v3 2/4] RDMA/mana_ib : Register Mana IB device with
> Management SW
> 
> [Some people who received this message don't often get email from
> sharmaajay@linuxonhyperv.com. Learn why this is important at
> https://aka.ms/LearnAboutSenderIdentification ]
> 
> From: Ajay Sharma <sharmaajay@microsoft.com>
> 
> Each of the MANA infiniband devices must be registered with the
> management software to request services/resources.
> Register the Mana IB device with Management which would later help get an
> adapter handle.
> 
> Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
> ---
>  drivers/infiniband/hw/mana/device.c           | 20 +++++--
>  drivers/infiniband/hw/mana/main.c             | 58 ++++++-------------
>  drivers/infiniband/hw/mana/mana_ib.h          |  1 +
>  drivers/infiniband/hw/mana/mr.c               | 17 ++----
>  drivers/infiniband/hw/mana/qp.c               | 10 ++--
>  .../net/ethernet/microsoft/mana/gdma_main.c   |  5 ++
>  include/net/mana/gdma.h                       |  3 +
>  7 files changed, 55 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mana/device.c
> b/drivers/infiniband/hw/mana/device.c
> index 083f27246ba8..ea4c8c8fc10d 100644
> --- a/drivers/infiniband/hw/mana/device.c
> +++ b/drivers/infiniband/hw/mana/device.c
> @@ -78,22 +78,34 @@ static int mana_ib_probe(struct auxiliary_device
> *adev,
>         mib_dev->ib_dev.num_comp_vectors = 1;
>         mib_dev->ib_dev.dev.parent = mdev->gdma_context->dev;
> 
> -       ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
> -                                mdev->gdma_context->dev);
> +       ret = mana_gd_register_device(&mib_dev->gc->mana_ib);

Is this device implemented on all existing Azure hosts? If not, it will break existing VMs.

Long

^ permalink raw reply

* [PATCH V4,net-next] net: mana: Add page pool for RX buffers
From: Haiyang Zhang @ 2023-07-28 21:46 UTC (permalink / raw)
  To: linux-hyperv, netdev
  Cc: haiyangz, decui, kys, paulros, olaf, vkuznets, davem, wei.liu,
	edumazet, kuba, pabeni, leon, longli, ssengar, linux-rdma, daniel,
	john.fastabend, bpf, ast, sharmaajay, hawk, tglx, shradhagupta,
	linux-kernel

Add page pool for RX buffers for faster buffer cycle and reduce CPU
usage.

The standard page pool API is used.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
V4:
Add nid setting, remove page_pool_nid_changed(), as suggested by
Jesper Dangaard Brouer
V3:
Update xdp mem model, pool param, alloc as suggested by Jakub Kicinski
V2:
Use the standard page pool API as suggested by Jesper Dangaard Brouer

---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 90 +++++++++++++++----
 include/net/mana/mana.h                       |  3 +
 2 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ac2acc9aca9d..83f2ac132990 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1414,8 +1414,8 @@ static struct sk_buff *mana_build_skb(struct mana_rxq *rxq, void *buf_va,
 	return skb;
 }
 
-static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
-			struct mana_rxq *rxq)
+static void mana_rx_skb(void *buf_va, bool from_pool,
+			struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq)
 {
 	struct mana_stats_rx *rx_stats = &rxq->stats;
 	struct net_device *ndev = rxq->ndev;
@@ -1448,6 +1448,9 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
 	if (!skb)
 		goto drop;
 
+	if (from_pool)
+		skb_mark_for_recycle(skb);
+
 	skb->dev = napi->dev;
 
 	skb->protocol = eth_type_trans(skb, ndev);
@@ -1498,9 +1501,14 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
 	u64_stats_update_end(&rx_stats->syncp);
 
 drop:
-	WARN_ON_ONCE(rxq->xdp_save_va);
-	/* Save for reuse */
-	rxq->xdp_save_va = buf_va;
+	if (from_pool) {
+		page_pool_recycle_direct(rxq->page_pool,
+					 virt_to_head_page(buf_va));
+	} else {
+		WARN_ON_ONCE(rxq->xdp_save_va);
+		/* Save for reuse */
+		rxq->xdp_save_va = buf_va;
+	}
 
 	++ndev->stats.rx_dropped;
 
@@ -1508,11 +1516,13 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
 }
 
 static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
-			     dma_addr_t *da, bool is_napi)
+			     dma_addr_t *da, bool *from_pool, bool is_napi)
 {
 	struct page *page;
 	void *va;
 
+	*from_pool = false;
+
 	/* Reuse XDP dropped page if available */
 	if (rxq->xdp_save_va) {
 		va = rxq->xdp_save_va;
@@ -1533,17 +1543,22 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
 			return NULL;
 		}
 	} else {
-		page = dev_alloc_page();
+		page = page_pool_dev_alloc_pages(rxq->page_pool);
 		if (!page)
 			return NULL;
 
+		*from_pool = true;
 		va = page_to_virt(page);
 	}
 
 	*da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
 			     DMA_FROM_DEVICE);
 	if (dma_mapping_error(dev, *da)) {
-		put_page(virt_to_head_page(va));
+		if (*from_pool)
+			page_pool_put_full_page(rxq->page_pool, page, is_napi);
+		else
+			put_page(virt_to_head_page(va));
+
 		return NULL;
 	}
 
@@ -1552,21 +1567,25 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
 
 /* Allocate frag for rx buffer, and save the old buf */
 static void mana_refill_rx_oob(struct device *dev, struct mana_rxq *rxq,
-			       struct mana_recv_buf_oob *rxoob, void **old_buf)
+			       struct mana_recv_buf_oob *rxoob, void **old_buf,
+			       bool *old_fp)
 {
+	bool from_pool;
 	dma_addr_t da;
 	void *va;
 
-	va = mana_get_rxfrag(rxq, dev, &da, true);
+	va = mana_get_rxfrag(rxq, dev, &da, &from_pool, true);
 	if (!va)
 		return;
 
 	dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
 			 DMA_FROM_DEVICE);
 	*old_buf = rxoob->buf_va;
+	*old_fp = rxoob->from_pool;
 
 	rxoob->buf_va = va;
 	rxoob->sgl[0].address = da;
+	rxoob->from_pool = from_pool;
 }
 
 static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
@@ -1580,6 +1599,7 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	struct device *dev = gc->dev;
 	void *old_buf = NULL;
 	u32 curr, pktlen;
+	bool old_fp;
 
 	apc = netdev_priv(ndev);
 
@@ -1622,12 +1642,12 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	rxbuf_oob = &rxq->rx_oobs[curr];
 	WARN_ON_ONCE(rxbuf_oob->wqe_inf.wqe_size_in_bu != 1);
 
-	mana_refill_rx_oob(dev, rxq, rxbuf_oob, &old_buf);
+	mana_refill_rx_oob(dev, rxq, rxbuf_oob, &old_buf, &old_fp);
 
 	/* Unsuccessful refill will have old_buf == NULL.
 	 * In this case, mana_rx_skb() will drop the packet.
 	 */
-	mana_rx_skb(old_buf, oob, rxq);
+	mana_rx_skb(old_buf, old_fp, oob, rxq);
 
 drop:
 	mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu);
@@ -1887,6 +1907,7 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 	struct mana_recv_buf_oob *rx_oob;
 	struct device *dev = gc->dev;
 	struct napi_struct *napi;
+	struct page *page;
 	int i;
 
 	if (!rxq)
@@ -1919,10 +1940,18 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 		dma_unmap_single(dev, rx_oob->sgl[0].address,
 				 rx_oob->sgl[0].size, DMA_FROM_DEVICE);
 
-		put_page(virt_to_head_page(rx_oob->buf_va));
+		page = virt_to_head_page(rx_oob->buf_va);
+
+		if (rx_oob->from_pool)
+			page_pool_put_full_page(rxq->page_pool, page, false);
+		else
+			put_page(page);
+
 		rx_oob->buf_va = NULL;
 	}
 
+	page_pool_destroy(rxq->page_pool);
+
 	if (rxq->gdma_rq)
 		mana_gd_destroy_queue(gc, rxq->gdma_rq);
 
@@ -1933,18 +1962,20 @@ static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
 			    struct mana_rxq *rxq, struct device *dev)
 {
 	struct mana_port_context *mpc = netdev_priv(rxq->ndev);
+	bool from_pool = false;
 	dma_addr_t da;
 	void *va;
 
 	if (mpc->rxbufs_pre)
 		va = mana_get_rxbuf_pre(rxq, &da);
 	else
-		va = mana_get_rxfrag(rxq, dev, &da, false);
+		va = mana_get_rxfrag(rxq, dev, &da, &from_pool, false);
 
 	if (!va)
 		return -ENOMEM;
 
 	rx_oob->buf_va = va;
+	rx_oob->from_pool = from_pool;
 
 	rx_oob->sgl[0].address = da;
 	rx_oob->sgl[0].size = rxq->datasize;
@@ -2014,6 +2045,26 @@ static int mana_push_wqe(struct mana_rxq *rxq)
 	return 0;
 }
 
+static int mana_create_page_pool(struct mana_rxq *rxq, struct gdma_context *gc)
+{
+	struct page_pool_params pprm = {};
+	int ret;
+
+	pprm.pool_size = RX_BUFFERS_PER_QUEUE;
+	pprm.nid = gc->numa_node;
+	pprm.napi = &rxq->rx_cq.napi;
+
+	rxq->page_pool = page_pool_create(&pprm);
+
+	if (IS_ERR(rxq->page_pool)) {
+		ret = PTR_ERR(rxq->page_pool);
+		rxq->page_pool = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
 static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 					u32 rxq_idx, struct mana_eq *eq,
 					struct net_device *ndev)
@@ -2043,6 +2094,13 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size,
 			   &rxq->headroom);
 
+	/* Create page pool for RX queue */
+	err = mana_create_page_pool(rxq, gc);
+	if (err) {
+		netdev_err(ndev, "Create page pool err:%d\n", err);
+		goto out;
+	}
+
 	err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size);
 	if (err)
 		goto out;
@@ -2114,8 +2172,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 
 	WARN_ON(xdp_rxq_info_reg(&rxq->xdp_rxq, ndev, rxq_idx,
 				 cq->napi.napi_id));
-	WARN_ON(xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq,
-					   MEM_TYPE_PAGE_SHARED, NULL));
+	WARN_ON(xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, MEM_TYPE_PAGE_POOL,
+					   rxq->page_pool));
 
 	napi_enable(&cq->napi);
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 024ad8ddb27e..b12859511839 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -280,6 +280,7 @@ struct mana_recv_buf_oob {
 	struct gdma_wqe_request wqe_req;
 
 	void *buf_va;
+	bool from_pool; /* allocated from a page pool */
 
 	/* SGL of the buffer going to be sent has part of the work request. */
 	u32 num_sge;
@@ -330,6 +331,8 @@ struct mana_rxq {
 	bool xdp_flush;
 	int xdp_rc; /* XDP redirect return code */
 
+	struct page_pool *page_pool;
+
 	/* MUST BE THE LAST MEMBER:
 	 * Each receive buffer has an associated mana_recv_buf_oob.
 	 */
-- 
2.25.1


^ permalink raw reply related

* RE: [Patch v3 3/4] RDMA/mana_ib : Create adapter and Add error eq
From: Long Li @ 2023-07-28 21:56 UTC (permalink / raw)
  To: sharmaajay@linuxonhyperv.com, Jason Gunthorpe, Leon Romanovsky,
	Dexuan Cui, Wei Liu, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Ajay Sharma
In-Reply-To: <1690402104-29518-4-git-send-email-sharmaajay@linuxonhyperv.com>

> Subject: [Patch v3 3/4] RDMA/mana_ib : Create adapter and Add error eq
> 
> From: Ajay Sharma <sharmaajay@microsoft.com>
> 
> Create adapter object as nice container for VF resources.
> Add error eq needed for adapter creation and later used for notification from
> Management SW. The management software uses this channel to send
> messages or error notifications back to the Client.
> 
> Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
> ---
>  drivers/infiniband/hw/mana/device.c           |  22 ++-
>  drivers/infiniband/hw/mana/main.c             |  95 ++++++++++++
>  drivers/infiniband/hw/mana/mana_ib.h          |  33 ++++
>  .../net/ethernet/microsoft/mana/gdma_main.c   | 146 ++++++++++--------
>  drivers/net/ethernet/microsoft/mana/mana_en.c |   3 +
>  include/net/mana/gdma.h                       |  13 +-
>  6 files changed, 242 insertions(+), 70 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mana/device.c
> b/drivers/infiniband/hw/mana/device.c
> index ea4c8c8fc10d..4077e440657a 100644
> --- a/drivers/infiniband/hw/mana/device.c
> +++ b/drivers/infiniband/hw/mana/device.c
> @@ -68,7 +68,7 @@ static int mana_ib_probe(struct auxiliary_device *adev,
>  	ibdev_dbg(&mib_dev->ib_dev, "mdev=%p id=%d num_ports=%d\n",
> mdev,
>  		  mdev->dev_id.as_uint32, mib_dev->ib_dev.phys_port_cnt);
> 
> -	mib_dev->gdma_dev = mdev;
> +	mib_dev->gc = mdev->gdma_context;
>  	mib_dev->ib_dev.node_type = RDMA_NODE_IB_CA;
> 
>  	/*
> @@ -85,15 +85,31 @@ static int mana_ib_probe(struct auxiliary_device
> *adev,
>  		goto free_ib_device;
>  	}
> 
> +	ret = mana_ib_create_error_eq(mib_dev);
> +	if (ret) {
> +		ibdev_err(&mib_dev->ib_dev, "Failed to allocate err eq");
> +		goto deregister_device;
> +	}
> +
> +	ret = mana_ib_create_adapter(mib_dev);
> +	if (ret) {
> +		ibdev_err(&mib_dev->ib_dev, "Failed to create adapter");
> +		goto free_error_eq;
> +	}
> +
>  	ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
>  				 mdev->gdma_context->dev);
>  	if (ret)
> -		goto deregister_device;
> +		goto destroy_adapter;
> 
>  	dev_set_drvdata(&adev->dev, mib_dev);
> 
>  	return 0;
> 
> +destroy_adapter:
> +	mana_ib_destroy_adapter(mib_dev);
> +free_error_eq:
> +	mana_gd_destroy_queue(mib_dev->gc, mib_dev->fatal_err_eq);
>  deregister_device:
>  	mana_gd_deregister_device(&mib_dev->gc->mana_ib);
>  free_ib_device:
> @@ -105,6 +121,8 @@ static void mana_ib_remove(struct auxiliary_device
> *adev)  {
>  	struct mana_ib_dev *mib_dev = dev_get_drvdata(&adev->dev);
> 
> +	mana_gd_destroy_queue(mib_dev->gc, mib_dev->fatal_err_eq);
> +	mana_ib_destroy_adapter(mib_dev);
>  	mana_gd_deregister_device(&mib_dev->gc->mana_ib);
>  	ib_unregister_device(&mib_dev->ib_dev);
>  	ib_dealloc_device(&mib_dev->ib_dev);
> diff --git a/drivers/infiniband/hw/mana/main.c
> b/drivers/infiniband/hw/mana/main.c
> index 2c4e3c496644..1b1a8670d0fa 100644
> --- a/drivers/infiniband/hw/mana/main.c
> +++ b/drivers/infiniband/hw/mana/main.c
> @@ -504,3 +504,98 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32
> port, int index,  void mana_ib_disassociate_ucontext(struct ib_ucontext
> *ibcontext)  {  }
> +
> +int mana_ib_destroy_adapter(struct mana_ib_dev *mib_dev) {
> +	struct mana_ib_destroy_adapter_resp resp = {};
> +	struct mana_ib_destroy_adapter_req req = {};
> +	struct gdma_context *gc;
> +	int err;
> +
> +	gc = mib_dev->gc;
> +
> +	mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_ADAPTER,
> sizeof(req),
> +			     sizeof(resp));
> +	req.adapter = mib_dev->adapter_handle;
> +	req.hdr.dev_id = gc->mana_ib.dev_id;
> +
> +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> +&resp);
> +
> +	if (err) {
> +		ibdev_err(&mib_dev->ib_dev, "Failed to destroy adapter
> err %d", err);
> +		return err;
> +	}
> +
> +	return 0;
> +}
> +
> +int mana_ib_create_adapter(struct mana_ib_dev *mib_dev) {
> +	struct mana_ib_create_adapter_resp resp = {};
> +	struct mana_ib_create_adapter_req req = {};
> +	struct gdma_context *gc;
> +	int err;
> +
> +	gc = mib_dev->gc;
> +
> +	mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_ADAPTER,
> sizeof(req),
> +			     sizeof(resp));
> +	req.notify_eq_id = mib_dev->fatal_err_eq->id;
> +	req.hdr.dev_id = gc->mana_ib.dev_id;
> +
> +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> +&resp);
> +
> +	if (err) {
> +		ibdev_err(&mib_dev->ib_dev, "Failed to create adapter
> err %d",
> +			  err);
> +		return err;
> +	}
> +
> +	mib_dev->adapter_handle = resp.adapter;
> +
> +	return 0;
> +}
> +
> +static void mana_ib_soc_event_handler(void *ctx, struct gdma_queue
> *queue,
> +				      struct gdma_event *event)
> +{
> +	struct mana_ib_dev *mib_dev = (struct mana_ib_dev *)ctx;
> +
> +	switch (event->type) {
> +	case GDMA_EQE_SOC_EVENT_NOTIFICATION:
> +		ibdev_info(&mib_dev->ib_dev, "Received SOC Notification");
> +		break;

Should we do something with the event?




> +	case GDMA_EQE_SOC_EVENT_TEST:
> +		ibdev_info(&mib_dev->ib_dev, "Received SoC Test");
> +		break;
> +	default:
> +		ibdev_dbg(&mib_dev->ib_dev, "Received unsolicited evt %d",
> +			  event->type);
> +	}
> +}
> +
> +int mana_ib_create_error_eq(struct mana_ib_dev *mib_dev) {
> +	struct gdma_queue_spec spec = {};
> +	int err;
> +
> +	spec.type = GDMA_EQ;
> +	spec.monitor_avl_buf = false;
> +	spec.queue_size = EQ_SIZE;
> +	spec.eq.callback = mana_ib_soc_event_handler;
> +	spec.eq.context = mib_dev;
> +	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
> +	spec.eq.msix_allocated = true;
> +	spec.eq.msix_index = 0;
> +	spec.doorbell = mib_dev->gc->mana_ib.doorbell;
> +	spec.pdid = mib_dev->gc->mana_ib.pdid;
> +
> +	err = mana_gd_create_mana_eq(&mib_dev->gc->mana_ib, &spec,
> +				     &mib_dev->fatal_err_eq);
> +	if (err)
> +		return err;
> +
> +	mib_dev->fatal_err_eq->eq.disable_needed = true;
> +
> +	return 0;
> +}
> diff --git a/drivers/infiniband/hw/mana/mana_ib.h
> b/drivers/infiniband/hw/mana/mana_ib.h
> index 3a2ba6b96f15..8a652bccd978 100644
> --- a/drivers/infiniband/hw/mana/mana_ib.h
> +++ b/drivers/infiniband/hw/mana/mana_ib.h
> @@ -31,6 +31,8 @@ struct mana_ib_dev {
>  	struct ib_device ib_dev;
>  	struct gdma_dev *gdma_dev;
>  	struct gdma_context *gc;
> +	struct gdma_queue *fatal_err_eq;
> +	mana_handle_t adapter_handle;
>  };
> 
>  struct mana_ib_wq {
> @@ -93,6 +95,31 @@ struct mana_ib_rwq_ind_table {
>  	struct ib_rwq_ind_table ib_ind_table;
>  };
> 
> +enum mana_ib_command_code {
> +	MANA_IB_CREATE_ADAPTER  = 0x30002,
> +	MANA_IB_DESTROY_ADAPTER = 0x30003,
> +};
> +
> +struct mana_ib_create_adapter_req {
> +	struct gdma_req_hdr hdr;
> +	u32 notify_eq_id;
> +	u32 reserved;
> +}; /*HW Data */
> +
> +struct mana_ib_create_adapter_resp {
> +	struct gdma_resp_hdr hdr;
> +	mana_handle_t adapter;
> +}; /* HW Data */
> +
> +struct mana_ib_destroy_adapter_req {
> +	struct gdma_req_hdr hdr;
> +	mana_handle_t adapter;
> +}; /*HW Data */
> +
> +struct mana_ib_destroy_adapter_resp {
> +	struct gdma_resp_hdr hdr;
> +}; /* HW Data */
> +
>  int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
>  				 struct ib_umem *umem,
>  				 mana_handle_t *gdma_region);
> @@ -161,4 +188,10 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32
> port, int index,
> 
>  void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext);
> 
> +int mana_ib_create_error_eq(struct mana_ib_dev *mib_dev);
> +
> +int mana_ib_create_adapter(struct mana_ib_dev *mib_dev);
> +
> +int mana_ib_destroy_adapter(struct mana_ib_dev *mib_dev);
> +
>  #endif
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 9fa7a2d6c2b2..55e194c9d84e 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -185,7 +185,8 @@ void mana_gd_free_memory(struct gdma_mem_info
> *gmi)  }
> 
>  static int mana_gd_create_hw_eq(struct gdma_context *gc,
> -				struct gdma_queue *queue)
> +				struct gdma_queue *queue,
> +				u32 doorbell, u32 pdid)
>  {
>  	struct gdma_create_queue_resp resp = {};
>  	struct gdma_create_queue_req req = {}; @@ -199,8 +200,8 @@ static
> int mana_gd_create_hw_eq(struct gdma_context *gc,
> 
>  	req.hdr.dev_id = queue->gdma_dev->dev_id;
>  	req.type = queue->type;
> -	req.pdid = queue->gdma_dev->pdid;
> -	req.doolbell_id = queue->gdma_dev->doorbell;
> +	req.pdid = pdid;
> +	req.doolbell_id = doorbell;
>  	req.gdma_region = queue->mem_info.dma_region_handle;
>  	req.queue_size = queue->queue_size;
>  	req.log2_throttle_limit = queue->eq.log2_throttle_limit; @@ -371,53
> +372,51 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
>  	}
>  }
> 
> -static void mana_gd_process_eq_events(void *arg)
> +static void mana_gd_process_eq_events(struct list_head *eq_list)
>  {
>  	u32 owner_bits, new_bits, old_bits;
>  	union gdma_eqe_info eqe_info;
>  	struct gdma_eqe *eq_eqe_ptr;
> -	struct gdma_queue *eq = arg;
> +	struct gdma_queue *eq;
>  	struct gdma_context *gc;
>  	struct gdma_eqe *eqe;
>  	u32 head, num_eqe;
>  	int i;
> 
> -	gc = eq->gdma_dev->gdma_context;
> -
> -	num_eqe = eq->queue_size / GDMA_EQE_SIZE;
> -	eq_eqe_ptr = eq->queue_mem_ptr;
> -
> -	/* Process up to 5 EQEs at a time, and update the HW head. */
> -	for (i = 0; i < 5; i++) {
> -		eqe = &eq_eqe_ptr[eq->head % num_eqe];
> -		eqe_info.as_uint32 = eqe->eqe_info;
> -		owner_bits = eqe_info.owner_bits;
> -
> -		old_bits = (eq->head / num_eqe - 1) &
> GDMA_EQE_OWNER_MASK;
> -		/* No more entries */
> -		if (owner_bits == old_bits)
> -			break;
> -
> -		new_bits = (eq->head / num_eqe) &
> GDMA_EQE_OWNER_MASK;
> -		if (owner_bits != new_bits) {
> -			dev_err(gc->dev, "EQ %d: overflow detected\n", eq-
> >id);
> -			break;
> +	list_for_each_entry_rcu(eq, eq_list, entry) {
> +		gc = eq->gdma_dev->gdma_context;
> +
> +		num_eqe = eq->queue_size / GDMA_EQE_SIZE;
> +		eq_eqe_ptr = eq->queue_mem_ptr;
> +		/* Process up to 5 EQEs at a time, and update the HW head. */
> +		for (i = 0; i < 5; i++) {
> +			eqe = &eq_eqe_ptr[eq->head % num_eqe];
> +			eqe_info.as_uint32 = eqe->eqe_info;
> +			owner_bits = eqe_info.owner_bits;
> +
> +			old_bits = (eq->head / num_eqe - 1) &
> GDMA_EQE_OWNER_MASK;
> +			/* No more entries */
> +			if (owner_bits == old_bits)
> +				break;
> +
> +			new_bits = (eq->head / num_eqe) &
> GDMA_EQE_OWNER_MASK;
> +			if (owner_bits != new_bits) {
> +				dev_err(gc->dev, "EQ %d: overflow
> detected\n",
> +					eq->id);
> +				break;
> +			}
> +			/* Per GDMA spec, rmb is necessary after checking
> owner_bits, before
> +			 * reading eqe.
> +			 */
> +			rmb();
> +			mana_gd_process_eqe(eq);
> +			eq->head++;
>  		}
> 
> -		/* Per GDMA spec, rmb is necessary after checking
> owner_bits, before
> -		 * reading eqe.
> -		 */
> -		rmb();
> -
> -		mana_gd_process_eqe(eq);
> -
> -		eq->head++;
> +		head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS);
> +		mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq-
> >type,
> +				      eq->id, head, SET_ARM_BIT);
>  	}
> -
> -	head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS);
> -
> -	mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq->type, eq-
> >id,
> -			      head, SET_ARM_BIT);
>  }
> 
>  static int mana_gd_register_irq(struct gdma_queue *queue, @@ -435,44
> +434,47 @@ static int mana_gd_register_irq(struct gdma_queue *queue,
>  	gc = gd->gdma_context;
>  	r = &gc->msix_resource;
>  	dev = gc->dev;
> +	msi_index = spec->eq.msix_index;
> 
>  	spin_lock_irqsave(&r->lock, flags);
> 
> -	msi_index = find_first_zero_bit(r->map, r->size);
> -	if (msi_index >= r->size || msi_index >= gc->num_msix_usable) {
> -		err = -ENOSPC;
> -	} else {
> -		bitmap_set(r->map, msi_index, 1);
> -		queue->eq.msix_index = msi_index;
> -	}
> -
> -	spin_unlock_irqrestore(&r->lock, flags);
> +	if (!spec->eq.msix_allocated) {
> +		msi_index = find_first_zero_bit(r->map, r->size);
> 
> -	if (err) {
> -		dev_err(dev, "Register IRQ err:%d, msi:%u rsize:%u,
> nMSI:%u",
> -			err, msi_index, r->size, gc->num_msix_usable);
> +		if (msi_index >= r->size ||
> +		    msi_index >= gc->num_msix_usable)
> +			err = -ENOSPC;
> +		else
> +			bitmap_set(r->map, msi_index, 1);
> 
> -		return err;
> +		if (err) {
> +			dev_err(dev, "Register IRQ err:%d, msi:%u rsize:%u,
> nMSI:%u",
> +				err, msi_index, r->size, gc->num_msix_usable);
> +				goto out;
> +		}
>  	}
> 
> +	queue->eq.msix_index = msi_index;
>  	gic = &gc->irq_contexts[msi_index];
> 
> -	WARN_ON(gic->handler || gic->arg);
> -
> -	gic->arg = queue;
> +	list_add_rcu(&queue->entry, &gic->eq_list);
> 
>  	gic->handler = mana_gd_process_eq_events;
> 
> -	return 0;
> +out:
> +	spin_unlock_irqrestore(&r->lock, flags);
> +	return err;
>  }
> 
> -static void mana_gd_deregiser_irq(struct gdma_queue *queue)
> +static void mana_gd_deregister_irq(struct gdma_queue *queue)
>  {
>  	struct gdma_dev *gd = queue->gdma_dev;
>  	struct gdma_irq_context *gic;
>  	struct gdma_context *gc;
>  	struct gdma_resource *r;
>  	unsigned int msix_index;
> +	struct list_head *p, *n;
> +	struct gdma_queue *eq;
>  	unsigned long flags;
> 
>  	gc = gd->gdma_context;
> @@ -483,14 +485,23 @@ static void mana_gd_deregiser_irq(struct
> gdma_queue *queue)
>  	if (WARN_ON(msix_index >= gc->num_msix_usable))
>  		return;
> 
> +	spin_lock_irqsave(&r->lock, flags);
> +
>  	gic = &gc->irq_contexts[msix_index];
> -	gic->handler = NULL;
> -	gic->arg = NULL;
> +	list_for_each_safe(p, n, &gic->eq_list) {
> +		eq = list_entry(p, struct gdma_queue, entry);
> +		if (queue == eq) {
> +			list_del(&eq->entry);

The previous code used list_for_each_entry_rcu() for iterating eq, need to add rcu_synchronize()?




> +			break;
> +		}
> +	}
> 

^ permalink raw reply

* RE: [Patch v3 4/4] RDMA/mana_ib : Query adapter capabilities
From: Long Li @ 2023-07-28 22:52 UTC (permalink / raw)
  To: sharmaajay@linuxonhyperv.com, Jason Gunthorpe, Leon Romanovsky,
	Dexuan Cui, Wei Liu, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Ajay Sharma
In-Reply-To: <1690402104-29518-5-git-send-email-sharmaajay@linuxonhyperv.com>

> Subject: [Patch v3 4/4] RDMA/mana_ib : Query adapter capabilities
> 
> From: Ajay Sharma <sharmaajay@microsoft.com>
> 
> Query the adapter capabilities to expose to other clients and VF. This checks
> against the user supplied values and protects against overflows.
> 
> Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
> ---
>  drivers/infiniband/hw/mana/device.c  |  4 ++
>  drivers/infiniband/hw/mana/main.c    | 66 +++++++++++++++++++++++++-
> --
>  drivers/infiniband/hw/mana/mana_ib.h | 53 +++++++++++++++++++++-
>  3 files changed, 115 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mana/device.c
> b/drivers/infiniband/hw/mana/device.c
> index 4077e440657a..e15da43c73a0 100644
> --- a/drivers/infiniband/hw/mana/device.c
> +++ b/drivers/infiniband/hw/mana/device.c
> @@ -97,6 +97,10 @@ static int mana_ib_probe(struct auxiliary_device *adev,
>  		goto free_error_eq;
>  	}
> 
> +	ret = mana_ib_query_adapter_caps(mib_dev);
> +	if (ret)
> +		ibdev_dbg(&mib_dev->ib_dev, "Failed to get caps, use
> defaults");

There is an ibdev_err() in mana_ib_query_adapter_caps(), how about merging this message with that?

And you can remove the return value of mana_ib_query_adapter_caps(), since it doesn't do something meaningful.

> +
>  	ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
>  				 mdev->gdma_context->dev);
>  	if (ret)
> diff --git a/drivers/infiniband/hw/mana/main.c
> b/drivers/infiniband/hw/mana/main.c
> index 1b1a8670d0fa..512815e1e64d 100644
> --- a/drivers/infiniband/hw/mana/main.c
> +++ b/drivers/infiniband/hw/mana/main.c
> @@ -469,21 +469,27 @@ int mana_ib_get_port_immutable(struct ib_device
> *ibdev, u32 port_num,  int mana_ib_query_device(struct ib_device *ibdev,
> struct ib_device_attr *props,
>  			 struct ib_udata *uhw)
>  {
> +	struct mana_ib_dev *mib_dev = container_of(ibdev,
> +			struct mana_ib_dev, ib_dev);
> +
>  	props->max_qp = MANA_MAX_NUM_QUEUES;
>  	props->max_qp_wr = MAX_SEND_BUFFERS_PER_QUEUE;
> -
> -	/*
> -	 * max_cqe could be potentially much bigger.
> -	 * As this version of driver only support RAW QP, set it to the same
> -	 * value as max_qp_wr
> -	 */
>  	props->max_cqe = MAX_SEND_BUFFERS_PER_QUEUE;
> -
>  	props->max_mr_size = MANA_IB_MAX_MR_SIZE;
>  	props->max_mr = MANA_IB_MAX_MR;
>  	props->max_send_sge = MAX_TX_WQE_SGL_ENTRIES;
>  	props->max_recv_sge = MAX_RX_WQE_SGL_ENTRIES;
> 
> +	/* If the Management SW is updated and supports adapter creation */
> +	if (mib_dev->adapter_handle) {

Does this mean mana_ib_query_adapter_caps() was a success?


> +		props->max_qp = mib_dev->adapter_caps.max_qp_count;
> +		props->max_qp_wr = mib_dev-
> >adapter_caps.max_requester_sq_size;
> +		props->max_cqe = mib_dev-
> >adapter_caps.max_requester_sq_size;
> +		props->max_mr = mib_dev->adapter_caps.max_mr_count;
> +		props->max_send_sge = mib_dev-
> >adapter_caps.max_send_wqe_size;
> +		props->max_recv_sge = mib_dev-
> >adapter_caps.max_recv_wqe_size;
> +	}
> +
>  	return 0;
>  }
> 
> @@ -599,3 +605,49 @@ int mana_ib_create_error_eq(struct mana_ib_dev
> *mib_dev)
> 
>  	return 0;
>  }
> +
> +static void assign_caps(struct mana_ib_adapter_caps *caps,
> +			struct mana_ib_query_adapter_caps_resp *resp) {
> +	caps->max_sq_id = resp->max_sq_id;
> +	caps->max_rq_id = resp->max_rq_id;
> +	caps->max_cq_id = resp->max_cq_id;
> +	caps->max_qp_count = resp->max_qp_count;
> +	caps->max_cq_count = resp->max_cq_count;
> +	caps->max_mr_count = resp->max_mr_count;
> +	caps->max_pd_count = resp->max_pd_count;
> +	caps->max_inbound_read_limit = resp->max_inbound_read_limit;
> +	caps->max_outbound_read_limit = resp->max_outbound_read_limit;
> +	caps->mw_count = resp->mw_count;
> +	caps->max_srq_count = resp->max_srq_count;
> +	caps->max_requester_sq_size = resp->max_requester_sq_size;
> +	caps->max_responder_sq_size = resp->max_responder_sq_size;
> +	caps->max_requester_rq_size = resp->max_requester_rq_size;
> +	caps->max_responder_rq_size = resp->max_responder_rq_size;
> +	caps->max_send_wqe_size = resp->max_send_wqe_size;
> +	caps->max_recv_wqe_size = resp->max_recv_wqe_size;
> +	caps->max_inline_data_size = resp->max_inline_data_size; }
> +
> +int mana_ib_query_adapter_caps(struct mana_ib_dev *mib_dev) {
> +	struct mana_ib_query_adapter_caps_resp resp = {};
> +	struct mana_ib_query_adapter_caps_req req = {};
> +	int err;
> +
> +	mana_gd_init_req_hdr(&req.hdr, MANA_IB_GET_ADAPTER_CAP,
> sizeof(req),
> +			     sizeof(resp));
> +	req.hdr.resp.msg_version =
> MANA_IB__GET_ADAPTER_CAP_RESPONSE_V3;
> +	req.hdr.dev_id = mib_dev->gc->mana_ib.dev_id;
> +
> +	err = mana_gd_send_request(mib_dev->gc, sizeof(req), &req,
> +				   sizeof(resp), &resp);
> +
> +	if (err) {
> +		ibdev_err(&mib_dev->ib_dev, "Failed to query adapter caps
> err %d", err);
> +		return err;
> +	}
> +
> +	assign_caps(&mib_dev->adapter_caps, &resp);
> +	return 0;
> +}
> diff --git a/drivers/infiniband/hw/mana/mana_ib.h
> b/drivers/infiniband/hw/mana/mana_ib.h
> index 8a652bccd978..1044358230d3 100644
> --- a/drivers/infiniband/hw/mana/mana_ib.h
> +++ b/drivers/infiniband/hw/mana/mana_ib.h
> @@ -20,19 +20,41 @@
> 
>  /* MANA doesn't have any limit for MR size */
>  #define MANA_IB_MAX_MR_SIZE	U64_MAX
> -
> +#define MANA_IB__GET_ADAPTER_CAP_RESPONSE_V3 3

This value is used in GDMA header of the request message? If so, define GDMA_MESSAGE_V3 in "include/net/mana/gdma.h".

^ permalink raw reply

* [PATCH 1/1] scsi: storvsc: Fix handling of virtual Fibre Channel timeouts
From: Michael Kelley @ 2023-07-29  4:59 UTC (permalink / raw)
  To: kys, martin.petersen, longli, wei.liu, decui, jejb, linux-hyperv,
	linux-kernel, linux-scsi
  Cc: mikelley, stable

Hyper-V provides the ability to connect Fibre Channel LUNs to the host
system and present them in a guest VM as a SCSI device. I/O to the vFC
device is handled by the storvsc driver. The storvsc driver includes
a partial integration with the FC transport implemented in the generic
portion of the Linux SCSI subsystem so that FC attributes can be
displayed in /sys.  However, the partial integration means that some
aspects of vFC don't work properly. Unfortunately, a full and correct
integration isn't practical because of limitations in what Hyper-V
provides to the guest.

In particular, in the context of Hyper-V storvsc, the FC transport
timeout function fc_eh_timed_out() causes a kernel panic because it
can't find the rport and dereferences a NULL pointer. The original
patch that added the call from storvsc_eh_timed_out() to
fc_eh_timed_out() is faulty in this regard.

In many cases a timeout is due to a transient condition, so the
situation can be improved by just continuing to wait like with other
I/O requests issued by storvsc, and avoiding the guaranteed panic. For
a permanent failure, continuing to wait may result in a hung thread
instead of a panic, which again may be better.

So fix the panic by removing the storvsc call to fc_eh_timed_out().
This allows storvsc to keep waiting for a response.  The change has
been tested by users who experienced a panic in fc_eh_timed_out() due
to transient timeouts, and it solves their problem.

In the future we may want to deprecate the vFC functionality in storvsc
since it can't be fully fixed. But it has current users for whom it is
working well enough, so it should probably stay for a while longer.

Fixes: 3930d7309807 ("scsi: storvsc: use default I/O timeout handler for FC devices")
Cc: stable@vger.kernel.org
Signed-off-by: Michael Kelley <mikelley@microsoft.com>
---
 drivers/scsi/storvsc_drv.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 659196a..6014200 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1671,10 +1671,6 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd)
  */
 static enum scsi_timeout_action storvsc_eh_timed_out(struct scsi_cmnd *scmnd)
 {
-#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
-	if (scmnd->device->host->transportt == fc_transport_template)
-		return fc_eh_timed_out(scmnd);
-#endif
 	return SCSI_EH_RESET_TIMER;
 }

-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v9 00/36] x86: enable FRED for x86-64
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy

This patch set enables the Intel flexible return and event delivery
(FRED) architecture for x86-64.

The FRED architecture defines simple new transitions that change
privilege level (ring transitions). The FRED architecture was
designed with the following goals:

1) Improve overall performance and response time by replacing event
   delivery through the interrupt descriptor table (IDT event
   delivery) and event return by the IRET instruction with lower
   latency transitions.

2) Improve software robustness by ensuring that event delivery
   establishes the full supervisor context and that event return
   establishes the full user context.

The new transitions defined by the FRED architecture are FRED event
delivery and, for returning from events, two FRED return instructions.
FRED event delivery can effect a transition from ring 3 to ring 0, but
it is used also to deliver events incident to ring 0. One FRED
instruction (ERETU) effects a return from ring 0 to ring 3, while the
other (ERETS) returns while remaining in ring 0. Collectively, FRED
event delivery and the FRED return instructions are FRED transitions.

Search for the latest FRED spec in most search engines with this search pattern:

  site:intel.com FRED (flexible return and event delivery) specification

As of now there is no publicly avaiable CPU supporting FRED, thus the Intel
Simics® Simulator is used as software development and testing vehicles. And
it can be downloaded from:
  https://www.intel.com/content/www/us/en/developer/articles/tool/simics-simulator.html

To enable FRED, the Simics package 8112 QSP-CPU needs to be installed with CPU
model configured as:
	$cpu_comp_class = "x86-experimental-fred"

Changes since v8:
* Move the FRED initialization patch after all required changes are in
  place (Thomas Gleixner).
* Don't do syscall early out in fred_entry_from_user() before there are
  proper performance numbers and justifications (Thomas Gleixner).
* Add the control exception handler to the FRED exception handler table
  (Thomas Gleixner).
* Introduce a macro sysvec_install() to derive the asm handler name from
  a C handler, which simplifies the code and avoids an ugly typecast
  (Thomas Gleixner).
* Remove junk code that assumes no local APIC on x86_64 (Thomas Gleixner).
* Put IDTENTRY changes in a separate patch (Thomas Gleixner).
* Use high-order 48 bits above the lowest 16 bit SS only when FRED is
  enabled (Thomas Gleixner).
* Explain why writing directly to the IA32_KERNEL_GS_BASE MSR is
  doing the right thing (Thomas Gleixner).
* Reword some patch descriptions (Thomas Gleixner).
* Add a new macro VMX_DO_FRED_EVENT_IRQOFF for FRED instead of
  refactoring VMX_DO_EVENT_IRQOFF (Sean Christopherson).
* Do NOT use a trampoline, just LEA+PUSH the return RIP, PUSH the error
  code, and jump to the FRED kernel entry point for NMI or call
  external_interrupt() for IRQs (Sean Christopherson).
* Call external_interrupt() only when FRED is enabled, and convert the
  non-FRED handling to external_interrupt() after FRED lands (Sean
  Christopherson).
* Use __packed instead of __attribute__((__packed__)) (Borislav Petkov).
* Put all comments above the members, like the rest of the file does
  (Borislav Petkov).
* Reflect the FRED spec 5.0 change that ERETS and ERETU add 8 to %rsp
  before popping the return context from the stack.
* Reflect stack frame definition changes from FRED spec 3.0 to 5.0.
* Add ENDBR to the FRED_ENTER asm macro after kernel IBT is added to
  FRED base line in FRED spec 5.0.
* Add a document which briefly introduces FRED features.
* Remove 2 patches, "allow FRED systems to use interrupt vectors
  0x10-0x1f" and "allow dynamic stack frame size", from this patch set,
  as they are "optimizations" only.
* Send 2 patches, "header file for event types" and "do not modify the
  DPL bits for a null selector", as pre-FRED patches.

Changes since v7:
* Always call external_interrupt() for VMX IRQ handling on x86_64, thus avoid
  re-entering the noinstr code.
* Create a FRED stack frame when FRED is compiled-in but not enabled, which
  uses some extra stack space but simplifies the code.
* Add a log message when FRED is enabled.

Changes since v6:
* Add a comment to explain why it is safe to write to a previous FRED stack
  frame. (Lai Jiangshan).
* Export fred_entrypoint_kernel(), required when kvm-intel built as a module.
* Reserve a REDZONE for CALL emulation and Align RSP to a 64-byte boundary
  before pushing a new FRED stack frame.
* Replace pt_regs csx flags prefix FRED_CSL_ with FRED_CSX_.

Changes since v5:
* Initialize system_interrupt_handlers with dispatch_table_spurious_interrupt()
  instead of NULL to get rid of a branch (Peter Zijlstra).
* Disallow #DB inside #MCE for robustness sake (Peter Zijlstra).
* Add a comment for FRED stack level settings (Lai Jiangshan).
* Move the NMI bit from an invalid stack frame, which caused ERETU to fault,
  to the fault handler's stack frame, thus to unblock NMI ASAP if NMI is blocked
  (Lai Jiangshan).
* Refactor VMX_DO_EVENT_IRQOFF to handle IRQ/NMI in IRQ/NMI induced VM exits
  when FRED is enabled (Sean Christopherson).

Changes since v4:
* Do NOT use the term "injection", which in the KVM context means to
  reinject an event into the guest (Sean Christopherson).
* Add the explanation of why to execute "int $2" to invoke the NMI handler
  in NMI caused VM exits (Sean Christopherson).
* Use cs/ss instead of csx/ssx when initializing the pt_regs structure
  for calling external_interrupt(), otherwise it breaks i386 build.

Changes since v3:
* Call external_interrupt() to handle IRQ in IRQ caused VM exits.
* Execute "int $2" to handle NMI in NMI caused VM exits.
* Rename csl/ssl of the pt_regs structure to csx/ssx (x for extended)
  (Andrew Cooper).

Changes since v2:
* Improve comments for changes in arch/x86/include/asm/idtentry.h.

Changes since v1:
* call irqentry_nmi_{enter,exit}() in both IDT and FRED debug fault kernel
  handler (Peter Zijlstra).
* Initialize a FRED exception handler to fred_bad_event() instead of NULL
  if no FRED handler defined for an exception vector (Peter Zijlstra).
* Push calling irqentry_{enter,exit}() and instrumentation_{begin,end}()
  down into individual FRED exception handlers, instead of in the dispatch
  framework (Peter Zijlstra).

H. Peter Anvin (Intel) (22):
  x86/fred: Add Kconfig option for FRED (CONFIG_X86_FRED)
  x86/fred: Disable FRED support if CONFIG_X86_FRED is disabled
  x86/cpufeatures: Add the cpu feature bit for FRED
  x86/opcode: Add ERETU, ERETS instructions to x86-opcode-map
  x86/objtool: Teach objtool about ERETU and ERETS
  x86/cpu: Add X86_CR4_FRED macro
  x86/cpu: Add MSR numbers for FRED configuration
  x86/fred: Make unions for the cs and ss fields in struct pt_regs
  x86/fred: Add a new header file for FRED definitions
  x86/fred: Reserve space for the FRED stack frame
  x86/fred: Update MSR_IA32_FRED_RSP0 during task switch
  x86/fred: Let ret_from_fork_asm() jmp to fred_exit_user when FRED is
    enabled
  x86/fred: Disallow the swapgs instruction when FRED is enabled
  x86/fred: No ESPFIX needed when FRED is enabled
  x86/fred: Allow single-step trap and NMI when starting a new task
  x86/fred: Add a page fault entry stub for FRED
  x86/fred: Add a debug fault entry stub for FRED
  x86/fred: Add a NMI entry stub for FRED
  x86/traps: Add a system interrupt handler table for system interrupt
    dispatch
  x86/traps: Add external_interrupt() to dispatch external interrupts
  x86/fred: FRED entry/exit and dispatch code
  x86/fred: FRED initialization code

Xin Li (14):
  Documentation/x86/64: Add documentation for FRED
  x86/fred: Define a common function type fred_handler
  x86/fred: Add a machine check entry stub for FRED
  x86/fred: Add a double fault entry stub for FRED
  x86/entry: Remove idtentry_sysvec from entry_{32,64}.S
  x86/idtentry: Incorporate definitions/declarations of the FRED
    external interrupt handler type
  x86/traps: Add sysvec_install() to install a system interrupt handler
  x86/idtentry: Incorporate declaration/definition of the FRED exception
    handler type
  x86/fred: Fixup fault on ERETU by jumping to fred_entrypoint_user
  x86/traps: Export external_interrupt() for handling IRQ in IRQ induced
    VM exits
  x86/fred: Export fred_entrypoint_kernel() for handling NMI in NMI
    induced VM exits
  KVM: VMX: Add VMX_DO_FRED_EVENT_IRQOFF for IRQ/NMI handling
  x86/syscall: Split IDT syscall setup code into idt_syscall_init()
  x86/fred: Disable FRED by default in its early stage

 .../admin-guide/kernel-parameters.txt         |   4 +
 Documentation/arch/x86/x86_64/fred.rst        | 102 ++++++++
 Documentation/arch/x86/x86_64/index.rst       |   1 +
 arch/x86/Kconfig                              |   9 +
 arch/x86/entry/Makefile                       |   5 +-
 arch/x86/entry/entry_32.S                     |   4 -
 arch/x86/entry/entry_64.S                     |  14 +-
 arch/x86/entry/entry_64_fred.S                |  58 +++++
 arch/x86/entry/entry_fred.c                   | 220 ++++++++++++++++++
 arch/x86/entry/vsyscall/vsyscall_64.c         |   2 +-
 arch/x86/include/asm/asm-prototypes.h         |   1 +
 arch/x86/include/asm/cpufeatures.h            |   1 +
 arch/x86/include/asm/disabled-features.h      |   8 +-
 arch/x86/include/asm/extable_fixup_types.h    |   4 +-
 arch/x86/include/asm/fred.h                   | 157 +++++++++++++
 arch/x86/include/asm/idtentry.h               | 115 ++++++++-
 arch/x86/include/asm/msr-index.h              |  13 +-
 arch/x86/include/asm/ptrace.h                 |  57 ++++-
 arch/x86/include/asm/switch_to.h              |  11 +-
 arch/x86/include/asm/thread_info.h            |  12 +-
 arch/x86/include/asm/traps.h                  |  23 ++
 arch/x86/include/uapi/asm/processor-flags.h   |   2 +
 arch/x86/kernel/Makefile                      |   1 +
 arch/x86/kernel/cpu/acrn.c                    |   5 +-
 arch/x86/kernel/cpu/common.c                  |  47 +++-
 arch/x86/kernel/cpu/mce/core.c                |  15 ++
 arch/x86/kernel/cpu/mshyperv.c                |  16 +-
 arch/x86/kernel/espfix_64.c                   |   8 +
 arch/x86/kernel/fred.c                        |  67 ++++++
 arch/x86/kernel/irqinit.c                     |   7 +-
 arch/x86/kernel/kvm.c                         |   2 +-
 arch/x86/kernel/nmi.c                         |  19 ++
 arch/x86/kernel/process_64.c                  |  31 ++-
 arch/x86/kernel/traps.c                       | 153 ++++++++++--
 arch/x86/kvm/vmx/vmenter.S                    |  88 +++++++
 arch/x86/kvm/vmx/vmx.c                        |  19 +-
 arch/x86/lib/x86-opcode-map.txt               |   2 +-
 arch/x86/mm/extable.c                         |  79 +++++++
 arch/x86/mm/fault.c                           |  18 +-
 drivers/xen/events/events_base.c              |   3 +-
 tools/arch/x86/include/asm/cpufeatures.h      |   1 +
 .../arch/x86/include/asm/disabled-features.h  |   8 +-
 tools/arch/x86/include/asm/msr-index.h        |  13 +-
 tools/arch/x86/lib/x86-opcode-map.txt         |   2 +-
 tools/objtool/arch/x86/decode.c               |  19 +-
 45 files changed, 1348 insertions(+), 98 deletions(-)
 create mode 100644 Documentation/arch/x86/x86_64/fred.rst
 create mode 100644 arch/x86/entry/entry_64_fred.S
 create mode 100644 arch/x86/entry/entry_fred.c
 create mode 100644 arch/x86/include/asm/fred.h
 create mode 100644 arch/x86/kernel/fred.c

-- 
2.34.1

^ permalink raw reply

* [PATCH v9 02/36] x86/fred: Add Kconfig option for FRED (CONFIG_X86_FRED)
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add the configuration option CONFIG_X86_FRED to enable FRED.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/Kconfig | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7422db409770..700d94cb8330 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -494,6 +494,15 @@ config X86_CPU_RESCTRL
 
 	  Say N if unsure.
 
+config X86_FRED
+	bool "Flexible Return and Event Delivery"
+	depends on X86_64
+	help
+	  When enabled, try to use Flexible Return and Event Delivery
+	  instead of the legacy SYSCALL/SYSENTER/IDT architecture for
+	  ring transitions and exception/interrupt handling if the
+	  system supports.
+
 if X86_32
 config X86_BIGSMP
 	bool "Support for big SMP systems with more than 8 CPUs"
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 03/36] x86/fred: Disable FRED support if CONFIG_X86_FRED is disabled
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add CONFIG_X86_FRED to <asm/disabled-features.h> to make
cpu_feature_enabled() work correctly with FRED.

Originally-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/disabled-features.h       | 8 +++++++-
 tools/arch/x86/include/asm/disabled-features.h | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index fafe9be7a6f4..85fd67c67ce1 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -105,6 +105,12 @@
 # define DISABLE_TDX_GUEST	(1 << (X86_FEATURE_TDX_GUEST & 31))
 #endif
 
+#ifdef CONFIG_X86_FRED
+# define DISABLE_FRED 0
+#else
+# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -122,7 +128,7 @@
 #define DISABLED_MASK11	(DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
 			 DISABLE_CALL_DEPTH_TRACKING)
 #define DISABLED_MASK12	(DISABLE_LAM)
-#define DISABLED_MASK13	0
+#define DISABLED_MASK13	(DISABLE_FRED)
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
 #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index fafe9be7a6f4..85fd67c67ce1 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -105,6 +105,12 @@
 # define DISABLE_TDX_GUEST	(1 << (X86_FEATURE_TDX_GUEST & 31))
 #endif
 
+#ifdef CONFIG_X86_FRED
+# define DISABLE_FRED 0
+#else
+# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -122,7 +128,7 @@
 #define DISABLED_MASK11	(DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
 			 DISABLE_CALL_DEPTH_TRACKING)
 #define DISABLED_MASK12	(DISABLE_LAM)
-#define DISABLED_MASK13	0
+#define DISABLED_MASK13	(DISABLE_FRED)
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
 #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 01/36] Documentation/x86/64: Add documentation for FRED
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

Briefly introduce FRED, its advantages compared to IDT, and its
Linux enabling.

Signed-off-by: Xin Li <xin3.li@intel.com>
---
 Documentation/arch/x86/x86_64/fred.rst  | 102 ++++++++++++++++++++++++
 Documentation/arch/x86/x86_64/index.rst |   1 +
 2 files changed, 103 insertions(+)
 create mode 100644 Documentation/arch/x86/x86_64/fred.rst

diff --git a/Documentation/arch/x86/x86_64/fred.rst b/Documentation/arch/x86/x86_64/fred.rst
new file mode 100644
index 000000000000..27c980e882ba
--- /dev/null
+++ b/Documentation/arch/x86/x86_64/fred.rst
@@ -0,0 +1,102 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+Flexible Return and Event Delivery (FRED)
+=========================================
+
+Overview
+========
+
+The FRED architecture defines simple new transitions that change
+privilege level (ring transitions). The FRED architecture was
+designed with the following goals:
+
+1) Improve overall performance and response time by replacing event
+   delivery through the interrupt descriptor table (IDT event
+   delivery) and event return by the IRET instruction with lower
+   latency transitions.
+
+2) Improve software robustness by ensuring that event delivery
+   establishes the full supervisor context and that event return
+   establishes the full user context.
+
+The new transitions defined by the FRED architecture are FRED event
+delivery and, for returning from events, two FRED return instructions.
+FRED event delivery can effect a transition from ring 3 to ring 0, but
+it is used also to deliver events incident to ring 0. One FRED
+instruction (ERETU) effects a return from ring 0 to ring 3, while the
+other (ERETS) returns while remaining in ring 0. Collectively, FRED
+event delivery and the FRED return instructions are FRED transitions.
+
+In addition to these transitions, the FRED architecture defines a new
+instruction (LKGS) for managing the state of the GS segment register.
+The LKGS instruction can be used by 64-bit operating systems that do
+not use the new FRED transitions.
+
+Software based event dispatching
+================================
+
+FRED operates differently from IDT in terms of event handling. Instead
+of directly dispatching an event to its handler based on the event
+vector, FRED requires the software to dispatch an event to its handler
+based on both the event's type and vector. Therefore, an event
+dispatch framework must be implemented to facilitate the
+event-to-handler dispatch process. The FRED event dispatch framework
+assumes control once an event is delivered, starting from two FRED
+entry points, after which several event dispatch tables are introduced
+to facilitate the dispatching.
+
+The first level dispatching is event type based, and two tables need
+to be defined, one for ring 3 event dispatching, and the other
+for ring 0.
+
+The second level dispatching is event vector based, and
+several tables need to be defined, e.g., an exception handler table
+for exception dispatching.
+
+Full supervisor/user context
+============================
+
+FRED event delivery atomically save and restore full supervisor/user
+context upon event delivery and return. Thus it avoids the problem of
+transient states due to %cr2 and/or %dr6, thus it is no longer needed
+to handle all the ugly corner cases caused by half baked CPU states.
+
+FRED allows explicit unblock of NMI with new event return instructions
+ERETS/ERETU, avoiding the mess caused by IRET which unconditionally
+unblocks NMI, when an exception happens during NMI handling.
+
+FRED always restores the full value of %rsp, thus ESPFIX is no longer
+needed when FRED is enabled.
+
+LKGS
+====
+
+LKGS behaves like the MOV to GS instruction except that it loads the
+base address into the IA32_KERNEL_GS_BASE MSR instead of the GS
+segment’s descriptor cache, which is exactly what Linux kernel does
+to load user level GS base. With LKGS, it ends up with avoiding
+mucking with kernel GS.
+
+Because FRED event delivery from ring 3 swaps the value of the GS base
+address and that of the IA32_KERNEL_GS_BASE MSR, and ERETU swaps the
+value of the GS base address and that of the IA32_KERNEL_GS_BASE MSR,
+plus the introduction of LKGS instruction, the SWAPGS instruction is
+no longer needed when FRED is enabled, thus is disallowed (#UD).
+
+Stack levels
+============
+
+4 stack levels 0~3 are introduced to replace the un-reentrant IST for
+handling events. Each stack level could be configured to use a
+dedicated stack.
+
+The current stack level could be unchanged or go higher upon FRED
+event delivery. If unchanged, the CPU keeps using the current event
+stack. If higher, the CPU switches to a new stack specified by the
+stack MSR of the new stack level.
+
+Only execution of a FRED return instruction ERETU or ERETS could lower
+the current stack level, causing the CPU to switch back to the stack
+it was on before a previous event delivery.
+satck.
diff --git a/Documentation/arch/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst
index a56070fc8e77..ad15e9bd623f 100644
--- a/Documentation/arch/x86/x86_64/index.rst
+++ b/Documentation/arch/x86/x86_64/index.rst
@@ -15,3 +15,4 @@ x86_64 Support
    cpu-hotplug-spec
    machinecheck
    fsgs
+   fred
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 04/36] x86/cpufeatures: Add the cpu feature bit for FRED
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add the CPU feature bit for FRED.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/cpufeatures.h       | 1 +
 tools/arch/x86/include/asm/cpufeatures.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cb8ca46213be..fd3ddd5c0283 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -317,6 +317,7 @@
 #define X86_FEATURE_FZRM		(12*32+10) /* "" Fast zero-length REP MOVSB */
 #define X86_FEATURE_FSRS		(12*32+11) /* "" Fast short REP STOSB */
 #define X86_FEATURE_FSRC		(12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_FRED		(12*32+17) /* Flexible Return and Event Delivery */
 #define X86_FEATURE_LKGS		(12*32+18) /* "" Load "kernel" (userspace) GS */
 #define X86_FEATURE_AMX_FP16		(12*32+21) /* "" AMX fp16 Support */
 #define X86_FEATURE_AVX_IFMA            (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index cb8ca46213be..fd3ddd5c0283 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -317,6 +317,7 @@
 #define X86_FEATURE_FZRM		(12*32+10) /* "" Fast zero-length REP MOVSB */
 #define X86_FEATURE_FSRS		(12*32+11) /* "" Fast short REP STOSB */
 #define X86_FEATURE_FSRC		(12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_FRED		(12*32+17) /* Flexible Return and Event Delivery */
 #define X86_FEATURE_LKGS		(12*32+18) /* "" Load "kernel" (userspace) GS */
 #define X86_FEATURE_AMX_FP16		(12*32+21) /* "" AMX fp16 Support */
 #define X86_FEATURE_AVX_IFMA            (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 05/36] x86/opcode: Add ERETU, ERETS instructions to x86-opcode-map
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add instruction opcodes used by FRED ERETU/ERETS to x86-opcode-map.

Opcode numbers are per FRED spec v5.0.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/lib/x86-opcode-map.txt       | 2 +-
 tools/arch/x86/lib/x86-opcode-map.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 5168ee0360b2..7a269e269dc0 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1052,7 +1052,7 @@ EndTable
 
 GrpTable: Grp7
 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B)
 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv
diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt
index 5168ee0360b2..7a269e269dc0 100644
--- a/tools/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/arch/x86/lib/x86-opcode-map.txt
@@ -1052,7 +1052,7 @@ EndTable
 
 GrpTable: Grp7
 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B)
 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 06/36] x86/objtool: Teach objtool about ERETU and ERETS
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Update the objtool decoder to know about the ERETU and ERETS
instructions (type INSN_CONTEXT_SWITCH).

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 tools/objtool/arch/x86/decode.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 2e1caabecb18..a486485cff20 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -509,11 +509,20 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 
 		if (op2 == 0x01) {
 
-			if (modrm == 0xca)
-				insn->type = INSN_CLAC;
-			else if (modrm == 0xcb)
-				insn->type = INSN_STAC;
-
+			switch (insn_last_prefix_id(&ins)) {
+			case INAT_PFX_REPE:
+			case INAT_PFX_REPNE:
+				if (modrm == 0xca)
+					/* eretu/erets */
+					insn->type = INSN_CONTEXT_SWITCH;
+				break;
+			default:
+				if (modrm == 0xca)
+					insn->type = INSN_CLAC;
+				else if (modrm == 0xcb)
+					insn->type = INSN_STAC;
+				break;
+			}
 		} else if (op2 >= 0x80 && op2 <= 0x8f) {
 
 			insn->type = INSN_JUMP_CONDITIONAL;
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 07/36] x86/cpu: Add X86_CR4_FRED macro
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add X86_CR4_FRED macro for the FRED bit in %cr4. This bit should be a
pinned bit, not to be changed after initialization.

CR4 macros are defined in arch/x86/include/uapi/asm/processor-flags.h,
which is uapi, and thus cannot depend on CONFIG_X86_64.

Using _BITUL() causes build errors on 32 bits, and there is no
guarantee that user space applications (e.g. something like Qemu)
might not want to use this declaration even when building for i386 or
x32.

However, %cr4 is a machine word (unsigned long), so to avoid build
warnings on 32 bits, explicitly cast the value to unsigned long,
truncating upper 32 bits.

The other alternative would be to use CONFIG_X86_64 around the
definition of cr4_pinned_mask. It is probably not desirable to make
cr4_pinned_mask non-const.

Another option, which may be preferable, to be honest: explicitly
enumerate the CR4 bits which *may* be changed (a whitelist), instead
of the ones that may not. That would be a separate, pre-FRED, patch,
and would automatically resolve this problem as a side effect.

The following flags probably should have been in this set all along,
as they are all controls affecting the kernel runtime environment as
opposed to user space:

X86_CR4_DE, X86_CR4_PAE, X86_CR4_PSE, X86_CR4_MCE, X86_CR4_PGE,
X86_CR4_OSFXSR, X86_CR4_OSXMMEXCPT, X86_CR4_LA57, X86_CR4_PCIDE,
X86_CR4_LAM_SUP

Possibly X86_CR4_VMXE as well, which seems harmless even if KVM is
not loaded; X86_CR4_PKE can be fixed as long as the PKE configuration
registers are at least initialized to disabled.

It is relatively simple to do an audit of which flags are allowed to
be modified at runtime and whitelist only those. There is no reason
why we should allow bits in CR4 to be toggled by default.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/uapi/asm/processor-flags.h | 2 ++
 arch/x86/kernel/cpu/common.c                | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index d898432947ff..ce08c2ca70b5 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -138,6 +138,8 @@
 #define X86_CR4_CET		_BITUL(X86_CR4_CET_BIT)
 #define X86_CR4_LAM_SUP_BIT	28 /* LAM for supervisor pointers */
 #define X86_CR4_LAM_SUP		_BITUL(X86_CR4_LAM_SUP_BIT)
+#define X86_CR4_FRED_BIT	32 /* enable FRED kernel entry */
+#define X86_CR4_FRED		_BITULL(X86_CR4_FRED_BIT)

 /*
  * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0ba1067f4e5f..331b06d19f7f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -402,8 +402,9 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c)

 /* These bits should not change their value after CPU init is finished. */
 static const unsigned long cr4_pinned_mask =
-	X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
-	X86_CR4_FSGSBASE | X86_CR4_CET;
+	(unsigned long)
+	(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+	 X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED);
 static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
 static unsigned long cr4_pinned_bits __ro_after_init;

-- 
2.34.1

^ permalink raw reply related

* [PATCH v9 10/36] x86/fred: Add a new header file for FRED definitions
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add a header file for FRED prototypes and definitions.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---

Changes since v6:
* Replace pt_regs csx flags prefix FRED_CSL_ with FRED_CSX_.
---
 arch/x86/include/asm/fred.h | 104 ++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 arch/x86/include/asm/fred.h

diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
new file mode 100644
index 000000000000..d76e681a806f
--- /dev/null
+++ b/arch/x86/include/asm/fred.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Macros for Flexible Return and Event Delivery (FRED)
+ */
+
+#ifndef ASM_X86_FRED_H
+#define ASM_X86_FRED_H
+
+#include <linux/const.h>
+#include <asm/asm.h>
+
+/*
+ * FRED return instructions
+ *
+ * Replace with "ERETS"/"ERETU" once binutils support FRED return instructions.
+ * The binutils version supporting FRED instructions is still TBD, and will
+ * update once we have it.
+ */
+#define ERETS			_ASM_BYTES(0xf2,0x0f,0x01,0xca)
+#define ERETU			_ASM_BYTES(0xf3,0x0f,0x01,0xca)
+
+/*
+ * RSP is aligned to a 64-byte boundary before used to push a new stack frame
+ */
+#define FRED_STACK_FRAME_RSP_MASK	_AT(unsigned long, (~0x3f))
+
+/*
+ * Event stack level macro for the FRED_STKLVLS MSR.
+ * Usage example: FRED_STKLVL(X86_TRAP_DF, 3)
+ * Multiple values can be ORd together.
+ */
+#define FRED_STKLVL(v,l)	(_AT(unsigned long, l) << (2*(v)))
+
+/* FRED_CONFIG MSR */
+#define FRED_CONFIG_CSL_MASK		0x3
+/*
+ * Used for the return address for call emulation during code patching,
+ * and measured in 64-byte cache lines.
+ */
+#define FRED_CONFIG_REDZONE_AMOUNT	1
+#define FRED_CONFIG_REDZONE		(_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6)
+#define FRED_CONFIG_INT_STKLVL(l)	(_AT(unsigned long, l) << 9)
+#define FRED_CONFIG_ENTRYPOINT(p)	_AT(unsigned long, (p))
+
+/*
+ * FRED event type and vector bit width and counts.
+ *
+ * There is space in the stack frame making it possible to extend event type
+ * and vector fields in the future.
+ */
+#define FRED_EVENT_TYPE_BITS		3
+#define FRED_EVENT_TYPE_COUNT		_BITUL(FRED_EVENT_TYPE_BITS)
+#define FRED_EVENT_VECTOR_BITS		8
+#define FRED_EVENT_VECTOR_COUNT		_BITUL(FRED_EVENT_VECTOR_BITS)
+
+/* FRED EVENT_TYPE_OTHER vector numbers */
+#define FRED_SYSCALL			1
+#define FRED_SYSENTER			2
+#define FRED_NUM_OTHER_VECTORS		3
+
+/* Flags above the SS selector (regs->ssx) */
+#define FRED_SSX_INTERRUPT_SHADOW_BIT	16
+#define FRED_SSX_INTERRUPT_SHADOW	_BITUL(FRED_SSX_INTERRUPT_SHADOW_BIT)
+#define FRED_SSX_SOFTWARE_INITIATED_BIT	17
+#define FRED_SSX_SOFTWARE_INITIATED	_BITUL(FRED_SSX_SOFTWARE_INITIATED_BIT)
+#define FRED_SSX_NMI_BIT		18
+#define FRED_SSX_NMI			_BITUL(FRED_SSX_NMI_BIT)
+#define FRED_SSX_64_BIT_MODE_BIT	57
+#define FRED_SSX_64_BIT_MODE		_BITUL(FRED_SSX_64_BIT_MODE_BIT)
+
+#ifdef CONFIG_X86_FRED
+
+#ifndef __ASSEMBLY__
+
+#include <linux/kernel.h>
+#include <asm/ptrace.h>
+
+struct fred_info {
+	/* Event data: CR2, DR6, ... */
+	unsigned long edata;
+	unsigned long resv;
+};
+
+/* Full format of the FRED stack frame */
+struct fred_frame {
+	struct pt_regs   regs;
+	struct fred_info info;
+};
+
+static __always_inline struct fred_info *fred_info(struct pt_regs *regs)
+{
+	return &container_of(regs, struct fred_frame, regs)->info;
+}
+
+static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
+{
+	return fred_info(regs)->edata;
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_X86_FRED */
+
+#endif /* ASM_X86_FRED_H */
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 09/36] x86/fred: Make unions for the cs and ss fields in struct pt_regs
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Make the cs and ss fields in struct pt_regs unions between the actual
selector and the unsigned long stack slot. FRED uses this space to
store additional flags.

The printk changes are simply due to the cs and ss fields changed to
unsigned short from unsigned long.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---

Changes since v8:
* Reflect stack frame definition changes from FRED spec 3.0 to 5.0.
* Use __packed instead of __attribute__((__packed__)) (Borislav Petkov).
* Put all comments above the members, like the rest of the file does
  (Borislav Petkov).

Changes since v3:
* Rename csl/ssl of the pt_regs structure to csx/ssx (x for extended)
  (Andrew Cooper).
---
 arch/x86/entry/vsyscall/vsyscall_64.c |  2 +-
 arch/x86/include/asm/ptrace.h         | 57 +++++++++++++++++++++++++--
 arch/x86/kernel/process_64.c          |  2 +-
 3 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index e0ca8120aea8..a3c0df11d0e6 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 	if (!show_unhandled_signals)
 		return;
 
-	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
 			   level, current->comm, task_pid_nr(current),
 			   message, regs->ip, regs->cs,
 			   regs->sp, regs->ax, regs->si, regs->di);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index f4db78b09c8f..f1690beffd15 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -80,15 +80,66 @@ struct pt_regs {
 /*
  * On syscall entry, this is syscall#. On CPU exception, this is error code.
  * On hw interrupt, it's IRQ number:
+ *
+ * A FRED stack frame starts here:
+ *   1) It _always_ includes an error code;
+ *   2) The return frame for eretu/erets starts here.
  */
 	unsigned long orig_ax;
 /* Return frame for iretq */
 	unsigned long ip;
-	unsigned long cs;
+	union {
+/* CS extended: CS + any fields above it */
+		unsigned long csx;
+		struct {
+/* CS selector proper */
+			unsigned short cs;
+/* The stack level (SL) at the time the event occurred */
+			unsigned int sl		: 2;
+/* Set to indicate that indirect branch tracker in WAIT_FOR_ENDBRANCH state */
+			unsigned int wfe	: 1;
+			unsigned int __csx_resv1: 13;
+			unsigned int __csx_resv2: 32;
+		} __packed;
+	};
 	unsigned long flags;
 	unsigned long sp;
-	unsigned long ss;
-/* top of stack page */
+	union {
+/* SS extended: SS + any fields above it */
+		unsigned long ssx;
+		struct {
+/* SS selector proper */
+			unsigned short ss;
+/* Set to indicate that interrupt blocking by STI was in effect */
+			unsigned int sti	: 1;
+/* For SYSCALL, SYSENTER, or INT n (for any value of n) */
+			unsigned int sys	: 1;
+			unsigned int nmi	: 1;
+			unsigned int __ssx_resv1: 13;
+/* Event information fields, ignored by the FRED return instructions */
+			unsigned int vector	: 8;
+			unsigned int __ssx_resv2: 8;
+			unsigned int type	: 4;
+			unsigned int __ssx_resv3: 4;
+/* Set to indicate that the event was incident to enclave execution */
+			unsigned int enc	: 1;
+/* Set to indicate that the logical processor had been in 64-bit mode */
+			unsigned int l		: 1;
+/*
+ * Set to indicate the event is a nested exception encountered during FRED
+ * event delivery of another event. This bit is not set if the event is
+ * double fault (#DF).
+ */
+			unsigned int nst	: 1;
+			unsigned int __ssx_resv4: 1;
+/* The length of the instruction causing the event */
+			unsigned int instr_len	: 4;
+		} __packed;
+	};
+/*
+ * Top of stack page on IDT systems, while FRED systems have extra fields
+ * defined above, see <asm/fred.h>.
+ */
 };
 
 #endif /* !__i386__ */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d181c16a2f6..265ab8fcb146 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,7 +117,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
 
 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
-	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
+	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
 		log_lvl, regs->cs, ds, es, cr0);
 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
 		log_lvl, cr2, cr3, cr4);
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 08/36] x86/cpu: Add MSR numbers for FRED configuration
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add MSR numbers for the FRED configuration registers.

Originally-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/msr-index.h       | 13 ++++++++++++-
 tools/arch/x86/include/asm/msr-index.h | 13 ++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index a00a53e15ab7..111fb76f6dbe 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,19 @@
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
 
-/* Intel MSRs. Some also available on other CPUs */
+/* FRED MSRs */
+#define MSR_IA32_FRED_RSP0	0x1cc /* Level 0 stack pointer */
+#define MSR_IA32_FRED_RSP1	0x1cd /* Level 1 stack pointer */
+#define MSR_IA32_FRED_RSP2	0x1ce /* Level 2 stack pointer */
+#define MSR_IA32_FRED_RSP3	0x1cf /* Level 3 stack pointer */
+#define MSR_IA32_FRED_STKLVLS	0x1d0 /* Exception stack levels */
+#define MSR_IA32_FRED_SSP0	MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */
+#define MSR_IA32_FRED_SSP1	0x1d1 /* Level 1 shadow stack pointer */
+#define MSR_IA32_FRED_SSP2	0x1d2 /* Level 2 shadow stack pointer */
+#define MSR_IA32_FRED_SSP3	0x1d3 /* Level 3 shadow stack pointer */
+#define MSR_IA32_FRED_CONFIG	0x1d4 /* Entrypoint and interrupt stack level */
 
+/* Intel MSRs. Some also available on other CPUs */
 #define MSR_TEST_CTRL				0x00000033
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT	29
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT		BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 3aedae61af4f..565cade0785a 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,19 @@
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
 
-/* Intel MSRs. Some also available on other CPUs */
+/* FRED MSRs */
+#define MSR_IA32_FRED_RSP0	0x1cc /* Level 0 stack pointer */
+#define MSR_IA32_FRED_RSP1	0x1cd /* Level 1 stack pointer */
+#define MSR_IA32_FRED_RSP2	0x1ce /* Level 2 stack pointer */
+#define MSR_IA32_FRED_RSP3	0x1cf /* Level 3 stack pointer */
+#define MSR_IA32_FRED_STKLVLS	0x1d0 /* Exception stack levels */
+#define MSR_IA32_FRED_SSP0	MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */
+#define MSR_IA32_FRED_SSP1	0x1d1 /* Level 1 shadow stack pointer */
+#define MSR_IA32_FRED_SSP2	0x1d2 /* Level 2 shadow stack pointer */
+#define MSR_IA32_FRED_SSP3	0x1d3 /* Level 3 shadow stack pointer */
+#define MSR_IA32_FRED_CONFIG	0x1d4 /* Entrypoint and interrupt stack level */
 
+/* Intel MSRs. Some also available on other CPUs */
 #define MSR_TEST_CTRL				0x00000033
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT	29
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT		BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 11/36] x86/fred: Reserve space for the FRED stack frame
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

When using FRED, reserve space at the top of the stack frame, just
like i386 does.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/thread_info.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d63b02940747..089cab875cba 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,9 @@
  * In vm86 mode, the hardware frame is much longer still, so add 16
  * bytes to make room for the real-mode segments.
  *
- * x86_64 has a fixed-length stack frame.
+ * x86-64 has a fixed-length stack frame, but it depends on whether
+ * or not FRED is enabled. Future versions of FRED might make this
+ * dynamic, but for now it is always 2 words longer.
  */
 #ifdef CONFIG_X86_32
 # ifdef CONFIG_VM86
@@ -39,8 +41,12 @@
 # else
 #  define TOP_OF_KERNEL_STACK_PADDING 8
 # endif
-#else
-# define TOP_OF_KERNEL_STACK_PADDING 0
+#else /* x86-64 */
+# ifdef CONFIG_X86_FRED
+#  define TOP_OF_KERNEL_STACK_PADDING (2*8)
+# else
+#  define TOP_OF_KERNEL_STACK_PADDING 0
+# endif
 #endif
 
 /*
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 12/36] x86/fred: Update MSR_IA32_FRED_RSP0 during task switch
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

MSR_IA32_FRED_RSP0 is used during ring 3 event delivery, and needs to
be updated to point to the top of next task stack during task switch.

Update MSR_IA32_FRED_RSP0 with WRMSR instruction for now, and will use
WRMSRNS/WRMSRLIST for performance once it gets upstreamed.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/switch_to.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index f42dbf17f52b..6c911fd400b2 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -70,9 +70,16 @@ static inline void update_task_stack(struct task_struct *task)
 #ifdef CONFIG_X86_32
 	this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-	/* Xen PV enters the kernel on the thread stack. */
-	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+		/*
+		 * Will use WRMSRNS/WRMSRLIST for performance once it's upstreamed.
+		 */
+		wrmsrl(MSR_IA32_FRED_RSP0,
+		       (unsigned long)task_stack_page(task) + THREAD_SIZE);
+	} else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
+		/* Xen PV enters the kernel on the thread stack. */
 		load_sp0(task_top_of_stack(task));
+	}
 #endif
 }
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 13/36] x86/fred: Let ret_from_fork_asm() jmp to fred_exit_user when FRED is enabled
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Let ret_from_fork_asm() jmp to fred_exit_user when FRED is enabled,
otherwise the existing IDT code is chosen.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/entry/entry_64.S | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 43606de22511..8069151176f2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -309,7 +309,13 @@ SYM_CODE_START(ret_from_fork_asm)
 	 * and unwind should work normally.
 	 */
 	UNWIND_HINT_REGS
+
+#ifdef CONFIG_X86_FRED
+	ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
+		    "jmp fred_exit_user", X86_FEATURE_FRED
+#else
 	jmp	swapgs_restore_regs_and_return_to_usermode
+#endif
 SYM_CODE_END(ret_from_fork_asm)
 .popsection
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 14/36] x86/fred: Disallow the swapgs instruction when FRED is enabled
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

The FRED architecture establishes the full supervisor/user through:
1) FRED event delivery from ring 3 swaps the value of the GS base
   address and that of the IA32_KERNEL_GS_BASE MSR.
2) ERETU swaps the value of the GS base address and that of the
   IA32_KERNEL_GS_BASE MSR.
3) LKGS is already upstreamed and automatically enabled with FRED to
   load the GS base address directly into the IA32_KERNEL_GS_BASE MSR
   instead of the GS segment’s descriptor cache.

As a result, there is no need to SWAPGS away from the kernel GS base,
i.e., the swapgs instruction is no longer needed when FRED is enabled,
thus is disallowed. Otherwise it causes #UD.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---

Changes since v8:
* Explain why writing directly to the IA32_KERNEL_GS_BASE MSR is
  doing the right thing (Thomas Gleixner).
---
 arch/x86/kernel/process_64.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 265ab8fcb146..6d5fed29f552 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -166,7 +166,8 @@ static noinstr unsigned long __rdgsbase_inactive(void)
 
 	lockdep_assert_irqs_disabled();
 
-	if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
+	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
 		native_swapgs();
 		gsbase = rdgsbase();
 		native_swapgs();
@@ -191,7 +192,8 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase)
 {
 	lockdep_assert_irqs_disabled();
 
-	if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
+	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
 		native_swapgs();
 		wrgsbase(gsbase);
 		native_swapgs();
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 16/36] x86/fred: Allow single-step trap and NMI when starting a new task
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Entering a new task is logically speaking a return from a system call
(exec, fork, clone, etc.). As such, if ptrace enables single stepping
a single step exception should be allowed to trigger immediately upon
entering user space. This is not optional.

NMI should *never* be disabled in user space. As such, this is an
optional, opportunistic way to catch errors.

Allow single-step trap and NMI when starting a new task, thus once
the new task enters user space, single-step trap and NMI are both
enabled immediately.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---

Changes since v8:
* Use high-order 48 bits above the lowest 16 bit SS only when FRED
  is enabled (Thomas Gleixner).
---
 arch/x86/kernel/process_64.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6d5fed29f552..0b47871a6141 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -56,6 +56,7 @@
 #include <asm/resctrl.h>
 #include <asm/unistd.h>
 #include <asm/fsgsbase.h>
+#include <asm/fred.h>
 #ifdef CONFIG_IA32_EMULATION
 /* Not included via unistd.h */
 #include <asm/unistd_32_ia32.h>
@@ -507,8 +508,18 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 static void
 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 		    unsigned long new_sp,
-		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
+		    u16 _cs, u16 _ss, u16 _ds)
 {
+	/*
+	 * Paranoia: High-order 48 bits above the lowest 16 bit SS are
+	 * discarded by the legacy IRET instruction on all Intel, AMD,
+	 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
+	 * even when FRED is not enabled. But we choose the safer side
+	 * to use these bits only when FRED is enabled.
+	 */
+	const unsigned long ssx_flags = cpu_feature_enabled(X86_FEATURE_FRED) ?
+		(FRED_SSX_SOFTWARE_INITIATED | FRED_SSX_NMI) : 0;
+
 	WARN_ON_ONCE(regs != current_pt_regs());
 
 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
@@ -522,11 +533,11 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	loadsegment(ds, _ds);
 	load_gs_index(0);
 
-	regs->ip		= new_ip;
-	regs->sp		= new_sp;
-	regs->cs		= _cs;
-	regs->ss		= _ss;
-	regs->flags		= X86_EFLAGS_IF;
+	regs->ip	= new_ip;
+	regs->sp	= new_sp;
+	regs->csx	= _cs;
+	regs->ssx	= _ss | ssx_flags;
+	regs->flags	= X86_EFLAGS_IF | X86_EFLAGS_FIXED;
 }
 
 void
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 15/36] x86/fred: No ESPFIX needed when FRED is enabled
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Because FRED always restores the full value of %rsp, ESPFIX is
no longer needed when it's enabled.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/kernel/espfix_64.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 16f9814c9be0..48d133a54f45 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -106,6 +106,10 @@ void __init init_espfix_bsp(void)
 	pgd_t *pgd;
 	p4d_t *p4d;
 
+	/* FRED systems don't need ESPFIX */
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return;
+
 	/* Install the espfix pud into the kernel page directory */
 	pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
@@ -129,6 +133,10 @@ void init_espfix_ap(int cpu)
 	void *stack_page;
 	pteval_t ptemask;
 
+	/* FRED systems don't need ESPFIX */
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return;
+
 	/* We only have to do this once... */
 	if (likely(per_cpu(espfix_stack, cpu)))
 		return;		/* Already initialized */
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 17/36] x86/fred: Define a common function type fred_handler
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

FRED event delivery establishes a full supervisor context by saving
the essential information about an event to a FRED stack frame, e.g.,
the faulting linear address of a #PF is saved as event data of a FRED
stack frame. Thus a struct pt_regs has all the needed data to handle
an event and it's the only input argument of a FRED event handler.

Define fred_handler, a common function type used in the FRED event
dispatch framework, which makes it easier to find the entry points
(via grep), allows the prototype to change if necessary without
requiring changing changes everywhere, and makes sure that all the
entry points have the proper decorations (currently noinstr, but
could change in the future.)

Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/fred.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index d76e681a806f..b45c1bea5b7f 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -68,6 +68,19 @@
 #define FRED_SSX_64_BIT_MODE_BIT	57
 #define FRED_SSX_64_BIT_MODE		_BITUL(FRED_SSX_64_BIT_MODE_BIT)
 
+/*
+ * FRED event delivery establishes a full supervisor context by
+ * saving the essential information about an event to a FRED
+ * stack frame, e.g., the faulting linear address of a #PF is
+ * saved as event data of a FRED #PF stack frame. Thus a struct
+ * pt_regs has all the needed data to handle an event and it's
+ * the only input argument of a FRED event handler.
+ *
+ * FRED handlers need to be placed in the noinstr text section.
+ */
+#define DECLARE_FRED_HANDLER(f) void f (struct pt_regs *regs)
+#define DEFINE_FRED_HANDLER(f) noinstr DECLARE_FRED_HANDLER(f)
+
 #ifdef CONFIG_X86_FRED
 
 #ifndef __ASSEMBLY__
@@ -97,6 +110,8 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
 	return fred_info(regs)->edata;
 }
 
+typedef DECLARE_FRED_HANDLER((*fred_handler));
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* CONFIG_X86_FRED */
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 18/36] x86/fred: Add a page fault entry stub for FRED
From: Xin Li @ 2023-07-31  6:32 UTC (permalink / raw)
  To: linux-doc, linux-kernel, linux-edac, linux-hyperv, kvm, xen-devel
  Cc: Jonathan Corbet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, Andy Lutomirski, Oleg Nesterov,
	Tony Luck, K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	Paolo Bonzini, Wanpeng Li, Vitaly Kuznetsov, Sean Christopherson,
	Peter Zijlstra, Juergen Gross, Stefano Stabellini,
	Oleksandr Tyshchenko, Josh Poimboeuf, Paul E . McKenney,
	Catalin Marinas, Randy Dunlap, Steven Rostedt, Kim Phillips,
	Xin Li, Hyeonggon Yoo, Liam R . Howlett, Sebastian Reichel,
	Kirill A . Shutemov, Suren Baghdasaryan, Pawan Gupta, Jiaxi Chen,
	Babu Moger, Jim Mattson, Sandipan Das, Lai Jiangshan,
	Hans de Goede, Reinette Chatre, Daniel Sneddon, Breno Leitao,
	Nikunj A Dadhania, Brian Gerst, Sami Tolvanen,
	Alexander Potapenko, Andrew Morton, Arnd Bergmann,
	Eric W . Biederman, Kees Cook, Masami Hiramatsu, Masahiro Yamada,
	Ze Gao, Fei Li, Conghui, Ashok Raj, Jason A . Donenfeld,
	Mark Rutland, Jacob Pan, Jiapeng Chong, Jane Malalane,
	David Woodhouse, Boris Ostrovsky, Arnaldo Carvalho de Melo,
	Yantengsi, Christophe Leroy, Sathvika Vasireddy
In-Reply-To: <20230731063317.3720-1-xin3.li@intel.com>

From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

Add a page fault entry stub for FRED.

On a FRED system, the faulting address (CR2) is passed on the stack,
to avoid the problem of transient state. Thus we get the page fault
address from the stack instead of CR2.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/fred.h |  2 ++
 arch/x86/mm/fault.c         | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index b45c1bea5b7f..fb8e7b4f2d38 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -112,6 +112,8 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
 
 typedef DECLARE_FRED_HANDLER((*fred_handler));
 
+DECLARE_FRED_HANDLER(fred_exc_page_fault);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* CONFIG_X86_FRED */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e8711b2cafaf..dd3df092d0f2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -34,6 +34,7 @@
 #include <asm/kvm_para.h>		/* kvm_handle_async_pf		*/
 #include <asm/vdso.h>			/* fixup_vdso_exception()	*/
 #include <asm/irq_stack.h>
+#include <asm/fred.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -1495,9 +1496,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code,
 	}
 }
 
-DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
+static __always_inline void page_fault_common(struct pt_regs *regs,
+					      unsigned int error_code,
+					      unsigned long address)
 {
-	unsigned long address = read_cr2();
 	irqentry_state_t state;
 
 	prefetchw(&current->mm->mmap_lock);
@@ -1544,3 +1546,15 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
 
 	irqentry_exit(regs, state);
 }
+
+DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
+{
+	page_fault_common(regs, error_code, read_cr2());
+}
+
+#ifdef CONFIG_X86_FRED
+DEFINE_FRED_HANDLER(fred_exc_page_fault)
+{
+	page_fault_common(regs, regs->orig_ax, fred_event_data(regs));
+}
+#endif
-- 
2.34.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox