Linux CXL
 help / color / mirror / Atom feed
From: Ira Weiny <ira.weiny@intel.com>
To: Davidlohr Bueso <dave@stgolabs.net>, <dave.jiang@intel.com>,
	<dan.j.williams@intel.com>
Cc: <jonathan.cameron@huawei.com>, <alison.schofield@intel.com>,
	<ira.weiny@intel.com>, <vishal.l.verma@intel.com>,
	<seven.yi.lee@gmail.com>, <hch@infradead.org>,
	<a.manzanares@samsung.com>, <fan.ni@samsung.com>,
	<dave@stgolabs.net>, <linux-cxl@vger.kernel.org>
Subject: Re: [PATCH RFC 1/1] cxl/pci: Support Global Persistent Flush (GPF)
Date: Fri, 20 Dec 2024 12:21:57 -0600	[thread overview]
Message-ID: <6765b5c5560e5_2e2afc29467@iweiny-mobl.notmuch> (raw)
In-Reply-To: <20241220164337.204900-2-dave@stgolabs.net>

Davidlohr Bueso wrote:
> Add support for GPF flows. It is found that the CXL specification
> around this to be a bit too involved from the driver side. And while
> this should really all handled by the hardware, this patch takes
> things with a grain of salt.
> 
> Timeout detection is based on dirty Shutdown semantics. The driver
> will mark it as dirty, expecting that the device clear it upon a
> successful GPF event. The admin may consult the device Health and
> check the dirty shutdown counter to see if there was a problem
> with data integrity.
> 
> Timeout arming is done throughout the decode hierarchy, upon device
> probing and hot-remove. These timeouts can be over-specified,
> particularly T1. Set the max timeout to 20 seconds for T1, which is
> the NMI watchdog default for lockup detection. For T2, the policy
> is to use the largest device T2 available in the hierarchy.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>

[snip]

>  int cxl_set_timestamp(struct cxl_memdev_state *mds)
>  {
>  	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 9d58ab9d33c5..9b1e110817f2 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -1054,3 +1054,107 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
>  
>  	return 0;
>  }
> +
> +int cxl_pci_update_gpf_port(struct pci_dev *pdev,
> +			    struct cxl_memdev *cxlmd, bool remove)
> +{
> +	u16 ctrl;
> +	int port_t1_base, port_t1_scale;
> +	int port_t2_base, port_t2_scale;
> +	unsigned long device_tmo, port_tmo;
> +	int rc, dvsec;
> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
> +
> +	dvsec = pci_find_dvsec_capability(
> +		pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PORT_GPF);
> +	if (!dvsec) {
> +		dev_warn(&pdev->dev,
> +			 "GPF Port DVSEC not present\n");
> +		return -EINVAL;
> +	}
> +
> +	/* check for t1 */
> +	rc = pci_read_config_word(
> +		pdev,
> +		dvsec + CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET,
> +		&ctrl);
> +	if (rc)
> +		return rc;
> +
> +	port_t1_base = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK,
> +				 ctrl);
> +	port_t1_scale = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK,
> +				  ctrl);
> +	if (port_t1_scale > GPF_TIMEOUT_SCALE_MAX) {
> +		dev_warn(&pdev->dev, "GPF: invalid port phase 1 timeout\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Set max timeout such that vendors will optimize GPF flow to
> +	 * avoid the implied worst-case scenario delays.
> +	 */
> +	device_tmo = gpf_timeout_us(7, GPF_TIMEOUT_SCALE_MAX);

What is '7' here?  Seems like another define would be good.

> +	port_tmo = gpf_timeout_us(port_t1_base, port_t1_scale);
> +
> +	dev_dbg(&pdev->dev, "Port GPF phase 1 timeout: %lu us\n", port_tmo);
> +
> +	if ((remove && device_tmo != port_tmo) || device_tmo > port_tmo) {
> +		/* update the timeout in DVSEC */
> +		ctrl = FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK,
> +				   7);

And use it here...

> +		ctrl |= FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK,
> +				   GPF_TIMEOUT_SCALE_MAX);
> +		rc = pci_write_config_word(
> +			pdev,
> +			dvsec + CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET,
> +			ctrl);
> +		if (rc)
> +			return rc;
> +
> +		dev_dbg(&pdev->dev,
> +			"new GPF Port phase 1 timeout: %lu us\n", device_tmo);
> +	}
> +
> +	/* check for t2 */
> +	rc = pci_read_config_word(
> +		pdev,
> +		dvsec + CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET,
> +		&ctrl);
> +	if (rc)
> +		return rc;
> +
> +	port_t2_base = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK,
> +			    ctrl);
> +	port_t2_scale = FIELD_GET(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK,
> +			     ctrl);
> +	if (port_t2_scale > GPF_TIMEOUT_SCALE_MAX) {
> +		dev_warn(&pdev->dev, "GPF: invalid port phase 2 timeout\n");
> +		return -EINVAL;
> +	}
> +
> +	device_tmo = gpf_timeout_us(mds->gpf.t2_base, mds->gpf.t2_scale);
> +	port_tmo = gpf_timeout_us(port_t2_base, port_t2_scale);
> +
> +	dev_dbg(&pdev->dev, "Port GPF phase 2 timeout: %lu us\n", port_tmo);
> +
> +	if ((remove && device_tmo != port_tmo) || device_tmo > port_tmo) {
> +		/* update the timeout in DVSEC */
> +		ctrl = FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK,
> +				   mds->gpf.t2_base);
> +		ctrl |= FIELD_PREP(CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK,
> +				   mds->gpf.t2_scale);
> +		rc = pci_write_config_word(
> +			pdev,
> +			dvsec + CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET,
> +			ctrl);
> +		if (rc)
> +			return rc;
> +
> +		dev_dbg(&pdev->dev,
> +			"new GPF Port phase 2 timeout: %lu us\n", device_tmo);
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_pci_update_gpf_port, "CXL");
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index 78a5c2c25982..0bd09669af68 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -1370,6 +1370,73 @@ static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port,
>  	return port;
>  }
>  
> +static void delete_update_gpf(struct cxl_memdev *cxlmd)
> +{
> +	struct cxl_port *port = cxlmd->endpoint;
> +	struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
> +	struct cxl_memdev *max_cxlmd = NULL;
> +	struct cxl_memdev_state *mds;
> +	struct cxl_ep *ep;
> +	unsigned long index;
> +
> +	/* first calculate the new max T2 timeout */
> +	xa_for_each(&parent_port->endpoints, index, ep) {
> +		struct cxl_memdev *this_cxlmd;
> +		struct cxl_memdev_state *max_mds;
> +
> +		this_cxlmd = to_cxl_memdev(ep->ep);
> +		if (cxlmd == this_cxlmd) /* ignore self */
> +			continue;
> +		if (!max_cxlmd) {
> +			max_cxlmd = this_cxlmd;
> +			continue;
> +		}
> +
> +		mds = to_cxl_memdev_state(this_cxlmd->cxlds);
> +		max_mds = to_cxl_memdev_state(max_cxlmd->cxlds);
> +
> +		if (gpf_timeout_us(mds->gpf.t2_base, mds->gpf.t2_scale) >
> +		    gpf_timeout_us(max_mds->gpf.t2_base, max_mds->gpf.t2_scale))
> +			max_cxlmd = this_cxlmd;
> +	}
> +
> +	if (!max_cxlmd) /* no other devices */
> +		goto clean_shutdown;
> +
> +	while (1) {
> +		struct cxl_dport *dport;
> +
> +		parent_port = to_cxl_port(port->dev.parent);
> +		xa_for_each(&parent_port->dports, index, dport) {

Does this set the dports for all ports on a switch?  Is that the
intention?

> +			if (!dev_is_pci(dport->dport_dev))
> +				continue;
> +
> +			cxl_pci_update_gpf_port(to_pci_dev(dport->dport_dev),
> +						max_cxlmd, true);
> +		}
> +
> +		if (is_cxl_root(parent_port))
> +			break;
> +
> +		port = parent_port;
> +	}
> +
> +clean_shutdown:
> +	/*
> +	 * Device can still dirty the shutdown upon detecting any
> +	 * failed internal flush.
> +	 */
> +	if (resource_size(&cxlmd->cxlds->pmem_res)) {
> +		mds = to_cxl_memdev_state(cxlmd->cxlds);
> +
> +		if (mds->dirtied_shutdown) {
> +			if (cxl_set_shutdown_state(mds, false))
> +				dev_warn(&cxlmd->dev,
> +					 "could not clean Shutdown state");
> +		}
> +	}
> +}
> +
>  /*
>   * All users of grandparent() are using it to walk PCIe-like switch port
>   * hierarchy. A PCIe switch is comprised of a bridge device representing the
> @@ -1400,6 +1467,7 @@ static void delete_endpoint(void *data)
>  	struct device *host = endpoint_host(endpoint);
>  
>  	scoped_guard(device, host) {
> +		delete_update_gpf(cxlmd);
>  		if (host->driver && !endpoint->dead) {
>  			devm_release_action(host, cxl_unlink_parent_dport, endpoint);
>  			devm_release_action(host, cxl_unlink_uport, endpoint);

[snip]

> @@ -129,4 +140,57 @@ void read_cdat_data(struct cxl_port *port);
>  void cxl_cor_error_detected(struct pci_dev *pdev);
>  pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>  				    pci_channel_state_t state);
> +
> +#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
> +
> +static inline unsigned long gpf_timeout_us(int base, int scale)
> +{
> +	unsigned long tmo, others;
> +
> +	switch (scale) {
> +	case 0: /* 1 us */
> +		tmo = 1;
> +		break;
> +	case 1: /* 10 us */
> +		tmo = 10UL;
> +		break;
> +	case 2: /* 100 us */
> +		tmo = 100UL;
> +		break;
> +	case 3: /* 1 ms */
> +		tmo = 1000UL;
> +		break;
> +	case 4: /* 10 ms */
> +		tmo = 10000UL;
> +		break;
> +	case 5: /* 100 ms */
> +		tmo = 100000UL;
> +		break;
> +	case 6: /* 1 s */
> +		tmo = 1000000UL;
> +		break;
> +	case GPF_TIMEOUT_SCALE_MAX:
> +		tmo = 10000000UL;
> +		break;
> +	default:
> +		tmo = 0;
> +		break;
> +	}
> +
> +	tmo *= base;
> +	/*
> +	 * The spec is over involved. Do not account for any ad-hoc
> +	 * host delays. Ie: propagation delay, host-side processing
> +	 * delays, and any other  host/system-specific delays.
> +	 */
> +	others = 0;
> +	tmo += others;
> +
> +	/*
> +	 * Limit max timeout to 20 seconds (per phase), which is
> +	 * already the default for the nmi watchdog.
> +	 */
> +	return min(20000000UL, tmo);
> +}
> +
>  #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 6d94ff4a4f1a..37d5616b6fc8 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -900,6 +900,101 @@ static struct attribute_group cxl_rcd_group = {
>  };
>  __ATTRIBUTE_GROUPS(cxl_rcd);
>  
> +
> +static int cxl_gpf_setup(struct pci_dev *pdev)
> +{
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
> +	struct cxl_memdev *cxlmd = cxlds->cxlmd;
> +	struct cxl_port *port;
> +	int rc, gpf_dvsec;
> +	u16 duration;
> +	u32 power;
> +	int device_t2_base, device_t2_scale;
> +
> +	/* get the timeouts for phase 2, given by the hardware */
> +	gpf_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
> +					      CXL_DVSEC_DEVICE_GPF);
> +	if (!gpf_dvsec) {
> +		dev_warn(&pdev->dev,
> +			 "GPF Device DVSEC not present\n");
> +		return -EINVAL;
> +	}
> +
> +	rc = pci_read_config_word(
> +		pdev,
> +		gpf_dvsec + CXL_DVSEC_DEVICE_GPF_PHASE_2_DURATION_OFFSET,
> +		&duration);
> +	if (rc)
> +		return rc;
> +
> +	device_t2_base = FIELD_GET(CXL_DVSEC_DEVICE_GPF_PHASE_2_TIME_BASE_MASK,
> +			    duration);
> +	device_t2_scale = FIELD_GET(CXL_DVSEC_DEVICE_GPF_PHASE_2_TIME_SCALE_MASK,
> +			     duration);
> +	if (device_t2_scale > GPF_TIMEOUT_SCALE_MAX) {
> +		dev_warn(&pdev->dev, "GPF: invalid device timeout\n");
> +		return -EINVAL;
> +	}
> +
> +	/* cache device GPF timeout and power consumption for phase 2 */
> +	mds->gpf.t2_base = device_t2_base;
> +	mds->gpf.t2_scale = device_t2_scale;
> +
> +	rc = pci_read_config_dword(
> +		pdev,
> +		gpf_dvsec + CXL_DVSEC_DEVICE_GPF_PHASE_2_POWER_OFFSET,
> +		&power);
> +	if (rc)
> +		return rc;
> +
> +	mds->gpf.power_mwatts = power;
> +
> +	dev_dbg(&pdev->dev, "Device GPF timeout: %lu us (power needed: %dmW)\n",
> +		gpf_timeout_us(device_t2_base, device_t2_scale),
> +		mds->gpf.power_mwatts);
> +
> +	/* iterate up the hierarchy updating max port timeouts where necessary */
> +	port = cxlmd->endpoint;
> +	while (1) {
> +		struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
> +		struct cxl_dport *dport;
> +		unsigned long index;
> +
> +		device_lock(&parent_port->dev);
> +		xa_for_each(&parent_port->dports, index, dport) {
> +			if (!dev_is_pci(dport->dport_dev))
> +				continue;
> +
> +			cxl_pci_update_gpf_port(to_pci_dev(dport->dport_dev),
> +						cxlmd, false);
> +		}
> +		device_unlock(&parent_port->dev);
> +
> +		if (is_cxl_root(parent_port))
> +			break;
> +
> +		port = parent_port;
> +	}
> +
> +	/*
> +	 * Set dirty shutdown now, with the expectation that the device
> +	 * clear it upon a successful GPF flow. The exception to this
> +	 * is upon Viral detection, per CXL 3.2 section 12.4.2.
> +	 *
> +	 * XXX: For non-fail scenarios, this is cleared by the driver
> +	 * at hot-unplug. But, what about reboot/shutdown case? Is this
> +	 * done by hw if no data integrity failure is detected?

Does this path execute on reboot/shutdown as part of delete_endpoint()?

Ira

> +	 */
> +	if (resource_size(&cxlds->pmem_res)) {
> +		rc = cxl_set_shutdown_state(mds, true);
> +		if (!rc)
> +			mds->dirtied_shutdown = true;
> +	}
> +
> +	return rc;
> +}
> +

[snip]

  reply	other threads:[~2024-12-20 18:22 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-20 16:43 [PATCH RFC v2 0/1] cxl/pci: Support Global Persistent Flush (GPF) Davidlohr Bueso
2024-12-20 16:43 ` [PATCH RFC 1/1] " Davidlohr Bueso
2024-12-20 18:21   ` Ira Weiny [this message]
2024-12-21  4:14     ` Davidlohr Bueso
2024-12-23 19:53   ` Jonathan Cameron
2025-01-17  2:28   ` Dan Williams
2025-01-17  7:17     ` Davidlohr Bueso
2025-01-17  7:33       ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6765b5c5560e5_2e2afc29467@iweiny-mobl.notmuch \
    --to=ira.weiny@intel.com \
    --cc=a.manzanares@samsung.com \
    --cc=alison.schofield@intel.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.jiang@intel.com \
    --cc=dave@stgolabs.net \
    --cc=fan.ni@samsung.com \
    --cc=hch@infradead.org \
    --cc=jonathan.cameron@huawei.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=seven.yi.lee@gmail.com \
    --cc=vishal.l.verma@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox