Linux PCI subsystem development
 help / color / mirror / Atom feed
From: Shuai Xue <xueshuai@linux.alibaba.com>
To: Yicong Yang <yang.yicong@picoheart.com>,
	renyu.zj@linux.alibaba.com, will@kernel.org,
	mark.rutland@arm.com, jic23@kernel.org, bhelgaas@google.com,
	linux-pci@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: jingoohan1@gmail.com, mani@kernel.org, juwenlong@picoheart.com,
	geshijian@picoheart.com, douyufan@picoheart.com
Subject: Re: [PATCH v2 2/3] perf/dwc_pcie: Support narrowed time-based counter for long time monitoring
Date: Tue, 30 Jun 2026 16:50:16 +0800	[thread overview]
Message-ID: <a1bbf438-c0d3-4467-9852-657fb93bc548@linux.alibaba.com> (raw)
In-Reply-To: <20260629092717.74946-3-yang.yicong@picoheart.com>



On 6/29/26 5:27 PM, Yicong Yang wrote:
> From: Yufan Dou <douyufan@picoheart.com>
> 
> The DWC PCIe Time-Based Analysis Data Register (the counter for time-based
> events) is architected as 64-bit, but some hardware implementations do not
> implement the full width.  On these implementations the counter stops after
> reaching its implemented width. This will limit the usage for short time
> monitoring only. The counter will only cover ~15s for monitoring RX TLP
> payloads on our platform.
> 
> Add an optional hrtimer that fires every 2 seconds. It'll take the role
> as the counter overflow interrupt to read-update-reset the counter and
> event counts to break the limits of the narrow counters. It'll only
> apply on timer-based counter. The 2 seconds update period is the half
> of the maximum counting period (4s) of the time-based counter under
> period counting mode of the hardware.
> 
> Because fully-implemented 64-bit counters do not need this workaround,
> enable this hrtimer on the platforms known to have narrowed counter.
> 
> Before this patch, when counting fio for 10m the counts is incorrect:
>    root@localhost:/tmp# perf stat -e dwc_rootport_20000/rx_pcie_tlp_data_payload/ -- fio --runtime=10m fio_job.config
>    [...]
>    Run status group 0 (all jobs):
>       READ: bw=5594MiB/s (5865MB/s), 5594MiB/s-5594MiB/s (5865MB/s-5865MB/s), io=3278GiB (3519GB), run=600010-600010msec
>    [...]
>    Performance counter stats for 'system wide':
>       137,438,953,456      dwc_rootport_20000/rx_pcie_tlp_data_payload/
> 
> After this patch the counts is as expected:
>    root@localhost:/tmp# perf stat -e dwc_rootport_20000/rx_pcie_tlp_data_payload/ -- fio --runtime=10m fio_job.config
>    [...]
>    Run status group 0 (all jobs):
>       READ: bw=5632MiB/s (5905MB/s), 5632MiB/s-5632MiB/s (5905MB/s-5905MB/s), io=3300GiB (3543GB), run=600013-600013msec
>    [...]
>    Performance counter stats for 'system wide':
>     3,543,850,268,576      dwc_rootport_20000/rx_pcie_tlp_data_payload/
> 
> Signed-off-by: Yufan Dou <douyufan@picoheart.com>
> Signed-off-by: Yicong Yang <yang.yicong@picoheart.com>
> ---
>   drivers/perf/dwc_pcie_pmu.c | 69 ++++++++++++++++++++++++++++++++++---
>   1 file changed, 65 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
> index 5385401fa9cf..7ec8302d4090 100644
> --- a/drivers/perf/dwc_pcie_pmu.c
> +++ b/drivers/perf/dwc_pcie_pmu.c
> @@ -11,6 +11,7 @@
>   #include <linux/cpumask.h>
>   #include <linux/device.h>
>   #include <linux/errno.h>
> +#include <linux/hrtimer.h>
>   #include <linux/kernel.h>
>   #include <linux/list.h>
>   #include <linux/pcie-dwc.h>
> @@ -83,6 +84,7 @@ enum dwc_pcie_event_type {
>   
>   #define DWC_PCIE_LANE_EVENT_MAX_PERIOD		GENMASK_ULL(31, 0)
>   #define DWC_PCIE_MAX_PERIOD			GENMASK_ULL(63, 0)
> +#define DWC_PCIE_PMU_TIMER_PERIOD_NS		(2 * NSEC_PER_SEC)
>   
>   struct dwc_pcie_pmu {
>   	struct pmu		pmu;
> @@ -93,6 +95,8 @@ struct dwc_pcie_pmu {
>   	/* Groups #6 and #7 */
>   	DECLARE_BITMAP(lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP);
>   	struct perf_event	*time_based_event;
> +	bool			timer_enable;
> +	struct hrtimer		hrtimer;
>   
>   	struct hlist_node	cpuhp_node;
>   	int			on_cpu;
> @@ -354,6 +358,26 @@ static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
>   	return val;
>   }
>   
> +static void dwc_pcie_pmu_reset_time_based_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	u64 prev;
> +
> +	dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
> +
> +	/*
> +	 * The hardware counter is reset to zero when disabled. Synchronize
> +	 * prev_count so that the next event_update() computes the correct
> +	 * delta against the new counter baseline.
> +	 */
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, 0) != prev);
> +
> +	dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
> +}
> +
>   static void dwc_pcie_pmu_event_update(struct perf_event *event)
>   {
>   	struct hw_perf_event *hwc = &event->hw;
> @@ -429,6 +453,26 @@ static int dwc_pcie_pmu_validate_group(struct perf_event *event)
>   	return 0;
>   }
>   
> +static enum hrtimer_restart dwc_pcie_pmu_hrtimer_callback(struct hrtimer *hrtimer)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = container_of(hrtimer, struct dwc_pcie_pmu, hrtimer);
> +	struct perf_event *event = pcie_pmu->time_based_event;
> +	struct hw_perf_event *hwc;
> +
> +	if (!event)
> +		return HRTIMER_NORESTART;
> +
> +	hwc = &event->hw;
> +	if (hwc->state & PERF_HES_STOPPED)
> +		return HRTIMER_NORESTART;
> +
> +	dwc_pcie_pmu_event_update(event);
> +	dwc_pcie_pmu_reset_time_based_counter(event);
> +	hrtimer_forward_now(hrtimer, ns_to_ktime(DWC_PCIE_PMU_TIMER_PERIOD_NS));

 From sashiko:

If the PCIe Root Port enters runtime suspend (such as D3hot or D3cold) while
this timer is active, will the hardware accesses in
dwc_pcie_pmu_event_update() cause problems?
Accessing the configuration space of a suspended device could return
corrupted data or trigger system errors like synchronous external aborts.

https://sashiko.dev/#/patchset/20260629092717.74946-1-yang.yicong%40picoheart.com

> +
> +	return HRTIMER_RESTART;
> +}
> +
>   static int dwc_pcie_pmu_event_init(struct perf_event *event)
>   {
>   	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> @@ -478,10 +522,15 @@ static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
>   	hwc->state = 0;
>   	local64_set(&hwc->prev_count, 0);
>   
> -	if (type == DWC_PCIE_LANE_EVENT)
> +	if (type == DWC_PCIE_LANE_EVENT) {
>   		dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, true);
> -	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
>   		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
> +		if (pcie_pmu->timer_enable)
> +			hrtimer_start(&pcie_pmu->hrtimer,
> +				      ns_to_ktime(DWC_PCIE_PMU_TIMER_PERIOD_NS),
> +				      HRTIMER_MODE_REL_PINNED_HARD);
> +	}
>   }
>   
>   static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
> @@ -495,10 +544,12 @@ static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
>   
>   	dwc_pcie_pmu_event_update(event);
>   
> -	if (type == DWC_PCIE_LANE_EVENT)
> +	if (type == DWC_PCIE_LANE_EVENT) {
>   		dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, false);
> -	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
>   		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
> +		hrtimer_cancel(&pcie_pmu->hrtimer);

While hrtimer_cancel() on an inactive timer is a no-op, it is asymmetric
with event_start which only starts the timer when timer_enable is true.
Please guard it for consistency:

       if (pcie_pmu->timer_enable)
           hrtimer_cancel(&pcie_pmu->hrtimer);


Thanks.
Shuai

  parent reply	other threads:[~2026-06-30  8:50 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-29  9:27 [PATCH v2 0/3] New vendor support and optimizations for DWC PCIe PMU Yicong Yang
2026-06-29  9:27 ` [PATCH v2 1/3] perf/dwc_pcie: Add support for Picoheart vendor devices Yicong Yang
2026-06-29  9:40   ` sashiko-bot
2026-06-29 17:43   ` Bjorn Helgaas
2026-06-30  3:35   ` Shuai Xue
2026-06-29  9:27 ` [PATCH v2 2/3] perf/dwc_pcie: Support narrowed time-based counter for long time monitoring Yicong Yang
2026-06-29  9:42   ` sashiko-bot
2026-06-30  8:50   ` Shuai Xue [this message]
2026-06-29  9:27 ` [PATCH v2 3/3] perf/dwc_pcie: Convert to faux device interface Yicong Yang
2026-06-29  9:40   ` sashiko-bot
2026-06-30  9:19   ` Shuai Xue

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a1bbf438-c0d3-4467-9852-657fb93bc548@linux.alibaba.com \
    --to=xueshuai@linux.alibaba.com \
    --cc=bhelgaas@google.com \
    --cc=douyufan@picoheart.com \
    --cc=geshijian@picoheart.com \
    --cc=jic23@kernel.org \
    --cc=jingoohan1@gmail.com \
    --cc=juwenlong@picoheart.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=mani@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=renyu.zj@linux.alibaba.com \
    --cc=will@kernel.org \
    --cc=yang.yicong@picoheart.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox