* [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
@ 2015-08-08 6:42 Kanaka Juvva
2015-08-16 17:42 ` Juvva, Kanaka D
2015-08-19 20:50 ` Thomas Gleixner
0 siblings, 2 replies; 12+ messages in thread
From: Kanaka Juvva @ 2015-08-08 6:42 UTC (permalink / raw)
To: kanaka.d.juvva, glenn.p.williamson, matt.fleming, will.auld, andi,
linux-kernel, tony.luck, peterz, tglx, tj, x86, mingo, hpa,
vikas.shivappa
This patch adds Memory Bandwidth Monitoring events to the exitsing
intel_cqm pmu in the Linux Kernel.
Intel MBM builds on Cache Monitoring Technology (CMT) infrastructure
to allow monitoring of bandwidth from one level of the cache hierarchy
to the next - in this case focusing on the L3 cache, which is typically
backed directly by system memory. As a result of this implementation,
memory bandwidth can be monitored.
MBM counters are available in Intel Xeon CPUs. The following events are
implemented in the kernel and expose two MBM counters via perf_event
interface:
- llc_local_bw: bandwidth consumption of all coress on socket
- llc_total_bw: bandwidth consumption of all cores on socket + bandwidth
for QPI accesses
At present, the MBM events are checked at one second interval provided
by the HRTIMER of the MBM event. MBM counters can overflow atmost once in
a second and thus must be read atleast once in a second. Overflow is
detected and handled. Cumulative average value is calculated for each
bandwidth type upon reading a new value from the MSR.
version2: merged MBM patches with tip
changes: 1. Incorporate HRTIMER API chanes of tip
2. added separate event MBM_LOCAL_EVENT codes for perf
and MSR reads
QOS_MBM_LOCAL_EVENT_ID_HW 0x3 used for MSR read
as per SDM
QOS_MBM_LOCAL_EVENT_ID (1 << 2) used by perf,
perf event codes are in powers of 2
version3: Improved readbility of code
Incorporated upstream comments for Patch v2
Signed-off-by: Kanaka Juvva <kanaka.d.juvva@linux.intel.com>
---
arch/x86/include/asm/cpufeature.h | 2 +
arch/x86/kernel/cpu/common.c | 4 +-
arch/x86/kernel/cpu/perf_event_intel_cqm.c | 702 +++++++++++++++++++++++++++--
3 files changed, 672 insertions(+), 36 deletions(-)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3d6606f..c36abdf 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -251,6 +251,8 @@
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+#define X86_FEATURE_CQM_MBM_TOTAL (12*32+1) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (12*32+2) /* LLC Local MBM monitoring */
/*
* BUG word(s)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 922c5e0..7dcc88c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -643,7 +643,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[12] = edx;
- if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC))
+ || ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL))
+ || (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 63eb68b..d54543f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -12,9 +12,19 @@
#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
+#define MBM_CNTR_MAX 0xffffff
+#define MBM_SOCKET_MAX 8
+#define MBM_TIME_DELTA_MAX 1000
+#define MBM_TIME_DELTA_MIN 100
+#define MBM_SCALING_FACTOR2 1
+#define MBM_SCALING_FACTOR 1000
+#define MBM_FIFO_SIZE_MIN 10
+#define MBM_FIFO_SIZE_MAX 300
static u32 cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN;
+static bool cqm_llc_occ, cqm_mbm;
/**
* struct intel_pqr_state - State cache for the PQR MSR
@@ -42,6 +52,90 @@ struct intel_pqr_state {
* interrupts disabled, which is sufficient for the protection.
*/
static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu);
+static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
+
+/*
+ * mbm pmu is used for storing mbm_local and mbm_total
+ * events
+ */
+struct mbm_pmu {
+ /*
+ * # of active events
+ */
+ int n_active;
+
+ /*
+ * Linked list that holds the perf events
+ */
+ struct list_head active_list;
+
+ /*
+ * pmu to which event belongs
+ */
+ struct pmu *pmu;
+
+ /*
+ * time interval in ms between two consecutive samples
+ */
+ ktime_t timer_interval; /* in ktime_t unit */
+
+ /*
+ * periodic timer that triggers inte_cqm_mbm_update
+ */
+ struct hrtimer hrtimer;
+};
+
+/*
+ * struct sample defines mbm event
+ */
+struct sample {
+ /*
+ * current MSR value
+ */
+ u64 bytes;
+
+ /*
+ * running average of the bandwidth
+ */
+ u64 cum_avg;
+
+ /*
+ * time of the previous sample
+ */
+ ktime_t prev_time;
+
+ /*
+ * sample index
+ */
+ u64 index;
+
+ /*
+ * Sliding window to store previous 'n' bw values
+ */
+ u32 mbmfifo[MBM_FIFO_SIZE_MAX];
+
+ /*
+ * fifoin is the location at which current bw is saved
+ */
+ u32 fifoin;
+
+ /*
+ * fifoout is the start of the sliding window from which last 'n'
+ * bw values must be read
+ */
+ u32 fifoout; /* mbmfifo out counter for sliding window */
+};
+
+/*
+ * stats for total bw
+ */
+static struct sample *mbm_total; /* curent stats for mbm_total events */
+
+/*
+ * stats for local bw
+ */
+static struct sample *mbm_local; /* current stats for mbm_local events */
/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -66,6 +160,9 @@ static cpumask_t cqm_cpumask;
#define RMID_VAL_UNAVAIL (1ULL << 62)
#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
+#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
+#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
+#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
@@ -176,6 +273,25 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
return entry;
}
+/**
+ * __mbm_reset_stats - reset stats for a given rmid for the current cpu
+ * @rmid: rmid value
+ *
+ * vrmid: array index for current core for the given rmid
+ * mbs_total[] and mbm_local[] are linearly indexed by core * max #rmids per
+ * core
+ */
+static void __mbm_reset_stats(u32 rmid)
+{
+ u32 vrmid = topology_physical_package_id(smp_processor_id()) *
+ cqm_max_rmid + rmid;
+
+ if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
+ return;
+ memset(&mbm_local[vrmid], 0, sizeof(struct sample));
+ memset(&mbm_total[vrmid], 0, sizeof(struct sample));
+}
+
/*
* Returns < 0 on fail.
*
@@ -192,6 +308,7 @@ static u32 __get_rmid(void)
entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
list_del(&entry->list);
+ __mbm_reset_stats(entry->rmid);
return entry->rmid;
}
@@ -207,6 +324,7 @@ static void __put_rmid(u32 rmid)
entry->queue_time = jiffies;
entry->state = RMID_YOUNG;
+ __mbm_reset_stats(rmid);
list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
}
@@ -232,6 +350,7 @@ static int intel_cqm_setup_rmid_cache(void)
INIT_LIST_HEAD(&entry->list);
entry->rmid = r;
+
cqm_rmid_ptrs[r] = entry;
list_add_tail(&entry->list, &cqm_rmid_free_lru);
@@ -494,6 +613,165 @@ static bool intel_cqm_sched_in_event(u32 rmid)
return false;
}
+
+static u32 bw_sum_calc(struct sample *bw_stat, int rmid)
+{
+ u32 val = 0, i, j, index;
+
+ if (++bw_stat->fifoout >= mbm_window_size)
+ bw_stat->fifoout = 0;
+ index = bw_stat->fifoout;
+ for (i = 0; i < mbm_window_size - 1; i++) {
+ if (index + i >= mbm_window_size)
+ j = index + i - mbm_window_size;
+ else
+ j = index + i;
+ val += bw_stat->mbmfifo[j];
+ }
+ return val;
+}
+
+static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw)
+{
+ if (is_localbw)
+ return bw_sum_calc(&mbm_local[rmid], rmid);
+ else
+ return bw_sum_calc(&mbm_total[rmid], rmid);
+}
+
+static void __mbm_fifo_in(struct sample *bw_stat, u32 val)
+{
+ bw_stat->mbmfifo[bw_stat->fifoin] = val;
+ if (++bw_stat->fifoin >= mbm_window_size)
+ bw_stat->fifoin = 0;
+}
+
+static void mbm_fifo_in(int rmid, u32 val, bool is_localbw)
+{
+ if (is_localbw)
+ __mbm_fifo_in(&mbm_local[rmid], val);
+ else
+ __mbm_fifo_in(&mbm_total[rmid], val);
+}
+
+/*
+ * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and reads
+ * its MSR counter. Check whether overflow occurred and handles it. Calculates
+ * currenet BW and updates running average.
+ *
+ * Overflow Handling:
+ * if (MSR current value < MSR previous value) it is an
+ * overflow. and overflow is handled.
+ *
+ * Calculation of Current BW value:
+ * If MSR is read within last 100ms, then the value is ignored;
+ * this will suppress small deltas. We don't process MBM samples that are
+ * within 100ms.
+ *
+ * Bandwidth is calculated as:
+ * bandwidth = difference of last two msr counter values/time difference.
+ *
+ * cum_avg = Running Average bandwidth of last 'n' samples that are processed
+ *
+ * Sliding window is used to save the last 'n' samples. Hence,
+ * n = sliding_window_size
+ *
+ *Scaling:
+ * cum_avg is scaled down by a factor MBM_SCALING_FACTOR2 and rounded to
+ * nearest integer. User interface reads the BW in KB/sec or MB/sec.
+ */
+static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
+{
+ u64 val, tmp, diff_time, cma, bytes, index;
+ bool overflow = false, first = false;
+ ktime_t cur_time;
+ u32 tmp32 = rmid;
+ struct sample *mbm_current;
+ u32 vrmid = topology_physical_package_id(smp_processor_id()) *
+ cqm_max_rmid + rmid;
+
+ rmid = vrmid;
+ cur_time = ktime_get();
+ if (read_mbm_local) {
+ cma = mbm_local[rmid].cum_avg;
+ diff_time = ktime_ms_delta(cur_time,
+ mbm_local[rmid].prev_time);
+ if (diff_time < 100)
+ return cma;
+ mbm_local[rmid].prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+ bytes = mbm_local[rmid].bytes;
+ index = mbm_local[rmid].index;
+ mbm_current = &mbm_local[rmid];
+ rmid = tmp32;
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID_HW, rmid);
+ } else {
+ cma = mbm_total[rmid].cum_avg;
+ diff_time = ktime_ms_delta(cur_time,
+ mbm_total[rmid].prev_time);
+ if (diff_time < 100)
+ return cma;
+ mbm_total[rmid].prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+ bytes = mbm_total[rmid].bytes;
+ index = mbm_total[rmid].index;
+ mbm_current = &mbm_total[rmid];
+ rmid = tmp32;
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID, rmid);
+ }
+ rdmsrl(MSR_IA32_QM_CTR, val);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return val;
+
+ tmp = val;
+ /* if current msr value < previous msr value , it means overflow */
+ if (val < bytes) {
+ val = MBM_CNTR_MAX - bytes + val;
+ overflow = true;
+ } else
+ val = val - bytes;
+
+ val = (val * MBM_TIME_DELTA_MAX) / diff_time;
+
+ if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
+ /* First sample */
+ first = true;
+
+ rmid = vrmid;
+ if ((diff_time <= (MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN)) ||
+ overflow || first) {
+ int bw, ret;
+
+ if (index & (index < mbm_window_size))
+ val = cma * MBM_SCALING_FACTOR2 + val / index -
+ cma / index;
+ val = DIV_ROUND_UP(val, MBM_SCALING_FACTOR2);
+ if (index >= mbm_window_size) {
+ /*
+ * Compute the sum of recent n-1 samples and slide the
+ * window by 1
+ */
+ ret = __mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
+ /* recalculate the running average with current bw */
+ ret = (ret + val) / mbm_window_size;
+ bw = val;
+ val = ret;
+ } else
+ bw = val;
+ /* save the recent bw in fifo */
+ mbm_fifo_in(rmid, bw, read_mbm_local);
+ /* save the current sample */
+ mbm_current->index++;
+ mbm_current->cum_avg = val;
+ mbm_current->bytes = tmp;
+ mbm_current->prev_time = ktime_set(0,
+ (unsigned long)ktime_to_ns(cur_time));
+
+ return val;
+ }
+ return cma;
+}
+
/*
* Initially use this constant for both the limbo queue time and the
* rotation timer interval, pmu::hrtimer_interval_ms.
@@ -875,6 +1153,40 @@ static void intel_cqm_setup_event(struct perf_event *event,
event->hw.cqm_rmid = rmid;
}
+static void intel_cqm_event_update(struct perf_event *event)
+{
+ unsigned int rmid;
+ u64 val = 0;
+
+ /*
+ * Task events are handled by intel_cqm_event_count().
+ */
+ if (event->cpu == -1)
+ return;
+
+ rmid = event->hw.cqm_rmid;
+ if (!__rmid_valid(rmid))
+ return;
+
+ switch (event->attr.config) {
+ case QOS_MBM_TOTAL_EVENT_ID:
+ val = __rmid_read_mbm(rmid, false);
+ break;
+ case QOS_MBM_LOCAL_EVENT_ID:
+ val = __rmid_read_mbm(rmid, true);
+ break;
+ default:
+ return;
+ }
+ /*
+ * Ignore this reading on error states and do not update the value.
+ */
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ local64_set(&event->count, val);
+}
+
static void intel_cqm_event_read(struct perf_event *event)
{
unsigned long flags;
@@ -887,6 +1199,13 @@ static void intel_cqm_event_read(struct perf_event *event)
if (event->cpu == -1)
return;
+ if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
+ intel_cqm_event_update(event);
+
+ if (!(event->attr.config & QOS_L3_OCCUP_EVENT_ID))
+ return;
+
raw_spin_lock_irqsave(&cache_lock, flags);
rmid = event->hw.cqm_rmid;
@@ -906,6 +1225,28 @@ out:
raw_spin_unlock_irqrestore(&cache_lock, flags);
}
+static void __intel_cqm_event_total_bw_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read_mbm(rr->rmid, false);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
+static void __intel_cqm_event_local_bw_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read_mbm(rr->rmid, true);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
static void __intel_cqm_event_count(void *info)
{
struct rmid_read *rr = info;
@@ -975,7 +1316,21 @@ static u64 intel_cqm_event_count(struct perf_event *event)
if (!__rmid_valid(rr.rmid))
goto out;
- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ switch (event->attr.config) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ break;
+ case QOS_MBM_TOTAL_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_total_bw_count,
+ &rr, 1);
+ break;
+ case QOS_MBM_LOCAL_EVENT_ID:
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_local_bw_count,
+ &rr, 1);
+ break;
+ default:
+ goto out;
+ }
raw_spin_lock_irqsave(&cache_lock, flags);
if (event->hw.cqm_rmid == rr.rmid)
@@ -985,6 +1340,39 @@ out:
return __perf_event_count(event);
}
+static void mbm_start_hrtimer(struct mbm_pmu *pmu)
+{
+ hrtimer_start_range_ns(&(pmu->hrtimer),
+ pmu->timer_interval, 0,
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void mbm_stop_hrtimer(struct mbm_pmu *pmu)
+{
+ hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
+ struct perf_event *event;
+
+ if (!pmu->n_active)
+ return HRTIMER_NORESTART;
+ list_for_each_entry(event, &pmu->active_list, active_entry)
+ intel_cqm_event_update(event);
+ hrtimer_forward_now(hrtimer, pmu->timer_interval);
+ return HRTIMER_RESTART;
+}
+
+static void mbm_hrtimer_init(struct mbm_pmu *pmu)
+{
+ struct hrtimer *hr = &pmu->hrtimer;
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = mbm_hrtimer_handle;
+}
+
static void intel_cqm_event_start(struct perf_event *event, int mode)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
@@ -1003,19 +1391,45 @@ static void intel_cqm_event_start(struct perf_event *event, int mode)
}
state->rmid = rmid;
+ if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
+ struct cqm_rmid_entry *entry;
+
+ entry = __rmid_entry(rmid);
+ }
wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+
+ if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) && (cqm_mbm)) {
+ int cpu = smp_processor_id();
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
+
+ if (pmu) {
+ pmu->n_active++;
+ list_add_tail(&event->active_entry,
+ &pmu->active_list);
+ if (pmu->n_active == 1)
+ mbm_start_hrtimer(pmu);
+ intel_cqm_event_update(event);
+ }
+ }
}
static void intel_cqm_event_stop(struct perf_event *event, int mode)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
if (event->hw.cqm_state & PERF_HES_STOPPED)
return;
event->hw.cqm_state |= PERF_HES_STOPPED;
- intel_cqm_event_read(event);
+ if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
+ intel_cqm_event_read(event);
+
+ if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
+ (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
+ intel_cqm_event_update(event);
if (!--state->rmid_usecnt) {
state->rmid = 0;
@@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
} else {
WARN_ON_ONCE(!state->rmid);
}
+
+ if (pmu) {
+ if (pmu->n_active > 0) {
+ pmu->n_active--;
+ if (pmu->n_active == 0)
+ mbm_stop_hrtimer(pmu);
+ if (!list_empty(&event->active_entry))
+ list_del(&event->active_entry);
+ }
+ }
+
}
static int intel_cqm_event_add(struct perf_event *event, int mode)
@@ -1090,7 +1515,9 @@ static int intel_cqm_event_init(struct perf_event *event)
if (event->attr.type != intel_cqm_pmu.type)
return -ENOENT;
- if (event->attr.config & ~QOS_EVENT_MASK)
+ if (((event->attr.config & ~QOS_EVENT_MASK) &&
+ (event->attr.config & ~QOS_MBM_TOTAL_EVENT_ID)) &&
+ (event->attr.config & ~QOS_MBM_LOCAL_EVENT_ID))
return -EINVAL;
/* unsupported modes and filters */
@@ -1145,18 +1572,68 @@ EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-static struct attribute *intel_cqm_events_attr[] = {
+EVENT_ATTR_STR(llc_total_bw, intel_cqm_llc_total_bw, "event=0x02");
+EVENT_ATTR_STR(llc_total_bw.per-pkg, intel_cqm_llc_total_bw_pkg, "1");
+#if MBM_SCALING_FACTOR2 == 1000
+EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit, "MB/sec");
+EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit, "MB/sec");
+#else
+EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit, "KB/sec");
+EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit, "KB/sec");
+#endif
+EVENT_ATTR_STR(llc_total_bw.scale, intel_cqm_llc_total_bw_scale, NULL);
+EVENT_ATTR_STR(llc_total_bw.snapshot, intel_cqm_llc_total_bw_snapshot, "1");
+
+EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x04");
+EVENT_ATTR_STR(llc_local_bw.per-pkg, intel_cqm_llc_local_bw_pkg, "1");
+EVENT_ATTR_STR(llc_local_bw.scale, intel_cqm_llc_local_bw_scale, NULL);
+EVENT_ATTR_STR(llc_local_bw.snapshot, intel_cqm_llc_local_bw_snapshot, "1");
+
+static struct attribute *intel_cmt_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
+static struct attribute *intel_mbm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc_total_bw),
+ EVENT_PTR(intel_cqm_llc_local_bw),
+ EVENT_PTR(intel_cqm_llc_total_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_local_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_total_bw_unit),
+ EVENT_PTR(intel_cqm_llc_local_bw_unit),
+ EVENT_PTR(intel_cqm_llc_total_bw_scale),
+ EVENT_PTR(intel_cqm_llc_local_bw_scale),
+ EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
+ EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
+ NULL,
+};
+
+static struct attribute *intel_cmt_mbm_events_attr[] = {
EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_total_bw),
+ EVENT_PTR(intel_cqm_llc_local_bw),
EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_total_bw_pkg),
+ EVENT_PTR(intel_cqm_llc_local_bw_pkg),
EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_total_bw_unit),
+ EVENT_PTR(intel_cqm_llc_local_bw_unit),
EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_total_bw_scale),
+ EVENT_PTR(intel_cqm_llc_local_bw_scale),
EVENT_PTR(intel_cqm_llc_snapshot),
+ EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
+ EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
NULL,
};
static struct attribute_group intel_cqm_events_group = {
.name = "events",
- .attrs = intel_cqm_events_attr,
+ .attrs = NULL,
};
PMU_FORMAT_ATTR(event, "config:0-7");
@@ -1184,6 +1661,19 @@ max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
}
static ssize_t
+sliding_window_size_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ ssize_t rv;
+
+ mutex_lock(&cache_mutex);
+ rv = snprintf(page, PAGE_SIZE-1, "%u\n", mbm_window_size);
+ mutex_unlock(&cache_mutex);
+
+ return rv;
+}
+
+static ssize_t
max_recycle_threshold_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -1211,10 +1701,32 @@ max_recycle_threshold_store(struct device *dev,
return count;
}
+static ssize_t
+sliding_window_size_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cache_mutex);
+ if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
+ mbm_window_size = bytes;
+ mutex_unlock(&cache_mutex);
+
+ return count;
+}
+
static DEVICE_ATTR_RW(max_recycle_threshold);
+static DEVICE_ATTR_RW(sliding_window_size);
static struct attribute *intel_cqm_attrs[] = {
&dev_attr_max_recycle_threshold.attr,
+ &dev_attr_sliding_window_size.attr,
NULL,
};
@@ -1255,10 +1767,11 @@ static inline void cqm_pick_event_reader(int cpu)
cpumask_set_cpu(cpu, &cqm_cpumask);
}
-static void intel_cqm_cpu_prepare(unsigned int cpu)
+static int intel_cqm_cpu_prepare(unsigned int cpu)
{
struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
state->rmid = 0;
state->closid = 0;
@@ -1266,12 +1779,27 @@ static void intel_cqm_cpu_prepare(unsigned int cpu)
WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+
+ if ((!pmu) && (cqm_mbm)) {
+ pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL, NUMA_NO_NODE);
+ if (!pmu)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&pmu->active_list);
+ pmu->pmu = &intel_cqm_pmu;
+ pmu->n_active = 0;
+ pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
+ per_cpu(mbm_pmu, cpu) = pmu;
+ per_cpu(mbm_pmu_to_free, cpu) = NULL;
+ mbm_hrtimer_init(pmu);
+ }
+ return 0;
}
static void intel_cqm_cpu_exit(unsigned int cpu)
{
int phys_id = topology_physical_package_id(cpu);
int i;
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
/*
* Is @cpu a designated cqm reader?
@@ -1288,6 +1816,36 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
break;
}
}
+
+ /* cancel overflow polling timer for CPU */
+ if (pmu)
+ mbm_stop_hrtimer(pmu);
+ kfree(mbm_local);
+ kfree(mbm_total);
+
+}
+
+static void mbm_cpu_kfree(int cpu)
+{
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu_to_free, cpu);
+
+ kfree(pmu);
+
+ per_cpu(mbm_pmu_to_free, cpu) = NULL;
+}
+
+static int mbm_cpu_dying(int cpu)
+{
+ struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
+
+ if (!pmu)
+ return 0;
+
+ per_cpu(mbm_pmu, cpu) = NULL;
+
+ per_cpu(mbm_pmu_to_free, cpu) = pmu;
+
+ return 0;
}
static int intel_cqm_cpu_notifier(struct notifier_block *nb,
@@ -1297,7 +1855,7 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- intel_cqm_cpu_prepare(cpu);
+ return intel_cqm_cpu_prepare(cpu);
break;
case CPU_DOWN_PREPARE:
intel_cqm_cpu_exit(cpu);
@@ -1305,6 +1863,14 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
case CPU_STARTING:
cqm_pick_event_reader(cpu);
break;
+ case CPU_UP_CANCELED:
+ case CPU_DYING:
+ mbm_cpu_dying(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ mbm_cpu_kfree(cpu);
+ break;
}
return NOTIFY_OK;
@@ -1315,15 +1881,74 @@ static const struct x86_cpu_id intel_cqm_match[] = {
{}
};
+static const struct x86_cpu_id intel_mbm_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
+ {}
+};
+
static int __init intel_cqm_init(void)
{
char *str, scale[20];
- int i, cpu, ret;
+ int i, cpu, ret, array_size;
- if (!x86_match_cpu(intel_cqm_match))
+ if (!x86_match_cpu(intel_cqm_match) &&
+ (!x86_match_cpu(intel_mbm_match)))
return -ENODEV;
cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+ cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
+
+ if (x86_match_cpu(intel_cqm_match)) {
+ cqm_llc_occ = true;
+ intel_cqm_events_group.attrs = intel_cmt_events_attr;
+ }
+
+ if (x86_match_cpu(intel_mbm_match)) {
+ u32 mbm_scale_rounded = 0;
+
+ cqm_mbm = true;
+ cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+ /*
+ * MBM counter values are in bytes. To conver this to KB/sec
+ * or MB/sec, we scale the MBM scale factor by 1000. Another
+ * MBM_SCALING_FACTOR2 factor scaling down is done
+ * after reading the counter val i.e. in the function
+ * __rmid_read_mbm()
+ */
+ mbm_scale_rounded = DIV_ROUND_UP(cqm_l3_scale,
+ MBM_SCALING_FACTOR);
+ cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
+ snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (cqm_llc_occ)
+ intel_cqm_events_group.attrs =
+ intel_cmt_mbm_events_attr;
+ else
+ intel_cqm_events_group.attrs = intel_mbm_events_attr;
+
+ event_attr_intel_cqm_llc_local_bw_scale.event_str = str;
+ event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
+
+ array_size = (cqm_max_rmid + 1) * MBM_SOCKET_MAX;
+ mbm_local = kzalloc_node(sizeof(struct sample) * array_size,
+ GFP_KERNEL, NUMA_NO_NODE);
+ if (!mbm_local) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ mbm_total = kzalloc_node(sizeof(struct sample) * array_size,
+ GFP_KERNEL, NUMA_NO_NODE);
+ if (!mbm_total) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ } else
+ cqm_mbm = false;
/*
* It's possible that not all resources support the same number
@@ -1336,44 +1961,48 @@ static int __init intel_cqm_init(void)
*/
cpu_notifier_register_begin();
- for_each_online_cpu(cpu) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
+ if (cqm_llc_occ) {
+ for_each_online_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
- if (c->x86_cache_max_rmid < cqm_max_rmid)
- cqm_max_rmid = c->x86_cache_max_rmid;
+ if (c->x86_cache_max_rmid < cqm_max_rmid)
+ cqm_max_rmid = c->x86_cache_max_rmid;
- if (c->x86_cache_occ_scale != cqm_l3_scale) {
- pr_err("Multiple LLC scale values, disabling\n");
- ret = -EINVAL;
- goto out;
+ if (c->x86_cache_occ_scale != cqm_l3_scale) {
+ pr_err("Multiple LLC scale values, disabling\n");
+ ret = -EINVAL;
+ goto out;
+ }
}
- }
- /*
- * A reasonable upper limit on the max threshold is the number
- * of lines tagged per RMID if all RMIDs have the same number of
- * lines tagged in the LLC.
- *
- * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
- */
- __intel_cqm_max_threshold =
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ __intel_cqm_max_threshold =
boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
- snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
- str = kstrdup(scale, GFP_KERNEL);
- if (!str) {
- ret = -ENOMEM;
- goto out;
- }
+ snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }
- event_attr_intel_cqm_llc_scale.event_str = str;
+ event_attr_intel_cqm_llc_scale.event_str = str;
+ }
ret = intel_cqm_setup_rmid_cache();
if (ret)
goto out;
for_each_online_cpu(i) {
- intel_cqm_cpu_prepare(i);
+ ret = intel_cqm_cpu_prepare(i);
+ if (ret)
+ goto out_free;
cqm_pick_event_reader(i);
}
@@ -1384,7 +2013,10 @@ static int __init intel_cqm_init(void)
pr_err("Intel CQM perf registration failed: %d\n", ret);
else
pr_info("Intel CQM monitoring enabled\n");
-
+ goto out;
+out_free:
+ kfree(mbm_local);
+ kfree(mbm_total);
out:
cpu_notifier_register_done();
--
2.1.0
^ permalink raw reply related [flat|nested] 12+ messages in thread
* RE: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-08-08 6:42 [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU Kanaka Juvva
@ 2015-08-16 17:42 ` Juvva, Kanaka D
2015-08-19 20:50 ` Thomas Gleixner
1 sibling, 0 replies; 12+ messages in thread
From: Juvva, Kanaka D @ 2015-08-16 17:42 UTC (permalink / raw)
To: Kanaka Juvva, Williamson, Glenn P, Fleming, Matt, Auld, Will,
andi@firstfloor.org, linux-kernel@vger.kernel.org, Luck, Tony,
peterz@infradead.org, tglx@linutronix.de, tj@kernel.org,
x86@kernel.org, mingo@redhat.com, hpa@zytor.com, Shivappa, Vikas
Hi Thomas and Peter,
I just want to follow up on this patch submission. Any thoughts. I guess this is in right time
for 4.3. Could you respond when you have some time.
Regards,
-Kanaka
> -----Original Message-----
> From: Kanaka Juvva [mailto:kanaka.d.juvva@linux.intel.com]
> Sent: Friday, August 7, 2015 11:42 PM
> To: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will;
> andi@firstfloor.org; linux-kernel@vger.kernel.org; Luck, Tony;
> peterz@infradead.org; tglx@linutronix.de; tj@kernel.org; x86@kernel.org;
> mingo@redhat.com; hpa@zytor.com; Shivappa, Vikas
> Subject: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring
> (MBM) PMU
>
> This patch adds Memory Bandwidth Monitoring events to the exitsing intel_cqm
> pmu in the Linux Kernel.
>
> Intel MBM builds on Cache Monitoring Technology (CMT) infrastructure to allow
> monitoring of bandwidth from one level of the cache hierarchy to the next - in
> this case focusing on the L3 cache, which is typically backed directly by system
> memory. As a result of this implementation, memory bandwidth can be
> monitored.
>
> MBM counters are available in Intel Xeon CPUs. The following events are
> implemented in the kernel and expose two MBM counters via perf_event
> interface:
>
> - llc_local_bw: bandwidth consumption of all coress on socket
> - llc_total_bw: bandwidth consumption of all cores on socket + bandwidth
> for QPI accesses
>
> At present, the MBM events are checked at one second interval provided by the
> HRTIMER of the MBM event. MBM counters can overflow atmost once in a
> second and thus must be read atleast once in a second. Overflow is detected
> and handled. Cumulative average value is calculated for each bandwidth type
> upon reading a new value from the MSR.
>
> version2: merged MBM patches with tip
> changes: 1. Incorporate HRTIMER API chanes of tip
> 2. added separate event MBM_LOCAL_EVENT codes for perf
> and MSR reads
> QOS_MBM_LOCAL_EVENT_ID_HW 0x3 used for MSR read
> as per SDM
> QOS_MBM_LOCAL_EVENT_ID (1 << 2) used by perf,
> perf event codes are in powers of 2
>
> version3: Improved readbility of code
> Incorporated upstream comments for Patch v2
>
> Signed-off-by: Kanaka Juvva <kanaka.d.juvva@linux.intel.com>
> ---
> arch/x86/include/asm/cpufeature.h | 2 +
> arch/x86/kernel/cpu/common.c | 4 +-
> arch/x86/kernel/cpu/perf_event_intel_cqm.c | 702
> +++++++++++++++++++++++++++--
> 3 files changed, 672 insertions(+), 36 deletions(-)
>
> diff --git a/arch/x86/include/asm/cpufeature.h
> b/arch/x86/include/asm/cpufeature.h
> index 3d6606f..c36abdf 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -251,6 +251,8 @@
>
> /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
> #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy
> monitoring if 1 */
> +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+1) /* LLC Total MBM
> monitoring
> +*/ #define X86_FEATURE_CQM_MBM_LOCAL (12*32+2) /* LLC Local MBM
> +monitoring */
>
> /*
> * BUG word(s)
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 922c5e0..7dcc88c 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -643,7 +643,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
> /* QoS sub-leaf, EAX=0Fh, ECX=1 */
> cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
> c->x86_capability[12] = edx;
> - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
> + if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC))
> + || ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL))
> + || (cpu_has(c,
> X86_FEATURE_CQM_MBM_LOCAL)))) {
> c->x86_cache_max_rmid = ecx;
> c->x86_cache_occ_scale = ebx;
> }
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> index 63eb68b..d54543f 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> @@ -12,9 +12,19 @@
> #define MSR_IA32_PQR_ASSOC 0x0c8f
> #define MSR_IA32_QM_CTR 0x0c8e
> #define MSR_IA32_QM_EVTSEL 0x0c8d
> +#define MBM_CNTR_MAX 0xffffff
> +#define MBM_SOCKET_MAX 8
> +#define MBM_TIME_DELTA_MAX 1000
> +#define MBM_TIME_DELTA_MIN 100
> +#define MBM_SCALING_FACTOR2 1
> +#define MBM_SCALING_FACTOR 1000
> +#define MBM_FIFO_SIZE_MIN 10
> +#define MBM_FIFO_SIZE_MAX 300
>
> static u32 cqm_max_rmid = -1;
> static unsigned int cqm_l3_scale; /* supposedly cacheline size */
> +static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN; static bool
> +cqm_llc_occ, cqm_mbm;
>
> /**
> * struct intel_pqr_state - State cache for the PQR MSR @@ -42,6 +52,90 @@
> struct intel_pqr_state {
> * interrupts disabled, which is sufficient for the protection.
> */
> static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
> +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu); static
> +DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
> +
> +/*
> + * mbm pmu is used for storing mbm_local and mbm_total
> + * events
> + */
> +struct mbm_pmu {
> + /*
> + * # of active events
> + */
> + int n_active;
> +
> + /*
> + * Linked list that holds the perf events
> + */
> + struct list_head active_list;
> +
> + /*
> + * pmu to which event belongs
> + */
> + struct pmu *pmu;
> +
> + /*
> + * time interval in ms between two consecutive samples
> + */
> + ktime_t timer_interval; /* in ktime_t unit */
> +
> + /*
> + * periodic timer that triggers inte_cqm_mbm_update
> + */
> + struct hrtimer hrtimer;
> +};
> +
> +/*
> + * struct sample defines mbm event
> + */
> +struct sample {
> + /*
> + * current MSR value
> + */
> + u64 bytes;
> +
> + /*
> + * running average of the bandwidth
> + */
> + u64 cum_avg;
> +
> + /*
> + * time of the previous sample
> + */
> + ktime_t prev_time;
> +
> + /*
> + * sample index
> + */
> + u64 index;
> +
> + /*
> + * Sliding window to store previous 'n' bw values
> + */
> + u32 mbmfifo[MBM_FIFO_SIZE_MAX];
> +
> + /*
> + * fifoin is the location at which current bw is saved
> + */
> + u32 fifoin;
> +
> + /*
> + * fifoout is the start of the sliding window from which last 'n'
> + * bw values must be read
> + */
> + u32 fifoout; /* mbmfifo out counter for sliding window */ };
> +
> +/*
> + * stats for total bw
> + */
> +static struct sample *mbm_total; /* curent stats for mbm_total events
> +*/
> +
> +/*
> + * stats for local bw
> + */
> +static struct sample *mbm_local; /* current stats for mbm_local events
> +*/
>
> /*
> * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
> @@ -66,6 +160,9 @@ static cpumask_t cqm_cpumask;
> #define RMID_VAL_UNAVAIL (1ULL << 62)
>
> #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> +#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
>
> #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
>
> @@ -176,6 +273,25 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32
> rmid)
> return entry;
> }
>
> +/**
> + * __mbm_reset_stats - reset stats for a given rmid for the current cpu
> + * @rmid: rmid value
> + *
> + * vrmid: array index for current core for the given rmid
> + * mbs_total[] and mbm_local[] are linearly indexed by core * max
> +#rmids per
> + * core
> + */
> +static void __mbm_reset_stats(u32 rmid) {
> + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> + cqm_max_rmid + rmid;
> +
> + if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
> + return;
> + memset(&mbm_local[vrmid], 0, sizeof(struct sample));
> + memset(&mbm_total[vrmid], 0, sizeof(struct sample)); }
> +
> /*
> * Returns < 0 on fail.
> *
> @@ -192,6 +308,7 @@ static u32 __get_rmid(void)
>
> entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry,
> list);
> list_del(&entry->list);
> + __mbm_reset_stats(entry->rmid);
>
> return entry->rmid;
> }
> @@ -207,6 +324,7 @@ static void __put_rmid(u32 rmid)
>
> entry->queue_time = jiffies;
> entry->state = RMID_YOUNG;
> + __mbm_reset_stats(rmid);
>
> list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } @@ -232,6 +350,7
> @@ static int intel_cqm_setup_rmid_cache(void)
>
> INIT_LIST_HEAD(&entry->list);
> entry->rmid = r;
> +
> cqm_rmid_ptrs[r] = entry;
>
> list_add_tail(&entry->list, &cqm_rmid_free_lru); @@ -494,6
> +613,165 @@ static bool intel_cqm_sched_in_event(u32 rmid)
> return false;
> }
>
> +
> +static u32 bw_sum_calc(struct sample *bw_stat, int rmid) {
> + u32 val = 0, i, j, index;
> +
> + if (++bw_stat->fifoout >= mbm_window_size)
> + bw_stat->fifoout = 0;
> + index = bw_stat->fifoout;
> + for (i = 0; i < mbm_window_size - 1; i++) {
> + if (index + i >= mbm_window_size)
> + j = index + i - mbm_window_size;
> + else
> + j = index + i;
> + val += bw_stat->mbmfifo[j];
> + }
> + return val;
> +}
> +
> +static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw) {
> + if (is_localbw)
> + return bw_sum_calc(&mbm_local[rmid], rmid);
> + else
> + return bw_sum_calc(&mbm_total[rmid], rmid); }
> +
> +static void __mbm_fifo_in(struct sample *bw_stat, u32 val) {
> + bw_stat->mbmfifo[bw_stat->fifoin] = val;
> + if (++bw_stat->fifoin >= mbm_window_size)
> + bw_stat->fifoin = 0;
> +}
> +
> +static void mbm_fifo_in(int rmid, u32 val, bool is_localbw) {
> + if (is_localbw)
> + __mbm_fifo_in(&mbm_local[rmid], val);
> + else
> + __mbm_fifo_in(&mbm_total[rmid], val); }
> +
> +/*
> + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and
> +reads
> + * its MSR counter. Check whether overflow occurred and handles it.
> +Calculates
> + * currenet BW and updates running average.
> + *
> + * Overflow Handling:
> + * if (MSR current value < MSR previous value) it is an
> + * overflow. and overflow is handled.
> + *
> + * Calculation of Current BW value:
> + * If MSR is read within last 100ms, then the value is ignored;
> + * this will suppress small deltas. We don't process MBM samples that
> +are
> + * within 100ms.
> + *
> + * Bandwidth is calculated as:
> + * bandwidth = difference of last two msr counter values/time difference.
> + *
> + * cum_avg = Running Average bandwidth of last 'n' samples that are
> +processed
> + *
> + * Sliding window is used to save the last 'n' samples. Hence,
> + * n = sliding_window_size
> + *
> + *Scaling:
> + * cum_avg is scaled down by a factor MBM_SCALING_FACTOR2 and rounded
> +to
> + * nearest integer. User interface reads the BW in KB/sec or MB/sec.
> + */
> +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local) {
> + u64 val, tmp, diff_time, cma, bytes, index;
> + bool overflow = false, first = false;
> + ktime_t cur_time;
> + u32 tmp32 = rmid;
> + struct sample *mbm_current;
> + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> + cqm_max_rmid + rmid;
> +
> + rmid = vrmid;
> + cur_time = ktime_get();
> + if (read_mbm_local) {
> + cma = mbm_local[rmid].cum_avg;
> + diff_time = ktime_ms_delta(cur_time,
> + mbm_local[rmid].prev_time);
> + if (diff_time < 100)
> + return cma;
> + mbm_local[rmid].prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> + bytes = mbm_local[rmid].bytes;
> + index = mbm_local[rmid].index;
> + mbm_current = &mbm_local[rmid];
> + rmid = tmp32;
> + wrmsr(MSR_IA32_QM_EVTSEL,
> QOS_MBM_LOCAL_EVENT_ID_HW, rmid);
> + } else {
> + cma = mbm_total[rmid].cum_avg;
> + diff_time = ktime_ms_delta(cur_time,
> + mbm_total[rmid].prev_time);
> + if (diff_time < 100)
> + return cma;
> + mbm_total[rmid].prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> + bytes = mbm_total[rmid].bytes;
> + index = mbm_total[rmid].index;
> + mbm_current = &mbm_total[rmid];
> + rmid = tmp32;
> + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID,
> rmid);
> + }
> + rdmsrl(MSR_IA32_QM_CTR, val);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return val;
> +
> + tmp = val;
> + /* if current msr value < previous msr value , it means overflow */
> + if (val < bytes) {
> + val = MBM_CNTR_MAX - bytes + val;
> + overflow = true;
> + } else
> + val = val - bytes;
> +
> + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
> +
> + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> + /* First sample */
> + first = true;
> +
> + rmid = vrmid;
> + if ((diff_time <= (MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN))
> ||
> + overflow || first) {
> + int bw, ret;
> +
> + if (index & (index < mbm_window_size))
> + val = cma * MBM_SCALING_FACTOR2 + val / index -
> + cma / index;
> + val = DIV_ROUND_UP(val, MBM_SCALING_FACTOR2);
> + if (index >= mbm_window_size) {
> + /*
> + * Compute the sum of recent n-1 samples and slide the
> + * window by 1
> + */
> + ret = __mbm_fifo_sum_lastn_out(rmid,
> read_mbm_local);
> + /* recalculate the running average with current bw */
> + ret = (ret + val) / mbm_window_size;
> + bw = val;
> + val = ret;
> + } else
> + bw = val;
> + /* save the recent bw in fifo */
> + mbm_fifo_in(rmid, bw, read_mbm_local);
> + /* save the current sample */
> + mbm_current->index++;
> + mbm_current->cum_avg = val;
> + mbm_current->bytes = tmp;
> + mbm_current->prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> +
> + return val;
> + }
> + return cma;
> +}
> +
> /*
> * Initially use this constant for both the limbo queue time and the
> * rotation timer interval, pmu::hrtimer_interval_ms.
> @@ -875,6 +1153,40 @@ static void intel_cqm_setup_event(struct perf_event
> *event,
> event->hw.cqm_rmid = rmid;
> }
>
> +static void intel_cqm_event_update(struct perf_event *event) {
> + unsigned int rmid;
> + u64 val = 0;
> +
> + /*
> + * Task events are handled by intel_cqm_event_count().
> + */
> + if (event->cpu == -1)
> + return;
> +
> + rmid = event->hw.cqm_rmid;
> + if (!__rmid_valid(rmid))
> + return;
> +
> + switch (event->attr.config) {
> + case QOS_MBM_TOTAL_EVENT_ID:
> + val = __rmid_read_mbm(rmid, false);
> + break;
> + case QOS_MBM_LOCAL_EVENT_ID:
> + val = __rmid_read_mbm(rmid, true);
> + break;
> + default:
> + return;
> + }
> + /*
> + * Ignore this reading on error states and do not update the value.
> + */
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return;
> +
> + local64_set(&event->count, val);
> +}
> +
> static void intel_cqm_event_read(struct perf_event *event) {
> unsigned long flags;
> @@ -887,6 +1199,13 @@ static void intel_cqm_event_read(struct perf_event
> *event)
> if (event->cpu == -1)
> return;
>
> + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> + intel_cqm_event_update(event);
> +
> + if (!(event->attr.config & QOS_L3_OCCUP_EVENT_ID))
> + return;
> +
> raw_spin_lock_irqsave(&cache_lock, flags);
> rmid = event->hw.cqm_rmid;
>
> @@ -906,6 +1225,28 @@ out:
> raw_spin_unlock_irqrestore(&cache_lock, flags); }
>
> +static void __intel_cqm_event_total_bw_count(void *info) {
> + struct rmid_read *rr = info;
> + u64 val;
> +
> + val = __rmid_read_mbm(rr->rmid, false);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return;
> + atomic64_add(val, &rr->value);
> +}
> +
> +static void __intel_cqm_event_local_bw_count(void *info) {
> + struct rmid_read *rr = info;
> + u64 val;
> +
> + val = __rmid_read_mbm(rr->rmid, true);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return;
> + atomic64_add(val, &rr->value);
> +}
> +
> static void __intel_cqm_event_count(void *info) {
> struct rmid_read *rr = info;
> @@ -975,7 +1316,21 @@ static u64 intel_cqm_event_count(struct perf_event
> *event)
> if (!__rmid_valid(rr.rmid))
> goto out;
>
> - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> + switch (event->attr.config) {
> + case QOS_L3_OCCUP_EVENT_ID:
> + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_count, &rr, 1);
> + break;
> + case QOS_MBM_TOTAL_EVENT_ID:
> + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_total_bw_count,
> + &rr, 1);
> + break;
> + case QOS_MBM_LOCAL_EVENT_ID:
> + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_local_bw_count,
> + &rr, 1);
> + break;
> + default:
> + goto out;
> + }
>
> raw_spin_lock_irqsave(&cache_lock, flags);
> if (event->hw.cqm_rmid == rr.rmid)
> @@ -985,6 +1340,39 @@ out:
> return __perf_event_count(event);
> }
>
> +static void mbm_start_hrtimer(struct mbm_pmu *pmu) {
> + hrtimer_start_range_ns(&(pmu->hrtimer),
> + pmu->timer_interval, 0,
> + HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static void mbm_stop_hrtimer(struct mbm_pmu *pmu) {
> + hrtimer_cancel(&pmu->hrtimer);
> +}
> +
> +static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
> +{
> + struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
> + struct perf_event *event;
> +
> + if (!pmu->n_active)
> + return HRTIMER_NORESTART;
> + list_for_each_entry(event, &pmu->active_list, active_entry)
> + intel_cqm_event_update(event);
> + hrtimer_forward_now(hrtimer, pmu->timer_interval);
> + return HRTIMER_RESTART;
> +}
> +
> +static void mbm_hrtimer_init(struct mbm_pmu *pmu) {
> + struct hrtimer *hr = &pmu->hrtimer;
> +
> + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + hr->function = mbm_hrtimer_handle;
> +}
> +
> static void intel_cqm_event_start(struct perf_event *event, int mode) {
> struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); @@ -1003,19
> +1391,45 @@ static void intel_cqm_event_start(struct perf_event *event, int
> mode)
> }
>
> state->rmid = rmid;
> + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
> + struct cqm_rmid_entry *entry;
> +
> + entry = __rmid_entry(rmid);
> + }
> wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
> +
> + if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) &&
> (cqm_mbm)) {
> + int cpu = smp_processor_id();
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> +
> + if (pmu) {
> + pmu->n_active++;
> + list_add_tail(&event->active_entry,
> + &pmu->active_list);
> + if (pmu->n_active == 1)
> + mbm_start_hrtimer(pmu);
> + intel_cqm_event_update(event);
> + }
> + }
> }
>
> static void intel_cqm_event_stop(struct perf_event *event, int mode) {
> struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> + struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
>
> if (event->hw.cqm_state & PERF_HES_STOPPED)
> return;
>
> event->hw.cqm_state |= PERF_HES_STOPPED;
>
> - intel_cqm_event_read(event);
> + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
> + intel_cqm_event_read(event);
> +
> + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> + intel_cqm_event_update(event);
>
> if (!--state->rmid_usecnt) {
> state->rmid = 0;
> @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct perf_event
> *event, int mode)
> } else {
> WARN_ON_ONCE(!state->rmid);
> }
> +
> + if (pmu) {
> + if (pmu->n_active > 0) {
> + pmu->n_active--;
> + if (pmu->n_active == 0)
> + mbm_stop_hrtimer(pmu);
> + if (!list_empty(&event->active_entry))
> + list_del(&event->active_entry);
> + }
> + }
> +
> }
>
> static int intel_cqm_event_add(struct perf_event *event, int mode) @@ -
> 1090,7 +1515,9 @@ static int intel_cqm_event_init(struct perf_event *event)
> if (event->attr.type != intel_cqm_pmu.type)
> return -ENOENT;
>
> - if (event->attr.config & ~QOS_EVENT_MASK)
> + if (((event->attr.config & ~QOS_EVENT_MASK) &&
> + (event->attr.config & ~QOS_MBM_TOTAL_EVENT_ID)) &&
> + (event->attr.config & ~QOS_MBM_LOCAL_EVENT_ID))
> return -EINVAL;
>
> /* unsupported modes and filters */
> @@ -1145,18 +1572,68 @@ EVENT_ATTR_STR(llc_occupancy.unit,
> intel_cqm_llc_unit, "Bytes"); EVENT_ATTR_STR(llc_occupancy.scale,
> intel_cqm_llc_scale, NULL); EVENT_ATTR_STR(llc_occupancy.snapshot,
> intel_cqm_llc_snapshot, "1");
>
> -static struct attribute *intel_cqm_events_attr[] = {
> +EVENT_ATTR_STR(llc_total_bw, intel_cqm_llc_total_bw, "event=0x02");
> +EVENT_ATTR_STR(llc_total_bw.per-pkg, intel_cqm_llc_total_bw_pkg, "1");
> +#if MBM_SCALING_FACTOR2 == 1000 EVENT_ATTR_STR(llc_total_bw.unit,
> +intel_cqm_llc_total_bw_unit, "MB/sec");
> +EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit,
> +"MB/sec"); #else EVENT_ATTR_STR(llc_total_bw.unit,
> +intel_cqm_llc_total_bw_unit, "KB/sec");
> +EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit,
> +"KB/sec"); #endif EVENT_ATTR_STR(llc_total_bw.scale,
> +intel_cqm_llc_total_bw_scale, NULL);
> +EVENT_ATTR_STR(llc_total_bw.snapshot, intel_cqm_llc_total_bw_snapshot,
> +"1");
> +
> +EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x04");
> +EVENT_ATTR_STR(llc_local_bw.per-pkg, intel_cqm_llc_local_bw_pkg, "1");
> +EVENT_ATTR_STR(llc_local_bw.scale, intel_cqm_llc_local_bw_scale, NULL);
> +EVENT_ATTR_STR(llc_local_bw.snapshot, intel_cqm_llc_local_bw_snapshot,
> +"1");
> +
> +static struct attribute *intel_cmt_events_attr[] = {
> + EVENT_PTR(intel_cqm_llc),
> + EVENT_PTR(intel_cqm_llc_pkg),
> + EVENT_PTR(intel_cqm_llc_unit),
> + EVENT_PTR(intel_cqm_llc_scale),
> + EVENT_PTR(intel_cqm_llc_snapshot),
> + NULL,
> +};
> +
> +static struct attribute *intel_mbm_events_attr[] = {
> + EVENT_PTR(intel_cqm_llc_total_bw),
> + EVENT_PTR(intel_cqm_llc_local_bw),
> + EVENT_PTR(intel_cqm_llc_total_bw_pkg),
> + EVENT_PTR(intel_cqm_llc_local_bw_pkg),
> + EVENT_PTR(intel_cqm_llc_total_bw_unit),
> + EVENT_PTR(intel_cqm_llc_local_bw_unit),
> + EVENT_PTR(intel_cqm_llc_total_bw_scale),
> + EVENT_PTR(intel_cqm_llc_local_bw_scale),
> + EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
> + EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
> + NULL,
> +};
> +
> +static struct attribute *intel_cmt_mbm_events_attr[] = {
> EVENT_PTR(intel_cqm_llc),
> + EVENT_PTR(intel_cqm_llc_total_bw),
> + EVENT_PTR(intel_cqm_llc_local_bw),
> EVENT_PTR(intel_cqm_llc_pkg),
> + EVENT_PTR(intel_cqm_llc_total_bw_pkg),
> + EVENT_PTR(intel_cqm_llc_local_bw_pkg),
> EVENT_PTR(intel_cqm_llc_unit),
> + EVENT_PTR(intel_cqm_llc_total_bw_unit),
> + EVENT_PTR(intel_cqm_llc_local_bw_unit),
> EVENT_PTR(intel_cqm_llc_scale),
> + EVENT_PTR(intel_cqm_llc_total_bw_scale),
> + EVENT_PTR(intel_cqm_llc_local_bw_scale),
> EVENT_PTR(intel_cqm_llc_snapshot),
> + EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
> + EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
> NULL,
> };
>
> static struct attribute_group intel_cqm_events_group = {
> .name = "events",
> - .attrs = intel_cqm_events_attr,
> + .attrs = NULL,
> };
>
> PMU_FORMAT_ATTR(event, "config:0-7");
> @@ -1184,6 +1661,19 @@ max_recycle_threshold_show(struct device *dev,
> struct device_attribute *attr, }
>
> static ssize_t
> +sliding_window_size_show(struct device *dev, struct device_attribute *attr,
> + char *page)
> +{
> + ssize_t rv;
> +
> + mutex_lock(&cache_mutex);
> + rv = snprintf(page, PAGE_SIZE-1, "%u\n", mbm_window_size);
> + mutex_unlock(&cache_mutex);
> +
> + return rv;
> +}
> +
> +static ssize_t
> max_recycle_threshold_store(struct device *dev,
> struct device_attribute *attr,
> const char *buf, size_t count)
> @@ -1211,10 +1701,32 @@ max_recycle_threshold_store(struct device *dev,
> return count;
> }
>
> +static ssize_t
> +sliding_window_size_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned int bytes;
> + int ret;
> +
> + ret = kstrtouint(buf, 0, &bytes);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&cache_mutex);
> + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> + mbm_window_size = bytes;
> + mutex_unlock(&cache_mutex);
> +
> + return count;
> +}
> +
> static DEVICE_ATTR_RW(max_recycle_threshold);
> +static DEVICE_ATTR_RW(sliding_window_size);
>
> static struct attribute *intel_cqm_attrs[] = {
> &dev_attr_max_recycle_threshold.attr,
> + &dev_attr_sliding_window_size.attr,
> NULL,
> };
>
> @@ -1255,10 +1767,11 @@ static inline void cqm_pick_event_reader(int cpu)
> cpumask_set_cpu(cpu, &cqm_cpumask);
> }
>
> -static void intel_cqm_cpu_prepare(unsigned int cpu)
> +static int intel_cqm_cpu_prepare(unsigned int cpu)
> {
> struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
> struct cpuinfo_x86 *c = &cpu_data(cpu);
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
>
> state->rmid = 0;
> state->closid = 0;
> @@ -1266,12 +1779,27 @@ static void intel_cqm_cpu_prepare(unsigned int
> cpu)
>
> WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
> WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
> +
> + if ((!pmu) && (cqm_mbm)) {
> + pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL,
> NUMA_NO_NODE);
> + if (!pmu)
> + return -ENOMEM;
> + INIT_LIST_HEAD(&pmu->active_list);
> + pmu->pmu = &intel_cqm_pmu;
> + pmu->n_active = 0;
> + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> + per_cpu(mbm_pmu, cpu) = pmu;
> + per_cpu(mbm_pmu_to_free, cpu) = NULL;
> + mbm_hrtimer_init(pmu);
> + }
> + return 0;
> }
>
> static void intel_cqm_cpu_exit(unsigned int cpu) {
> int phys_id = topology_physical_package_id(cpu);
> int i;
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
>
> /*
> * Is @cpu a designated cqm reader?
> @@ -1288,6 +1816,36 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
> break;
> }
> }
> +
> + /* cancel overflow polling timer for CPU */
> + if (pmu)
> + mbm_stop_hrtimer(pmu);
> + kfree(mbm_local);
> + kfree(mbm_total);
> +
> +}
> +
> +static void mbm_cpu_kfree(int cpu)
> +{
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu_to_free, cpu);
> +
> + kfree(pmu);
> +
> + per_cpu(mbm_pmu_to_free, cpu) = NULL;
> +}
> +
> +static int mbm_cpu_dying(int cpu)
> +{
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> +
> + if (!pmu)
> + return 0;
> +
> + per_cpu(mbm_pmu, cpu) = NULL;
> +
> + per_cpu(mbm_pmu_to_free, cpu) = pmu;
> +
> + return 0;
> }
>
> static int intel_cqm_cpu_notifier(struct notifier_block *nb, @@ -1297,7
> +1855,7 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
>
> switch (action & ~CPU_TASKS_FROZEN) {
> case CPU_UP_PREPARE:
> - intel_cqm_cpu_prepare(cpu);
> + return intel_cqm_cpu_prepare(cpu);
> break;
> case CPU_DOWN_PREPARE:
> intel_cqm_cpu_exit(cpu);
> @@ -1305,6 +1863,14 @@ static int intel_cqm_cpu_notifier(struct
> notifier_block *nb,
> case CPU_STARTING:
> cqm_pick_event_reader(cpu);
> break;
> + case CPU_UP_CANCELED:
> + case CPU_DYING:
> + mbm_cpu_dying(cpu);
> + break;
> + case CPU_ONLINE:
> + case CPU_DEAD:
> + mbm_cpu_kfree(cpu);
> + break;
> }
>
> return NOTIFY_OK;
> @@ -1315,15 +1881,74 @@ static const struct x86_cpu_id intel_cqm_match[] =
> {
> {}
> };
>
> +static const struct x86_cpu_id intel_mbm_match[] = {
> + { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_MBM_LOCAL },
> + {}
> +};
> +
> static int __init intel_cqm_init(void)
> {
> char *str, scale[20];
> - int i, cpu, ret;
> + int i, cpu, ret, array_size;
>
> - if (!x86_match_cpu(intel_cqm_match))
> + if (!x86_match_cpu(intel_cqm_match) &&
> + (!x86_match_cpu(intel_mbm_match)))
> return -ENODEV;
>
> cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> +
> + if (x86_match_cpu(intel_cqm_match)) {
> + cqm_llc_occ = true;
> + intel_cqm_events_group.attrs = intel_cmt_events_attr;
> + }
> +
> + if (x86_match_cpu(intel_mbm_match)) {
> + u32 mbm_scale_rounded = 0;
> +
> + cqm_mbm = true;
> + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> + /*
> + * MBM counter values are in bytes. To conver this to KB/sec
> + * or MB/sec, we scale the MBM scale factor by 1000. Another
> + * MBM_SCALING_FACTOR2 factor scaling down is done
> + * after reading the counter val i.e. in the function
> + * __rmid_read_mbm()
> + */
> + mbm_scale_rounded = DIV_ROUND_UP(cqm_l3_scale,
> + MBM_SCALING_FACTOR);
> + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> + snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
> + str = kstrdup(scale, GFP_KERNEL);
> + if (!str) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + if (cqm_llc_occ)
> + intel_cqm_events_group.attrs =
> + intel_cmt_mbm_events_attr;
> + else
> + intel_cqm_events_group.attrs =
> intel_mbm_events_attr;
> +
> + event_attr_intel_cqm_llc_local_bw_scale.event_str = str;
> + event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
> +
> + array_size = (cqm_max_rmid + 1) * MBM_SOCKET_MAX;
> + mbm_local = kzalloc_node(sizeof(struct sample) * array_size,
> + GFP_KERNEL, NUMA_NO_NODE);
> + if (!mbm_local) {
> + ret = -ENOMEM;
> + goto out_free;
> + }
> + mbm_total = kzalloc_node(sizeof(struct sample) * array_size,
> + GFP_KERNEL, NUMA_NO_NODE);
> + if (!mbm_total) {
> + ret = -ENOMEM;
> + goto out_free;
> + }
> + } else
> + cqm_mbm = false;
>
> /*
> * It's possible that not all resources support the same number @@ -
> 1336,44 +1961,48 @@ static int __init intel_cqm_init(void)
> */
> cpu_notifier_register_begin();
>
> - for_each_online_cpu(cpu) {
> - struct cpuinfo_x86 *c = &cpu_data(cpu);
> + if (cqm_llc_occ) {
> + for_each_online_cpu(cpu) {
> + struct cpuinfo_x86 *c = &cpu_data(cpu);
>
> - if (c->x86_cache_max_rmid < cqm_max_rmid)
> - cqm_max_rmid = c->x86_cache_max_rmid;
> + if (c->x86_cache_max_rmid < cqm_max_rmid)
> + cqm_max_rmid = c->x86_cache_max_rmid;
>
> - if (c->x86_cache_occ_scale != cqm_l3_scale) {
> - pr_err("Multiple LLC scale values, disabling\n");
> - ret = -EINVAL;
> - goto out;
> + if (c->x86_cache_occ_scale != cqm_l3_scale) {
> + pr_err("Multiple LLC scale values, disabling\n");
> + ret = -EINVAL;
> + goto out;
> + }
> }
> - }
>
> - /*
> - * A reasonable upper limit on the max threshold is the number
> - * of lines tagged per RMID if all RMIDs have the same number of
> - * lines tagged in the LLC.
> - *
> - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> - */
> - __intel_cqm_max_threshold =
> + /*
> + * A reasonable upper limit on the max threshold is the number
> + * of lines tagged per RMID if all RMIDs have the same number
> of
> + * lines tagged in the LLC.
> + *
> + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> + */
> + __intel_cqm_max_threshold =
> boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
>
> - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> - str = kstrdup(scale, GFP_KERNEL);
> - if (!str) {
> - ret = -ENOMEM;
> - goto out;
> - }
> + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> + str = kstrdup(scale, GFP_KERNEL);
> + if (!str) {
> + ret = -ENOMEM;
> + goto out;
> + }
>
> - event_attr_intel_cqm_llc_scale.event_str = str;
> + event_attr_intel_cqm_llc_scale.event_str = str;
> + }
>
> ret = intel_cqm_setup_rmid_cache();
> if (ret)
> goto out;
>
> for_each_online_cpu(i) {
> - intel_cqm_cpu_prepare(i);
> + ret = intel_cqm_cpu_prepare(i);
> + if (ret)
> + goto out_free;
> cqm_pick_event_reader(i);
> }
>
> @@ -1384,7 +2013,10 @@ static int __init intel_cqm_init(void)
> pr_err("Intel CQM perf registration failed: %d\n", ret);
> else
> pr_info("Intel CQM monitoring enabled\n");
> -
> + goto out;
> +out_free:
> + kfree(mbm_local);
> + kfree(mbm_total);
> out:
> cpu_notifier_register_done();
>
> --
> 2.1.0
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-08-08 6:42 [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU Kanaka Juvva
2015-08-16 17:42 ` Juvva, Kanaka D
@ 2015-08-19 20:50 ` Thomas Gleixner
2015-08-20 21:54 ` Juvva, Kanaka D
[not found] ` <06033C969873E840BDE9FC9B584F66B51944F51B@ORSMSX116.amr.corp.intel.com>
1 sibling, 2 replies; 12+ messages in thread
From: Thomas Gleixner @ 2015-08-19 20:50 UTC (permalink / raw)
To: Kanaka Juvva
Cc: kanaka.d.juvva, glenn.p.williamson, matt.fleming, will.auld,
Andi Kleen, LKML, Tony Luck, Peter Zijlstra, Tejun Heo, x86,
Ingo Molnar, H. Peter Anvin, vikas.shivappa
On Fri, 7 Aug 2015, Kanaka Juvva wrote:
> +#define MBM_CNTR_MAX 0xffffff
> +#define MBM_SOCKET_MAX 8
> +#define MBM_TIME_DELTA_MAX 1000
> +#define MBM_TIME_DELTA_MIN 100
What are these constants for and how are they determined? Pulled out
of thin air?
> +#define MBM_SCALING_FACTOR2 1
> +#define MBM_SCALING_FACTOR 1000
Ditto
> +#define MBM_FIFO_SIZE_MIN 10
> +#define MBM_FIFO_SIZE_MAX 300
Ditto
> /**
> * struct intel_pqr_state - State cache for the PQR MSR
> @@ -42,6 +52,90 @@ struct intel_pqr_state {
> * interrupts disabled, which is sufficient for the protection.
> */
> static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
> +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu);
> +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
> +
> +/*
> + * mbm pmu is used for storing mbm_local and mbm_total
> + * events
> + */
> +struct mbm_pmu {
> + /*
> + * # of active events
> + */
I told you last time:
"If you want to document your struct members, please use kerneldoc
and not these hard to read tail comments."
I can't see how this documentation style is in any way related to
kerneldoc.
> + /*
> + * time interval in ms between two consecutive samples
> + */
> + ktime_t timer_interval; /* in ktime_t unit */
So you stick with your hard to read tail comments? Aside of that this
one is completely pointless. What's the value of this?
> + /*
> + * fifoout is the start of the sliding window from which last 'n'
> + * bw values must be read
> + */
> + u32 fifoout; /* mbmfifo out counter for sliding window */
Sigh
> +};
> +
> +/*
> + * stats for total bw
> + */
> +static struct sample *mbm_total; /* curent stats for mbm_total events */
Oh well...
> #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> +#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
So we have ID values which are built with (1 << X) and then this HW
variant in the middle with 0x3. Of course without any explanation what
the heck this stuff is.
Last review:
"So this wants a descriptive ID name and a comment."
> #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
>
> @@ -176,6 +273,25 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
> return entry;
> }
>
> +/**
> + * __mbm_reset_stats - reset stats for a given rmid for the current cpu
> + * @rmid: rmid value
> + *
> + * vrmid: array index for current core for the given rmid
Array index of what?
> + * mbs_total[] and mbm_local[] are linearly indexed by core * max #rmids per
> + * core
The code tells a different story.
> + */
> +static void __mbm_reset_stats(u32 rmid)
What's the point of the double underscores?
> +{
> + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> + cqm_max_rmid + rmid;
Again I asked you last time:
"Can you please explain that magic calculation? It's far from obvious
why this would be correct."
There is still no explanation.
> + if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
> + return;
> + memset(&mbm_local[vrmid], 0, sizeof(struct sample));
> + memset(&mbm_total[vrmid], 0, sizeof(struct sample));
> +}
> +
> /*
> * Returns < 0 on fail.
> *
> @@ -192,6 +308,7 @@ static u32 __get_rmid(void)
>
> entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
> list_del(&entry->list);
> + __mbm_reset_stats(entry->rmid);
>
> return entry->rmid;
> }
> @@ -207,6 +324,7 @@ static void __put_rmid(u32 rmid)
>
> entry->queue_time = jiffies;
> entry->state = RMID_YOUNG;
> + __mbm_reset_stats(rmid);
>
> list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
> }
> @@ -232,6 +350,7 @@ static int intel_cqm_setup_rmid_cache(void)
>
> INIT_LIST_HEAD(&entry->list);
> entry->rmid = r;
> +
Stray newline
> cqm_rmid_ptrs[r] = entry;
>
> list_add_tail(&entry->list, &cqm_rmid_free_lru);
> @@ -494,6 +613,165 @@ static bool intel_cqm_sched_in_event(u32 rmid)
> return false;
> }
>
> +
> +static u32 bw_sum_calc(struct sample *bw_stat, int rmid)
> +{
> + u32 val = 0, i, j, index;
> +
> + if (++bw_stat->fifoout >= mbm_window_size)
> + bw_stat->fifoout = 0;
> + index = bw_stat->fifoout;
> + for (i = 0; i < mbm_window_size - 1; i++) {
> + if (index + i >= mbm_window_size)
> + j = index + i - mbm_window_size;
> + else
> + j = index + i;
> + val += bw_stat->mbmfifo[j];
> + }
This math wants a explanatory comment.
> + return val;
> +}
> +
> +static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw)
> +{
> + if (is_localbw)
> + return bw_sum_calc(&mbm_local[rmid], rmid);
> + else
> + return bw_sum_calc(&mbm_total[rmid], rmid);
> +}
> +
> +static void __mbm_fifo_in(struct sample *bw_stat, u32 val)
> +{
> + bw_stat->mbmfifo[bw_stat->fifoin] = val;
> + if (++bw_stat->fifoin >= mbm_window_size)
How does that become greater than mbm_windowsize?
> + bw_stat->fifoin = 0;
> +}
> +/*
> + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and reads
> + * its MSR counter. Check whether overflow occurred and handles it. Calculates
> + * currenet BW and updates running average.
currenet? And please get rid of the double spaces
> + *
> + * Overflow Handling:
> + * if (MSR current value < MSR previous value) it is an
> + * overflow. and overflow is handled.
Wow. That's informative as hell!
> + *
> + * Calculation of Current BW value:
BW == Body Weight?
> + * If MSR is read within last 100ms, then the value is ignored;
> + * this will suppress small deltas. We don't process MBM samples that are
> + * within 100ms.
WHY?
> + * Bandwidth is calculated as:
> + * bandwidth = difference of last two msr counter values/time difference.
> + *
> + * cum_avg = Running Average bandwidth of last 'n' samples that are processed
> + *
> + * Sliding window is used to save the last 'n' samples. Hence,
> + * n = sliding_window_size
> + *
> + *Scaling:
> + * cum_avg is scaled down by a factor MBM_SCALING_FACTOR2 and rounded to
> + * nearest integer. User interface reads the BW in KB/sec or MB/sec.
And how is the scaling factor determined?
> + */
> +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
More pointless double underscores
> +{
> + u64 val, tmp, diff_time, cma, bytes, index;
> + bool overflow = false, first = false;
> + ktime_t cur_time;
> + u32 tmp32 = rmid;
> + struct sample *mbm_current;
> + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> + cqm_max_rmid + rmid;
> +
> + rmid = vrmid;
>From my previous review:
"This is completely backwards.
tmp32 = rmid;
rmid = vrmid;
do_stuff(rmid);
rmid = tmp32;
do_other_stuff(rmid);
Why can't you use vrmid for do_stuff() and leave rmid alone? Just
because it would make the code simpler to read?"
Still applies.
> + cur_time = ktime_get();
> + if (read_mbm_local) {
> + cma = mbm_local[rmid].cum_avg;
> + diff_time = ktime_ms_delta(cur_time,
> + mbm_local[rmid].prev_time);
> + if (diff_time < 100)
> + return cma;
> + mbm_local[rmid].prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
What's the actual purpose of this back and forth conversion? The only
purpose I can see is to wreckage the time on 32bit machines.
> + bytes = mbm_local[rmid].bytes;
> + index = mbm_local[rmid].index;
> + mbm_current = &mbm_local[rmid];
> + rmid = tmp32;
> + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID_HW, rmid);
> + } else {
> + cma = mbm_total[rmid].cum_avg;
> + diff_time = ktime_ms_delta(cur_time,
> + mbm_total[rmid].prev_time);
> + if (diff_time < 100)
> + return cma;
> + mbm_total[rmid].prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> + bytes = mbm_total[rmid].bytes;
> + index = mbm_total[rmid].index;
> + mbm_current = &mbm_total[rmid];
> + rmid = tmp32;
> + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID, rmid);
> + }
So you got rid of ONE 'if (read_mbm_local)' but this can be done
completely without this conditional and without the silly code
duplication.
> + rdmsrl(MSR_IA32_QM_CTR, val);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return val;
> +
> + tmp = val;
Again from the last round of review:
"You really have a faible for random storage. tmp gets assigned to
mbm_*[rmid].bytes further down. This makes no sense at all."
> + /* if current msr value < previous msr value , it means overflow */
> + if (val < bytes) {
> + val = MBM_CNTR_MAX - bytes + val;
> + overflow = true;
> + } else
> + val = val - bytes;
> +
> + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
> +
> + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> + /* First sample */
> + first = true;
> +
> + rmid = vrmid;
And another time:
"More obfuscation"
> + if ((diff_time <= (MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN)) ||
> + overflow || first) {
I asked last time about an explanation for this time delta
magic. There still is none.
> + int bw, ret;
> +
> + if (index & (index < mbm_window_size))
> + val = cma * MBM_SCALING_FACTOR2 + val / index -
> + cma / index;
> + val = DIV_ROUND_UP(val, MBM_SCALING_FACTOR2);
> + if (index >= mbm_window_size) {
> + /*
> + * Compute the sum of recent n-1 samples and slide the
> + * window by 1
> + */
> + ret = __mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
> + /* recalculate the running average with current bw */
> + ret = (ret + val) / mbm_window_size;
> + bw = val;
> + val = ret;
Yet another time:
"Your back and forth assignements of random variables is not making the
code any more readable."
> + } else
> + bw = val;
Is still missing parentheses
> + /* save the recent bw in fifo */
> + mbm_fifo_in(rmid, bw, read_mbm_local);
> + /* save the current sample */
> + mbm_current->index++;
> + mbm_current->cum_avg = val;
> + mbm_current->bytes = tmp;
> + mbm_current->prev_time = ktime_set(0,
> + (unsigned long)ktime_to_ns(cur_time));
> +
> + return val;
> + }
> + return cma;
> +}
> +static void __intel_cqm_event_total_bw_count(void *info)
> +{
> + struct rmid_read *rr = info;
> + u64 val;
> +
> + val = __rmid_read_mbm(rr->rmid, false);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return;
> + atomic64_add(val, &rr->value);
> +}
> +
> +static void __intel_cqm_event_local_bw_count(void *info)
> +{
> + struct rmid_read *rr = info;
> + u64 val;
> +
> + val = __rmid_read_mbm(rr->rmid, true);
> + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> + return;
> + atomic64_add(val, &rr->value);
> +}
And once more:
"You're really a fan of copy and paste."
> +
> static void __intel_cqm_event_count(void *info)
> {
> struct rmid_read *rr = info;
> @@ -975,7 +1316,21 @@ static u64 intel_cqm_event_count(struct perf_event *event)
> if (!__rmid_valid(rr.rmid))
> goto out;
>
> - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> + switch (event->attr.config) {
> + case QOS_L3_OCCUP_EVENT_ID:
> + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> + break;
> + case QOS_MBM_TOTAL_EVENT_ID:
> + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_total_bw_count,
If you make the function names a little bit longer then this might
need 3 lines and further enhance unreadability.
> static void intel_cqm_event_start(struct perf_event *event, int mode)
> {
> struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> @@ -1003,19 +1391,45 @@ static void intel_cqm_event_start(struct perf_event *event, int mode)
> }
>
> state->rmid = rmid;
> + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
> + struct cqm_rmid_entry *entry;
> +
> + entry = __rmid_entry(rmid);
> + }
> wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
> +
> + if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) && (cqm_mbm)) {
> + int cpu = smp_processor_id();
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
So you got rid of get_cpu(), but what's wrong with this_cpu_read() ?
> @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
> } else {
> WARN_ON_ONCE(!state->rmid);
> }
> +
> + if (pmu) {
> + if (pmu->n_active > 0) {
What's the purpose of this check? In the previous version there was a
WARN_ON(), which made sense. Did it trigger and you decided to "work"
around it?
> + pmu->n_active--;
> + if (pmu->n_active == 0)
> + mbm_stop_hrtimer(pmu);
> + if (!list_empty(&event->active_entry))
> + list_del(&event->active_entry);
These four lines are at the wrong indentation level.
> + }
> + }
> +
> }
> +#if MBM_SCALING_FACTOR2 == 1000
And how does MBM_SCALING_FACTOR2 change magically?
> +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit, "MB/sec");
> +EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit, "MB/sec");
> +#else
So any value != 1000 results in KB/sec. Interesting math.
> +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit, "KB/sec");
> +EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit, "KB/sec");
> +#endif
> +static ssize_t
> +sliding_window_size_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned int bytes;
> + int ret;
> +
> + ret = kstrtouint(buf, 0, &bytes);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&cache_mutex);
> + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> + mbm_window_size = bytes;
So, it's valid to set the window to X where 0 < X < MBM_FIFO_SIZE_MIN.
What's the actual purpose of MBM_FIFO_SIZE_MIN?
> + mutex_unlock(&cache_mutex);
> +
> + return count;
> +}
> -static void intel_cqm_cpu_prepare(unsigned int cpu)
> +static int intel_cqm_cpu_prepare(unsigned int cpu)
> {
> struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
> struct cpuinfo_x86 *c = &cpu_data(cpu);
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
>
> state->rmid = 0;
> state->closid = 0;
> @@ -1266,12 +1779,27 @@ static void intel_cqm_cpu_prepare(unsigned int cpu)
>
> WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
> WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
> +
> + if ((!pmu) && (cqm_mbm)) {
> + pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL, NUMA_NO_NODE);
> + if (!pmu)
> + return -ENOMEM;
> + INIT_LIST_HEAD(&pmu->active_list);
> + pmu->pmu = &intel_cqm_pmu;
> + pmu->n_active = 0;
It's already zero.
> + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> + per_cpu(mbm_pmu, cpu) = pmu;
> + per_cpu(mbm_pmu_to_free, cpu) = NULL;
What's the point of this? If there is still something to be free'd its
leaked. Otherwise that's redundant.
> + mbm_hrtimer_init(pmu);
> + }
> + return 0;
s/0/NOTIFY_OK/ because you return that value directly.
> }
>
> static void intel_cqm_cpu_exit(unsigned int cpu)
> {
> int phys_id = topology_physical_package_id(cpu);
> int i;
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
>
> /*
> * Is @cpu a designated cqm reader?
> @@ -1288,6 +1816,36 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
> break;
> }
> }
> +
> + /* cancel overflow polling timer for CPU */
> + if (pmu)
> + mbm_stop_hrtimer(pmu);
> + kfree(mbm_local);
> + kfree(mbm_total);
Again you insist to ignore a review comment:
"So this frees the module global storage if one cpu is unplugged and
leaves the pointer initialized so the rest of the code can happily
dereference it."
Oh, well!
> +}
> +
> +static void mbm_cpu_kfree(int cpu)
> +{
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu_to_free, cpu);
> +
> + kfree(pmu);
> +
> + per_cpu(mbm_pmu_to_free, cpu) = NULL;
> +}
> +
> +static int mbm_cpu_dying(int cpu)
> +{
> + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
this_cpu_read(). It's called on the dying CPU.
> +static const struct x86_cpu_id intel_mbm_match[] = {
> + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
> + {}
> +};
> +
> static int __init intel_cqm_init(void)
> {
> char *str, scale[20];
> - int i, cpu, ret;
> + int i, cpu, ret, array_size;
>
> - if (!x86_match_cpu(intel_cqm_match))
> + if (!x86_match_cpu(intel_cqm_match) &&
> + (!x86_match_cpu(intel_mbm_match)))
> return -ENODEV;
>
> cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> +
> + if (x86_match_cpu(intel_cqm_match)) {
> + cqm_llc_occ = true;
> + intel_cqm_events_group.attrs = intel_cmt_events_attr;
> + }
> +
> + if (x86_match_cpu(intel_mbm_match)) {
> + u32 mbm_scale_rounded = 0;
> +
> + cqm_mbm = true;
> + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> + /*
> + * MBM counter values are in bytes. To conver this to KB/sec
> + * or MB/sec, we scale the MBM scale factor by 1000. Another
> + * MBM_SCALING_FACTOR2 factor scaling down is done
> + * after reading the counter val i.e. in the function
> + * __rmid_read_mbm()
> + */
> + mbm_scale_rounded = DIV_ROUND_UP(cqm_l3_scale,
> + MBM_SCALING_FACTOR);
> + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> + snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
> + str = kstrdup(scale, GFP_KERNEL);
> + if (!str) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + if (cqm_llc_occ)
> + intel_cqm_events_group.attrs =
> + intel_cmt_mbm_events_attr;
> + else
> + intel_cqm_events_group.attrs = intel_mbm_events_attr;
> +
> + event_attr_intel_cqm_llc_local_bw_scale.event_str = str;
> + event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
> +
> + array_size = (cqm_max_rmid + 1) * MBM_SOCKET_MAX;
> + mbm_local = kzalloc_node(sizeof(struct sample) * array_size,
> + GFP_KERNEL, NUMA_NO_NODE);
> + if (!mbm_local) {
> + ret = -ENOMEM;
Still leaks str.
> + goto out_free;
> + }
> + mbm_total = kzalloc_node(sizeof(struct sample) * array_size,
> + GFP_KERNEL, NUMA_NO_NODE);
> + if (!mbm_total) {
> +
ret = -ENOMEM;
Ditto
> + goto out_free;
> + }
I asked you to put the above into a separate function, which saves you
one indentation level and makes the init function readable, but
readability is not on your agenda, right?
> + } else
> + cqm_mbm = false;
Still a completely useless exercise.
>
> /*
> * It's possible that not all resources support the same number
> @@ -1336,44 +1961,48 @@ static int __init intel_cqm_init(void)
> */
> cpu_notifier_register_begin();
>
> - for_each_online_cpu(cpu) {
> - struct cpuinfo_x86 *c = &cpu_data(cpu);
> + if (cqm_llc_occ) {
> + for_each_online_cpu(cpu) {
> + struct cpuinfo_x86 *c = &cpu_data(cpu);
>
> - if (c->x86_cache_max_rmid < cqm_max_rmid)
> - cqm_max_rmid = c->x86_cache_max_rmid;
> + if (c->x86_cache_max_rmid < cqm_max_rmid)
> + cqm_max_rmid = c->x86_cache_max_rmid;
>
> - if (c->x86_cache_occ_scale != cqm_l3_scale) {
> - pr_err("Multiple LLC scale values, disabling\n");
> - ret = -EINVAL;
> - goto out;
> + if (c->x86_cache_occ_scale != cqm_l3_scale) {
> + pr_err("Multiple LLC scale values, disabling\n");
> + ret = -EINVAL;
> + goto out;
> + }
> }
> - }
>
> - /*
> - * A reasonable upper limit on the max threshold is the number
> - * of lines tagged per RMID if all RMIDs have the same number of
> - * lines tagged in the LLC.
> - *
> - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> - */
> - __intel_cqm_max_threshold =
> + /*
> + * A reasonable upper limit on the max threshold is the number
> + * of lines tagged per RMID if all RMIDs have the same number of
> + * lines tagged in the LLC.
> + *
> + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> + */
> + __intel_cqm_max_threshold =
> boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
>
> - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> - str = kstrdup(scale, GFP_KERNEL);
> - if (!str) {
> - ret = -ENOMEM;
> - goto out;
> - }
> + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> + str = kstrdup(scale, GFP_KERNEL);
> + if (!str) {
> + ret = -ENOMEM;
> + goto out;
> + }
>
> - event_attr_intel_cqm_llc_scale.event_str = str;
> + event_attr_intel_cqm_llc_scale.event_str = str;
Seperate function as well. And this splitout wants to be a separate
patch.
> + }
>
> ret = intel_cqm_setup_rmid_cache();
> if (ret)
> goto out;
Still leaks str and mbm_local
> for_each_online_cpu(i) {
> - intel_cqm_cpu_prepare(i);
> + ret = intel_cqm_cpu_prepare(i);
> + if (ret)
> + goto out_free;
Still leaks str and leaves more half initialized stuff around.
I.e., what happens if you prepared 5 cpus and the 6th fails? You free
mbm_local and mbm_total, and how is the other preparation of the cpus
undone?
FYI, this is not the way a review cycle works. Either you address the
comments or you reply to the comment and explain why you think that
your code is correct. Just ignoring it does not work as you might have
noticed.
I really don't care how much time you waste on this, but I pretty much
care about the time I waste.
Please take your time and address all points before you post the next
version. You have plenty of it as it's not going into 4.3 by any
means.
Yours grumpy
tglx
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-08-19 20:50 ` Thomas Gleixner
@ 2015-08-20 21:54 ` Juvva, Kanaka D
2015-08-21 12:30 ` Thomas Gleixner
[not found] ` <06033C969873E840BDE9FC9B584F66B51944F51B@ORSMSX116.amr.corp.intel.com>
1 sibling, 1 reply; 12+ messages in thread
From: Juvva, Kanaka D @ 2015-08-20 21:54 UTC (permalink / raw)
To: Thomas Gleixner, Kanaka Juvva
Cc: Williamson, Glenn P, Fleming, Matt, Auld, Will, Andi Kleen, LKML,
Luck, Tony, Peter Zijlstra, Tejun Heo, x86@kernel.org,
Ingo Molnar, H. Peter Anvin, Shivappa, Vikas
Hi Thomas,
Acknowledged. Perhaps some discussions are required in terms of your questions and our solutions.
Mostly nothing was unresolved except some new things that are brought up for v3. Seems like some
clarification are required for upstream.
Regards,
-Kanaka
> -----Original Message-----
> From: Thomas Gleixner [mailto:tglx@linutronix.de]
> Sent: Wednesday, August 19, 2015 1:50 PM
> To: Kanaka Juvva
> Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will; Andi
> Kleen; LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; x86@kernel.org; Ingo
> Molnar; H. Peter Anvin; Shivappa, Vikas
> Subject: Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth
> Monitoring (MBM) PMU
>
> On Fri, 7 Aug 2015, Kanaka Juvva wrote:
> > +#define MBM_CNTR_MAX 0xffffff
> > +#define MBM_SOCKET_MAX 8
> > +#define MBM_TIME_DELTA_MAX 1000
> > +#define MBM_TIME_DELTA_MIN 100
>
> What are these constants for and how are they determined? Pulled out of
> thin air?
>
> > +#define MBM_SCALING_FACTOR2 1
> > +#define MBM_SCALING_FACTOR 1000
>
> Ditto
>
> > +#define MBM_FIFO_SIZE_MIN 10
> > +#define MBM_FIFO_SIZE_MAX 300
>
> Ditto
>
> > /**
> > * struct intel_pqr_state - State cache for the PQR MSR @@ -42,6
> > +52,90 @@ struct intel_pqr_state {
> > * interrupts disabled, which is sufficient for the protection.
> > */
> > static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
> > +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu); static
> > +DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
> > +
> > +/*
> > + * mbm pmu is used for storing mbm_local and mbm_total
> > + * events
> > + */
> > +struct mbm_pmu {
> > + /*
> > + * # of active events
> > + */
>
> I told you last time:
>
> "If you want to document your struct members, please use kerneldoc
> and not these hard to read tail comments."
>
> I can't see how this documentation style is in any way related to kerneldoc.
>
> > + /*
> > + * time interval in ms between two consecutive samples
> > + */
> > + ktime_t timer_interval; /* in ktime_t unit */
>
>
> So you stick with your hard to read tail comments? Aside of that this one is
> completely pointless. What's the value of this?
>
> > + /*
> > + * fifoout is the start of the sliding window from which last 'n'
> > + * bw values must be read
> > + */
> > + u32 fifoout; /* mbmfifo out counter for sliding window */
>
> Sigh
>
> > +};
> > +
> > +/*
> > + * stats for total bw
> > + */
> > +static struct sample *mbm_total; /* curent stats for mbm_total
> > +events */
>
> Oh well...
>
> > #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> > +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> > +#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> > +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
>
> So we have ID values which are built with (1 << X) and then this HW variant in
> the middle with 0x3. Of course without any explanation what the heck this
> stuff is.
>
> Last review:
>
> "So this wants a descriptive ID name and a comment."
>
>
> > #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
> >
> > @@ -176,6 +273,25 @@ static inline struct cqm_rmid_entry
> *__rmid_entry(u32 rmid)
> > return entry;
> > }
> >
> > +/**
> > + * __mbm_reset_stats - reset stats for a given rmid for the current cpu
> > + * @rmid: rmid value
> > + *
> > + * vrmid: array index for current core for the given rmid
>
> Array index of what?
>
> > + * mbs_total[] and mbm_local[] are linearly indexed by core * max
> > + #rmids per
> > + * core
>
> The code tells a different story.
>
> > + */
> > +static void __mbm_reset_stats(u32 rmid)
>
> What's the point of the double underscores?
>
> > +{
> > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > + cqm_max_rmid + rmid;
>
> Again I asked you last time:
>
> "Can you please explain that magic calculation? It's far from obvious
> why this would be correct."
>
> There is still no explanation.
>
> > + if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
> > + return;
> > + memset(&mbm_local[vrmid], 0, sizeof(struct sample));
> > + memset(&mbm_total[vrmid], 0, sizeof(struct sample)); }
> > +
> > /*
> > * Returns < 0 on fail.
> > *
> > @@ -192,6 +308,7 @@ static u32 __get_rmid(void)
> >
> > entry = list_first_entry(&cqm_rmid_free_lru, struct
> cqm_rmid_entry, list);
> > list_del(&entry->list);
> > + __mbm_reset_stats(entry->rmid);
> >
> > return entry->rmid;
> > }
> > @@ -207,6 +324,7 @@ static void __put_rmid(u32 rmid)
> >
> > entry->queue_time = jiffies;
> > entry->state = RMID_YOUNG;
> > + __mbm_reset_stats(rmid);
> >
> > list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } @@ -232,6
> > +350,7 @@ static int intel_cqm_setup_rmid_cache(void)
> >
> > INIT_LIST_HEAD(&entry->list);
> > entry->rmid = r;
> > +
>
> Stray newline
>
> > cqm_rmid_ptrs[r] = entry;
> >
> > list_add_tail(&entry->list, &cqm_rmid_free_lru); @@ -494,6
> +613,165
> > @@ static bool intel_cqm_sched_in_event(u32 rmid)
> > return false;
> > }
> >
> > +
> > +static u32 bw_sum_calc(struct sample *bw_stat, int rmid) {
> > + u32 val = 0, i, j, index;
> > +
> > + if (++bw_stat->fifoout >= mbm_window_size)
> > + bw_stat->fifoout = 0;
> > + index = bw_stat->fifoout;
> > + for (i = 0; i < mbm_window_size - 1; i++) {
> > + if (index + i >= mbm_window_size)
> > + j = index + i - mbm_window_size;
> > + else
> > + j = index + i;
> > + val += bw_stat->mbmfifo[j];
> > + }
>
> This math wants a explanatory comment.
>
> > + return val;
> > +}
> > +
> > +static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw) {
> > + if (is_localbw)
> > + return bw_sum_calc(&mbm_local[rmid], rmid);
> > + else
> > + return bw_sum_calc(&mbm_total[rmid], rmid); }
> > +
> > +static void __mbm_fifo_in(struct sample *bw_stat, u32 val) {
> > + bw_stat->mbmfifo[bw_stat->fifoin] = val;
> > + if (++bw_stat->fifoin >= mbm_window_size)
>
> How does that become greater than mbm_windowsize?
>
> > + bw_stat->fifoin = 0;
> > +}
>
> > +/*
> > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event
> and
> > +reads
> > + * its MSR counter. Check whether overflow occurred and handles it.
> > +Calculates
> > + * currenet BW and updates running average.
>
> currenet? And please get rid of the double spaces
>
> > + *
> > + * Overflow Handling:
> > + * if (MSR current value < MSR previous value) it is an
> > + * overflow. and overflow is handled.
>
> Wow. That's informative as hell!
>
> > + *
> > + * Calculation of Current BW value:
>
> BW == Body Weight?
>
> > + * If MSR is read within last 100ms, then the value is ignored;
> > + * this will suppress small deltas. We don't process MBM samples that
> > + are
> > + * within 100ms.
>
> WHY?
>
> > + * Bandwidth is calculated as:
> > + * bandwidth = difference of last two msr counter values/time difference.
> > + *
> > + * cum_avg = Running Average bandwidth of last 'n' samples that are
> > + processed
> > + *
> > + * Sliding window is used to save the last 'n' samples. Hence,
> > + * n = sliding_window_size
> > + *
> > + *Scaling:
> > + * cum_avg is scaled down by a factor MBM_SCALING_FACTOR2 and
> > + rounded to
> > + * nearest integer. User interface reads the BW in KB/sec or MB/sec.
>
> And how is the scaling factor determined?
>
> > + */
> > +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
>
> More pointless double underscores
>
> > +{
> > + u64 val, tmp, diff_time, cma, bytes, index;
> > + bool overflow = false, first = false;
> > + ktime_t cur_time;
> > + u32 tmp32 = rmid;
> > + struct sample *mbm_current;
> > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > + cqm_max_rmid + rmid;
> > +
> > + rmid = vrmid;
>
> From my previous review:
>
> "This is completely backwards.
>
> tmp32 = rmid;
> rmid = vrmid;
> do_stuff(rmid);
> rmid = tmp32;
> do_other_stuff(rmid);
>
> Why can't you use vrmid for do_stuff() and leave rmid alone? Just
> because it would make the code simpler to read?"
>
> Still applies.
>
> > + cur_time = ktime_get();
> > + if (read_mbm_local) {
> > + cma = mbm_local[rmid].cum_avg;
> > + diff_time = ktime_ms_delta(cur_time,
> > + mbm_local[rmid].prev_time);
> > + if (diff_time < 100)
> > + return cma;
> > + mbm_local[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
>
> What's the actual purpose of this back and forth conversion? The only
> purpose I can see is to wreckage the time on 32bit machines.
>
> > + bytes = mbm_local[rmid].bytes;
> > + index = mbm_local[rmid].index;
> > + mbm_current = &mbm_local[rmid];
> > + rmid = tmp32;
> > + wrmsr(MSR_IA32_QM_EVTSEL,
> QOS_MBM_LOCAL_EVENT_ID_HW, rmid);
> > + } else {
> > + cma = mbm_total[rmid].cum_avg;
> > + diff_time = ktime_ms_delta(cur_time,
> > + mbm_total[rmid].prev_time);
> > + if (diff_time < 100)
> > + return cma;
> > + mbm_total[rmid].prev_time = ktime_set(0,
> > + (unsigned
> long)ktime_to_ns(cur_time));
> > + bytes = mbm_total[rmid].bytes;
> > + index = mbm_total[rmid].index;
> > + mbm_current = &mbm_total[rmid];
> > + rmid = tmp32;
> > + wrmsr(MSR_IA32_QM_EVTSEL,
> QOS_MBM_TOTAL_EVENT_ID, rmid);
> > + }
>
> So you got rid of ONE 'if (read_mbm_local)' but this can be done completely
> without this conditional and without the silly code duplication.
>
> > + rdmsrl(MSR_IA32_QM_CTR, val);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return val;
> > +
> > + tmp = val;
>
> Again from the last round of review:
>
> "You really have a faible for random storage. tmp gets assigned to
> mbm_*[rmid].bytes further down. This makes no sense at all."
>
> > + /* if current msr value < previous msr value , it means overflow */
> > + if (val < bytes) {
> > + val = MBM_CNTR_MAX - bytes + val;
> > + overflow = true;
> > + } else
> > + val = val - bytes;
> > +
> > + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
> > +
> > + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> > + /* First sample */
> > + first = true;
> > +
> > + rmid = vrmid;
>
> And another time:
>
> "More obfuscation"
>
> > + if ((diff_time <= (MBM_TIME_DELTA_MAX +
> MBM_TIME_DELTA_MIN)) ||
> > + overflow || first) {
>
> I asked last time about an explanation for this time delta magic. There still is
> none.
>
> > + int bw, ret;
> > +
> > + if (index & (index < mbm_window_size))
> > + val = cma * MBM_SCALING_FACTOR2 + val / index -
> > + cma / index;
> > + val = DIV_ROUND_UP(val, MBM_SCALING_FACTOR2);
> > + if (index >= mbm_window_size) {
> > + /*
> > + * Compute the sum of recent n-1 samples and slide
> the
> > + * window by 1
> > + */
> > + ret = __mbm_fifo_sum_lastn_out(rmid,
> read_mbm_local);
> > + /* recalculate the running average with current bw */
> > + ret = (ret + val) / mbm_window_size;
> > + bw = val;
> > + val = ret;
>
> Yet another time:
>
> "Your back and forth assignements of random variables is not making the
> code any more readable."
>
> > + } else
> > + bw = val;
>
> Is still missing parentheses
>
> > + /* save the recent bw in fifo */
> > + mbm_fifo_in(rmid, bw, read_mbm_local);
> > + /* save the current sample */
> > + mbm_current->index++;
> > + mbm_current->cum_avg = val;
> > + mbm_current->bytes = tmp;
> > + mbm_current->prev_time = ktime_set(0,
> > + (unsigned
> long)ktime_to_ns(cur_time));
> > +
> > + return val;
> > + }
> > + return cma;
> > +}
>
> > +static void __intel_cqm_event_total_bw_count(void *info) {
> > + struct rmid_read *rr = info;
> > + u64 val;
> > +
> > + val = __rmid_read_mbm(rr->rmid, false);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return;
> > + atomic64_add(val, &rr->value);
> > +}
> > +
> > +static void __intel_cqm_event_local_bw_count(void *info) {
> > + struct rmid_read *rr = info;
> > + u64 val;
> > +
> > + val = __rmid_read_mbm(rr->rmid, true);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return;
> > + atomic64_add(val, &rr->value);
> > +}
>
> And once more:
>
> "You're really a fan of copy and paste."
>
> > +
> > static void __intel_cqm_event_count(void *info) {
> > struct rmid_read *rr = info;
> > @@ -975,7 +1316,21 @@ static u64 intel_cqm_event_count(struct
> perf_event *event)
> > if (!__rmid_valid(rr.rmid))
> > goto out;
> >
> > - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
> &rr, 1);
> > + switch (event->attr.config) {
> > + case QOS_L3_OCCUP_EVENT_ID:
> > + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_count, &rr, 1);
> > + break;
> > + case QOS_MBM_TOTAL_EVENT_ID:
> > + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_total_bw_count,
>
> If you make the function names a little bit longer then this might need 3 lines
> and further enhance unreadability.
>
> > static void intel_cqm_event_start(struct perf_event *event, int mode)
> > {
> > struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); @@
> > -1003,19 +1391,45 @@ static void intel_cqm_event_start(struct perf_event
> *event, int mode)
> > }
> >
> > state->rmid = rmid;
> > + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
> > + struct cqm_rmid_entry *entry;
> > +
> > + entry = __rmid_entry(rmid);
> > + }
> > wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
> > +
> > + if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) &&
> (cqm_mbm)) {
> > + int cpu = smp_processor_id();
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
>
> So you got rid of get_cpu(), but what's wrong with this_cpu_read() ?
>
> > @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct
> perf_event *event, int mode)
> > } else {
> > WARN_ON_ONCE(!state->rmid);
> > }
> > +
> > + if (pmu) {
> > + if (pmu->n_active > 0) {
>
> What's the purpose of this check? In the previous version there was a
> WARN_ON(), which made sense. Did it trigger and you decided to "work"
> around it?
>
> > + pmu->n_active--;
>
> > + if (pmu->n_active == 0)
> > + mbm_stop_hrtimer(pmu);
> > + if (!list_empty(&event->active_entry))
> > + list_del(&event->active_entry);
>
> These four lines are at the wrong indentation level.
>
> > + }
> > + }
> > +
> > }
>
> > +#if MBM_SCALING_FACTOR2 == 1000
>
> And how does MBM_SCALING_FACTOR2 change magically?
>
> > +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit,
> > +"MB/sec"); EVENT_ATTR_STR(llc_local_bw.unit,
> > +intel_cqm_llc_local_bw_unit, "MB/sec"); #else
>
> So any value != 1000 results in KB/sec. Interesting math.
>
> > +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit,
> > +"KB/sec"); EVENT_ATTR_STR(llc_local_bw.unit,
> > +intel_cqm_llc_local_bw_unit, "KB/sec"); #endif
>
> > +static ssize_t
> > +sliding_window_size_store(struct device *dev,
> > + struct device_attribute *attr,
> > + const char *buf, size_t count)
> > +{
> > + unsigned int bytes;
> > + int ret;
> > +
> > + ret = kstrtouint(buf, 0, &bytes);
> > + if (ret)
> > + return ret;
> > +
> > + mutex_lock(&cache_mutex);
> > + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> > + mbm_window_size = bytes;
>
> So, it's valid to set the window to X where 0 < X < MBM_FIFO_SIZE_MIN.
> What's the actual purpose of MBM_FIFO_SIZE_MIN?
>
> > + mutex_unlock(&cache_mutex);
> > +
> > + return count;
> > +}
>
> > -static void intel_cqm_cpu_prepare(unsigned int cpu)
> > +static int intel_cqm_cpu_prepare(unsigned int cpu)
> > {
> > struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
> > struct cpuinfo_x86 *c = &cpu_data(cpu);
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> >
> > state->rmid = 0;
> > state->closid = 0;
> > @@ -1266,12 +1779,27 @@ static void intel_cqm_cpu_prepare(unsigned
> int
> > cpu)
> >
> > WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
> > WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
> > +
> > + if ((!pmu) && (cqm_mbm)) {
> > + pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL,
> NUMA_NO_NODE);
> > + if (!pmu)
> > + return -ENOMEM;
> > + INIT_LIST_HEAD(&pmu->active_list);
> > + pmu->pmu = &intel_cqm_pmu;
> > + pmu->n_active = 0;
>
> It's already zero.
>
> > + pmu->timer_interval =
> ms_to_ktime(MBM_TIME_DELTA_MAX);
> > + per_cpu(mbm_pmu, cpu) = pmu;
> > + per_cpu(mbm_pmu_to_free, cpu) = NULL;
>
> What's the point of this? If there is still something to be free'd its leaked.
> Otherwise that's redundant.
>
> > + mbm_hrtimer_init(pmu);
> > + }
> > + return 0;
>
> s/0/NOTIFY_OK/ because you return that value directly.
>
> > }
> >
> > static void intel_cqm_cpu_exit(unsigned int cpu) {
> > int phys_id = topology_physical_package_id(cpu);
> > int i;
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> >
> > /*
> > * Is @cpu a designated cqm reader?
> > @@ -1288,6 +1816,36 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
> > break;
> > }
> > }
> > +
> > + /* cancel overflow polling timer for CPU */
> > + if (pmu)
> > + mbm_stop_hrtimer(pmu);
> > + kfree(mbm_local);
> > + kfree(mbm_total);
>
> Again you insist to ignore a review comment:
>
> "So this frees the module global storage if one cpu is unplugged and
> leaves the pointer initialized so the rest of the code can happily
> dereference it."
>
> Oh, well!
>
> > +}
> > +
> > +static void mbm_cpu_kfree(int cpu)
> > +{
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu_to_free, cpu);
> > +
> > + kfree(pmu);
> > +
> > + per_cpu(mbm_pmu_to_free, cpu) = NULL; }
> > +
> > +static int mbm_cpu_dying(int cpu)
> > +{
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
>
> this_cpu_read(). It's called on the dying CPU.
>
> > +static const struct x86_cpu_id intel_mbm_match[] = {
> > + { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_MBM_LOCAL },
> > + {}
> > +};
> > +
> > static int __init intel_cqm_init(void) {
> > char *str, scale[20];
> > - int i, cpu, ret;
> > + int i, cpu, ret, array_size;
> >
> > - if (!x86_match_cpu(intel_cqm_match))
> > + if (!x86_match_cpu(intel_cqm_match) &&
> > + (!x86_match_cpu(intel_mbm_match)))
> > return -ENODEV;
> >
> > cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> > + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> > +
> > + if (x86_match_cpu(intel_cqm_match)) {
> > + cqm_llc_occ = true;
> > + intel_cqm_events_group.attrs = intel_cmt_events_attr;
> > + }
> > +
> > + if (x86_match_cpu(intel_mbm_match)) {
> > + u32 mbm_scale_rounded = 0;
> > +
> > + cqm_mbm = true;
> > + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> > + /*
> > + * MBM counter values are in bytes. To conver this to KB/sec
> > + * or MB/sec, we scale the MBM scale factor by 1000.
> Another
> > + * MBM_SCALING_FACTOR2 factor scaling down is done
> > + * after reading the counter val i.e. in the function
> > + * __rmid_read_mbm()
> > + */
> > + mbm_scale_rounded = DIV_ROUND_UP(cqm_l3_scale,
> > + MBM_SCALING_FACTOR);
> > + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> > + snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
> > + str = kstrdup(scale, GFP_KERNEL);
> > + if (!str) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > +
> > + if (cqm_llc_occ)
> > + intel_cqm_events_group.attrs =
> > + intel_cmt_mbm_events_attr;
> > + else
> > + intel_cqm_events_group.attrs =
> intel_mbm_events_attr;
> > +
> > + event_attr_intel_cqm_llc_local_bw_scale.event_str = str;
> > + event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
> > +
> > + array_size = (cqm_max_rmid + 1) * MBM_SOCKET_MAX;
> > + mbm_local = kzalloc_node(sizeof(struct sample) * array_size,
> > + GFP_KERNEL, NUMA_NO_NODE);
> > + if (!mbm_local) {
> > + ret = -ENOMEM;
>
> Still leaks str.
>
> > + goto out_free;
> > + }
> > + mbm_total = kzalloc_node(sizeof(struct sample) *
> array_size,
> > + GFP_KERNEL, NUMA_NO_NODE);
> > + if (!mbm_total) {
> > +
> ret = -ENOMEM;
> Ditto
>
> > + goto out_free;
> > + }
>
> I asked you to put the above into a separate function, which saves you one
> indentation level and makes the init function readable, but readability is not
> on your agenda, right?
>
> > + } else
> > + cqm_mbm = false;
>
> Still a completely useless exercise.
>
> >
> > /*
> > * It's possible that not all resources support the same number @@
> > -1336,44 +1961,48 @@ static int __init intel_cqm_init(void)
> > */
> > cpu_notifier_register_begin();
> >
> > - for_each_online_cpu(cpu) {
> > - struct cpuinfo_x86 *c = &cpu_data(cpu);
> > + if (cqm_llc_occ) {
> > + for_each_online_cpu(cpu) {
> > + struct cpuinfo_x86 *c = &cpu_data(cpu);
> >
> > - if (c->x86_cache_max_rmid < cqm_max_rmid)
> > - cqm_max_rmid = c->x86_cache_max_rmid;
> > + if (c->x86_cache_max_rmid < cqm_max_rmid)
> > + cqm_max_rmid = c->x86_cache_max_rmid;
> >
> > - if (c->x86_cache_occ_scale != cqm_l3_scale) {
> > - pr_err("Multiple LLC scale values, disabling\n");
> > - ret = -EINVAL;
> > - goto out;
> > + if (c->x86_cache_occ_scale != cqm_l3_scale) {
> > + pr_err("Multiple LLC scale values,
> disabling\n");
> > + ret = -EINVAL;
> > + goto out;
> > + }
> > }
> > - }
> >
> > - /*
> > - * A reasonable upper limit on the max threshold is the number
> > - * of lines tagged per RMID if all RMIDs have the same number of
> > - * lines tagged in the LLC.
> > - *
> > - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> > - */
> > - __intel_cqm_max_threshold =
> > + /*
> > + * A reasonable upper limit on the max threshold is the
> number
> > + * of lines tagged per RMID if all RMIDs have the same
> number of
> > + * lines tagged in the LLC.
> > + *
> > + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> > + */
> > + __intel_cqm_max_threshold =
> > boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid +
> 1);
> >
> > - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> > - str = kstrdup(scale, GFP_KERNEL);
> > - if (!str) {
> > - ret = -ENOMEM;
> > - goto out;
> > - }
> > + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> > + str = kstrdup(scale, GFP_KERNEL);
> > + if (!str) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> >
> > - event_attr_intel_cqm_llc_scale.event_str = str;
> > + event_attr_intel_cqm_llc_scale.event_str = str;
>
> Seperate function as well. And this splitout wants to be a separate patch.
>
> > + }
> >
> > ret = intel_cqm_setup_rmid_cache();
> > if (ret)
> > goto out;
>
> Still leaks str and mbm_local
>
> > for_each_online_cpu(i) {
> > - intel_cqm_cpu_prepare(i);
> > + ret = intel_cqm_cpu_prepare(i);
> > + if (ret)
> > + goto out_free;
>
> Still leaks str and leaves more half initialized stuff around.
>
> I.e., what happens if you prepared 5 cpus and the 6th fails? You free
> mbm_local and mbm_total, and how is the other preparation of the cpus
> undone?
>
> FYI, this is not the way a review cycle works. Either you address the
> comments or you reply to the comment and explain why you think that your
> code is correct. Just ignoring it does not work as you might have noticed.
>
> I really don't care how much time you waste on this, but I pretty much care
> about the time I waste.
>
> Please take your time and address all points before you post the next
> version. You have plenty of it as it's not going into 4.3 by any means.
>
> Yours grumpy
>
> tglx
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-08-20 21:54 ` Juvva, Kanaka D
@ 2015-08-21 12:30 ` Thomas Gleixner
0 siblings, 0 replies; 12+ messages in thread
From: Thomas Gleixner @ 2015-08-21 12:30 UTC (permalink / raw)
To: Juvva, Kanaka D
Cc: Kanaka Juvva, Williamson, Glenn P, Fleming, Matt, Auld, Will,
Andi Kleen, LKML, Luck, Tony, Peter Zijlstra, Tejun Heo,
x86@kernel.org, Ingo Molnar, H. Peter Anvin, Shivappa, Vikas
On Thu, 20 Aug 2015, Juvva, Kanaka D wrote:
> Hi Thomas,
Please do not top post and trim your replies proper.
> Acknowledged. Perhaps some discussions are required in terms of
> your questions and our solutions.
Fine with me.
> Mostly nothing was unresolved except some new things that are
> brought up for v3.
By some weird definition of mostly. You picked random items, you did
not answer any of the questions I asked and you ignored all requests
which involve a little bit more work. You even left the obvious bugs I
pointed out unresolved.
> Seems like some clarification are required for upstream.
It seems there is some clarification required about how (or how not to
work) with upstream.
Thanks,
tglx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
[not found] ` <06033C969873E840BDE9FC9B584F66B51944F51B@ORSMSX116.amr.corp.intel.com>
@ 2015-09-08 11:47 ` Matt Fleming
2015-09-08 16:11 ` Juvva, Kanaka D
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Matt Fleming @ 2015-09-08 11:47 UTC (permalink / raw)
To: Juvva, Kanaka D
Cc: Thomas Gleixner, Kanaka Juvva, Williamson, Glenn P, Auld, Will,
Andi Kleen, LKML, Luck, Tony, Peter Zijlstra, Tejun Heo,
x86@kernel.org, Ingo Molnar, H. Peter Anvin, Shivappa, Vikas
On Mon, 2015-09-07 at 20:22 +0100, Juvva, Kanaka D wrote:
> Hi Thomas,
>
> I'm sending updated patch(s). I have given details for each of
> these items below.
>
Kanaka, this email is HTML formatted and so has been blocked by
vger.kernel.org where the linux-kernel mailing list is hosted.
Please configure outlook not to send html email, or use a different mail
agent for working with upstream.
> Regards,
> -Kanaka
>
> > -----Original Message-----
> > From: Thomas Gleixner [mailto:tglx@linutronix.de]
> > Sent: Wednesday, August 19, 2015 1:50 PM
> > To: Kanaka Juvva
> > Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will;
> Andi Kleen;
> > LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; x86@kernel.org; Ingo
> Molnar; H.
> > Peter Anvin; Shivappa, Vikas
> > Subject: Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth
> Monitoring
> > (MBM) PMU
> >
> > On Fri, 7 Aug 2015, Kanaka Juvva wrote:
> > > +#define MBM_CNTR_MAX 0xffffff
> > > +#define MBM_SOCKET_MAX 8
> > > +#define MBM_TIME_DELTA_MAX 1000
> > > +#define MBM_TIME_DELTA_MIN 100
> >
> > What are these constants for and how are they determined? Pulled out of thin
> > air?
> >
>
> /*
> * MBM Counter is 24bits wide. MBM_CNTR_MAX defines max counter
> * value
> */
> #define MBM_CNTR_MAX 0xffffff
> /*
> * Max #sockets supported
> */
> #define MBM_SOCKET_MAX 8
This seems like a constant we could get by without. Do we really need to
know this at compile time?
> /*
> * Expected time interval between consecutive MSR reads for a given rmid
> */
> #define MBM_TIME_DELTA_MAX 1000
"max" and "expected" are not the same thing.
> > > #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> > > +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> > > +#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> > > +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
> >
> > So we have ID values which are built with (1 << X) and then this HW variant in the
> > middle with 0x3. Of course without any explanation what the heck this stuff is.
> >
> > Last review:
> >
> > "So this wants a descriptive ID name and a comment."
> >
> >
>
> /*
> * MBM Event IDs as defined in SDM section 17.14.6
> * Event IDs used to program MSRs for reading counters
> */
> #define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> #define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> /*
> * Perf needs event id to be 1 << x, hence we can't use 0x3 (HW EVENT ID)
> * for MBM_LOCAL_EVENT we use next 1 << x for MBM_LOCAL_EVENT_ID
> */
> #define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
No, perf events do not need to be of the form (1 << X), that was just a
convention we used in the cqm code before we knew what values the MBM
events would take - you can change these to be whatever format you want,
but be sure to make it consistent.
The constants are very much supposed to be programmed into the MSRs,
take a look at __rmid_read().
I would suggest (as I already did privately) that you change the format
to be 0x0x for all of these event IDs.
> > > @@ static bool intel_cqm_sched_in_event(u32 rmid)
> > > return false;
> > > }
> > >
> > > +
> > > +static u32 bw_sum_calc(struct sample *bw_stat, int rmid) {
> > > + u32 val = 0, i, j, index;
> > > +
> > > + if (++bw_stat->fifoout >= mbm_window_size)
> > > + bw_stat->fifoout = 0;
> > > + index = bw_stat->fifoout;
> > > + for (i = 0; i < mbm_window_size - 1; i++) {
> > > + if (index + i >= mbm_window_size)
> > > + j = index + i - mbm_window_size;
> > > + else
> > > + j = index + i;
> > > + val += bw_stat->mbmfifo[j];
> > > + }
> >
> > This math wants a explanatory comment.
> >
> /*
> * Slide the window by 1 and calculate the sum of the last
> * mbm_window_size-1 bandwidth values.
> * fifoout is the current position of the window.
> * Increment the fifoout by 1 to slide the window by 1.
> *
> * Calcalute the bandwidth from ++fifiout to ( ++fifoout + mbm_window_size -1)
> * e.g.fifoout =1; Bandwidth1 Bandwidth2 ..... Bandwidthn are the
> * sliding window values where n is size of the sliding window
> * bandwidth sum: val = Bandwidth2 + Bandwidth3 + .. Bandwidthn
> */
Instead of these large comment blocks please comment smaller,
logically-connected chunks of code, e.g.
/* Slide the window by one */
if (++bw_stat->fifoout >= mbm_window_size)
bw_stat->fifoout = 0;
/*
* Calculate the sum of last mbm_window_size-1 values.
*/
for (i = 0; i < mbm_window_size - 1; i++) {
/* Handle wraparound at end of window */
if (index + i >= mbm_window_size)
j = index + i - mbm_window_size;
else
j = index + i;
val += bw_stat->mbminfo[j];
}
>
>
> > > + return val;
> > > +}
> > > +
> > > +static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw) {
> > > + if (is_localbw)
> > > + return bw_sum_calc(&mbm_local[rmid], rmid);
> > > + else
> > > + return bw_sum_calc(&mbm_total[rmid], rmid); }
> > > +
> > > +static void __mbm_fifo_in(struct sample *bw_stat, u32 val) {
> > > + bw_stat->mbmfifo[bw_stat->fifoin] = val;
> > > + if (++bw_stat->fifoin >= mbm_window_size)
> >
> > How does that become greater than mbm_windowsize?
> >
>
> This is fixed by changing >= to ==
> Added a comment:
>
> /*
> * store current sample's bw value in sliding window at the
> * index fifoin. Increment fifoin. Check if fifoin has reached
> * max_window_size. If yes reset it to begining i.e. zero
> * e.g.
> * mbm_window_size = 10
> * mbmfifo is a circular fifo 0 1 2 3 4 5 6 7 8 9 10
> * ^ |
> * | |
> * | _ _ _ _ _ _ _ _ _ _|
> *
> * So when fifoin becomes 10, then it is reset to zero
> *
> */
I'm not sure that this comment adds anything of value that isn't already
understood by reading the code. I don't think you need a comment for
this function, it seems pretty straight forward and Thomas' question was
about the boundary limits of ->fifoin.
> > > + bw_stat->fifoin = 0;
> > > +}
> >
> > > +/*
> > > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and
> > > +reads
> > > + * its MSR counter. Check whether overflow occurred and handles it.
> > > +Calculates
> > > + * currenet BW and updates running average.
> >
> > currenet? And please get rid of the double spaces
> >
> This is fixed now. Here is the updated comment:
>
> /*
> * rmid_read_mbm checks whether it is LOCAL or Total MBM event and reads
> * its MSR counter. Check whether overflow occured and handles it. Calculates
> * currenet BW and updates running average.
> *
^^^ You've still misspelled current.
>
> * Overflow Handling:
> * if (MSR current value < MSR previous value) it is an
> * overflow. MSR values are increasing when bandwidth consumption for the thread
> * is non-zero; When MSR values reaches MAX_COUNTER_VALUE it overflows. After overflow,
> * MSR current value goes back to zero and starts increasing again at the rate of
> * bandwidth.
> *
You don't need to provide a definition of "overflow", most people will
be familiar with it. What is more important to document is how the
overflow is handled...
>
> * Overflow handling:
> * Detect an overflow : current read value > last read value
Isn't this inverted? Overflow occurred if current < previous.
>
> * Overflow correction: if (overflow)
> * Current value = (MAX_COUNTER_VALUE - prev read value) + current read value
> * else
> * Current value = current read value
> *
Please don't write pseudocode in the comments. Use English prose to
describe the important parts of the code.
>
> * Calculation of Current Bandwidth value:
> * If MSR is read within last 100ms, then then the smaple is ignored;
> * If the MSR was Read with in last 100ms, why incur an extra overhead
> * of doing the MSR reads again. Anyway there'll be a negligible change or zero
> * change in MSR readings in 100ms.
> *
> * Bandwidth is calculated as:
> * memory bandwidth = difference of last two msr counter values/time difference.
> *
> * cum_avg = Running Average bandwidth of last 'n' bandwidth values for
> * the samples that are processed
> *
Where 'n' is 'mbm_window_size' ? If so, please use 'mbm_window_size',
not 'n'.
>
> * Sliding window is used to save the last 'n' samples. Where,
> * n = sliding_window_size and results in sliding window duration of 'n' secs.
Hmm... this confuses me a lot. Is 'n' a size or a duration? The two are
not the same thing.
>
> * The sliding window size by default set to
> * MBM_FIFO_SIZE_MIN. User can configure it to the values in the range
> * (MBM_FIFO_SIZE_MIN,MBM_FIFO_SIZE_MAX). The range for sliding window
> * is chosen based on a general criteria for monitoring duration. Example
> * for a short lived application, 10sec monitoring period gives
> * good characterization of its bandwidth consumption. For an application
> * that runs for longer duration, 300sec monitoring period gives better
> * characterization of its bandwidth consumption. Since the running average
> * calculated for total monitoring period, user gets the most accuracate
> * average bandwidth for the each monitoring period.
> *
> * Scaling:
> * cum_avg is the raw bandwidth is Bytes/sec.
> * cum_avg is converted to MB/sec by applying MBM_CONVERSION_FACTOR and
> * rounded to nearest integer. User interface gets the Bandwidth values in MB/sec.
> *
> */
> > > + *
> > > + * Overflow Handling:
> > > + * if (MSR current value < MSR previous value) it is an
> > > + * overflow. and overflow is handled.
> >
> > Wow. That's informative as hell!
> >
> Please look at the modified comment above
> > > + *
> > > + * Calculation of Current BW value:
> >
> > BW == Body Weight?
> >
>
> It is fixed now
>
> > > + * If MSR is read within last 100ms, then the value is ignored;
> > > + * this will suppress small deltas. We don't process MBM samples that
> > > + are
> > > + * within 100ms.
> >
> > WHY?
> >
> Explained in the comment. If mbm_read is called within in 100ms for the same rmid, we don’t
> have to process the sample.
The key piece of information you're missing here is that skipping these
small deltas is an optimization, because we avoid performing costly
operations for what would likely be a very minor change in the MBM data,
right?
> > > +{
> > > + u64 val, tmp, diff_time, cma, bytes, index;
> > > + bool overflow = false, first = false;
> > > + ktime_t cur_time;
> > > + u32 tmp32 = rmid;
> > > + struct sample *mbm_current;
> > > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > > + cqm_max_rmid + rmid;
> > > +
> > > + rmid = vrmid;
> >
> > From my previous review:
> >
> > "This is completely backwards.
> >
> > tmp32 = rmid;
> > rmid = vrmid;
> > do_stuff(rmid);
> > rmid = tmp32;
> > do_other_stuff(rmid);
> >
> > Why can't you use vrmid for do_stuff() and leave rmid alone? Just
> > because it would make the code simpler to read?"
> >
> > Still applies.
> >
>
> This is now changed to
> u64 val, currentmsr, currentbw, diff_time, cma, bytes, index;
> bool overflow = false, first = false;
> ktime_t cur_time;
> u32 tmp32 = rmid, eventid;
> struct sample *mbm_current;
> u32 vrmid = rmid_2_index(rmid);
>
> rmid = vrmid;
> cur_time = ktime_get();
> if (read_mbm_local) {
> mbm_current = &mbm_local[vrmid];
> eventid = QOS_MBM_LOCAL_EVENT_ID_HW;
> wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID_HW, rmid);
You don't need to perform this wrmsr() here because it's taken care of
in the common code below.
> } else {
> mbm_current = &mbm_total[vrmid];
> eventid = QOS_MBM_TOTAL_EVENT_ID;
> }
> rmid = tmp32;
Why did you assign rmid to vrmid if you reassign it before it was used?
> > > + /* if current msr value < previous msr value , it means overflow */
> > > + if (val < bytes) {
> > > + val = MBM_CNTR_MAX - bytes + val;
> > > + overflow = true;
> > > + } else
> > > + val = val - bytes;
> > > +
> > > + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
> > > +
> > > + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> > > + /* First sample */
> > > + first = true;
> > > +
> > > + rmid = vrmid;
> >
> > And another time:
> >
> > "More obfuscation"
> >
>
> /*
> * MBM_TIME_DELTA_MAX is picked as per MBM specs. As specified in Intel Platform
> * Quality of Service Monitoring Implementer's Guide V1, Section 2.7.2. page 21,
> * overflow can occur maximum once in a second. So latest we want to read the MSR
> * counters is 1000ms. If it is less than 1000ms we can ignore the sample. Then we
> * decide since when we should ignore. If the MSR was Read with in last 100ms, why
> * process the MSR reads again. Anyway there'll be small change or zero change.
> * So ignoring MSR Reads within 100ms or less is efficient. MBM_TIME_DELTA_MIN
> * is specified as 100ms as per this guideline.
> *
> */
I suspect the document you're referring to above is only available under
NDA, which makes it unsuitable for mention in the kernel source since a
large number of people won't have access to it.
Just explain that the way the hardware is designed puts an upper limit
on how quickly the counter can overflow, which is once per second.
> > > +static void __intel_cqm_event_total_bw_count(void *info) {
> > > + struct rmid_read *rr = info;
> > > + u64 val;
> > > +
> > > + val = __rmid_read_mbm(rr->rmid, false);
> > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > > + return;
> > > + atomic64_add(val, &rr->value);
> > > +}
> > > +
> > > +static void __intel_cqm_event_local_bw_count(void *info) {
> > > + struct rmid_read *rr = info;
> > > + u64 val;
> > > +
> > > + val = __rmid_read_mbm(rr->rmid, true);
> > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > > + return;
> > > + atomic64_add(val, &rr->value);
> > > +}
> >
> > And once more:
> >
> > "You're really a fan of copy and paste."
> >
>
> These functions are invoked indirectly. They were written keeping intel_cqm_event_count in mind.
> I’ll change the arg to struct mbm_read{
> struct rmid_read *rr;
> u32 eventid;
> };
> Intel_cqm_event_*_bw_count(….) needs eventid to call for decoding
No, please do not duplicate the rmid_read structure, that is not an
improvement, we don't need two different structs for reading the read
data.
Please add the event field to the existing struct rmid_read.
> > > @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct
> > perf_event *event, int mode)
> > > } else {
> > > WARN_ON_ONCE(!state->rmid);
> > > }
> > > +
> > > + if (pmu) {
> > > + if (pmu->n_active > 0) {
> >
> > What's the purpose of this check? In the previous version there was a
> > WARN_ON(), which made sense. Did it trigger and you decided to "work"
> > around it?
> >
>
> We actually meant to check if there are active events
I don't follow this answer. Are you saying that the WARN_ON() doesn't
make sense here?
>
> > > +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit,
> > > +"KB/sec"); EVENT_ATTR_STR(llc_local_bw.unit,
> > > +intel_cqm_llc_local_bw_unit, "KB/sec"); #endif
> >
> > > +static ssize_t
> > > +sliding_window_size_store(struct device *dev,
> > > + struct device_attribute *attr,
> > > + const char *buf, size_t count)
> > > +{
> > > + unsigned int bytes;
> > > + int ret;
> > > +
> > > + ret = kstrtouint(buf, 0, &bytes);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + mutex_lock(&cache_mutex);
> > > + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> > > + mbm_window_size = bytes;
> >
> > So, it's valid to set the window to X where 0 < X < MBM_FIFO_SIZE_MIN.
> > What's the actual purpose of MBM_FIFO_SIZE_MIN?
> >
> This is changed to
> if (bytes >= MBM_FIFO_SIZE_MIN && bytes <= MBM_FIFO_SIZE_MAX)
> mbm_window_size = bytes;
Note that if the user passes a value outside of this range you should be
returning -EINVAL to indicate that.
> > > + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> > > + per_cpu(mbm_pmu, cpu) = pmu;
> > > + per_cpu(mbm_pmu_to_free, cpu) = NULL;
> >
> > What's the point of this? If there is still something to be free'd its leaked.
> > Otherwise that's redundant.
> per_cpu(mbm_pmu_to_free, cpu) = NULL; is removed
>
> > > + mbm_hrtimer_init(pmu);
> > > + }
> > > + return 0;
> >
> > s/0/NOTIFY_OK/ because you return that value directly.
> >
> You mean I return the ‘return code’
?
You should be using NOTIFY_OK here so that you follow the notifier API
convention.
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-09-08 11:47 ` Matt Fleming
@ 2015-09-08 16:11 ` Juvva, Kanaka D
2015-09-08 17:06 ` Juvva, Kanaka D
2015-09-08 19:57 ` [PATCH v3 1/2] perf, x86: " kanaka.d.juvva
2 siblings, 0 replies; 12+ messages in thread
From: Juvva, Kanaka D @ 2015-09-08 16:11 UTC (permalink / raw)
To: Fleming, Matt
Cc: Thomas Gleixner, Kanaka Juvva, Williamson, Glenn P, Auld, Will,
Andi Kleen, LKML, Luck, Tony, Peter Zijlstra, Tejun Heo,
x86@kernel.org, Ingo Molnar, H. Peter Anvin, Shivappa, Vikas
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 19478 bytes --]
Hi Matt,
This is regarding Event IDs for perf event:
There are two aspects:
1) Programming MSRs
2) EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x04");
1 is used for programming MSRs
2 event attribute for perf
For MBM_LOCAL_EVENT HW ID is 0x3. We don't want to use 0x3 for EVENT ATTR.
If we use 0x3 for event_attribute
We can't clearly distinguish whether is EVENT 01 & EVENT 02 or EVENT 03 alone.
For perf event attribute it has to be 0x04. Because 0x01 and 0x02 are used for other tow
Thanks,
-Kanaka
> -----Original Message-----
> From: Fleming, Matt
> Sent: Tuesday, September 8, 2015 4:47 AM
> To: Juvva, Kanaka D
> Cc: Thomas Gleixner; Kanaka Juvva; Williamson, Glenn P; Auld, Will; Andi Kleen;
> LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; x86@kernel.org; Ingo Molnar; H.
> Peter Anvin; Shivappa, Vikas
> Subject: Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring
> (MBM) PMU
>
> On Mon, 2015-09-07 at 20:22 +0100, Juvva, Kanaka D wrote:
> > Hi Thomas,
> >
> > I'm sending updated patch(s). I have given details for each of
> > these items below.
> >
>
> Kanaka, this email is HTML formatted and so has been blocked by
> vger.kernel.org where the linux-kernel mailing list is hosted.
>
> Please configure outlook not to send html email, or use a different mail agent
> for working with upstream.
>
> > Regards,
> > -Kanaka
> >
> > > -----Original Message-----
> > > From: Thomas Gleixner [mailto:tglx@linutronix.de]
> > > Sent: Wednesday, August 19, 2015 1:50 PM
> > > To: Kanaka Juvva
> > > Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will;
> > Andi Kleen;
> > > LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; x86@kernel.org; Ingo
> > Molnar; H.
> > > Peter Anvin; Shivappa, Vikas
> > > Subject: Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth
> > Monitoring
> > > (MBM) PMU
> > >
> > > On Fri, 7 Aug 2015, Kanaka Juvva wrote:
> > > > +#define MBM_CNTR_MAX 0xffffff
> > > > +#define MBM_SOCKET_MAX 8
> > > > +#define MBM_TIME_DELTA_MAX 1000
> > > > +#define MBM_TIME_DELTA_MIN 100
> > >
> > > What are these constants for and how are they determined? Pulled out
> > > of thin air?
> > >
> >
> > /*
> > * MBM Counter is 24bits wide. MBM_CNTR_MAX defines max counter
> > * value
> > */
> > #define MBM_CNTR_MAX 0xffffff
> > /*
> > * Max #sockets supported
> > */
> > #define MBM_SOCKET_MAX 8
>
> This seems like a constant we could get by without. Do we really need to know
> this at compile time?
>
> > /*
> > * Expected time interval between consecutive MSR reads for a given rmid
> > */
> > #define MBM_TIME_DELTA_MAX 1000
>
> "max" and "expected" are not the same thing.
>
>
> > > > #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> > > > +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> > > > +#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> > > > +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
> > >
> > > So we have ID values which are built with (1 << X) and then this HW
> > > variant in the middle with 0x3. Of course without any explanation what the
> heck this stuff is.
> > >
> > > Last review:
> > >
> > > "So this wants a descriptive ID name and a comment."
> > >
> > >
> >
> > /*
> > * MBM Event IDs as defined in SDM section 17.14.6
> > * Event IDs used to program MSRs for reading counters
> > */
> > #define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> > #define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> > /*
> > * Perf needs event id to be 1 << x, hence we can't use 0x3 (HW EVENT ID)
> > * for MBM_LOCAL_EVENT we use next 1 << x for MBM_LOCAL_EVENT_ID
> > */
> > #define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
>
> No, perf events do not need to be of the form (1 << X), that was just a
> convention we used in the cqm code before we knew what values the MBM
> events would take - you can change these to be whatever format you want, but
> be sure to make it consistent.
>
> The constants are very much supposed to be programmed into the MSRs, take a
> look at __rmid_read().
>
> I would suggest (as I already did privately) that you change the format to be
> 0x0x for all of these event IDs.
>
>
> > > > @@ static bool intel_cqm_sched_in_event(u32 rmid)
> > > > return false;
> > > > }
> > > >
> > > > +
> > > > +static u32 bw_sum_calc(struct sample *bw_stat, int rmid) {
> > > > + u32 val = 0, i, j, index;
> > > > +
> > > > + if (++bw_stat->fifoout >= mbm_window_size)
> > > > + bw_stat->fifoout = 0;
> > > > + index = bw_stat->fifoout;
> > > > + for (i = 0; i < mbm_window_size - 1; i++) {
> > > > + if (index + i >= mbm_window_size)
> > > > + j = index + i - mbm_window_size;
> > > > + else
> > > > + j = index + i;
> > > > + val += bw_stat->mbmfifo[j];
> > > > + }
> > >
> > > This math wants a explanatory comment.
> > >
> > /*
> > * Slide the window by 1 and calculate the sum of the last
> > * mbm_window_size-1 bandwidth values.
> > * fifoout is the current position of the window.
> > * Increment the fifoout by 1 to slide the window by 1.
> > *
> > * Calcalute the bandwidth from ++fifiout to ( ++fifoout + mbm_window_size -
> 1)
> > * e.g.fifoout =1; Bandwidth1 Bandwidth2 ..... Bandwidthn are the
> > * sliding window values where n is size of the sliding window
> > * bandwidth sum: val = Bandwidth2 + Bandwidth3 + .. Bandwidthn
> > */
>
> Instead of these large comment blocks please comment smaller, logically-
> connected chunks of code, e.g.
>
> /* Slide the window by one */
> if (++bw_stat->fifoout >= mbm_window_size)
> bw_stat->fifoout = 0;
>
> /*
> * Calculate the sum of last mbm_window_size-1 values.
> */
> for (i = 0; i < mbm_window_size - 1; i++) {
> /* Handle wraparound at end of window */
> if (index + i >= mbm_window_size)
> j = index + i - mbm_window_size;
> else
> j = index + i;
>
> val += bw_stat->mbminfo[j];
> }
>
> >
> >
> > > > + return val;
> > > > +}
> > > > +
> > > > +static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw) {
> > > > + if (is_localbw)
> > > > + return bw_sum_calc(&mbm_local[rmid], rmid);
> > > > + else
> > > > + return bw_sum_calc(&mbm_total[rmid], rmid); }
> > > > +
> > > > +static void __mbm_fifo_in(struct sample *bw_stat, u32 val) {
> > > > + bw_stat->mbmfifo[bw_stat->fifoin] = val;
> > > > + if (++bw_stat->fifoin >= mbm_window_size)
> > >
> > > How does that become greater than mbm_windowsize?
> > >
> >
> > This is fixed by changing >= to ==
> > Added a comment:
> >
> > /*
> > * store current sample's bw value in sliding window at the
> > * index fifoin. Increment fifoin. Check if fifoin has reached
> > * max_window_size. If yes reset it to begining i.e. zero
> > * e.g.
> > * mbm_window_size = 10
> > * mbmfifo is a circular fifo 0 1 2 3 4 5 6 7 8 9 10
> > * ^ |
> > * | |
> > * | _ _ _ _ _ _ _ _ _ _|
> > *
> > * So when fifoin becomes 10, then it is reset to zero
> > *
> > */
>
> I'm not sure that this comment adds anything of value that isn't already
> understood by reading the code. I don't think you need a comment for this
> function, it seems pretty straight forward and Thomas' question was about the
> boundary limits of ->fifoin.
>
> > > > + bw_stat->fifoin = 0;
> > > > +}
> > >
> > > > +/*
> > > > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event
> > > > +and reads
> > > > + * its MSR counter. Check whether overflow occurred and handles it.
> > > > +Calculates
> > > > + * currenet BW and updates running average.
> > >
> > > currenet? And please get rid of the double spaces
> > >
> > This is fixed now. Here is the updated comment:
> >
> > /*
> > * rmid_read_mbm checks whether it is LOCAL or Total MBM event and reads
> > * its MSR counter. Check whether overflow occured and handles it. Calculates
> > * currenet BW and updates running average.
> > *
>
> ^^^ You've still misspelled current.
> >
> > * Overflow Handling:
> > * if (MSR current value < MSR previous value) it is an
> > * overflow. MSR values are increasing when bandwidth consumption for the
> thread
> > * is non-zero; When MSR values reaches MAX_COUNTER_VALUE it overflows.
> After overflow,
> > * MSR current value goes back to zero and starts increasing again at the rate
> of
> > * bandwidth.
> > *
>
> You don't need to provide a definition of "overflow", most people will be
> familiar with it. What is more important to document is how the overflow is
> handled...
> >
> > * Overflow handling:
> > * Detect an overflow : current read value > last read value
>
> Isn't this inverted? Overflow occurred if current < previous.
>
> >
> > * Overflow correction: if (overflow)
> > * Current value = (MAX_COUNTER_VALUE - prev read value) +
> current read value
> > * else
> > * Current value = current read value
> > *
>
> Please don't write pseudocode in the comments. Use English prose to describe
> the important parts of the code.
>
> >
> > * Calculation of Current Bandwidth value:
> > * If MSR is read within last 100ms, then then the smaple is ignored;
> > * If the MSR was Read with in last 100ms, why incur an extra overhead
> > * of doing the MSR reads again. Anyway there'll be a negligible change or zero
> > * change in MSR readings in 100ms.
> > *
> > * Bandwidth is calculated as:
> > * memory bandwidth = difference of last two msr counter values/time
> difference.
> > *
> > * cum_avg = Running Average bandwidth of last 'n' bandwidth values for
> > * the samples that are processed
> > *
>
> Where 'n' is 'mbm_window_size' ? If so, please use 'mbm_window_size', not 'n'.
> >
> > * Sliding window is used to save the last 'n' samples. Where,
> > * n = sliding_window_size and results in sliding window duration of 'n' secs.
>
> Hmm... this confuses me a lot. Is 'n' a size or a duration? The two are not the
> same thing.
>
> >
> > * The sliding window size by default set to
> > * MBM_FIFO_SIZE_MIN. User can configure it to the values in the range
> > * (MBM_FIFO_SIZE_MIN,MBM_FIFO_SIZE_MAX). The range for sliding window
> > * is chosen based on a general criteria for monitoring duration. Example
> > * for a short lived application, 10sec monitoring period gives
> > * good characterization of its bandwidth consumption. For an application
> > * that runs for longer duration, 300sec monitoring period gives better
> > * characterization of its bandwidth consumption. Since the running average
> > * calculated for total monitoring period, user gets the most accuracate
> > * average bandwidth for the each monitoring period.
> > *
> > * Scaling:
> > * cum_avg is the raw bandwidth is Bytes/sec.
> > * cum_avg is converted to MB/sec by applying MBM_CONVERSION_FACTOR
> and
> > * rounded to nearest integer. User interface gets the Bandwidth values in
> MB/sec.
> > *
> > */
> > > > + *
> > > > + * Overflow Handling:
> > > > + * if (MSR current value < MSR previous value) it is an
> > > > + * overflow. and overflow is handled.
> > >
> > > Wow. That's informative as hell!
> > >
> > Please look at the modified comment above
> > > > + *
> > > > + * Calculation of Current BW value:
> > >
> > > BW == Body Weight?
> > >
> >
> > It is fixed now
> >
> > > > + * If MSR is read within last 100ms, then the value is ignored;
> > > > + * this will suppress small deltas. We don't process MBM samples
> > > > + that are
> > > > + * within 100ms.
> > >
> > > WHY?
> > >
> > Explained in the comment. If mbm_read is called within in 100ms for
> > the same rmid, we donât have to process the sample.
>
> The key piece of information you're missing here is that skipping these small
> deltas is an optimization, because we avoid performing costly operations for
> what would likely be a very minor change in the MBM data, right?
>
>
> > > > +{
> > > > + u64 val, tmp, diff_time, cma, bytes, index;
> > > > + bool overflow = false, first = false;
> > > > + ktime_t cur_time;
> > > > + u32 tmp32 = rmid;
> > > > + struct sample *mbm_current;
> > > > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > > > + cqm_max_rmid + rmid;
> > > > +
> > > > + rmid = vrmid;
> > >
> > > From my previous review:
> > >
> > > "This is completely backwards.
> > >
> > > tmp32 = rmid;
> > > rmid = vrmid;
> > > do_stuff(rmid);
> > > rmid = tmp32;
> > > do_other_stuff(rmid);
> > >
> > > Why can't you use vrmid for do_stuff() and leave rmid alone? Just
> > > because it would make the code simpler to read?"
> > >
> > > Still applies.
> > >
> >
> > This is now changed to
> > u64 val, currentmsr, currentbw, diff_time, cma, bytes, index;
> > bool overflow = false, first = false;
> > ktime_t cur_time;
> > u32 tmp32 = rmid, eventid;
> > struct sample *mbm_current;
> > u32 vrmid = rmid_2_index(rmid);
> >
> > rmid = vrmid;
> > cur_time = ktime_get();
> > if (read_mbm_local) {
> > mbm_current = &mbm_local[vrmid];
> > eventid = QOS_MBM_LOCAL_EVENT_ID_HW;
> > wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID_HW,
> > rmid);
>
> You don't need to perform this wrmsr() here because it's taken care of in the
> common code below.
>
> > } else {
> > mbm_current = &mbm_total[vrmid];
> > eventid = QOS_MBM_TOTAL_EVENT_ID;
> > }
> > rmid = tmp32;
>
> Why did you assign rmid to vrmid if you reassign it before it was used?
>
>
> > > > + /* if current msr value < previous msr value , it means overflow */
> > > > + if (val < bytes) {
> > > > + val = MBM_CNTR_MAX - bytes + val;
> > > > + overflow = true;
> > > > + } else
> > > > + val = val - bytes;
> > > > +
> > > > + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
> > > > +
> > > > + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> > > > + /* First sample */
> > > > + first = true;
> > > > +
> > > > + rmid = vrmid;
> > >
> > > And another time:
> > >
> > > "More obfuscation"
> > >
> >
> > /*
> > * MBM_TIME_DELTA_MAX is picked as per MBM specs. As specified in
> Intel Platform
> > * Quality of Service Monitoring Implementer's Guide V1, Section 2.7.2.
> page 21,
> > * overflow can occur maximum once in a second. So latest we want to
> read the MSR
> > * counters is 1000ms. If it is less than 1000ms we can ignore the sample.
> Then we
> > * decide since when we should ignore. If the MSR was Read with in last
> 100ms, why
> > * process the MSR reads again. Anyway there'll be small change or zero
> change.
> > * So ignoring MSR Reads within 100ms or less is efficient.
> MBM_TIME_DELTA_MIN
> > * is specified as 100ms as per this guideline.
> > *
> > */
>
> I suspect the document you're referring to above is only available under NDA,
> which makes it unsuitable for mention in the kernel source since a large number
> of people won't have access to it.
>
> Just explain that the way the hardware is designed puts an upper limit on how
> quickly the counter can overflow, which is once per second.
>
>
> > > > +static void __intel_cqm_event_total_bw_count(void *info) {
> > > > + struct rmid_read *rr = info;
> > > > + u64 val;
> > > > +
> > > > + val = __rmid_read_mbm(rr->rmid, false);
> > > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > > > + return;
> > > > + atomic64_add(val, &rr->value); }
> > > > +
> > > > +static void __intel_cqm_event_local_bw_count(void *info) {
> > > > + struct rmid_read *rr = info;
> > > > + u64 val;
> > > > +
> > > > + val = __rmid_read_mbm(rr->rmid, true);
> > > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > > > + return;
> > > > + atomic64_add(val, &rr->value); }
> > >
> > > And once more:
> > >
> > > "You're really a fan of copy and paste."
> > >
> >
> > These functions are invoked indirectly. They were written keeping
> intel_cqm_event_count in mind.
> > Iâll change the arg to struct mbm_read{
> > struct rmid_read *rr;
> > u32 eventid;
> > };
> > Intel_cqm_event_*_bw_count(â¦.) needs eventid to call for decoding
>
> No, please do not duplicate the rmid_read structure, that is not an
> improvement, we don't need two different structs for reading the read data.
>
> Please add the event field to the existing struct rmid_read.
>
>
> > > > @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct
> > > perf_event *event, int mode)
> > > > } else {
> > > > WARN_ON_ONCE(!state->rmid);
> > > > }
> > > > +
> > > > + if (pmu) {
> > > > + if (pmu->n_active > 0) {
> > >
> > > What's the purpose of this check? In the previous version there was
> > > a WARN_ON(), which made sense. Did it trigger and you decided to "work"
> > > around it?
> > >
> >
> > We actually meant to check if there are active events
>
> I don't follow this answer. Are you saying that the WARN_ON() doesn't make
> sense here?
>
>
>
> >
> > > > +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit,
> > > > +"KB/sec"); EVENT_ATTR_STR(llc_local_bw.unit,
> > > > +intel_cqm_llc_local_bw_unit, "KB/sec"); #endif
> > >
> > > > +static ssize_t
> > > > +sliding_window_size_store(struct device *dev,
> > > > + struct device_attribute *attr,
> > > > + const char *buf, size_t count) {
> > > > + unsigned int bytes;
> > > > + int ret;
> > > > +
> > > > + ret = kstrtouint(buf, 0, &bytes);
> > > > + if (ret)
> > > > + return ret;
> > > > +
> > > > + mutex_lock(&cache_mutex);
> > > > + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> > > > + mbm_window_size = bytes;
> > >
> > > So, it's valid to set the window to X where 0 < X < MBM_FIFO_SIZE_MIN.
> > > What's the actual purpose of MBM_FIFO_SIZE_MIN?
> > >
> > This is changed to
> > if (bytes >= MBM_FIFO_SIZE_MIN && bytes <= MBM_FIFO_SIZE_MAX)
> > mbm_window_size = bytes;
>
> Note that if the user passes a value outside of this range you should be returning
> -EINVAL to indicate that.
>
>
> > > > + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> > > > + per_cpu(mbm_pmu, cpu) = pmu;
> > > > + per_cpu(mbm_pmu_to_free, cpu) = NULL;
> > >
> > > What's the point of this? If there is still something to be free'd its leaked.
> > > Otherwise that's redundant.
> > per_cpu(mbm_pmu_to_free, cpu) = NULL; is removed
> >
> > > > + mbm_hrtimer_init(pmu);
> > > > + }
> > > > + return 0;
> > >
> > > s/0/NOTIFY_OK/ because you return that value directly.
> > >
> > You mean I return the âreturn codeâ
>
> ?
>
> You should be using NOTIFY_OK here so that you follow the notifier API
> convention.
>
ÿôèº{.nÇ+·®+%Ëÿ±éݶ\x17¥wÿº{.nÇ+·¥{±þG«éÿ{ayº\x1dÊÚë,j\a¢f£¢·hïêÿêçz_è®\x03(éÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?¨èÚ&£ø§~á¶iOæ¬z·vØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?I¥
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-09-08 11:47 ` Matt Fleming
2015-09-08 16:11 ` Juvva, Kanaka D
@ 2015-09-08 17:06 ` Juvva, Kanaka D
2015-09-10 13:58 ` Matt Fleming
2015-09-08 19:57 ` [PATCH v3 1/2] perf, x86: " kanaka.d.juvva
2 siblings, 1 reply; 12+ messages in thread
From: Juvva, Kanaka D @ 2015-09-08 17:06 UTC (permalink / raw)
To: Fleming, Matt
Cc: Thomas Gleixner, Kanaka Juvva, Williamson, Glenn P, Auld, Will,
Andi Kleen, LKML, Luck, Tony, Peter Zijlstra, Tejun Heo,
x86@kernel.org, Ingo Molnar, H. Peter Anvin, Shivappa, Vikas
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 20239 bytes --]
Hi Matt,
Here is a response to all your comments down below.
Thanks,
-Kanaka
> -----Original Message-----
> From: Fleming, Matt
> Sent: Tuesday, September 8, 2015 4:47 AM
> To: Juvva, Kanaka D
> Cc: Thomas Gleixner; Kanaka Juvva; Williamson, Glenn P; Auld, Will; Andi Kleen;
> LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; x86@kernel.org; Ingo Molnar; H.
> Peter Anvin; Shivappa, Vikas
> Subject: Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring
> (MBM) PMU
>
> On Mon, 2015-09-07 at 20:22 +0100, Juvva, Kanaka D wrote:
> > Hi Thomas,
> >
> > I'm sending updated patch(s). I have given details for each of
> > these items below.
> >
>
> Kanaka, this email is HTML formatted and so has been blocked by
> vger.kernel.org where the linux-kernel mailing list is hosted.
>
> Please configure outlook not to send html email, or use a different mail agent
> for working with upstream.
>
Done.
> > Regards,
> > -Kanaka
> >
> > > -----Original Message-----
> > > From: Thomas Gleixner [mailto:tglx@linutronix.de]
> > > Sent: Wednesday, August 19, 2015 1:50 PM
> > > To: Kanaka Juvva
> > > Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will;
> > Andi Kleen;
> > > LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; x86@kernel.org; Ingo
> > Molnar; H.
> > > Peter Anvin; Shivappa, Vikas
> > > Subject: Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth
> > Monitoring
> > > (MBM) PMU
> > >
> > > On Fri, 7 Aug 2015, Kanaka Juvva wrote:
> > > > +#define MBM_CNTR_MAX 0xffffff
> > > > +#define MBM_SOCKET_MAX 8
> > > > +#define MBM_TIME_DELTA_MAX 1000
> > > > +#define MBM_TIME_DELTA_MIN 100
> > >
> > > What are these constants for and how are they determined? Pulled out
> > > of thin air?
> > >
> >
> > /*
> > * MBM Counter is 24bits wide. MBM_CNTR_MAX defines max counter
> > * value
> > */
> > #define MBM_CNTR_MAX 0xffffff
> > /*
> > * Max #sockets supported
> > */
> > #define MBM_SOCKET_MAX 8
>
> This seems like a constant we could get by without. Do we really need to know
> this at compile time?
>
I spoke to Andi Kleen regarding how many sockets. We decided we could keep 8.
> > /*
> > * Expected time interval between consecutive MSR reads for a given rmid
> > */
> > #define MBM_TIME_DELTA_MAX 1000
>
> "max" and "expected" are not the same thing.
>
>
This is being changed to #define MBM_TIME_DELTA_EXP 1000
> > > > #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> > > > +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> > > > +#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> > > > +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
> > >
> > > So we have ID values which are built with (1 << X) and then this HW
> > > variant in the middle with 0x3. Of course without any explanation what the
> heck this stuff is.
> > >
> > > Last review:
> > >
> > > "So this wants a descriptive ID name and a comment."
> > >
> > >
> >
> > /*
> > * MBM Event IDs as defined in SDM section 17.14.6
> > * Event IDs used to program MSRs for reading counters
> > */
> > #define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
> > #define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
> > /*
> > * Perf needs event id to be 1 << x, hence we can't use 0x3 (HW EVENT ID)
> > * for MBM_LOCAL_EVENT we use next 1 << x for MBM_LOCAL_EVENT_ID
> > */
> > #define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
>
> No, perf events do not need to be of the form (1 << X), that was just a
> convention we used in the cqm code before we knew what values the MBM
> events would take - you can change these to be whatever format you want, but
> be sure to make it consistent.
>
> The constants are very much supposed to be programmed into the MSRs, take a
> look at __rmid_read().
>
> I would suggest (as I already did privately) that you change the format to be
> 0x0x for all of these event IDs.
>
>
There are two aspects:
1) Programming MSRs
2) EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x04");
1 is used for programming MSRs
2 event attribute for perf
For MBM_LOCAL_EVENT HW ID is 0x3. We don't want to use 0x3 for EVENT ATTR.
If we use 0x3 for event_attribute
We can't clearly distinguish whether is EVENT 01 & EVENT 02 or EVENT 03 alone.
For perf event attribute it has to be 0x04. Because 0x01 and 0x02 are used for other two events
> > > > @@ static bool intel_cqm_sched_in_event(u32 rmid)
> > > > return false;
> > > > }
> > > >
> > > > +
> > > > +static u32 bw_sum_calc(struct sample *bw_stat, int rmid) {
> > > > + u32 val = 0, i, j, index;
> > > > +
> > > > + if (++bw_stat->fifoout >= mbm_window_size)
> > > > + bw_stat->fifoout = 0;
> > > > + index = bw_stat->fifoout;
> > > > + for (i = 0; i < mbm_window_size - 1; i++) {
> > > > + if (index + i >= mbm_window_size)
> > > > + j = index + i - mbm_window_size;
> > > > + else
> > > > + j = index + i;
> > > > + val += bw_stat->mbmfifo[j];
> > > > + }
> > >
> > > This math wants a explanatory comment.
> > >
> > /*
> > * Slide the window by 1 and calculate the sum of the last
> > * mbm_window_size-1 bandwidth values.
> > * fifoout is the current position of the window.
> > * Increment the fifoout by 1 to slide the window by 1.
> > *
> > * Calcalute the bandwidth from ++fifiout to ( ++fifoout + mbm_window_size -
> 1)
> > * e.g.fifoout =1; Bandwidth1 Bandwidth2 ..... Bandwidthn are the
> > * sliding window values where n is size of the sliding window
> > * bandwidth sum: val = Bandwidth2 + Bandwidth3 + .. Bandwidthn
> > */
>
> Instead of these large comment blocks please comment smaller, logically-
> connected chunks of code, e.g.
>
OK
> /* Slide the window by one */
> if (++bw_stat->fifoout >= mbm_window_size)
> bw_stat->fifoout = 0;
>
> /*
> * Calculate the sum of last mbm_window_size-1 values.
> */
> for (i = 0; i < mbm_window_size - 1; i++) {
> /* Handle wraparound at end of window */
> if (index + i >= mbm_window_size)
> j = index + i - mbm_window_size;
> else
> j = index + i;
>
> val += bw_stat->mbminfo[j];
> }
>
> >
> >
> > > > + return val;
> > > > +}
> > > > +
> > > > +static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw) {
> > > > + if (is_localbw)
> > > > + return bw_sum_calc(&mbm_local[rmid], rmid);
> > > > + else
> > > > + return bw_sum_calc(&mbm_total[rmid], rmid); }
> > > > +
> > > > +static void __mbm_fifo_in(struct sample *bw_stat, u32 val) {
> > > > + bw_stat->mbmfifo[bw_stat->fifoin] = val;
> > > > + if (++bw_stat->fifoin >= mbm_window_size)
> > >
> > > How does that become greater than mbm_windowsize?
> > >
> >
> > This is fixed by changing >= to ==
> > Added a comment:
> >
> > /*
> > * store current sample's bw value in sliding window at the
> > * index fifoin. Increment fifoin. Check if fifoin has reached
> > * max_window_size. If yes reset it to begining i.e. zero
> > * e.g.
> > * mbm_window_size = 10
> > * mbmfifo is a circular fifo 0 1 2 3 4 5 6 7 8 9 10
> > * ^ |
> > * | |
> > * | _ _ _ _ _ _ _ _ _ _|
> > *
> > * So when fifoin becomes 10, then it is reset to zero
> > *
> > */
>
> I'm not sure that this comment adds anything of value that isn't already
> understood by reading the code. I don't think you need a comment for this
> function, it seems pretty straight forward and Thomas' question was about the
> boundary limits of ->fifoin.
>
Thomas question was addressed by the change in if statement i.e '==''
I'll take out the comment.
> > > > + bw_stat->fifoin = 0;
> > > > +}
> > >
> > > > +/*
> > > > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event
> > > > +and reads
> > > > + * its MSR counter. Check whether overflow occurred and handles it.
> > > > +Calculates
> > > > + * currenet BW and updates running average.
> > >
> > > currenet? And please get rid of the double spaces
> > >
> > This is fixed now. Here is the updated comment:
> >
> > /*
> > * rmid_read_mbm checks whether it is LOCAL or Total MBM event and reads
> > * its MSR counter. Check whether overflow occured and handles it. Calculates
> > * currenet BW and updates running average.
> > *
>
> ^^^ You've still misspelled current.
Fixed
> >
> > * Overflow Handling:
> > * if (MSR current value < MSR previous value) it is an
> > * overflow. MSR values are increasing when bandwidth consumption for the
> thread
> > * is non-zero; When MSR values reaches MAX_COUNTER_VALUE it overflows.
> After overflow,
> > * MSR current value goes back to zero and starts increasing again at the rate
> of
> > * bandwidth.
> > *
>
How overflow handling is down below. I'll change this as per the comments
> You don't need to provide a definition of "overflow", most people will be
> familiar with it. What is more important to document is how the overflow is
> handled...
> >
> > * Overflow handling:
> > * Detect an overflow : current read value > last read value
>
> Isn't this inverted? Overflow occurred if current < previous.
>
> >
> > * Overflow correction: if (overflow)
> > * Current value = (MAX_COUNTER_VALUE - prev read value) +
> current read value
> > * else
> > * Current value = current read value
> > *
>
> Please don't write pseudocode in the comments. Use English prose to describe
> the important parts of the code.
>
OK
> >
> > * Calculation of Current Bandwidth value:
> > * If MSR is read within last 100ms, then then the smaple is ignored;
> > * If the MSR was Read with in last 100ms, why incur an extra overhead
> > * of doing the MSR reads again. Anyway there'll be a negligible change or zero
> > * change in MSR readings in 100ms.
> > *
> > * Bandwidth is calculated as:
> > * memory bandwidth = difference of last two msr counter values/time
> difference.
> > *
> > * cum_avg = Running Average bandwidth of last 'n' bandwidth values for
> > * the samples that are processed
> > *
>
> Where 'n' is 'mbm_window_size' ? If so, please use 'mbm_window_size', not 'n'.
OK
> >
> > * Sliding window is used to save the last 'n' samples. Where,
> > * n = sliding_window_size and results in sliding window duration of 'n' secs.
>
> Hmm... this confuses me a lot. Is 'n' a size or a duration? The two are not the
> same thing.
>
Actually they have same values. Anyways I'll change the comment to talk about
duration. That way it will not be a confusion.
> >
> > * The sliding window size by default set to
> > * MBM_FIFO_SIZE_MIN. User can configure it to the values in the range
> > * (MBM_FIFO_SIZE_MIN,MBM_FIFO_SIZE_MAX). The range for sliding window
> > * is chosen based on a general criteria for monitoring duration. Example
> > * for a short lived application, 10sec monitoring period gives
> > * good characterization of its bandwidth consumption. For an application
> > * that runs for longer duration, 300sec monitoring period gives better
> > * characterization of its bandwidth consumption. Since the running average
> > * calculated for total monitoring period, user gets the most accuracate
> > * average bandwidth for the each monitoring period.
> > *
> > * Scaling:
> > * cum_avg is the raw bandwidth is Bytes/sec.
> > * cum_avg is converted to MB/sec by applying MBM_CONVERSION_FACTOR
> and
> > * rounded to nearest integer. User interface gets the Bandwidth values in
> MB/sec.
> > *
> > */
> > > > + *
> > > > + * Overflow Handling:
> > > > + * if (MSR current value < MSR previous value) it is an
> > > > + * overflow. and overflow is handled.
> > >
> > > Wow. That's informative as hell!
> > >
> > Please look at the modified comment above
> > > > + *
> > > > + * Calculation of Current BW value:
> > >
> > > BW == Body Weight?
> > >
> >
> > It is fixed now
> >
> > > > + * If MSR is read within last 100ms, then the value is ignored;
> > > > + * this will suppress small deltas. We don't process MBM samples
> > > > + that are
> > > > + * within 100ms.
> > >
> > > WHY?
> > >
> > Explained in the comment. If mbm_read is called within in 100ms for
> > the same rmid, we donât have to process the sample.
>
> The key piece of information you're missing here is that skipping these small
> deltas is an optimization, because we avoid performing costly operations for
> what would likely be a very minor change in the MBM data, right?
>
>
Yes, You are correct.
> > > > +{
> > > > + u64 val, tmp, diff_time, cma, bytes, index;
> > > > + bool overflow = false, first = false;
> > > > + ktime_t cur_time;
> > > > + u32 tmp32 = rmid;
> > > > + struct sample *mbm_current;
> > > > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > > > + cqm_max_rmid + rmid;
> > > > +
> > > > + rmid = vrmid;
> > >
> > > From my previous review:
> > >
> > > "This is completely backwards.
> > >
> > > tmp32 = rmid;
> > > rmid = vrmid;
> > > do_stuff(rmid);
> > > rmid = tmp32;
> > > do_other_stuff(rmid);
> > >
> > > Why can't you use vrmid for do_stuff() and leave rmid alone? Just
> > > because it would make the code simpler to read?"
> > >
> > > Still applies.
> > >
> >
> > This is now changed to
> > u64 val, currentmsr, currentbw, diff_time, cma, bytes, index;
> > bool overflow = false, first = false;
> > ktime_t cur_time;
> > u32 tmp32 = rmid, eventid;
> > struct sample *mbm_current;
> > u32 vrmid = rmid_2_index(rmid);
> >
> > rmid = vrmid;
> > cur_time = ktime_get();
> > if (read_mbm_local) {
> > mbm_current = &mbm_local[vrmid];
> > eventid = QOS_MBM_LOCAL_EVENT_ID_HW;
> > wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID_HW,
> > rmid);
>
> You don't need to perform this wrmsr() here because it's taken care of in the
> common code below.
>
OK
> > } else {
> > mbm_current = &mbm_total[vrmid];
> > eventid = QOS_MBM_TOTAL_EVENT_ID;
> > }
> > rmid = tmp32;
>
> Why did you assign rmid to vrmid if you reassign it before it was used?
>
>
For MSR writes we use rmid value and for mbm_* arrary we use vrmid which is actual
index.
> > > > + /* if current msr value < previous msr value , it means overflow */
> > > > + if (val < bytes) {
> > > > + val = MBM_CNTR_MAX - bytes + val;
> > > > + overflow = true;
> > > > + } else
> > > > + val = val - bytes;
> > > > +
> > > > + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
> > > > +
> > > > + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> > > > + /* First sample */
> > > > + first = true;
> > > > +
> > > > + rmid = vrmid;
> > >
> > > And another time:
> > >
> > > "More obfuscation"
> > >
> >
> > /*
> > * MBM_TIME_DELTA_MAX is picked as per MBM specs. As specified in
> Intel Platform
> > * Quality of Service Monitoring Implementer's Guide V1, Section 2.7.2.
> page 21,
> > * overflow can occur maximum once in a second. So latest we want to
> read the MSR
> > * counters is 1000ms. If it is less than 1000ms we can ignore the sample.
> Then we
> > * decide since when we should ignore. If the MSR was Read with in last
> 100ms, why
> > * process the MSR reads again. Anyway there'll be small change or zero
> change.
> > * So ignoring MSR Reads within 100ms or less is efficient.
> MBM_TIME_DELTA_MIN
> > * is specified as 100ms as per this guideline.
> > *
> > */
>
> I suspect the document you're referring to above is only available under NDA,
> which makes it unsuitable for mention in the kernel source since a large number
> of people won't have access to it.
>
> Just explain that the way the hardware is designed puts an upper limit on how
> quickly the counter can overflow, which is once per second.
>
>
OK. I'll change this to "as per hardware functionality"
> > > > +static void __intel_cqm_event_total_bw_count(void *info) {
> > > > + struct rmid_read *rr = info;
> > > > + u64 val;
> > > > +
> > > > + val = __rmid_read_mbm(rr->rmid, false);
> > > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > > > + return;
> > > > + atomic64_add(val, &rr->value); }
> > > > +
> > > > +static void __intel_cqm_event_local_bw_count(void *info) {
> > > > + struct rmid_read *rr = info;
> > > > + u64 val;
> > > > +
> > > > + val = __rmid_read_mbm(rr->rmid, true);
> > > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > > > + return;
> > > > + atomic64_add(val, &rr->value); }
> > >
> > > And once more:
> > >
> > > "You're really a fan of copy and paste."
> > >
> >
> > These functions are invoked indirectly. They were written keeping
> intel_cqm_event_count in mind.
> > Iâll change the arg to struct mbm_read{
> > struct rmid_read *rr;
> > u32 eventid;
> > };
> > Intel_cqm_event_*_bw_count(â¦.) needs eventid to call for decoding
>
> No, please do not duplicate the rmid_read structure, that is not an
> improvement, we don't need two different structs for reading the read data.
>
> Please add the event field to the existing struct rmid_read.
>
>
OK
> > > > @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct
> > > perf_event *event, int mode)
> > > > } else {
> > > > WARN_ON_ONCE(!state->rmid);
> > > > }
> > > > +
> > > > + if (pmu) {
> > > > + if (pmu->n_active > 0) {
> > >
> > > What's the purpose of this check? In the previous version there was
> > > a WARN_ON(), which made sense. Did it trigger and you decided to "work"
> > > around it?
> > >
> >
> > We actually meant to check if there are active events
>
> I don't follow this answer. Are you saying that the WARN_ON() doesn't make
> sense here?
>
>
>
I can add WARN_ON. But this will always hit if there are no events.
> >
> > > > +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit,
> > > > +"KB/sec"); EVENT_ATTR_STR(llc_local_bw.unit,
> > > > +intel_cqm_llc_local_bw_unit, "KB/sec"); #endif
> > >
> > > > +static ssize_t
> > > > +sliding_window_size_store(struct device *dev,
> > > > + struct device_attribute *attr,
> > > > + const char *buf, size_t count) {
> > > > + unsigned int bytes;
> > > > + int ret;
> > > > +
> > > > + ret = kstrtouint(buf, 0, &bytes);
> > > > + if (ret)
> > > > + return ret;
> > > > +
> > > > + mutex_lock(&cache_mutex);
> > > > + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> > > > + mbm_window_size = bytes;
> > >
> > > So, it's valid to set the window to X where 0 < X < MBM_FIFO_SIZE_MIN.
> > > What's the actual purpose of MBM_FIFO_SIZE_MIN?
> > >
> > This is changed to
> > if (bytes >= MBM_FIFO_SIZE_MIN && bytes <= MBM_FIFO_SIZE_MAX)
> > mbm_window_size = bytes;
>
> Note that if the user passes a value outside of this range you should be returning
> -EINVAL to indicate that.
>
>
OK
> > > > + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> > > > + per_cpu(mbm_pmu, cpu) = pmu;
> > > > + per_cpu(mbm_pmu_to_free, cpu) = NULL;
> > >
> > > What's the point of this? If there is still something to be free'd its leaked.
> > > Otherwise that's redundant.
> > per_cpu(mbm_pmu_to_free, cpu) = NULL; is removed
> >
> > > > + mbm_hrtimer_init(pmu);
> > > > + }
> > > > + return 0;
> > >
> > > s/0/NOTIFY_OK/ because you return that value directly.
> > >
> > You mean I return the âreturn codeâ
>
> ?
>
> You should be using NOTIFY_OK here so that you follow the notifier API
> convention.
>
OK
ÿôèº{.nÇ+·®+%Ëÿ±éݶ\x17¥wÿº{.nÇ+·¥{±þG«éÿ{ayº\x1dÊÚë,j\a¢f£¢·hïêÿêçz_è®\x03(éÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?¨èÚ&£ø§~á¶iOæ¬z·vØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?I¥
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 1/2] perf, x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-09-08 11:47 ` Matt Fleming
2015-09-08 16:11 ` Juvva, Kanaka D
2015-09-08 17:06 ` Juvva, Kanaka D
@ 2015-09-08 19:57 ` kanaka.d.juvva
2 siblings, 0 replies; 12+ messages in thread
From: kanaka.d.juvva @ 2015-09-08 19:57 UTC (permalink / raw)
To: Matt Fleming
Cc: Juvva, Kanaka D, Thomas Gleixner, Kanaka Juvva,
Williamson, Glenn P, Auld, Will, Andi Kleen, LKML, Luck, Tony,
Peter Zijlstra, Tejun Heo, x86@kernel.org, Ingo Molnar,
H. Peter Anvin, Shivappa, Vikas
Hi All,
I have switched to Linux based email client (SquirrelMail) right now.
If required I'll resend the emails I just sent now. Sorry about the
repetition.
We have some of our team members planning for travel. I was in a rush.
Regards,
-Kanaka
> On Mon, 2015-09-07 at 20:22 +0100, Juvva, Kanaka D wrote:
>> Hi Thomas,
>>
>> I'm sending updated patch(s). I have given details for each of
>> these items below.
>>
>
> Kanaka, this email is HTML formatted and so has been blocked by
> vger.kernel.org where the linux-kernel mailing list is hosted.
>
> Please configure outlook not to send html email, or use a different mail
> agent for working with upstream.
>
>> Regards,
>> -Kanaka
>>
>> > -----Original Message-----
>> > From: Thomas Gleixner [mailto:tglx@linutronix.de]
>> > Sent: Wednesday, August 19, 2015 1:50 PM
>> > To: Kanaka Juvva
>> > Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will;
>> Andi Kleen;
>> > LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; x86@kernel.org; Ingo
>> Molnar; H.
>> > Peter Anvin; Shivappa, Vikas
>> > Subject: Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth
>> Monitoring
>> > (MBM) PMU
>> >
>> > On Fri, 7 Aug 2015, Kanaka Juvva wrote:
>> > > +#define MBM_CNTR_MAX 0xffffff
>> > > +#define MBM_SOCKET_MAX 8
>> > > +#define MBM_TIME_DELTA_MAX 1000
>> > > +#define MBM_TIME_DELTA_MIN 100
>> >
>> > What are these constants for and how are they determined? Pulled out
>> of thin
>> > air?
>> >
>>
>> /*
>> * MBM Counter is 24bits wide. MBM_CNTR_MAX defines max counter
>> * value
>> */
>> #define MBM_CNTR_MAX 0xffffff
>> /*
>> * Max #sockets supported
>> */
>> #define MBM_SOCKET_MAX 8
>
> This seems like a constant we could get by without. Do we really need to
> know this at compile time?
>
>> /*
>> * Expected time interval between consecutive MSR reads for a given rmid
>> */
>> #define MBM_TIME_DELTA_MAX 1000
>
> "max" and "expected" are not the same thing.
>
>
>> > > #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
>> > > +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
>> > > +#define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
>> > > +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
>> >
>> > So we have ID values which are built with (1 << X) and then this HW
>> variant in the
>> > middle with 0x3. Of course without any explanation what the heck this
>> stuff is.
>> >
>> > Last review:
>> >
>> > "So this wants a descriptive ID name and a comment."
>> >
>> >
>>
>> /*
>> * MBM Event IDs as defined in SDM section 17.14.6
>> * Event IDs used to program MSRs for reading counters
>> */
>> #define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
>> #define QOS_MBM_LOCAL_EVENT_ID_HW 0x3
>> /*
>> * Perf needs event id to be 1 << x, hence we can't use 0x3 (HW EVENT ID)
>> * for MBM_LOCAL_EVENT we use next 1 << x for MBM_LOCAL_EVENT_ID
>> */
>> #define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
>
> No, perf events do not need to be of the form (1 << X), that was just a
> convention we used in the cqm code before we knew what values the MBM
> events would take - you can change these to be whatever format you want,
> but be sure to make it consistent.
>
> The constants are very much supposed to be programmed into the MSRs,
> take a look at __rmid_read().
>
> I would suggest (as I already did privately) that you change the format
> to be 0x0x for all of these event IDs.
>
>
>> > > @@ static bool intel_cqm_sched_in_event(u32 rmid)
>> > > return false;
>> > > }
>> > >
>> > > +
>> > > +static u32 bw_sum_calc(struct sample *bw_stat, int rmid) {
>> > > + u32 val = 0, i, j, index;
>> > > +
>> > > + if (++bw_stat->fifoout >= mbm_window_size)
>> > > + bw_stat->fifoout = 0;
>> > > + index = bw_stat->fifoout;
>> > > + for (i = 0; i < mbm_window_size - 1; i++) {
>> > > + if (index + i >= mbm_window_size)
>> > > + j = index + i - mbm_window_size;
>> > > + else
>> > > + j = index + i;
>> > > + val += bw_stat->mbmfifo[j];
>> > > + }
>> >
>> > This math wants a explanatory comment.
>> >
>> /*
>> * Slide the window by 1 and calculate the sum of the last
>> * mbm_window_size-1 bandwidth values.
>> * fifoout is the current position of the window.
>> * Increment the fifoout by 1 to slide the window by 1.
>> *
>> * Calcalute the bandwidth from ++fifiout to ( ++fifoout +
>> mbm_window_size -1)
>> * e.g.fifoout =1; Bandwidth1 Bandwidth2 ..... Bandwidthn are the
>> * sliding window values where n is size of the sliding window
>> * bandwidth sum: val = Bandwidth2 + Bandwidth3 + .. Bandwidthn
>> */
>
> Instead of these large comment blocks please comment smaller,
> logically-connected chunks of code, e.g.
>
> /* Slide the window by one */
> if (++bw_stat->fifoout >= mbm_window_size)
> bw_stat->fifoout = 0;
>
> /*
> * Calculate the sum of last mbm_window_size-1 values.
> */
> for (i = 0; i < mbm_window_size - 1; i++) {
> /* Handle wraparound at end of window */
> if (index + i >= mbm_window_size)
> j = index + i - mbm_window_size;
> else
> j = index + i;
>
> val += bw_stat->mbminfo[j];
> }
>
>>
>>
>> > > + return val;
>> > > +}
>> > > +
>> > > +static u32 __mbm_fifo_sum_lastn_out(int rmid, bool is_localbw) {
>> > > + if (is_localbw)
>> > > + return bw_sum_calc(&mbm_local[rmid], rmid);
>> > > + else
>> > > + return bw_sum_calc(&mbm_total[rmid], rmid); }
>> > > +
>> > > +static void __mbm_fifo_in(struct sample *bw_stat, u32 val) {
>> > > + bw_stat->mbmfifo[bw_stat->fifoin] = val;
>> > > + if (++bw_stat->fifoin >= mbm_window_size)
>> >
>> > How does that become greater than mbm_windowsize?
>> >
>>
>> This is fixed by changing >= to ==
>> Added a comment:
>>
>> /*
>> * store current sample's bw value in sliding window at the
>> * index fifoin. Increment fifoin. Check if fifoin has reached
>> * max_window_size. If yes reset it to begining i.e. zero
>> * e.g.
>> * mbm_window_size = 10
>> * mbmfifo is a circular fifo 0 1 2 3 4 5 6 7 8 9 10
>> * ^
>> |
>> * |
>> |
>> * | _ _ _ _ _ _ _ _ _ _|
>> *
>> * So when fifoin becomes 10, then it is reset to zero
>> *
>> */
>
> I'm not sure that this comment adds anything of value that isn't already
> understood by reading the code. I don't think you need a comment for
> this function, it seems pretty straight forward and Thomas' question was
> about the boundary limits of ->fifoin.
>
>> > > + bw_stat->fifoin = 0;
>> > > +}
>> >
>> > > +/*
>> > > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event
>> and
>> > > +reads
>> > > + * its MSR counter. Check whether overflow occurred and handles it.
>> > > +Calculates
>> > > + * currenet BW and updates running average.
>> >
>> > currenet? And please get rid of the double spaces
>> >
>> This is fixed now. Here is the updated comment:
>>
>> /*
>> * rmid_read_mbm checks whether it is LOCAL or Total MBM event and reads
>> * its MSR counter. Check whether overflow occured and handles it.
>> Calculates
>> * currenet BW and updates running average.
>> *
>
> ^^^ You've still misspelled current.
>>
>> * Overflow Handling:
>> * if (MSR current value < MSR previous value) it is an
>> * overflow. MSR values are increasing when bandwidth consumption for the
>> thread
>> * is non-zero; When MSR values reaches MAX_COUNTER_VALUE it overflows.
>> After overflow,
>> * MSR current value goes back to zero and starts increasing again at the
>> rate of
>> * bandwidth.
>> *
>
> You don't need to provide a definition of "overflow", most people will
> be familiar with it. What is more important to document is how the
> overflow is handled...
>>
>> * Overflow handling:
>> * Detect an overflow : current read value > last read value
>
> Isn't this inverted? Overflow occurred if current < previous.
>
>>
>> * Overflow correction: if (overflow)
>> * Current value = (MAX_COUNTER_VALUE - prev
>> read value) + current read value
>> * else
>> * Current value = current read value
>> *
>
> Please don't write pseudocode in the comments. Use English prose to
> describe the important parts of the code.
>
>>
>> * Calculation of Current Bandwidth value:
>> * If MSR is read within last 100ms, then then the smaple is ignored;
>> * If the MSR was Read with in last 100ms, why incur an extra overhead
>> * of doing the MSR reads again. Anyway there'll be a negligible change
>> or zero
>> * change in MSR readings in 100ms.
>> *
>> * Bandwidth is calculated as:
>> * memory bandwidth = difference of last two msr counter values/time
>> difference.
>> *
>> * cum_avg = Running Average bandwidth of last 'n' bandwidth values for
>> * the samples that are processed
>> *
>
> Where 'n' is 'mbm_window_size' ? If so, please use 'mbm_window_size',
> not 'n'.
>>
>> * Sliding window is used to save the last 'n' samples. Where,
>> * n = sliding_window_size and results in sliding window duration of 'n'
>> secs.
>
> Hmm... this confuses me a lot. Is 'n' a size or a duration? The two are
> not the same thing.
>
>>
>> * The sliding window size by default set to
>> * MBM_FIFO_SIZE_MIN. User can configure it to the values in the range
>> * (MBM_FIFO_SIZE_MIN,MBM_FIFO_SIZE_MAX). The range for sliding window
>> * is chosen based on a general criteria for monitoring duration. Example
>> * for a short lived application, 10sec monitoring period gives
>> * good characterization of its bandwidth consumption. For an application
>> * that runs for longer duration, 300sec monitoring period gives better
>> * characterization of its bandwidth consumption. Since the running
>> average
>> * calculated for total monitoring period, user gets the most accuracate
>> * average bandwidth for the each monitoring period.
>> *
>> * Scaling:
>> * cum_avg is the raw bandwidth is Bytes/sec.
>> * cum_avg is converted to MB/sec by applying MBM_CONVERSION_FACTOR and
>> * rounded to nearest integer. User interface gets the Bandwidth values
>> in MB/sec.
>> *
>> */
>> > > + *
>> > > + * Overflow Handling:
>> > > + * if (MSR current value < MSR previous value) it is an
>> > > + * overflow. and overflow is handled.
>> >
>> > Wow. That's informative as hell!
>> >
>> Please look at the modified comment above
>> > > + *
>> > > + * Calculation of Current BW value:
>> >
>> > BW == Body Weight?
>> >
>>
>> It is fixed now
>>
>> > > + * If MSR is read within last 100ms, then the value is ignored;
>> > > + * this will suppress small deltas. We don't process MBM samples
>> that
>> > > + are
>> > > + * within 100ms.
>> >
>> > WHY?
>> >
>> Explained in the comment. If mbm_read is called within in 100ms for the
>> same rmid, we donât
>> have to process the sample.
>
> The key piece of information you're missing here is that skipping these
> small deltas is an optimization, because we avoid performing costly
> operations for what would likely be a very minor change in the MBM data,
> right?
>
>
>> > > +{
>> > > + u64 val, tmp, diff_time, cma, bytes, index;
>> > > + bool overflow = false, first = false;
>> > > + ktime_t cur_time;
>> > > + u32 tmp32 = rmid;
>> > > + struct sample *mbm_current;
>> > > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
>> > > + cqm_max_rmid + rmid;
>> > > +
>> > > + rmid = vrmid;
>> >
>> > From my previous review:
>> >
>> > "This is completely backwards.
>> >
>> > tmp32 = rmid;
>> > rmid = vrmid;
>> > do_stuff(rmid);
>> > rmid = tmp32;
>> > do_other_stuff(rmid);
>> >
>> > Why can't you use vrmid for do_stuff() and leave rmid alone? Just
>> > because it would make the code simpler to read?"
>> >
>> > Still applies.
>> >
>>
>> This is now changed to
>> u64 val, currentmsr, currentbw, diff_time, cma, bytes, index;
>> bool overflow = false, first = false;
>> ktime_t cur_time;
>> u32 tmp32 = rmid, eventid;
>> struct sample *mbm_current;
>> u32 vrmid = rmid_2_index(rmid);
>>
>> rmid = vrmid;
>> cur_time = ktime_get();
>> if (read_mbm_local) {
>> mbm_current = &mbm_local[vrmid];
>> eventid = QOS_MBM_LOCAL_EVENT_ID_HW;
>> wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID_HW,
>> rmid);
>
> You don't need to perform this wrmsr() here because it's taken care of
> in the common code below.
>
>> } else {
>> mbm_current = &mbm_total[vrmid];
>> eventid = QOS_MBM_TOTAL_EVENT_ID;
>> }
>> rmid = tmp32;
>
> Why did you assign rmid to vrmid if you reassign it before it was used?
>
>
>> > > + /* if current msr value < previous msr value , it means
>> overflow */
>> > > + if (val < bytes) {
>> > > + val = MBM_CNTR_MAX - bytes + val;
>> > > + overflow = true;
>> > > + } else
>> > > + val = val - bytes;
>> > > +
>> > > + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
>> > > +
>> > > + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
>> > > + /* First sample */
>> > > + first = true;
>> > > +
>> > > + rmid = vrmid;
>> >
>> > And another time:
>> >
>> > "More obfuscation"
>> >
>>
>> /*
>> * MBM_TIME_DELTA_MAX is picked as per MBM specs. As specified
>> in Intel Platform
>> * Quality of Service Monitoring Implementer's Guide V1, Section
>> 2.7.2. page 21,
>> * overflow can occur maximum once in a second. So latest we
>> want to read the MSR
>> * counters is 1000ms. If it is less than 1000ms we can ignore
>> the sample. Then we
>> * decide since when we should ignore. If the MSR was Read with
>> in last 100ms, why
>> * process the MSR reads again. Anyway there'll be small change
>> or zero change.
>> * So ignoring MSR Reads within 100ms or less is efficient.
>> MBM_TIME_DELTA_MIN
>> * is specified as 100ms as per this guideline.
>> *
>> */
>
> I suspect the document you're referring to above is only available under
> NDA, which makes it unsuitable for mention in the kernel source since a
> large number of people won't have access to it.
>
> Just explain that the way the hardware is designed puts an upper limit
> on how quickly the counter can overflow, which is once per second.
>
>
>> > > +static void __intel_cqm_event_total_bw_count(void *info) {
>> > > + struct rmid_read *rr = info;
>> > > + u64 val;
>> > > +
>> > > + val = __rmid_read_mbm(rr->rmid, false);
>> > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
>> > > + return;
>> > > + atomic64_add(val, &rr->value);
>> > > +}
>> > > +
>> > > +static void __intel_cqm_event_local_bw_count(void *info) {
>> > > + struct rmid_read *rr = info;
>> > > + u64 val;
>> > > +
>> > > + val = __rmid_read_mbm(rr->rmid, true);
>> > > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
>> > > + return;
>> > > + atomic64_add(val, &rr->value);
>> > > +}
>> >
>> > And once more:
>> >
>> > "You're really a fan of copy and paste."
>> >
>>
>> These functions are invoked indirectly. They were written keeping
>> intel_cqm_event_count in mind.
>> Iâll change the arg to struct mbm_read{
>> struct rmid_read
>> *rr;
>> u32 eventid;
>> };
>> Intel_cqm_event_*_bw_count(â¦.) needs eventid to call for decoding
>
> No, please do not duplicate the rmid_read structure, that is not an
> improvement, we don't need two different structs for reading the read
> data.
>
> Please add the event field to the existing struct rmid_read.
>
>
>> > > @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct
>> > perf_event *event, int mode)
>> > > } else {
>> > > WARN_ON_ONCE(!state->rmid);
>> > > }
>> > > +
>> > > + if (pmu) {
>> > > + if (pmu->n_active > 0) {
>> >
>> > What's the purpose of this check? In the previous version there was a
>> > WARN_ON(), which made sense. Did it trigger and you decided to "work"
>> > around it?
>> >
>>
>> We actually meant to check if there are active events
>
> I don't follow this answer. Are you saying that the WARN_ON() doesn't
> make sense here?
>
>
>
>>
>> > > +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit,
>> > > +"KB/sec"); EVENT_ATTR_STR(llc_local_bw.unit,
>> > > +intel_cqm_llc_local_bw_unit, "KB/sec"); #endif
>> >
>> > > +static ssize_t
>> > > +sliding_window_size_store(struct device *dev,
>> > > + struct device_attribute *attr,
>> > > + const char *buf, size_t count)
>> > > +{
>> > > + unsigned int bytes;
>> > > + int ret;
>> > > +
>> > > + ret = kstrtouint(buf, 0, &bytes);
>> > > + if (ret)
>> > > + return ret;
>> > > +
>> > > + mutex_lock(&cache_mutex);
>> > > + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
>> > > + mbm_window_size = bytes;
>> >
>> > So, it's valid to set the window to X where 0 < X < MBM_FIFO_SIZE_MIN.
>> > What's the actual purpose of MBM_FIFO_SIZE_MIN?
>> >
>> This is changed to
>> if (bytes >= MBM_FIFO_SIZE_MIN && bytes <= MBM_FIFO_SIZE_MAX)
>> mbm_window_size = bytes;
>
> Note that if the user passes a value outside of this range you should be
> returning -EINVAL to indicate that.
>
>
>> > > + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
>> > > + per_cpu(mbm_pmu, cpu) = pmu;
>> > > + per_cpu(mbm_pmu_to_free, cpu) = NULL;
>> >
>> > What's the point of this? If there is still something to be free'd its
>> leaked.
>> > Otherwise that's redundant.
>> per_cpu(mbm_pmu_to_free, cpu) = NULL; is removed
>>
>> > > + mbm_hrtimer_init(pmu);
>> > > + }
>> > > + return 0;
>> >
>> > s/0/NOTIFY_OK/ because you return that value directly.
>> >
>> You mean I return the âreturn codeâ
>
> ?
>
> You should be using NOTIFY_OK here so that you follow the notifier API
> convention.
>
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-09-08 17:06 ` Juvva, Kanaka D
@ 2015-09-10 13:58 ` Matt Fleming
2015-09-10 21:18 ` Kanaka Juvva
0 siblings, 1 reply; 12+ messages in thread
From: Matt Fleming @ 2015-09-10 13:58 UTC (permalink / raw)
To: Juvva, Kanaka D
Cc: Thomas Gleixner, Kanaka Juvva, Williamson, Glenn P, Auld, Will,
Andi Kleen, LKML, Luck, Tony, Peter Zijlstra, Tejun Heo,
x86@kernel.org, Ingo Molnar, H. Peter Anvin, Shivappa, Vikas
On Tue, 2015-09-08 at 18:06 +0100, Juvva, Kanaka D wrote:
>
> There are two aspects:
>
> 1) Programming MSRs
> 2) EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x04");
>
> 1 is used for programming MSRs
> 2 event attribute for perf
>
>
> For MBM_LOCAL_EVENT HW ID is 0x3. We don't want to use 0x3 for EVENT ATTR.
>
> If we use 0x3 for event_attribute
>
> We can't clearly distinguish whether is EVENT 01 & EVENT 02 or EVENT 03 alone.
> For perf event attribute it has to be 0x04. Because 0x01 and 0x02 are used for other two events
You cannot combine events like this, the perf events are not a bitmask
so having MBM_LOCAL_EVENT_ID as 0x3 is fine.
Just look at the Intel RAPL code, it does the same thing. 0x3 is a
perfectly valid perf event attr value and it does not mean "combine perf
event attr 0x1 and 0x2".
If you're concerned about QOS_EVENT_MASK, you can probably just delete
that and replace the code that uses it with a switch statement or
equivalent.
> > > Explained in the comment. If mbm_read is called within in 100ms for
> > > the same rmid, we don’t have to process the sample.
> >
> > The key piece of information you're missing here is that skipping these small
> > deltas is an optimization, because we avoid performing costly operations for
> > what would likely be a very minor change in the MBM data, right?
> >
> >
>
> Yes, You are correct.
OK, thanks. Please include that point in your comment.
>
> > > } else {
> > > mbm_current = &mbm_total[vrmid];
> > > eventid = QOS_MBM_TOTAL_EVENT_ID;
> > > }
> > > rmid = tmp32;
> >
> > Why did you assign rmid to vrmid if you reassign it before it was used?
> >
> >
>
> For MSR writes we use rmid value and for mbm_* arrary we use vrmid which is actual
> index.
What I'm saying is that the assignment rmid = vrmid looks unnecessary in
this piece of code.
> >
> > I suspect the document you're referring to above is only available under NDA,
> > which makes it unsuitable for mention in the kernel source since a large number
> > of people won't have access to it.
> >
> > Just explain that the way the hardware is designed puts an upper limit on how
> > quickly the counter can overflow, which is once per second.
> >
> >
>
> OK. I'll change this to "as per hardware functionality"
I don't think that provides enough detail. Instead, how about "The
hardware architectures assure us that the counter will overflow at most
once a second", because that at least tells us where this assertion came
from.
> > > > > @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct
> > > > perf_event *event, int mode)
> > > > > } else {
> > > > > WARN_ON_ONCE(!state->rmid);
> > > > > }
> > > > > +
> > > > > + if (pmu) {
> > > > > + if (pmu->n_active > 0) {
> > > >
> > > > What's the purpose of this check? In the previous version there was
> > > > a WARN_ON(), which made sense. Did it trigger and you decided to "work"
> > > > around it?
> > > >
> > >
> > > We actually meant to check if there are active events
> >
> > I don't follow this answer. Are you saying that the WARN_ON() doesn't make
> > sense here?
> >
> >
> >
> I can add WARN_ON. But this will always hit if there are no events.
OK, it sounds like the original WARN_ON() didn't make any sense. In
which case you don't need to re-add it.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-09-10 13:58 ` Matt Fleming
@ 2015-09-10 21:18 ` Kanaka Juvva
2015-09-11 21:11 ` Matt Fleming
0 siblings, 1 reply; 12+ messages in thread
From: Kanaka Juvva @ 2015-09-10 21:18 UTC (permalink / raw)
To: Matt Fleming
Cc: Thomas Gleixner, Kanaka Juvva, Williamson, Glenn P, Auld, Will,
Andi Kleen, LKML, Luck, Tony, Peter Zijlstra, Tejun Heo,
x86@kernel.org, Ingo Molnar, H. Peter Anvin, Shivappa, Vikas
On Thu, 2015-09-10 at 14:58 +0100, Matt Fleming wrote:
> On Tue, 2015-09-08 at 18:06 +0100, Juvva, Kanaka D wrote:
> >
> > There are two aspects:
> >
> > 1) Programming MSRs
> > 2) EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x04");
> >
> > 1 is used for programming MSRs
> > 2 event attribute for perf
> >
> >
> > For MBM_LOCAL_EVENT HW ID is 0x3. We don't want to use 0x3 for EVENT ATTR.
> >
> > If we use 0x3 for event_attribute
> >
> > We can't clearly distinguish whether is EVENT 01 & EVENT 02 or EVENT 03 alone.
> > For perf event attribute it has to be 0x04. Because 0x01 and 0x02 are used for other two events
>
> You cannot combine events like this, the perf events are not a bitmask
> so having MBM_LOCAL_EVENT_ID as 0x3 is fine.
>
> Just look at the Intel RAPL code, it does the same thing. 0x3 is a
> perfectly valid perf event attr value and it does not mean "combine perf
> event attr 0x1 and 0x2".
>
> If you're concerned about QOS_EVENT_MASK, you can probably just delete
> that and replace the code that uses it with a switch statement or
> equivalent.
>
OK. QOS_EVENT_MASK can't be applied. Code changes will be done as per
this.
> > > > Explained in the comment. If mbm_read is called within in 100ms for
> > > > the same rmid, we don’t have to process the sample.
> > >
> > > The key piece of information you're missing here is that skipping these small
> > > deltas is an optimization, because we avoid performing costly operations for
> > > what would likely be a very minor change in the MBM data, right?
> > >
> > >
> >
> > Yes, You are correct.
>
> OK, thanks. Please include that point in your comment.
>
>
OK
> >
> > > > } else {
> > > > mbm_current = &mbm_total[vrmid];
> > > > eventid = QOS_MBM_TOTAL_EVENT_ID;
> > > > }
> > > > rmid = tmp32;
> > >
> > > Why did you assign rmid to vrmid if you reassign it before it was used?
> > >
> > >
> >
> > For MSR writes we use rmid value and for mbm_* arrary we use vrmid which is actual
> > index.
>
> What I'm saying is that the assignment rmid = vrmid looks unnecessary in
> this piece of code.
>
>From my previous review:
"This is completely backwards.
tmp32 = rmid;
rmid = vrmid;
do_stuff(rmid);
rmid = tmp32;
do_other_stuff(rmid);
Why can't you use vrmid for do_stuff() and leave rmid alone? Just
because it would make the code simpler to read?"
I have included Thomas comment inline above.
and also I meant the following logic:
writemsr(..,rmid,...)
mbm_*[vrmid]
So new patch will use this logic.
> > >
> > > I suspect the document you're referring to above is only available under NDA,
> > > which makes it unsuitable for mention in the kernel source since a large number
> > > of people won't have access to it.
> > >
> > > Just explain that the way the hardware is designed puts an upper limit on how
> > > quickly the counter can overflow, which is once per second.
> > >
> > >
> >
> > OK. I'll change this to "as per hardware functionality"
>
> I don't think that provides enough detail. Instead, how about "The
> hardware architectures assure us that the counter will overflow at most
> once a second", because that at least tells us where this assertion came
> from.
>
OK
> > > > > > @@ -1023,6 +1437,17 @@ static void intel_cqm_event_stop(struct
> > > > > perf_event *event, int mode)
> > > > > > } else {
> > > > > > WARN_ON_ONCE(!state->rmid);
> > > > > > }
> > > > > > +
> > > > > > + if (pmu) {
> > > > > > + if (pmu->n_active > 0) {
> > > > >
> > > > > What's the purpose of this check? In the previous version there was
> > > > > a WARN_ON(), which made sense. Did it trigger and you decided to "work"
> > > > > around it?
> > > > >
> > > >
> > > > We actually meant to check if there are active events
> > >
> > > I don't follow this answer. Are you saying that the WARN_ON() doesn't make
> > > sense here?
> > >
> > >
> > >
> > I can add WARN_ON. But this will always hit if there are no events.
>
> OK, it sounds like the original WARN_ON() didn't make any sense. In
> which case you don't need to re-add it.
>
>
OK, WARN_ON I used for debugging.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU
2015-09-10 21:18 ` Kanaka Juvva
@ 2015-09-11 21:11 ` Matt Fleming
0 siblings, 0 replies; 12+ messages in thread
From: Matt Fleming @ 2015-09-11 21:11 UTC (permalink / raw)
To: Kanaka Juvva
Cc: Matt Fleming, Thomas Gleixner, Kanaka Juvva, Williamson, Glenn P,
Auld, Will, Andi Kleen, LKML, Luck, Tony, Peter Zijlstra,
Tejun Heo, x86@kernel.org, Ingo Molnar, H. Peter Anvin,
Shivappa, Vikas
On Thu, 10 Sep, at 02:18:49PM, Kanaka Juvva wrote:
> > >
> > > > > } else {
> > > > > mbm_current = &mbm_total[vrmid];
> > > > > eventid = QOS_MBM_TOTAL_EVENT_ID;
> > > > > }
> > > > > rmid = tmp32;
> > > >
> > > > Why did you assign rmid to vrmid if you reassign it before it was used?
> > > >
> > > >
> > >
> > > For MSR writes we use rmid value and for mbm_* arrary we use vrmid which is actual
> > > index.
> >
> > What I'm saying is that the assignment rmid = vrmid looks unnecessary in
> > this piece of code.
> >
>
> From my previous review:
>
> "This is completely backwards.
>
> tmp32 = rmid;
> rmid = vrmid;
> do_stuff(rmid);
> rmid = tmp32;
> do_other_stuff(rmid);
>
> Why can't you use vrmid for do_stuff() and leave rmid alone? Just
> because it would make the code simpler to read?"
>
> I have included Thomas comment inline above.
>
> and also I meant the following logic:
>
> writemsr(..,rmid,...)
> mbm_*[vrmid]
>
> So new patch will use this logic.
OK, let's pull the code in and discuss this with some context,
u64 val, currentmsr, currentbw, diff_time, cma, bytes, index;
bool overflow = false, first = false;
ktime_t cur_time;
u32 tmp32 = rmid, eventid;
struct sample *mbm_current;
u32 vrmid = rmid_2_index(rmid);
rmid = vrmid; <--------- This looks wrong
cur_time = ktime_get();
if (read_mbm_local) {
mbm_current = &mbm_local[vrmid];
eventid = QOS_MBM_LOCAL_EVENT_ID_HW;
wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID_HW, rmid); <---- Unneccesary because..
} else {
mbm_current = &mbm_total[vrmid];
eventid = QOS_MBM_TOTAL_EVENT_ID;
}
rmid = tmp32;
wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); <----- ... you write here
So you don't actually use 'rmid' within that if/else block. You can
probably get away with deleting 'tmp32' now that you've refactored
things.
--
Matt Fleming, Intel Open Source Technology Center
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2015-09-11 21:11 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-08 6:42 [PATCH v3 1/2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU Kanaka Juvva
2015-08-16 17:42 ` Juvva, Kanaka D
2015-08-19 20:50 ` Thomas Gleixner
2015-08-20 21:54 ` Juvva, Kanaka D
2015-08-21 12:30 ` Thomas Gleixner
[not found] ` <06033C969873E840BDE9FC9B584F66B51944F51B@ORSMSX116.amr.corp.intel.com>
2015-09-08 11:47 ` Matt Fleming
2015-09-08 16:11 ` Juvva, Kanaka D
2015-09-08 17:06 ` Juvva, Kanaka D
2015-09-10 13:58 ` Matt Fleming
2015-09-10 21:18 ` Kanaka Juvva
2015-09-11 21:11 ` Matt Fleming
2015-09-08 19:57 ` [PATCH v3 1/2] perf, x86: " kanaka.d.juvva
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox