* [PATCH v3 03/13] x86: intel-iommu: add vt-d init test
From: Peter Xu @ 2016-11-14 22:28 UTC (permalink / raw)
To: kvm; +Cc: drjones, agordeev, jan.kiszka, rkrcmar, pbonzini, peterx
In-Reply-To: <1479162491-20764-1-git-send-email-peterx@redhat.com>
Adding fundamental init test for Intel IOMMU. This includes basic
initialization of Intel IOMMU device, like DMAR (DMA Remapping),
IR (Interrupt Remapping), QI (Queue Invalidation), etc.
Further tests can use vtd_init() to initialize Intel IOMMU environment.
x86/unittests is updated to add this test.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
lib/x86/intel-iommu.c | 88 +++++++++++++++++++++++++++++++++++++
lib/x86/intel-iommu.h | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++
x86/Makefile.x86_64 | 2 +
x86/intel-iommu.c | 27 ++++++++++++
x86/unittests.cfg | 7 +++
5 files changed, 242 insertions(+)
create mode 100644 lib/x86/intel-iommu.c
create mode 100644 lib/x86/intel-iommu.h
create mode 100644 x86/intel-iommu.c
diff --git a/lib/x86/intel-iommu.c b/lib/x86/intel-iommu.c
new file mode 100644
index 0000000..9890f34
--- /dev/null
+++ b/lib/x86/intel-iommu.c
@@ -0,0 +1,88 @@
+/*
+ * Intel IOMMU APIs
+ *
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * Authors:
+ * Peter Xu <peterx@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or
+ * later.
+ */
+
+#include "intel-iommu.h"
+
+#define VTD_RTA_MASK (PAGE_MASK)
+#define VTD_IRTA_MASK (PAGE_MASK)
+
+static uint64_t vtd_root_table(void)
+{
+ /* No extend root table support yet */
+ return vtd_readq(DMAR_RTADDR_REG) & VTD_RTA_MASK;
+}
+
+static uint64_t vtd_ir_table(void)
+{
+ return vtd_readq(DMAR_IRTA_REG) & VTD_IRTA_MASK;
+}
+
+static void vtd_gcmd_or(uint32_t cmd)
+{
+ uint32_t status;
+
+ /* We only allow set one bit for each time */
+ assert(is_power_of_2(cmd));
+
+ status = vtd_readl(DMAR_GSTS_REG);
+ vtd_writel(DMAR_GCMD_REG, status | cmd);
+
+ if (cmd & VTD_GCMD_ONE_SHOT_BITS) {
+ /* One-shot bits are taking effect immediately */
+ return;
+ }
+
+ /* Make sure IOMMU handled our command request */
+ while (!(vtd_readl(DMAR_GSTS_REG) & cmd))
+ cpu_relax();
+}
+
+static void vtd_dump_init_info(void)
+{
+ printf("VT-d version: 0x%x\n", vtd_readl(DMAR_VER_REG));
+ printf(" cap: 0x%016lx\n", vtd_readq(DMAR_CAP_REG));
+ printf(" ecap: 0x%016lx\n", vtd_readq(DMAR_ECAP_REG));
+}
+
+static void vtd_setup_root_table(void)
+{
+ void *root = alloc_page();
+
+ memset(root, 0, PAGE_SIZE);
+ vtd_writeq(DMAR_RTADDR_REG, virt_to_phys(root));
+ vtd_gcmd_or(VTD_GCMD_ROOT);
+ printf("DMAR table address: 0x%016lx\n", vtd_root_table());
+}
+
+static void vtd_setup_ir_table(void)
+{
+ void *root = alloc_page();
+
+ memset(root, 0, PAGE_SIZE);
+ /* 0xf stands for table size (2^(0xf+1) == 65536) */
+ vtd_writeq(DMAR_IRTA_REG, virt_to_phys(root) | 0xf);
+ vtd_gcmd_or(VTD_GCMD_IR_TABLE);
+ printf("IR table address: 0x%016lx\n", vtd_ir_table());
+}
+
+void vtd_init(void)
+{
+ setup_vm();
+ smp_init();
+
+ vtd_dump_init_info();
+ vtd_gcmd_or(VTD_GCMD_QI); /* Enable QI */
+ vtd_setup_root_table();
+ vtd_setup_ir_table();
+ vtd_gcmd_or(VTD_GCMD_DMAR); /* Enable DMAR */
+ vtd_gcmd_or(VTD_GCMD_IR); /* Enable IR */
+}
diff --git a/lib/x86/intel-iommu.h b/lib/x86/intel-iommu.h
new file mode 100644
index 0000000..fae9ae5
--- /dev/null
+++ b/lib/x86/intel-iommu.h
@@ -0,0 +1,118 @@
+/*
+ * Intel IOMMU header
+ *
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * Authors:
+ * Peter Xu <peterx@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or
+ * later.
+ *
+ * (From include/linux/intel-iommu.h)
+ */
+
+#ifndef __INTEL_IOMMU_H__
+#define __INTEL_IOMMU_H__
+
+#include "libcflat.h"
+#include "vm.h"
+#include "isr.h"
+#include "smp.h"
+#include "desc.h"
+#include "asm/io.h"
+
+#define Q35_HOST_BRIDGE_IOMMU_ADDR 0xfed90000ULL
+
+/*
+ * Intel IOMMU register specification
+ */
+#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */
+#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */
+#define DMAR_CAP_REG_HI 0xc /* High 32-bit of DMAR_CAP_REG */
+#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */
+#define DMAR_ECAP_REG_HI 0X14
+#define DMAR_GCMD_REG 0x18 /* Global command */
+#define DMAR_GSTS_REG 0x1c /* Global status */
+#define DMAR_RTADDR_REG 0x20 /* Root entry table */
+#define DMAR_RTADDR_REG_HI 0X24
+#define DMAR_CCMD_REG 0x28 /* Context command */
+#define DMAR_CCMD_REG_HI 0x2c
+#define DMAR_FSTS_REG 0x34 /* Fault status */
+#define DMAR_FECTL_REG 0x38 /* Fault control */
+#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data */
+#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr */
+#define DMAR_FEUADDR_REG 0x44 /* Upper address */
+#define DMAR_AFLOG_REG 0x58 /* Advanced fault control */
+#define DMAR_AFLOG_REG_HI 0X5c
+#define DMAR_PMEN_REG 0x64 /* Enable protected memory region */
+#define DMAR_PLMBASE_REG 0x68 /* PMRR low addr */
+#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */
+#define DMAR_PHMBASE_REG 0x70 /* PMRR high base addr */
+#define DMAR_PHMBASE_REG_HI 0X74
+#define DMAR_PHMLIMIT_REG 0x78 /* PMRR high limit */
+#define DMAR_PHMLIMIT_REG_HI 0x7c
+#define DMAR_IQH_REG 0x80 /* Invalidation queue head */
+#define DMAR_IQH_REG_HI 0X84
+#define DMAR_IQT_REG 0x88 /* Invalidation queue tail */
+#define DMAR_IQT_REG_HI 0X8c
+#define DMAR_IQA_REG 0x90 /* Invalidation queue addr */
+#define DMAR_IQA_REG_HI 0x94
+#define DMAR_ICS_REG 0x9c /* Invalidation complete status */
+#define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr */
+#define DMAR_IRTA_REG_HI 0xbc
+#define DMAR_IECTL_REG 0xa0 /* Invalidation event control */
+#define DMAR_IEDATA_REG 0xa4 /* Invalidation event data */
+#define DMAR_IEADDR_REG 0xa8 /* Invalidation event address */
+#define DMAR_IEUADDR_REG 0xac /* Invalidation event address */
+#define DMAR_PQH_REG 0xc0 /* Page request queue head */
+#define DMAR_PQH_REG_HI 0xc4
+#define DMAR_PQT_REG 0xc8 /* Page request queue tail*/
+#define DMAR_PQT_REG_HI 0xcc
+#define DMAR_PQA_REG 0xd0 /* Page request queue address */
+#define DMAR_PQA_REG_HI 0xd4
+#define DMAR_PRS_REG 0xdc /* Page request status */
+#define DMAR_PECTL_REG 0xe0 /* Page request event control */
+#define DMAR_PEDATA_REG 0xe4 /* Page request event data */
+#define DMAR_PEADDR_REG 0xe8 /* Page request event address */
+#define DMAR_PEUADDR_REG 0xec /* Page event upper address */
+#define DMAR_MTRRCAP_REG 0x100 /* MTRR capability */
+#define DMAR_MTRRCAP_REG_HI 0x104
+#define DMAR_MTRRDEF_REG 0x108 /* MTRR default type */
+#define DMAR_MTRRDEF_REG_HI 0x10c
+
+#define VTD_GCMD_IR_TABLE 0x1000000
+#define VTD_GCMD_IR 0x2000000
+#define VTD_GCMD_QI 0x4000000
+#define VTD_GCMD_WBF 0x8000000 /* Write Buffer Flush */
+#define VTD_GCMD_SFL 0x20000000 /* Set Fault Log */
+#define VTD_GCMD_ROOT 0x40000000
+#define VTD_GCMD_DMAR 0x80000000
+#define VTD_GCMD_ONE_SHOT_BITS (VTD_GCMD_IR_TABLE | VTD_GCMD_WBF | \
+ VTD_GCMD_SFL | VTD_GCMD_ROOT)
+
+#define vtd_reg(reg) ((volatile void *)(Q35_HOST_BRIDGE_IOMMU_ADDR + reg))
+
+static inline void vtd_writel(unsigned int reg, uint32_t value)
+{
+ __raw_writel(value, vtd_reg(reg));
+}
+
+static inline void vtd_writeq(unsigned int reg, uint64_t value)
+{
+ __raw_writeq(value, vtd_reg(reg));
+}
+
+static inline uint32_t vtd_readl(unsigned int reg)
+{
+ return __raw_readl(vtd_reg(reg));
+}
+
+static inline uint64_t vtd_readq(unsigned int reg)
+{
+ return __raw_readq(vtd_reg(reg));
+}
+
+void vtd_init(void);
+
+#endif
diff --git a/x86/Makefile.x86_64 b/x86/Makefile.x86_64
index f82492b..3e2821e 100644
--- a/x86/Makefile.x86_64
+++ b/x86/Makefile.x86_64
@@ -4,6 +4,7 @@ ldarch = elf64-x86-64
CFLAGS += -mno-red-zone
cflatobjs += lib/x86/setjmp64.o
+cflatobjs += lib/x86/intel-iommu.o
tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
$(TEST_DIR)/emulator.flat $(TEST_DIR)/idt_test.flat \
@@ -14,6 +15,7 @@ tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
tests += $(TEST_DIR)/svm.flat
tests += $(TEST_DIR)/vmx.flat
tests += $(TEST_DIR)/tscdeadline_latency.flat
+tests += $(TEST_DIR)/intel-iommu.flat
include $(TEST_DIR)/Makefile.common
diff --git a/x86/intel-iommu.c b/x86/intel-iommu.c
new file mode 100644
index 0000000..f247913
--- /dev/null
+++ b/x86/intel-iommu.c
@@ -0,0 +1,27 @@
+/*
+ * Intel IOMMU unit test.
+ *
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * Authors:
+ * Peter Xu <peterx@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or
+ * later.
+ */
+
+#include "intel-iommu.h"
+
+int main(int argc, char *argv[])
+{
+ vtd_init();
+
+ report("fault status check", vtd_readl(DMAR_FSTS_REG) == 0);
+ report("QI enablement", vtd_readl(DMAR_GSTS_REG) & VTD_GCMD_QI);
+ report("DMAR table setup", vtd_readl(DMAR_GSTS_REG) & VTD_GCMD_ROOT);
+ report("IR table setup", vtd_readl(DMAR_GSTS_REG) & VTD_GCMD_IR_TABLE);
+ report("DMAR enablement", vtd_readl(DMAR_GSTS_REG) & VTD_GCMD_DMAR);
+ report("IR enablement", vtd_readl(DMAR_GSTS_REG) & VTD_GCMD_IR);
+
+ return report_summary();
+}
diff --git a/x86/unittests.cfg b/x86/unittests.cfg
index 23395c6..5413838 100644
--- a/x86/unittests.cfg
+++ b/x86/unittests.cfg
@@ -217,3 +217,10 @@ extra_params = -cpu kvm64,hv_time,hv_synic,hv_stimer -device hyperv-testdev
file = hyperv_clock.flat
smp = 2
extra_params = -cpu kvm64,hv_time
+
+[intel_iommu]
+file = intel-iommu.flat
+arch = x86_64
+timeout = 30
+smp = 4
+extra_params = -M q35,kernel-irqchip=split -device intel-iommu,intremap=on,eim=off -device edu
--
2.7.4
^ permalink raw reply related
* [PATCH v3 02/13] libcflat: introduce is_power_of_2()
From: Peter Xu @ 2016-11-14 22:28 UTC (permalink / raw)
To: kvm; +Cc: drjones, agordeev, jan.kiszka, rkrcmar, pbonzini, peterx
In-Reply-To: <1479162491-20764-1-git-send-email-peterx@redhat.com>
Suggested-by: Andrew Jones <drjones@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
lib/libcflat.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/lib/libcflat.h b/lib/libcflat.h
index 72b1bf9..038ea1d 100644
--- a/lib/libcflat.h
+++ b/lib/libcflat.h
@@ -103,4 +103,9 @@ do { \
} \
} while (0)
+static inline bool is_power_of_2(unsigned long n)
+{
+ return n && !(n & (n - 1));
+}
+
#endif
--
2.7.4
^ permalink raw reply related
* [PATCH v3 01/13] x86/asm: add cpu_relax()
From: Peter Xu @ 2016-11-14 22:27 UTC (permalink / raw)
To: kvm; +Cc: drjones, agordeev, jan.kiszka, rkrcmar, pbonzini, peterx
In-Reply-To: <1479162491-20764-1-git-send-email-peterx@redhat.com>
This will be useful to be put inside loops.
Suggested-by: Andrew Jones <drjones@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
lib/x86/asm/barrier.h | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/lib/x86/asm/barrier.h b/lib/x86/asm/barrier.h
index 7c108bd..193fb4c 100644
--- a/lib/x86/asm/barrier.h
+++ b/lib/x86/asm/barrier.h
@@ -13,4 +13,15 @@
#define smp_rmb() barrier()
#define smp_wmb() barrier()
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ asm volatile("rep; nop" ::: "memory");
+}
+
+static inline void cpu_relax(void)
+{
+ rep_nop();
+}
+
#endif
--
2.7.4
^ permalink raw reply related
* [PATCH v3 00/13] VT-d unit test
From: Peter Xu @ 2016-11-14 22:27 UTC (permalink / raw)
To: kvm; +Cc: drjones, agordeev, jan.kiszka, rkrcmar, pbonzini, peterx
(Please ignore previous v3 version and use this one. My fault to not
configure base branch before publish :-( Sorry for the noise.)
This is v4 of vt-d unit test series.
Patch "libcflat: add IS_ALIGNED() macro, and page sizes" is picked up
by Drew in the ARM GIC framework series, so please feel free to drop
it when needed.
Online repo:
https://github.com/xzpeter/kvm-unit-tests.git iommu-ut-v4
Please review. Thanks,
v4:
(same as v3, but with correct patches sent out)
v3:
- is_power_of_2(): remove outer "()" [Drew]
- fix indent for vtd_root_table() [Drew]
- pci_find_dev(): keep {} to avoid touching more codes [Drew]
- pci_scan_bars(): check whether bar is 64bits [Drew]
- pci_enable_defaults(): make it return nothing [Drew]
- fix strnang blanks in lib/x86/intel-iommu.h [Drew]
- pci_setup_msi(): fix correct assert() inside [Drew]
- squash patches: 03+17 08+09 10+11 12+14 [Drew]
- remove all "()" for raw defines (numbers) in macros [Drew]
v2:
- move cpu_relax patch to the beginning, and use them in all places
[Drew]
- replace all corresponding 256 into PCI_DEVFN_MAX, as well for
PCI_BAR_NUM [Drew]
- adding is_power_of_2() to replace ONE_BIT_ONLY() [Drew]
- add SZ_64K macro [Drew]
- declare pci_config_write[wb] in lib/asm-generic/pci-host-bridge.h [Alex]
- edu_reg_read/write() add "l" in func name [Drew]
- drop pci_set_master(), instead, provide pci_cmd_set_clr() [Drew]
- change return code into bool (always) for functions that apply
[Drew]
- keep old pci_find_dev() interface [Drew/Alex]
- use __raw_{read|write}*() for both vt-d and edu register read/writes
[Alex]
- remove pci_ prefix for all pci_dev fields [Drew]
- replace 0xff in cap_handlers[0xff] into (PCI_CAP_ID_MAX + 1) [Drew]
- make x86/unittest.cfg simpler by using q35 directly with eim=off
[Drew]
RFC -> v1:
- when init edu device fail, report_skip() rather than return error
[Radim]
- use asserts rather than "static bool inited" to avoid multiple init
of components (affects patch 1/2) [Drew]
- moving the first two patches out of the series [Drew]
- int vtd_init(), do not setup_idt() since smp_init() did it [Drew]
- when edu do not have MSI enabled, skip interrupt test [Radim]
- rename vtd_reg_*() into vtd_{read|write}[lq](), and move them to
header file [Drew]
- use PAGE_MASK when able [Drew]
- use "&" instead of "|" in intel-iommu init test (three places)
[Drew]
- use "vtd_init()" in unit test [Drew]
- mention that where intel-iommu.h comes from [Drew]
- re-written vtd_gcmd_or(), make it also work on even hardware [Drew]
- remove most of the oneline wrapper for VT-d registers, instead, use
vtd_{read|write}* with register names [Drew]
- remove useless BDF helpers [Drew]
- move edu device macros into header file [Drew]
- make edu_check_alive static inline [Drew]
- remove all useless wrappers in pci-edu.c [Drew]
- remove pci_dma_dir_t and all its users, instead, use "bool
from_device" [Drew]
- not use typedef for structs, to follow Linux/kvm-unit-tests coding
style [Drew]
- let pci_dev_init() clean and simple, then provide
pci_enable_defaults() for more complicated things [Drew]
- add one more patch to add intel-iommu test into x86/unittest [Radim]
- use 0x60 intr request instead of factorial to trigger edu device
interrupt [Drew]
- ...and some other changes I just forgot to note down...
Currently only a very small test scope is covered:
* VT-d init
* DMAR: 4 bytes copy
* IR: MSI
However this series could be a base point to add more test cases for
VT-d. The problem is, there are many IOMMU error conditions which are
very hard to be triggered in a real guest (IOMMU has merely no
interface for guest user, and it's totally running in the background).
This piece of work can be a start point if we want to do more
complicated things and play around with Intel IOMMU devices (also for
IOMMU regression tests).
Please review. Thanks,
=================
To run the test:
./x86/run ./x86/intel-iommu.flat \
-M q35,kernel-irqchip=split -global ioapic.version=0x20 \
-device intel-iommu,intremap=on -device edu
Sample output:
pxdev:kvm-unit-tests [new-iommu-ut]# ./iommu_run.sh
/root/git/qemu/bin/x86_64-softmmu/qemu-system-x86_64 -enable-kvm -device pc-testdev -device isa-debug-exit,iobase=0xf4,iosize=0x4 -vnc none -serial stdio
+-device pci-testdev -kernel ./x86/intel-iommu.flat -M q35,kernel-irqchip=split -global ioapic.version=0x20 -device intel-iommu,intremap=on -device edu
enabling apic
paging enabled
cr0 = 80010011
cr3 = 7fff000
cr4 = 20
VT-d version: 0x10
cap: 0x0012008c22260206
ecap: 0x0000000000f00f1a
PASS: init status check
PASS: fault status check
PASS: QI enablement
DMAR table address: 0x0000000007ff9000
PASS: DMAR table setup
IR table address: 0x0000000007ff8000
PASS: IR table setup
PASS: DMAR enablement
PASS: IR enablement
PASS: DMAR support 39 bits address width
PASS: DMAR support huge pages
PCI: init dev 0x0020 BAR 0 [MEM] addr 0xfea00000
PCI detected cap 0x5
Detected MSI for device 0x20 offset 0x40
allocated vt-d root entry for PCI bus 0
allocated vt-d context entry for devfn 0x20
map 4K page IOVA 0x0 to 0x7ff7000 (sid=0x0020)
edu device DMA start TO addr 0x0 size 0x4 off 0x0
edu device DMA start FROM addr 0x4 size 0x4 off 0x0
PASS: DMAR 4B memcpy test
INTR: setup IRTE index 0
MSI: dev 0x20 init 64bit address: addr=0xfee00010, data=0x0
PASS: EDU factorial INTR test
Peter Xu (13):
x86/asm: add cpu_relax()
libcflat: introduce is_power_of_2()
x86: intel-iommu: add vt-d init test
libcflat: add IS_ALIGNED() macro, and page sizes
libcflat: moving MIN/MAX here
vm/page: provide PGDIR_OFFSET() macro
pci: introduce struct pci_dev
pci: provide pci_scan_bars()
pci: provide pci_enable_defaults()
pci: edu: introduce pci-edu helpers
x86: intel-iommu: add dmar test
pci: add msi support for 32/64bit address
x86: intel-iommu: add IR MSI test
lib/alloc.c | 3 -
lib/libcflat.h | 14 +++
lib/pci-edu.c | 73 ++++++++++++
lib/pci-edu.h | 83 +++++++++++++
lib/pci-host-generic.c | 9 +-
lib/pci-testdev.c | 10 +-
lib/pci.c | 157 +++++++++++++++++++++----
lib/pci.h | 39 ++++--
lib/x86/asm/barrier.h | 11 ++
lib/x86/asm/page.h | 3 +
lib/x86/intel-iommu.c | 313 +++++++++++++++++++++++++++++++++++++++++++++++++
lib/x86/intel-iommu.h | 142 ++++++++++++++++++++++
lib/x86/vm.c | 4 +-
x86/Makefile.common | 1 +
x86/Makefile.x86_64 | 2 +
x86/intel-iommu.c | 120 +++++++++++++++++++
x86/unittests.cfg | 7 ++
x86/vmexit.c | 27 ++---
18 files changed, 960 insertions(+), 58 deletions(-)
create mode 100644 lib/pci-edu.c
create mode 100644 lib/pci-edu.h
create mode 100644 lib/x86/intel-iommu.c
create mode 100644 lib/x86/intel-iommu.h
create mode 100644 x86/intel-iommu.c
--
2.7.4
^ permalink raw reply
* Re: [PATCHv7 07/11] i2c: match dt-style device names from sysfs interface
From: Wolfram Sang @ 2016-11-14 22:27 UTC (permalink / raw)
To: Kieran Bingham
Cc: Lee Jones, linux-i2c, linux-kernel, Javier Martinez Canillas,
sameo
In-Reply-To: <1478522866-29620-8-git-send-email-kieran@bingham.xyz>
[-- Attachment #1: Type: text/plain, Size: 1231 bytes --]
On Mon, Nov 07, 2016 at 12:47:42PM +0000, Kieran Bingham wrote:
> A user can choose to instantiate a device on an i2c bus using the sysfs
> interface by providing a string and address to match and communicate
> with the device on the bus. Presently this string is only matched
> against the old i2c device id style strings, even in the presence of
> full device tree compatible strings with vendor prefixes.
>
> Providing a vendor-prefixed string to the sysfs interface will not match
> against the device tree of_match_device() calls as there is no device
> tree node to parse from the sysfs interface.
>
> Convert i2c_of_match_device_strip_vendor() such that it can match both
The function name here is the old one...
> vendor prefixed and stripped compatible strings on the sysfs interface.
>
> Signed-off-by: Kieran Bingham <kieran@bingham.xyz>
... and in patch 2, the sentence "remove this function if all drivers
are converted" is obsolete, too, since we need this function always for
sysfs.
This make me wonder if we shouldn't squash this patch also in into patch
2 (like I suggested for the next one), and create a best-of-all-worlds
commit message from these three patches?
Opinions?
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]
^ permalink raw reply
* Re: [PATCHv7 08/11] i2c: match vendorless strings on the internal string length
From: Wolfram Sang @ 2016-11-14 22:27 UTC (permalink / raw)
To: Kieran Bingham
Cc: Lee Jones, linux-i2c, linux-kernel, Javier Martinez Canillas,
sameo
In-Reply-To: <1478522866-29620-9-git-send-email-kieran@bingham.xyz>
[-- Attachment #1: Type: text/plain, Size: 576 bytes --]
On Mon, Nov 07, 2016 at 12:47:43PM +0000, Kieran Bingham wrote:
> If a user provides a shortened string to match a device to the sysfs i2c
> interface it will match on the first string that contains that string
> prefix.
>
> for example:
> echo a 0x68 > /sys/bus/i2c/devices/i2c-2/new_device
> will match as3711, as3722, and ak8975 incorrectly.
>
> Correct this by using sysfs_streq to match the string exactly
>
> Signed-off-by: Kieran Bingham <kieran@bingham.xyz>
Any objections that I squash this directly into patch 2? It is more like
a bugfix, no?
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]
^ permalink raw reply
* Re: [RESEND PATCH 1/2] PCI: rockchip: cleanup bit definition for PCIE_RC_CONFIG_LCS
From: Bjorn Helgaas @ 2016-11-14 22:26 UTC (permalink / raw)
To: Shawn Lin
Cc: Bjorn Helgaas, Rob Herring, linux-pci, linux-rockchip, Wenrui Li,
Brian Norris, Jeffy Chen, devicetree
In-Reply-To: <1479096666-112668-1-git-send-email-shawn.lin@rock-chips.com>
On Mon, Nov 14, 2016 at 12:11:05PM +0800, Shawn Lin wrote:
> PCIE_RC_CONFIG_LCS contains control and status bits specific
> to the PCIe link. The layout for this register looks the same
> as the existed PCI_EXP_LNKCTL and PCI_EXP_LNKSTA. So let's
> reuse them.
>
> Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Did something change since the version you posted yesterday?
Resending a patch with no changes or with no hint about what changed
doesn't speed things up; in fact, it slows things down.
> ---
>
> drivers/pci/host/pcie-rockchip.c | 14 ++++----------
> 1 file changed, 4 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
> index 7f238af..1dba698 100644
> --- a/drivers/pci/host/pcie-rockchip.c
> +++ b/drivers/pci/host/pcie-rockchip.c
> @@ -141,12 +141,6 @@
> #define PCIE_RC_CONFIG_DCR_CSPL_LIMIT 0xff
> #define PCIE_RC_CONFIG_DCR_CPLS_SHIFT 26
> #define PCIE_RC_CONFIG_LCS (PCIE_RC_CONFIG_BASE + 0xd0)
> -#define PCIE_RC_CONFIG_LCS_RETRAIN_LINK BIT(5)
> -#define PCIE_RC_CONFIG_LCS_CCC BIT(6)
> -#define PCIE_RC_CONFIG_LCS_LBMIE BIT(10)
> -#define PCIE_RC_CONFIG_LCS_LABIE BIT(11)
> -#define PCIE_RC_CONFIG_LCS_LBMS BIT(30)
> -#define PCIE_RC_CONFIG_LCS_LAMS BIT(31)
> #define PCIE_RC_CONFIG_L1_SUBSTATE_CTRL2 (PCIE_RC_CONFIG_BASE + 0x90c)
> #define PCIE_RC_CONFIG_THP_CAP (PCIE_RC_CONFIG_BASE + 0x274)
> #define PCIE_RC_CONFIG_THP_CAP_NEXT_MASK GENMASK(31, 20)
> @@ -229,7 +223,7 @@ static void rockchip_pcie_enable_bw_int(struct rockchip_pcie *rockchip)
> u32 status;
>
> status = rockchip_pcie_read(rockchip, PCIE_RC_CONFIG_LCS);
> - status |= (PCIE_RC_CONFIG_LCS_LBMIE | PCIE_RC_CONFIG_LCS_LABIE);
> + status |= (PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
> rockchip_pcie_write(rockchip, status, PCIE_RC_CONFIG_LCS);
> }
>
> @@ -238,7 +232,7 @@ static void rockchip_pcie_clr_bw_int(struct rockchip_pcie *rockchip)
> u32 status;
>
> status = rockchip_pcie_read(rockchip, PCIE_RC_CONFIG_LCS);
> - status |= (PCIE_RC_CONFIG_LCS_LBMS | PCIE_RC_CONFIG_LCS_LAMS);
> + status |= (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS) << 16;
> rockchip_pcie_write(rockchip, status, PCIE_RC_CONFIG_LCS);
> }
>
> @@ -540,7 +534,7 @@ static int rockchip_pcie_init_port(struct rockchip_pcie *rockchip)
>
> /* Set RC's clock architecture as common clock */
> status = rockchip_pcie_read(rockchip, PCIE_RC_CONFIG_LCS);
> - status |= PCIE_RC_CONFIG_LCS_CCC;
> + status |= PCI_EXP_LNKCTL_CCC;
> rockchip_pcie_write(rockchip, status, PCIE_RC_CONFIG_LCS);
>
> /* Enable Gen1 training */
> @@ -575,7 +569,7 @@ static int rockchip_pcie_init_port(struct rockchip_pcie *rockchip)
> * gen1 finished.
> */
> status = rockchip_pcie_read(rockchip, PCIE_RC_CONFIG_LCS);
> - status |= PCIE_RC_CONFIG_LCS_RETRAIN_LINK;
> + status |= PCI_EXP_LNKCTL_RL;
> rockchip_pcie_write(rockchip, status, PCIE_RC_CONFIG_LCS);
>
> timeout = jiffies + msecs_to_jiffies(500);
> --
> 1.9.1
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [RESEND PATCH 1/2] PCI: rockchip: cleanup bit definition for PCIE_RC_CONFIG_LCS
From: Bjorn Helgaas @ 2016-11-14 22:26 UTC (permalink / raw)
To: Shawn Lin
Cc: Bjorn Helgaas, Rob Herring, linux-pci-u79uwXL29TY76Z2rM5mHXA,
linux-rockchip-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Wenrui Li,
Brian Norris, Jeffy Chen, devicetree-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1479096666-112668-1-git-send-email-shawn.lin-TNX95d0MmH7DzftRWevZcw@public.gmane.org>
On Mon, Nov 14, 2016 at 12:11:05PM +0800, Shawn Lin wrote:
> PCIE_RC_CONFIG_LCS contains control and status bits specific
> to the PCIe link. The layout for this register looks the same
> as the existed PCI_EXP_LNKCTL and PCI_EXP_LNKSTA. So let's
> reuse them.
>
> Signed-off-by: Shawn Lin <shawn.lin-TNX95d0MmH7DzftRWevZcw@public.gmane.org>
Did something change since the version you posted yesterday?
Resending a patch with no changes or with no hint about what changed
doesn't speed things up; in fact, it slows things down.
> ---
>
> drivers/pci/host/pcie-rockchip.c | 14 ++++----------
> 1 file changed, 4 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
> index 7f238af..1dba698 100644
> --- a/drivers/pci/host/pcie-rockchip.c
> +++ b/drivers/pci/host/pcie-rockchip.c
> @@ -141,12 +141,6 @@
> #define PCIE_RC_CONFIG_DCR_CSPL_LIMIT 0xff
> #define PCIE_RC_CONFIG_DCR_CPLS_SHIFT 26
> #define PCIE_RC_CONFIG_LCS (PCIE_RC_CONFIG_BASE + 0xd0)
> -#define PCIE_RC_CONFIG_LCS_RETRAIN_LINK BIT(5)
> -#define PCIE_RC_CONFIG_LCS_CCC BIT(6)
> -#define PCIE_RC_CONFIG_LCS_LBMIE BIT(10)
> -#define PCIE_RC_CONFIG_LCS_LABIE BIT(11)
> -#define PCIE_RC_CONFIG_LCS_LBMS BIT(30)
> -#define PCIE_RC_CONFIG_LCS_LAMS BIT(31)
> #define PCIE_RC_CONFIG_L1_SUBSTATE_CTRL2 (PCIE_RC_CONFIG_BASE + 0x90c)
> #define PCIE_RC_CONFIG_THP_CAP (PCIE_RC_CONFIG_BASE + 0x274)
> #define PCIE_RC_CONFIG_THP_CAP_NEXT_MASK GENMASK(31, 20)
> @@ -229,7 +223,7 @@ static void rockchip_pcie_enable_bw_int(struct rockchip_pcie *rockchip)
> u32 status;
>
> status = rockchip_pcie_read(rockchip, PCIE_RC_CONFIG_LCS);
> - status |= (PCIE_RC_CONFIG_LCS_LBMIE | PCIE_RC_CONFIG_LCS_LABIE);
> + status |= (PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
> rockchip_pcie_write(rockchip, status, PCIE_RC_CONFIG_LCS);
> }
>
> @@ -238,7 +232,7 @@ static void rockchip_pcie_clr_bw_int(struct rockchip_pcie *rockchip)
> u32 status;
>
> status = rockchip_pcie_read(rockchip, PCIE_RC_CONFIG_LCS);
> - status |= (PCIE_RC_CONFIG_LCS_LBMS | PCIE_RC_CONFIG_LCS_LAMS);
> + status |= (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS) << 16;
> rockchip_pcie_write(rockchip, status, PCIE_RC_CONFIG_LCS);
> }
>
> @@ -540,7 +534,7 @@ static int rockchip_pcie_init_port(struct rockchip_pcie *rockchip)
>
> /* Set RC's clock architecture as common clock */
> status = rockchip_pcie_read(rockchip, PCIE_RC_CONFIG_LCS);
> - status |= PCIE_RC_CONFIG_LCS_CCC;
> + status |= PCI_EXP_LNKCTL_CCC;
> rockchip_pcie_write(rockchip, status, PCIE_RC_CONFIG_LCS);
>
> /* Enable Gen1 training */
> @@ -575,7 +569,7 @@ static int rockchip_pcie_init_port(struct rockchip_pcie *rockchip)
> * gen1 finished.
> */
> status = rockchip_pcie_read(rockchip, PCIE_RC_CONFIG_LCS);
> - status |= PCIE_RC_CONFIG_LCS_RETRAIN_LINK;
> + status |= PCI_EXP_LNKCTL_RL;
> rockchip_pcie_write(rockchip, status, PCIE_RC_CONFIG_LCS);
>
> timeout = jiffies + msecs_to_jiffies(500);
> --
> 1.9.1
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH v3 00/25] VT-d unit test
From: Peter Xu @ 2016-11-14 22:25 UTC (permalink / raw)
To: kvm; +Cc: drjones, agordeev, jan.kiszka, rkrcmar, pbonzini
In-Reply-To: <1479161961-20304-1-git-send-email-peterx@redhat.com>
Please ignore this v3 series.
This series is incorrectly configured with base branch, so wrong
patches are sent. I did stop it as long as I found the problem, but
still lots of patches were sent already.
Will repost later. Sorry for the noise!
On Mon, Nov 14, 2016 at 05:18:56PM -0500, Peter Xu wrote:
> This is v3 of vt-d unit test series.
>
> Patch "libcflat: add IS_ALIGNED() macro, and page sizes" is picked up
> by Drew in the ARM GIC framework series, so please feel free to drop
> it when needed.
>
> Online repo:
>
> https://github.com/xzpeter/kvm-unit-tests.git iommu-ut-v3
>
> Please review. Thanks,
>
> v3:
> - is_power_of_2(): remove outer "()" [Drew]
> - fix indent for vtd_root_table() [Drew]
> - pci_find_dev(): keep {} to avoid touching more codes [Drew]
> - pci_scan_bars(): check whether bar is 64bits [Drew]
> - pci_enable_defaults(): make it return nothing [Drew]
> - fix strnang blanks in lib/x86/intel-iommu.h [Drew]
> - pci_setup_msi(): fix correct assert() inside [Drew]
> - squash patches: 03+17 08+09 10+11 12+14 [Drew]
> - remove all "()" for raw defines (numbers) in macros [Drew]
>
> v2:
> - move cpu_relax patch to the beginning, and use them in all places
> [Drew]
> - replace all corresponding 256 into PCI_DEVFN_MAX, as well for
> PCI_BAR_NUM [Drew]
> - adding is_power_of_2() to replace ONE_BIT_ONLY() [Drew]
> - add SZ_64K macro [Drew]
> - declare pci_config_write[wb] in lib/asm-generic/pci-host-bridge.h [Alex]
> - edu_reg_read/write() add "l" in func name [Drew]
> - drop pci_set_master(), instead, provide pci_cmd_set_clr() [Drew]
> - change return code into bool (always) for functions that apply
> [Drew]
> - keep old pci_find_dev() interface [Drew/Alex]
> - use __raw_{read|write}*() for both vt-d and edu register read/writes
> [Alex]
> - remove pci_ prefix for all pci_dev fields [Drew]
> - replace 0xff in cap_handlers[0xff] into (PCI_CAP_ID_MAX + 1) [Drew]
> - make x86/unittest.cfg simpler by using q35 directly with eim=off
> [Drew]
>
> RFC -> v1:
> - when init edu device fail, report_skip() rather than return error
> [Radim]
> - use asserts rather than "static bool inited" to avoid multiple init
> of components (affects patch 1/2) [Drew]
> - moving the first two patches out of the series [Drew]
> - int vtd_init(), do not setup_idt() since smp_init() did it [Drew]
> - when edu do not have MSI enabled, skip interrupt test [Radim]
> - rename vtd_reg_*() into vtd_{read|write}[lq](), and move them to
> header file [Drew]
> - use PAGE_MASK when able [Drew]
> - use "&" instead of "|" in intel-iommu init test (three places)
> [Drew]
> - use "vtd_init()" in unit test [Drew]
> - mention that where intel-iommu.h comes from [Drew]
> - re-written vtd_gcmd_or(), make it also work on even hardware [Drew]
> - remove most of the oneline wrapper for VT-d registers, instead, use
> vtd_{read|write}* with register names [Drew]
> - remove useless BDF helpers [Drew]
> - move edu device macros into header file [Drew]
> - make edu_check_alive static inline [Drew]
> - remove all useless wrappers in pci-edu.c [Drew]
> - remove pci_dma_dir_t and all its users, instead, use "bool
> from_device" [Drew]
> - not use typedef for structs, to follow Linux/kvm-unit-tests coding
> style [Drew]
> - let pci_dev_init() clean and simple, then provide
> pci_enable_defaults() for more complicated things [Drew]
> - add one more patch to add intel-iommu test into x86/unittest [Radim]
> - use 0x60 intr request instead of factorial to trigger edu device
> interrupt [Drew]
> - ...and some other changes I just forgot to note down...
>
> Currently only a very small test scope is covered:
>
> * VT-d init
> * DMAR: 4 bytes copy
> * IR: MSI
>
> However this series could be a base point to add more test cases for
> VT-d. The problem is, there are many IOMMU error conditions which are
> very hard to be triggered in a real guest (IOMMU has merely no
> interface for guest user, and it's totally running in the background).
> This piece of work can be a start point if we want to do more
> complicated things and play around with Intel IOMMU devices (also for
> IOMMU regression tests).
>
> Please review. Thanks,
>
> =================
>
> To run the test:
>
> ./x86/run ./x86/intel-iommu.flat \
> -M q35,kernel-irqchip=split -global ioapic.version=0x20 \
> -device intel-iommu,intremap=on -device edu
>
> Sample output:
>
> pxdev:kvm-unit-tests [new-iommu-ut]# ./iommu_run.sh
> /root/git/qemu/bin/x86_64-softmmu/qemu-system-x86_64 -enable-kvm -device pc-testdev -device isa-debug-exit,iobase=0xf4,iosize=0x4 -vnc none -serial stdio
> +-device pci-testdev -kernel ./x86/intel-iommu.flat -M q35,kernel-irqchip=split -global ioapic.version=0x20 -device intel-iommu,intremap=on -device edu
> enabling apic
> paging enabled
> cr0 = 80010011
> cr3 = 7fff000
> cr4 = 20
> VT-d version: 0x10
> cap: 0x0012008c22260206
> ecap: 0x0000000000f00f1a
> PASS: init status check
> PASS: fault status check
> PASS: QI enablement
> DMAR table address: 0x0000000007ff9000
> PASS: DMAR table setup
> IR table address: 0x0000000007ff8000
> PASS: IR table setup
> PASS: DMAR enablement
> PASS: IR enablement
> PASS: DMAR support 39 bits address width
> PASS: DMAR support huge pages
> PCI: init dev 0x0020 BAR 0 [MEM] addr 0xfea00000
> PCI detected cap 0x5
> Detected MSI for device 0x20 offset 0x40
> allocated vt-d root entry for PCI bus 0
> allocated vt-d context entry for devfn 0x20
> map 4K page IOVA 0x0 to 0x7ff7000 (sid=0x0020)
> edu device DMA start TO addr 0x0 size 0x4 off 0x0
> edu device DMA start FROM addr 0x4 size 0x4 off 0x0
> PASS: DMAR 4B memcpy test
> INTR: setup IRTE index 0
> MSI: dev 0x20 init 64bit address: addr=0xfee00010, data=0x0
> PASS: EDU factorial INTR test
>
> Alexander Gordeev (12):
> pci: Fix coding style in generic PCI files
> pci: x86: Rename pci_config_read() to pci_config_readl()
> pci: Add 'extern' to public function declarations
> pci: x86: Add remaining PCI configuration space accessors
> pci: Factor out pci_bar_get()
> pci: Rework pci_bar_addr()
> pci: Add pci_bar_set_addr()
> pci: Add pci_dev_exists()
> pci: Add pci_print()
> pci: Add generic ECAM host support
> pci: Add pci-testdev PCI bus test device
> arm/arm64: pci: Add pci-testdev PCI device operation test
>
> Peter Xu (13):
> x86/asm: add cpu_relax()
> libcflat: introduce is_power_of_2()
> x86: intel-iommu: add vt-d init test
> libcflat: add IS_ALIGNED() macro, and page sizes
> libcflat: moving MIN/MAX here
> vm/page: provide PGDIR_OFFSET() macro
> pci: introduce struct pci_dev
> pci: provide pci_scan_bars()
> pci: provide pci_enable_defaults()
> pci: edu: introduce pci-edu helpers
> x86: intel-iommu: add dmar test
> pci: add msi support for 32/64bit address
> x86: intel-iommu: add IR MSI test
>
> arm/Makefile.common | 6 +-
> arm/pci-test.c | 27 ++++
> arm/run | 7 +-
> arm/unittests.cfg | 4 +
> lib/alloc.c | 3 -
> lib/arm/asm/pci.h | 1 +
> lib/arm64/asm/pci.h | 1 +
> lib/asm-generic/pci-host-bridge.h | 28 ++++
> lib/libcflat.h | 14 ++
> lib/pci-edu.c | 73 +++++++++
> lib/pci-edu.h | 83 ++++++++++
> lib/pci-host-generic.c | 321 +++++++++++++++++++++++++++++++++++++
> lib/pci-host-generic.h | 46 ++++++
> lib/pci-testdev.c | 194 ++++++++++++++++++++++
> lib/pci.c | 327 +++++++++++++++++++++++++++++++++++---
> lib/pci.h | 66 +++++++-
> lib/x86/asm/barrier.h | 11 ++
> lib/x86/asm/page.h | 3 +
> lib/x86/asm/pci.h | 46 +++++-
> lib/x86/intel-iommu.c | 313 ++++++++++++++++++++++++++++++++++++
> lib/x86/intel-iommu.h | 142 +++++++++++++++++
> lib/x86/vm.c | 4 +-
> x86/Makefile.common | 1 +
> x86/Makefile.x86_64 | 2 +
> x86/intel-iommu.c | 120 ++++++++++++++
> x86/unittests.cfg | 7 +
> x86/vmexit.c | 27 ++--
> 27 files changed, 1826 insertions(+), 51 deletions(-)
> create mode 100644 arm/pci-test.c
> create mode 100644 lib/arm/asm/pci.h
> create mode 100644 lib/arm64/asm/pci.h
> create mode 100644 lib/asm-generic/pci-host-bridge.h
> create mode 100644 lib/pci-edu.c
> create mode 100644 lib/pci-edu.h
> create mode 100644 lib/pci-host-generic.c
> create mode 100644 lib/pci-host-generic.h
> create mode 100644 lib/pci-testdev.c
> create mode 100644 lib/x86/intel-iommu.c
> create mode 100644 lib/x86/intel-iommu.h
> create mode 100644 x86/intel-iommu.c
>
> --
> 2.7.4
>
-- peterx
^ permalink raw reply
* Re: [PATCH net] ipv4: fix cloning issues in fib_trie_unmerge()
From: Alexander Duyck @ 2016-11-14 22:25 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Alexander Duyck
In-Reply-To: <1479159274.8455.82.camel@edumazet-glaptop3.roam.corp.google.com>
On Mon, Nov 14, 2016 at 1:34 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> I had crashes in a DEBUG_PAGEALLOC kernels in fib_table_flush() or
> fib_table_lookup() that I back tracked to a refcounting issue
> happening when we clone struct fib_alias in fib_trie_unmerge()
>
> While fixing this issue, I also noticed a mem leak happening
> if fib_insert_alias() fails.
>
> Fixes: 0ddcf43d5d4a0 ("ipv4: FIB Local/MAIN table collapse")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Alexander Duyck <alexander.h.duyck@intel.com>
> ---
> net/ipv4/fib_trie.c | 7 ++++++-
> 1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> index 4cff74d4133f..ebf49ab889e8 100644
> --- a/net/ipv4/fib_trie.c
> +++ b/net/ipv4/fib_trie.c
> @@ -1737,14 +1737,19 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
> goto out;
>
> memcpy(new_fa, fa, sizeof(*fa));
> + if (fa->fa_info)
> + fa->fa_info->fib_treeref++;
>
> /* insert clone into table */
> if (!local_l)
> local_l = fib_find_node(lt, &local_tp, l->key);
>
> if (fib_insert_alias(lt, local_tp, local_l, new_fa,
> - NULL, l->key))
> + NULL, l->key)) {
> + kmem_cache_free(fn_alias_kmem, new_fa);
> + fib_release_info(fa->fa_info);
> goto out;
> + }
> }
>
> /* stop loop if key wrapped back to 0 */
>
>
Actually I think this creates a reference leak. If you look the call
to fib_table_flush_external is skipping the call to fib_release_info.
If you add this then you would probably need to update
fib_table_flush_external so that we call fib_release_info like we do
for fib_table_flush.
^ permalink raw reply
* Re: [PATCH] scsi: megaraid_sas: add in missing white spaces in error messages text
From: Bart Van Assche @ 2016-11-14 22:24 UTC (permalink / raw)
To: Colin King, Kashyap Desai, Sumit Saxena, Shivasharan S,
James E . J . Bottomley, Martin K . Petersen, megaraidlinux.pdl,
linux-scsi
Cc: linux-kernel
In-Reply-To: <20161112162524.4585-1-colin.king@canonical.com>
On 11/12/2016 08:25 AM, Colin King wrote:
> A couple of dev_printk messages spans two lines and the literal string
> is missing a white space between words. Add the white space.
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
^ permalink raw reply
* Re: [PATCH v6 9/9] tpm: cleanup of printk error messages
From: Jarkko Sakkinen @ 2016-11-14 22:24 UTC (permalink / raw)
To: Nayna Jain
Cc: tpmdd-devel, peterhuewe, tpmdd, jgunthorpe, linux-kernel,
linux-security-module
In-Reply-To: <1479117656-12403-10-git-send-email-nayna@linux.vnet.ibm.com>
On Mon, Nov 14, 2016 at 05:00:56AM -0500, Nayna Jain wrote:
> This patch removes the unnecessary error messages on failing to
> allocate memory and replaces pr_err/printk with dev_dbg/dev_info
> as applicable.
>
> Suggested-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> Signed-off-by: Nayna Jain <nayna@linux.vnet.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
/Jarkko
> ---
> drivers/char/tpm/tpm_acpi.c | 16 ++++------------
> drivers/char/tpm/tpm_of.c | 29 +++++++++--------------------
> 2 files changed, 13 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/char/tpm/tpm_acpi.c b/drivers/char/tpm/tpm_acpi.c
> index fa30c969..ddbaef2 100644
> --- a/drivers/char/tpm/tpm_acpi.c
> +++ b/drivers/char/tpm/tpm_acpi.c
> @@ -60,11 +60,8 @@ int read_log_acpi(struct tpm_chip *chip)
> status = acpi_get_table(ACPI_SIG_TCPA, 1,
> (struct acpi_table_header **)&buff);
>
> - if (ACPI_FAILURE(status)) {
> - printk(KERN_ERR "%s: ERROR - Could not get TCPA table\n",
> - __func__);
> + if (ACPI_FAILURE(status))
> return -EIO;
> - }
>
> switch(buff->platform_class) {
> case BIOS_SERVER:
> @@ -78,25 +75,20 @@ int read_log_acpi(struct tpm_chip *chip)
> break;
> }
> if (!len) {
> - printk(KERN_ERR "%s: ERROR - TCPA log area empty\n", __func__);
> + dev_warn(&chip->dev, "%s: TCPA log area empty\n", __func__);
> return -EIO;
> }
>
> /* malloc EventLog space */
> log->bios_event_log = kmalloc(len, GFP_KERNEL);
> - if (!log->bios_event_log) {
> - printk("%s: ERROR - Not enough Memory for BIOS measurements\n",
> - __func__);
> + if (!log->bios_event_log)
> return -ENOMEM;
> - }
>
> log->bios_event_log_end = log->bios_event_log + len;
>
> virt = acpi_os_map_iomem(start, len);
> - if (!virt) {
> - printk("%s: ERROR - Unable to map memory\n", __func__);
> + if (!virt)
> goto err;
> - }
>
> memcpy_fromio(log->bios_event_log, virt, len);
>
> diff --git a/drivers/char/tpm/tpm_of.c b/drivers/char/tpm/tpm_of.c
> index 22b8f81..3af829f 100644
> --- a/drivers/char/tpm/tpm_of.c
> +++ b/drivers/char/tpm/tpm_of.c
> @@ -31,40 +31,29 @@ int read_log_of(struct tpm_chip *chip)
> log = &chip->log;
> if (chip->dev.parent->of_node)
> np = chip->dev.parent->of_node;
> - if (!np) {
> - pr_err("%s: ERROR - IBMVTPM not supported\n", __func__);
> + if (!np)
> return -ENODEV;
> - }
>
> sizep = of_get_property(np, "linux,sml-size", NULL);
> - if (sizep == NULL) {
> - pr_err("%s: ERROR - SML size not found\n", __func__);
> - goto cleanup_eio;
> - }
> + if (sizep == NULL)
> + return -EIO;
> +
> if (*sizep == 0) {
> - pr_err("%s: ERROR - event log area empty\n", __func__);
> - goto cleanup_eio;
> + dev_warn(&chip->dev, "%s: Event log area empty\n", __func__);
> + return -EIO;
> }
>
> basep = of_get_property(np, "linux,sml-base", NULL);
> - if (basep == NULL) {
> - pr_err("%s: ERROR - SML not found\n", __func__);
> - goto cleanup_eio;
> - }
> + if (basep == NULL)
> + return -EIO;
>
> log->bios_event_log = kmalloc(*sizep, GFP_KERNEL);
> - if (!log->bios_event_log) {
> - pr_err("%s: ERROR - Not enough memory for BIOS measurements\n",
> - __func__);
> + if (!log->bios_event_log)
> return -ENOMEM;
> - }
>
> log->bios_event_log_end = log->bios_event_log + *sizep;
>
> memcpy(log->bios_event_log, __va(*basep), *sizep);
>
> return 0;
> -
> -cleanup_eio:
> - return -EIO;
> }
> --
> 2.5.0
>
^ permalink raw reply
* Re: [PATCH] isci: fix typo in deg_dbg message
From: Bart Van Assche @ 2016-11-14 22:23 UTC (permalink / raw)
To: Colin King, Intel SCU Linux support, Artur Paszkiewicz,
James E . J . Bottomley, Martin K . Petersen, linux-scsi
Cc: linux-kernel
In-Reply-To: <20161112183026.9626-1-colin.king@canonical.com>
On 11/12/2016 10:30 AM, Colin King wrote:
> Trivial fix to typo "repsonse" to "response" in dev_dbg message.
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
^ permalink raw reply
* Re: [PATCH] iscsi: fix spelling mistakes in dev_warn messages
From: Bart Van Assche @ 2016-11-14 22:24 UTC (permalink / raw)
To: Colin King, Intel SCU Linux support, Artur Paszkiewicz,
James E . J . Bottomley, Martin K . Petersen, linux-scsi
Cc: linux-kernel
In-Reply-To: <20161112164950.5605-1-colin.king@canonical.com>
On 11/12/2016 08:49 AM, Colin King wrote:
> Trivial fix to spelling mistake "suspeneded" to "suspended" in
> dev_warn messages
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
^ permalink raw reply
* Re: [PATCH v6 9/9] tpm: cleanup of printk error messages
From: Jarkko Sakkinen @ 2016-11-14 22:24 UTC (permalink / raw)
To: Nayna Jain
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-security-module-u79uwXL29TY76Z2rM5mHXA,
tpmdd-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <1479117656-12403-10-git-send-email-nayna-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
On Mon, Nov 14, 2016 at 05:00:56AM -0500, Nayna Jain wrote:
> This patch removes the unnecessary error messages on failing to
> allocate memory and replaces pr_err/printk with dev_dbg/dev_info
> as applicable.
>
> Suggested-by: Jason Gunthorpe <jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
> Signed-off-by: Nayna Jain <nayna-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
/Jarkko
> ---
> drivers/char/tpm/tpm_acpi.c | 16 ++++------------
> drivers/char/tpm/tpm_of.c | 29 +++++++++--------------------
> 2 files changed, 13 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/char/tpm/tpm_acpi.c b/drivers/char/tpm/tpm_acpi.c
> index fa30c969..ddbaef2 100644
> --- a/drivers/char/tpm/tpm_acpi.c
> +++ b/drivers/char/tpm/tpm_acpi.c
> @@ -60,11 +60,8 @@ int read_log_acpi(struct tpm_chip *chip)
> status = acpi_get_table(ACPI_SIG_TCPA, 1,
> (struct acpi_table_header **)&buff);
>
> - if (ACPI_FAILURE(status)) {
> - printk(KERN_ERR "%s: ERROR - Could not get TCPA table\n",
> - __func__);
> + if (ACPI_FAILURE(status))
> return -EIO;
> - }
>
> switch(buff->platform_class) {
> case BIOS_SERVER:
> @@ -78,25 +75,20 @@ int read_log_acpi(struct tpm_chip *chip)
> break;
> }
> if (!len) {
> - printk(KERN_ERR "%s: ERROR - TCPA log area empty\n", __func__);
> + dev_warn(&chip->dev, "%s: TCPA log area empty\n", __func__);
> return -EIO;
> }
>
> /* malloc EventLog space */
> log->bios_event_log = kmalloc(len, GFP_KERNEL);
> - if (!log->bios_event_log) {
> - printk("%s: ERROR - Not enough Memory for BIOS measurements\n",
> - __func__);
> + if (!log->bios_event_log)
> return -ENOMEM;
> - }
>
> log->bios_event_log_end = log->bios_event_log + len;
>
> virt = acpi_os_map_iomem(start, len);
> - if (!virt) {
> - printk("%s: ERROR - Unable to map memory\n", __func__);
> + if (!virt)
> goto err;
> - }
>
> memcpy_fromio(log->bios_event_log, virt, len);
>
> diff --git a/drivers/char/tpm/tpm_of.c b/drivers/char/tpm/tpm_of.c
> index 22b8f81..3af829f 100644
> --- a/drivers/char/tpm/tpm_of.c
> +++ b/drivers/char/tpm/tpm_of.c
> @@ -31,40 +31,29 @@ int read_log_of(struct tpm_chip *chip)
> log = &chip->log;
> if (chip->dev.parent->of_node)
> np = chip->dev.parent->of_node;
> - if (!np) {
> - pr_err("%s: ERROR - IBMVTPM not supported\n", __func__);
> + if (!np)
> return -ENODEV;
> - }
>
> sizep = of_get_property(np, "linux,sml-size", NULL);
> - if (sizep == NULL) {
> - pr_err("%s: ERROR - SML size not found\n", __func__);
> - goto cleanup_eio;
> - }
> + if (sizep == NULL)
> + return -EIO;
> +
> if (*sizep == 0) {
> - pr_err("%s: ERROR - event log area empty\n", __func__);
> - goto cleanup_eio;
> + dev_warn(&chip->dev, "%s: Event log area empty\n", __func__);
> + return -EIO;
> }
>
> basep = of_get_property(np, "linux,sml-base", NULL);
> - if (basep == NULL) {
> - pr_err("%s: ERROR - SML not found\n", __func__);
> - goto cleanup_eio;
> - }
> + if (basep == NULL)
> + return -EIO;
>
> log->bios_event_log = kmalloc(*sizep, GFP_KERNEL);
> - if (!log->bios_event_log) {
> - pr_err("%s: ERROR - Not enough memory for BIOS measurements\n",
> - __func__);
> + if (!log->bios_event_log)
> return -ENOMEM;
> - }
>
> log->bios_event_log_end = log->bios_event_log + *sizep;
>
> memcpy(log->bios_event_log, __va(*basep), *sizep);
>
> return 0;
> -
> -cleanup_eio:
> - return -EIO;
> }
> --
> 2.5.0
>
------------------------------------------------------------------------------
^ permalink raw reply
* Re: [PATCH] isci: fix typo in deg_dbg message
From: Bart Van Assche @ 2016-11-14 22:23 UTC (permalink / raw)
To: Colin King, Intel SCU Linux support, Artur Paszkiewicz,
James E . J . Bottomley, Martin K . Petersen, linux-scsi
Cc: linux-kernel
In-Reply-To: <20161112183026.9626-1-colin.king@canonical.com>
On 11/12/2016 10:30 AM, Colin King wrote:
> Trivial fix to typo "repsonse" to "response" in dev_dbg message.
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
^ permalink raw reply
* Re: [PATCH v4 2/3] PCI: qcom: add support to msm8996 PCIE controller
From: Bjorn Helgaas @ 2016-11-14 22:23 UTC (permalink / raw)
To: Srinivas Kandagatla
Cc: svarbanov, linux-pci, bhelgaas, robh+dt, linux-arm-msm,
devicetree
In-Reply-To: <1479122155-13393-3-git-send-email-srinivas.kandagatla@linaro.org>
On Mon, Nov 14, 2016 at 11:15:54AM +0000, Srinivas Kandagatla wrote:
> This patch adds support to msm8996/apq8096 pcie, MSM8996 supports
> Gen 1/2, One lane, 3 pcie root-complex with support to MSI and
> legacy interrupts and it conforms to PCI Express Base 2.1 specification.
>
> This patch adds post_init callback to qcom_pcie_ops, as this is pcie
> pipe clocks are only setup after the phy is powered on.
> It also adds ltssm_enable callback as it is very much different to other
> supported SOCs in the driver.
>
> Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Will need ack from Stanimir before I can apply it.
> ---
> .../devicetree/bindings/pci/qcom,pcie.txt | 67 +++++++-
> drivers/pci/host/pcie-qcom.c | 177 ++++++++++++++++++++-
> 2 files changed, 238 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
> index 4059a6f..141d8c3 100644
> --- a/Documentation/devicetree/bindings/pci/qcom,pcie.txt
> +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
> @@ -7,6 +7,7 @@
> - "qcom,pcie-ipq8064" for ipq8064
> - "qcom,pcie-apq8064" for apq8064
> - "qcom,pcie-apq8084" for apq8084
> + - "qcom,pcie-msm8996" for msm8996 or apq8096
>
> - reg:
> Usage: required
> @@ -92,6 +93,17 @@
> - "aux" Auxiliary (AUX) clock
> - "bus_master" Master AXI clock
> - "bus_slave" Slave AXI clock
> +
> +- clock-names:
> + Usage: required for msm8996/apq8096
> + Value type: <stringlist>
> + Definition: Should contain the following entries
> + - "pipe" Pipe Clock driving internal logic.
> + - "aux" Auxiliary (AUX) clock.
> + - "cfg" Configuration clk.
> + - "bus_master" Master AXI clock.
> + - "bus_slave" Slave AXI clock.
> +
> - resets:
> Usage: required
> Value type: <prop-encoded-array>
> @@ -115,7 +127,7 @@
> - "core" Core reset
>
> - power-domains:
> - Usage: required for apq8084
> + Usage: required for apq8084 and msm8996/apq8096
> Value type: <prop-encoded-array>
> Definition: A phandle and power domain specifier pair to the
> power domain which is responsible for collapsing
> @@ -231,3 +243,56 @@
> pinctrl-0 = <&pcie0_pins_default>;
> pinctrl-names = "default";
> };
> +
> +* Example for apq8096:
> +
> + pcie@608000{
> + compatible = "qcom,pcie-msm8996", "snps,dw-pcie";
> + power-domains = <&gcc PCIE1_GDSC>;
> + bus-range = <0x00 0xff>;
> + num-lanes = <1>;
> +
> + reg = <0x00608000 0x2000>,
> + <0x0d000000 0xf1d>,
> + <0x0d000f20 0xa8>,
> + <0x0d100000 0x100000>;
> +
> + reg-names = "parf", "dbi", "elbi", "config";
> +
> + phys = <&pcie_phy 1>;
> + phy-names = "pciephy";
> +
> + #address-cells = <3>;
> + #size-cells = <2>;
> + ranges = <0x01000000 0x0 0x0d200000 0x0d200000 0x0 0x100000>,
> + <0x02000000 0x0 0x0d300000 0x0d300000 0x0 0xd00000>;
> +
> + interrupts = <GIC_SPI 413 IRQ_TYPE_NONE>;
> + interrupt-names = "msi";
> + #interrupt-cells = <1>;
> + interrupt-map-mask = <0 0 0 0x7>;
> + interrupt-map = <0 0 0 1 &intc 0 272 IRQ_TYPE_LEVEL_HIGH>, /* int_a */
> + <0 0 0 2 &intc 0 273 IRQ_TYPE_LEVEL_HIGH>, /* int_b */
> + <0 0 0 3 &intc 0 274 IRQ_TYPE_LEVEL_HIGH>, /* int_c */
> + <0 0 0 4 &intc 0 275 IRQ_TYPE_LEVEL_HIGH>; /* int_d */
> +
> + pinctrl-names = "default", "sleep";
> + pinctrl-0 = <&pcie1_clkreq_default &pcie1_perst_default &pcie1_wake_default>;
> + pinctrl-1 = <&pcie1_clkreq_sleep &pcie1_perst_default &pcie1_wake_sleep>;
> +
> + vdda-1p8-supply = <&pm8994_l12>;
> + vdda-supply = <&pm8994_l28>;
> + linux,pci-domain = <1>;
> +
> + clocks = <&gcc GCC_PCIE_1_PIPE_CLK>,
> + <&gcc GCC_PCIE_1_AUX_CLK>,
> + <&gcc GCC_PCIE_1_CFG_AHB_CLK>,
> + <&gcc GCC_PCIE_1_MSTR_AXI_CLK>,
> + <&gcc GCC_PCIE_1_SLV_AXI_CLK>;
> +
> + clock-names = "pipe",
> + "aux",
> + "cfg",
> + "bus_master",
> + "bus_slave";
> + };
> diff --git a/drivers/pci/host/pcie-qcom.c b/drivers/pci/host/pcie-qcom.c
> index 3593640..03ba6b1 100644
> --- a/drivers/pci/host/pcie-qcom.c
> +++ b/drivers/pci/host/pcie-qcom.c
> @@ -36,11 +36,19 @@
>
> #include "pcie-designware.h"
>
> +#define PCIE20_PARF_DBI_BASE_ADDR 0x168
> +
> +#define PCIE20_PARF_SYS_CTRL 0x00
> #define PCIE20_PARF_PHY_CTRL 0x40
> #define PCIE20_PARF_PHY_REFCLK 0x4C
> #define PCIE20_PARF_DBI_BASE_ADDR 0x168
> #define PCIE20_PARF_SLV_ADDR_SPACE_SIZE 0x16c
> +#define PCIE20_PARF_MHI_CLOCK_RESET_CTRL 0x174
> #define PCIE20_PARF_AXI_MSTR_WR_ADDR_HALT 0x178
> +#define MSM8996_PCIE20_PARF_AXI_MSTR_WR_ADDR_HALT 0x1A8
> +#define PCIE20_PARF_LTSSM 0x1B0
> +#define PCIE20_PARF_SID_OFFSET 0x234
> +#define PCIE20_PARF_BDF_TRANSLATE_CFG 0x24C
>
> #define PCIE20_ELBI_SYS_CTRL 0x04
> #define PCIE20_ELBI_SYS_CTRL_LT_ENABLE BIT(0)
> @@ -72,9 +80,18 @@ struct qcom_pcie_resources_v1 {
> struct regulator *vdda;
> };
>
> +struct qcom_pcie_resources_v2 {
> + struct clk *aux_clk;
> + struct clk *master_clk;
> + struct clk *slave_clk;
> + struct clk *cfg_clk;
> + struct clk *pipe_clk;
> +};
> +
> union qcom_pcie_resources {
> struct qcom_pcie_resources_v0 v0;
> struct qcom_pcie_resources_v1 v1;
> + struct qcom_pcie_resources_v2 v2;
> };
>
> struct qcom_pcie;
> @@ -82,7 +99,9 @@ struct qcom_pcie;
> struct qcom_pcie_ops {
> int (*get_resources)(struct qcom_pcie *pcie);
> int (*init)(struct qcom_pcie *pcie);
> + int (*post_init)(struct qcom_pcie *pcie);
> void (*deinit)(struct qcom_pcie *pcie);
> + void (*ltssm_enable)(struct qcom_pcie *pcie);
> };
>
> struct qcom_pcie {
> @@ -116,17 +135,33 @@ static irqreturn_t qcom_pcie_msi_irq_handler(int irq, void *arg)
> return dw_handle_msi_irq(pp);
> }
>
> -static int qcom_pcie_establish_link(struct qcom_pcie *pcie)
> +static void qcom_pcie_v0_v1_ltssm_enable(struct qcom_pcie *pcie)
> {
> u32 val;
> -
> - if (dw_pcie_link_up(&pcie->pp))
> - return 0;
> -
> /* enable link training */
> val = readl(pcie->elbi + PCIE20_ELBI_SYS_CTRL);
> val |= PCIE20_ELBI_SYS_CTRL_LT_ENABLE;
> writel(val, pcie->elbi + PCIE20_ELBI_SYS_CTRL);
> +}
> +
> +static void qcom_pcie_v2_ltssm_enable(struct qcom_pcie *pcie)
> +{
> + u32 val;
> + /* enable link training */
> + val = readl(pcie->parf + PCIE20_PARF_LTSSM);
> + val |= BIT(8);
> + writel(val, pcie->parf + PCIE20_PARF_LTSSM);
> +}
> +
> +static int qcom_pcie_establish_link(struct qcom_pcie *pcie)
> +{
> +
> + if (dw_pcie_link_up(&pcie->pp))
> + return 0;
> +
> + /* Enable Link Training state machine */
> + if (pcie->ops->ltssm_enable)
> + pcie->ops->ltssm_enable(pcie);
>
> return dw_pcie_wait_for_link(&pcie->pp);
> }
> @@ -421,6 +456,113 @@ static int qcom_pcie_init_v1(struct qcom_pcie *pcie)
> return ret;
> }
>
> +static int qcom_pcie_get_resources_v2(struct qcom_pcie *pcie)
> +{
> + struct qcom_pcie_resources_v2 *res = &pcie->res.v2;
> + struct device *dev = pcie->pp.dev;
> +
> + res->aux_clk = devm_clk_get(dev, "aux");
> + if (IS_ERR(res->aux_clk))
> + return PTR_ERR(res->aux_clk);
> +
> + res->cfg_clk = devm_clk_get(dev, "cfg");
> + if (IS_ERR(res->cfg_clk))
> + return PTR_ERR(res->cfg_clk);
> +
> + res->master_clk = devm_clk_get(dev, "bus_master");
> + if (IS_ERR(res->master_clk))
> + return PTR_ERR(res->master_clk);
> +
> + res->slave_clk = devm_clk_get(dev, "bus_slave");
> + if (IS_ERR(res->slave_clk))
> + return PTR_ERR(res->slave_clk);
> +
> + res->pipe_clk = devm_clk_get(dev, "pipe");
> + if (IS_ERR(res->pipe_clk))
> + return PTR_ERR(res->pipe_clk);
> +
> + return 0;
> +}
> +
> +static int qcom_pcie_init_v2(struct qcom_pcie *pcie)
> +{
> + struct qcom_pcie_resources_v2 *res = &pcie->res.v2;
> + struct device *dev = pcie->pp.dev;
> + u32 val;
> + int ret;
> +
> + ret = clk_prepare_enable(res->aux_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable aux clock\n");
> + return ret;
> + }
> +
> + ret = clk_prepare_enable(res->cfg_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable cfg clock\n");
> + goto err_cfg_clk;
> + }
> +
> + ret = clk_prepare_enable(res->master_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable master clock\n");
> + goto err_master_clk;
> + }
> +
> + ret = clk_prepare_enable(res->slave_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable slave clock\n");
> + goto err_slave_clk;
> + }
> +
> + /* enable PCIe clocks and resets */
> + val = readl(pcie->parf + PCIE20_PARF_PHY_CTRL);
> + val &= ~BIT(0);
> + writel(val, pcie->parf + PCIE20_PARF_PHY_CTRL);
> +
> + /* change DBI base address */
> + writel(0, pcie->parf + PCIE20_PARF_DBI_BASE_ADDR);
> +
> + /* MAC PHY_POWERDOWN MUX DISABLE */
> + val = readl(pcie->parf + PCIE20_PARF_SYS_CTRL);
> + val &= ~BIT(29);
> + writel(val, pcie->parf + PCIE20_PARF_SYS_CTRL);
> +
> + val = readl(pcie->parf + PCIE20_PARF_MHI_CLOCK_RESET_CTRL);
> + val |= BIT(4);
> + writel(val, pcie->parf + PCIE20_PARF_MHI_CLOCK_RESET_CTRL);
> +
> + val = readl(pcie->parf + MSM8996_PCIE20_PARF_AXI_MSTR_WR_ADDR_HALT);
> + val |= BIT(31);
> + writel(val, pcie->parf + MSM8996_PCIE20_PARF_AXI_MSTR_WR_ADDR_HALT);
> +
> + return 0;
> +
> +err_slave_clk:
> + clk_disable_unprepare(res->master_clk);
> +err_master_clk:
> + clk_disable_unprepare(res->cfg_clk);
> +err_cfg_clk:
> + clk_disable_unprepare(res->aux_clk);
> +
> + return ret;
> +}
> +
> +static int qcom_pcie_post_init_v2(struct qcom_pcie *pcie)
> +{
> + struct qcom_pcie_resources_v2 *res = &pcie->res.v2;
> + struct device *dev = pcie->pp.dev;
> + int ret;
> +
> + ret = clk_prepare_enable(res->pipe_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable pipe clock\n");
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> static int qcom_pcie_link_up(struct pcie_port *pp)
> {
> struct qcom_pcie *pcie = to_qcom_pcie(pp);
> @@ -429,6 +571,17 @@ static int qcom_pcie_link_up(struct pcie_port *pp)
> return !!(val & PCI_EXP_LNKSTA_DLLLA);
> }
>
> +static void qcom_pcie_deinit_v2(struct qcom_pcie *pcie)
> +{
> + struct qcom_pcie_resources_v2 *res = &pcie->res.v2;
> +
> + clk_disable_unprepare(res->pipe_clk);
> + clk_disable_unprepare(res->slave_clk);
> + clk_disable_unprepare(res->master_clk);
> + clk_disable_unprepare(res->cfg_clk);
> + clk_disable_unprepare(res->aux_clk);
> +}
> +
> static void qcom_pcie_host_init(struct pcie_port *pp)
> {
> struct qcom_pcie *pcie = to_qcom_pcie(pp);
> @@ -444,6 +597,9 @@ static void qcom_pcie_host_init(struct pcie_port *pp)
> if (ret)
> goto err_deinit;
>
> + if (pcie->ops->post_init)
> + pcie->ops->post_init(pcie);
> +
> dw_pcie_setup_rc(pp);
>
> if (IS_ENABLED(CONFIG_PCI_MSI))
> @@ -487,12 +643,22 @@ static const struct qcom_pcie_ops ops_v0 = {
> .get_resources = qcom_pcie_get_resources_v0,
> .init = qcom_pcie_init_v0,
> .deinit = qcom_pcie_deinit_v0,
> + .ltssm_enable = qcom_pcie_v0_v1_ltssm_enable,
> };
>
> static const struct qcom_pcie_ops ops_v1 = {
> .get_resources = qcom_pcie_get_resources_v1,
> .init = qcom_pcie_init_v1,
> .deinit = qcom_pcie_deinit_v1,
> + .ltssm_enable = qcom_pcie_v0_v1_ltssm_enable,
> +};
> +
> +static const struct qcom_pcie_ops ops_v2 = {
> + .get_resources = qcom_pcie_get_resources_v2,
> + .init = qcom_pcie_init_v2,
> + .post_init = qcom_pcie_post_init_v2,
> + .deinit = qcom_pcie_deinit_v2,
> + .ltssm_enable = qcom_pcie_v2_ltssm_enable,
> };
>
> static int qcom_pcie_probe(struct platform_device *pdev)
> @@ -572,6 +738,7 @@ static const struct of_device_id qcom_pcie_match[] = {
> { .compatible = "qcom,pcie-ipq8064", .data = &ops_v0 },
> { .compatible = "qcom,pcie-apq8064", .data = &ops_v0 },
> { .compatible = "qcom,pcie-apq8084", .data = &ops_v1 },
> + { .compatible = "qcom,pcie-msm8996", .data = &ops_v2 },
> { }
> };
>
> --
> 2.10.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH v4 2/3] PCI: qcom: add support to msm8996 PCIE controller
From: Bjorn Helgaas @ 2016-11-14 22:23 UTC (permalink / raw)
To: Srinivas Kandagatla
Cc: svarbanov-NEYub+7Iv8PQT0dZR+AlfA,
linux-pci-u79uwXL29TY76Z2rM5mHXA, bhelgaas-hpIqsD4AKlfQT0dZR+AlfA,
robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
linux-arm-msm-u79uwXL29TY76Z2rM5mHXA,
devicetree-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1479122155-13393-3-git-send-email-srinivas.kandagatla-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
On Mon, Nov 14, 2016 at 11:15:54AM +0000, Srinivas Kandagatla wrote:
> This patch adds support to msm8996/apq8096 pcie, MSM8996 supports
> Gen 1/2, One lane, 3 pcie root-complex with support to MSI and
> legacy interrupts and it conforms to PCI Express Base 2.1 specification.
>
> This patch adds post_init callback to qcom_pcie_ops, as this is pcie
> pipe clocks are only setup after the phy is powered on.
> It also adds ltssm_enable callback as it is very much different to other
> supported SOCs in the driver.
>
> Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
Will need ack from Stanimir before I can apply it.
> ---
> .../devicetree/bindings/pci/qcom,pcie.txt | 67 +++++++-
> drivers/pci/host/pcie-qcom.c | 177 ++++++++++++++++++++-
> 2 files changed, 238 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
> index 4059a6f..141d8c3 100644
> --- a/Documentation/devicetree/bindings/pci/qcom,pcie.txt
> +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
> @@ -7,6 +7,7 @@
> - "qcom,pcie-ipq8064" for ipq8064
> - "qcom,pcie-apq8064" for apq8064
> - "qcom,pcie-apq8084" for apq8084
> + - "qcom,pcie-msm8996" for msm8996 or apq8096
>
> - reg:
> Usage: required
> @@ -92,6 +93,17 @@
> - "aux" Auxiliary (AUX) clock
> - "bus_master" Master AXI clock
> - "bus_slave" Slave AXI clock
> +
> +- clock-names:
> + Usage: required for msm8996/apq8096
> + Value type: <stringlist>
> + Definition: Should contain the following entries
> + - "pipe" Pipe Clock driving internal logic.
> + - "aux" Auxiliary (AUX) clock.
> + - "cfg" Configuration clk.
> + - "bus_master" Master AXI clock.
> + - "bus_slave" Slave AXI clock.
> +
> - resets:
> Usage: required
> Value type: <prop-encoded-array>
> @@ -115,7 +127,7 @@
> - "core" Core reset
>
> - power-domains:
> - Usage: required for apq8084
> + Usage: required for apq8084 and msm8996/apq8096
> Value type: <prop-encoded-array>
> Definition: A phandle and power domain specifier pair to the
> power domain which is responsible for collapsing
> @@ -231,3 +243,56 @@
> pinctrl-0 = <&pcie0_pins_default>;
> pinctrl-names = "default";
> };
> +
> +* Example for apq8096:
> +
> + pcie@608000{
> + compatible = "qcom,pcie-msm8996", "snps,dw-pcie";
> + power-domains = <&gcc PCIE1_GDSC>;
> + bus-range = <0x00 0xff>;
> + num-lanes = <1>;
> +
> + reg = <0x00608000 0x2000>,
> + <0x0d000000 0xf1d>,
> + <0x0d000f20 0xa8>,
> + <0x0d100000 0x100000>;
> +
> + reg-names = "parf", "dbi", "elbi", "config";
> +
> + phys = <&pcie_phy 1>;
> + phy-names = "pciephy";
> +
> + #address-cells = <3>;
> + #size-cells = <2>;
> + ranges = <0x01000000 0x0 0x0d200000 0x0d200000 0x0 0x100000>,
> + <0x02000000 0x0 0x0d300000 0x0d300000 0x0 0xd00000>;
> +
> + interrupts = <GIC_SPI 413 IRQ_TYPE_NONE>;
> + interrupt-names = "msi";
> + #interrupt-cells = <1>;
> + interrupt-map-mask = <0 0 0 0x7>;
> + interrupt-map = <0 0 0 1 &intc 0 272 IRQ_TYPE_LEVEL_HIGH>, /* int_a */
> + <0 0 0 2 &intc 0 273 IRQ_TYPE_LEVEL_HIGH>, /* int_b */
> + <0 0 0 3 &intc 0 274 IRQ_TYPE_LEVEL_HIGH>, /* int_c */
> + <0 0 0 4 &intc 0 275 IRQ_TYPE_LEVEL_HIGH>; /* int_d */
> +
> + pinctrl-names = "default", "sleep";
> + pinctrl-0 = <&pcie1_clkreq_default &pcie1_perst_default &pcie1_wake_default>;
> + pinctrl-1 = <&pcie1_clkreq_sleep &pcie1_perst_default &pcie1_wake_sleep>;
> +
> + vdda-1p8-supply = <&pm8994_l12>;
> + vdda-supply = <&pm8994_l28>;
> + linux,pci-domain = <1>;
> +
> + clocks = <&gcc GCC_PCIE_1_PIPE_CLK>,
> + <&gcc GCC_PCIE_1_AUX_CLK>,
> + <&gcc GCC_PCIE_1_CFG_AHB_CLK>,
> + <&gcc GCC_PCIE_1_MSTR_AXI_CLK>,
> + <&gcc GCC_PCIE_1_SLV_AXI_CLK>;
> +
> + clock-names = "pipe",
> + "aux",
> + "cfg",
> + "bus_master",
> + "bus_slave";
> + };
> diff --git a/drivers/pci/host/pcie-qcom.c b/drivers/pci/host/pcie-qcom.c
> index 3593640..03ba6b1 100644
> --- a/drivers/pci/host/pcie-qcom.c
> +++ b/drivers/pci/host/pcie-qcom.c
> @@ -36,11 +36,19 @@
>
> #include "pcie-designware.h"
>
> +#define PCIE20_PARF_DBI_BASE_ADDR 0x168
> +
> +#define PCIE20_PARF_SYS_CTRL 0x00
> #define PCIE20_PARF_PHY_CTRL 0x40
> #define PCIE20_PARF_PHY_REFCLK 0x4C
> #define PCIE20_PARF_DBI_BASE_ADDR 0x168
> #define PCIE20_PARF_SLV_ADDR_SPACE_SIZE 0x16c
> +#define PCIE20_PARF_MHI_CLOCK_RESET_CTRL 0x174
> #define PCIE20_PARF_AXI_MSTR_WR_ADDR_HALT 0x178
> +#define MSM8996_PCIE20_PARF_AXI_MSTR_WR_ADDR_HALT 0x1A8
> +#define PCIE20_PARF_LTSSM 0x1B0
> +#define PCIE20_PARF_SID_OFFSET 0x234
> +#define PCIE20_PARF_BDF_TRANSLATE_CFG 0x24C
>
> #define PCIE20_ELBI_SYS_CTRL 0x04
> #define PCIE20_ELBI_SYS_CTRL_LT_ENABLE BIT(0)
> @@ -72,9 +80,18 @@ struct qcom_pcie_resources_v1 {
> struct regulator *vdda;
> };
>
> +struct qcom_pcie_resources_v2 {
> + struct clk *aux_clk;
> + struct clk *master_clk;
> + struct clk *slave_clk;
> + struct clk *cfg_clk;
> + struct clk *pipe_clk;
> +};
> +
> union qcom_pcie_resources {
> struct qcom_pcie_resources_v0 v0;
> struct qcom_pcie_resources_v1 v1;
> + struct qcom_pcie_resources_v2 v2;
> };
>
> struct qcom_pcie;
> @@ -82,7 +99,9 @@ struct qcom_pcie;
> struct qcom_pcie_ops {
> int (*get_resources)(struct qcom_pcie *pcie);
> int (*init)(struct qcom_pcie *pcie);
> + int (*post_init)(struct qcom_pcie *pcie);
> void (*deinit)(struct qcom_pcie *pcie);
> + void (*ltssm_enable)(struct qcom_pcie *pcie);
> };
>
> struct qcom_pcie {
> @@ -116,17 +135,33 @@ static irqreturn_t qcom_pcie_msi_irq_handler(int irq, void *arg)
> return dw_handle_msi_irq(pp);
> }
>
> -static int qcom_pcie_establish_link(struct qcom_pcie *pcie)
> +static void qcom_pcie_v0_v1_ltssm_enable(struct qcom_pcie *pcie)
> {
> u32 val;
> -
> - if (dw_pcie_link_up(&pcie->pp))
> - return 0;
> -
> /* enable link training */
> val = readl(pcie->elbi + PCIE20_ELBI_SYS_CTRL);
> val |= PCIE20_ELBI_SYS_CTRL_LT_ENABLE;
> writel(val, pcie->elbi + PCIE20_ELBI_SYS_CTRL);
> +}
> +
> +static void qcom_pcie_v2_ltssm_enable(struct qcom_pcie *pcie)
> +{
> + u32 val;
> + /* enable link training */
> + val = readl(pcie->parf + PCIE20_PARF_LTSSM);
> + val |= BIT(8);
> + writel(val, pcie->parf + PCIE20_PARF_LTSSM);
> +}
> +
> +static int qcom_pcie_establish_link(struct qcom_pcie *pcie)
> +{
> +
> + if (dw_pcie_link_up(&pcie->pp))
> + return 0;
> +
> + /* Enable Link Training state machine */
> + if (pcie->ops->ltssm_enable)
> + pcie->ops->ltssm_enable(pcie);
>
> return dw_pcie_wait_for_link(&pcie->pp);
> }
> @@ -421,6 +456,113 @@ static int qcom_pcie_init_v1(struct qcom_pcie *pcie)
> return ret;
> }
>
> +static int qcom_pcie_get_resources_v2(struct qcom_pcie *pcie)
> +{
> + struct qcom_pcie_resources_v2 *res = &pcie->res.v2;
> + struct device *dev = pcie->pp.dev;
> +
> + res->aux_clk = devm_clk_get(dev, "aux");
> + if (IS_ERR(res->aux_clk))
> + return PTR_ERR(res->aux_clk);
> +
> + res->cfg_clk = devm_clk_get(dev, "cfg");
> + if (IS_ERR(res->cfg_clk))
> + return PTR_ERR(res->cfg_clk);
> +
> + res->master_clk = devm_clk_get(dev, "bus_master");
> + if (IS_ERR(res->master_clk))
> + return PTR_ERR(res->master_clk);
> +
> + res->slave_clk = devm_clk_get(dev, "bus_slave");
> + if (IS_ERR(res->slave_clk))
> + return PTR_ERR(res->slave_clk);
> +
> + res->pipe_clk = devm_clk_get(dev, "pipe");
> + if (IS_ERR(res->pipe_clk))
> + return PTR_ERR(res->pipe_clk);
> +
> + return 0;
> +}
> +
> +static int qcom_pcie_init_v2(struct qcom_pcie *pcie)
> +{
> + struct qcom_pcie_resources_v2 *res = &pcie->res.v2;
> + struct device *dev = pcie->pp.dev;
> + u32 val;
> + int ret;
> +
> + ret = clk_prepare_enable(res->aux_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable aux clock\n");
> + return ret;
> + }
> +
> + ret = clk_prepare_enable(res->cfg_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable cfg clock\n");
> + goto err_cfg_clk;
> + }
> +
> + ret = clk_prepare_enable(res->master_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable master clock\n");
> + goto err_master_clk;
> + }
> +
> + ret = clk_prepare_enable(res->slave_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable slave clock\n");
> + goto err_slave_clk;
> + }
> +
> + /* enable PCIe clocks and resets */
> + val = readl(pcie->parf + PCIE20_PARF_PHY_CTRL);
> + val &= ~BIT(0);
> + writel(val, pcie->parf + PCIE20_PARF_PHY_CTRL);
> +
> + /* change DBI base address */
> + writel(0, pcie->parf + PCIE20_PARF_DBI_BASE_ADDR);
> +
> + /* MAC PHY_POWERDOWN MUX DISABLE */
> + val = readl(pcie->parf + PCIE20_PARF_SYS_CTRL);
> + val &= ~BIT(29);
> + writel(val, pcie->parf + PCIE20_PARF_SYS_CTRL);
> +
> + val = readl(pcie->parf + PCIE20_PARF_MHI_CLOCK_RESET_CTRL);
> + val |= BIT(4);
> + writel(val, pcie->parf + PCIE20_PARF_MHI_CLOCK_RESET_CTRL);
> +
> + val = readl(pcie->parf + MSM8996_PCIE20_PARF_AXI_MSTR_WR_ADDR_HALT);
> + val |= BIT(31);
> + writel(val, pcie->parf + MSM8996_PCIE20_PARF_AXI_MSTR_WR_ADDR_HALT);
> +
> + return 0;
> +
> +err_slave_clk:
> + clk_disable_unprepare(res->master_clk);
> +err_master_clk:
> + clk_disable_unprepare(res->cfg_clk);
> +err_cfg_clk:
> + clk_disable_unprepare(res->aux_clk);
> +
> + return ret;
> +}
> +
> +static int qcom_pcie_post_init_v2(struct qcom_pcie *pcie)
> +{
> + struct qcom_pcie_resources_v2 *res = &pcie->res.v2;
> + struct device *dev = pcie->pp.dev;
> + int ret;
> +
> + ret = clk_prepare_enable(res->pipe_clk);
> + if (ret) {
> + dev_err(dev, "cannot prepare/enable pipe clock\n");
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> static int qcom_pcie_link_up(struct pcie_port *pp)
> {
> struct qcom_pcie *pcie = to_qcom_pcie(pp);
> @@ -429,6 +571,17 @@ static int qcom_pcie_link_up(struct pcie_port *pp)
> return !!(val & PCI_EXP_LNKSTA_DLLLA);
> }
>
> +static void qcom_pcie_deinit_v2(struct qcom_pcie *pcie)
> +{
> + struct qcom_pcie_resources_v2 *res = &pcie->res.v2;
> +
> + clk_disable_unprepare(res->pipe_clk);
> + clk_disable_unprepare(res->slave_clk);
> + clk_disable_unprepare(res->master_clk);
> + clk_disable_unprepare(res->cfg_clk);
> + clk_disable_unprepare(res->aux_clk);
> +}
> +
> static void qcom_pcie_host_init(struct pcie_port *pp)
> {
> struct qcom_pcie *pcie = to_qcom_pcie(pp);
> @@ -444,6 +597,9 @@ static void qcom_pcie_host_init(struct pcie_port *pp)
> if (ret)
> goto err_deinit;
>
> + if (pcie->ops->post_init)
> + pcie->ops->post_init(pcie);
> +
> dw_pcie_setup_rc(pp);
>
> if (IS_ENABLED(CONFIG_PCI_MSI))
> @@ -487,12 +643,22 @@ static const struct qcom_pcie_ops ops_v0 = {
> .get_resources = qcom_pcie_get_resources_v0,
> .init = qcom_pcie_init_v0,
> .deinit = qcom_pcie_deinit_v0,
> + .ltssm_enable = qcom_pcie_v0_v1_ltssm_enable,
> };
>
> static const struct qcom_pcie_ops ops_v1 = {
> .get_resources = qcom_pcie_get_resources_v1,
> .init = qcom_pcie_init_v1,
> .deinit = qcom_pcie_deinit_v1,
> + .ltssm_enable = qcom_pcie_v0_v1_ltssm_enable,
> +};
> +
> +static const struct qcom_pcie_ops ops_v2 = {
> + .get_resources = qcom_pcie_get_resources_v2,
> + .init = qcom_pcie_init_v2,
> + .post_init = qcom_pcie_post_init_v2,
> + .deinit = qcom_pcie_deinit_v2,
> + .ltssm_enable = qcom_pcie_v2_ltssm_enable,
> };
>
> static int qcom_pcie_probe(struct platform_device *pdev)
> @@ -572,6 +738,7 @@ static const struct of_device_id qcom_pcie_match[] = {
> { .compatible = "qcom,pcie-ipq8064", .data = &ops_v0 },
> { .compatible = "qcom,pcie-apq8064", .data = &ops_v0 },
> { .compatible = "qcom,pcie-apq8084", .data = &ops_v1 },
> + { .compatible = "qcom,pcie-msm8996", .data = &ops_v2 },
> { }
> };
>
> --
> 2.10.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH V4 15/15] blk-throttle: add latency target support
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>
One hard problem adding .high limit is to detect idle cgroup. If one
cgroup doesn't dispatch enough IO against its high limit, we must have a
mechanism to determine if other cgroups dispatch more IO. We added the
think time detection mechanism before, but it doesn't work for all
workloads. Here we add a latency based approach.
We calculate the average request size and average latency of a cgroup.
Then we can calculate the target latency for the cgroup with the average
request size and the equation. In queue LIMIT_HIGH state, if a cgroup
doesn't dispatch enough IO against high limit but its average latency is
lower than its target latency, we treat the cgroup idle. In this case
other cgroups can dispatch more IO, eg, across their high limit.
Similarly in queue LIMIT_MAX state, if a cgroup doesn't dispatch enough
IO but its average latency is higher than its target latency, we treat
the cgroup busy. In this case, we should throttle other cgroups to make
the first cgroup's latency lower.
If cgroup's average request size is big (currently sets to 128k), we
always treat the cgroup busy (the think time check is still effective
though).
Currently this latency target check is only for SSD as we can't
calcualte the latency target for hard disk. And this is only for cgroup
leaf node so far.
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-throttle.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---
include/linux/blk_types.h | 1 +
2 files changed, 56 insertions(+), 3 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ac4d9ea..d07f332 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -156,6 +156,12 @@ struct throtl_grp {
u64 last_finish_time;
u64 checked_last_finish_time;
u64 avg_ttime;
+
+ unsigned int bio_batch;
+ u64 total_latency;
+ u64 avg_latency;
+ u64 total_size;
+ u64 avg_size;
};
/* We measure latency for request size from 4k to 4k * ( 1 << 4) */
@@ -1734,12 +1740,30 @@ static unsigned long tg_last_high_overflow_time(struct throtl_grp *tg)
return ret;
}
+static u64 throtl_target_latency(struct throtl_data *td,
+ struct throtl_grp *tg)
+{
+ if (td->line_slope == 0 || tg->latency_target == 0)
+ return 0;
+
+ /* latency_target + f(avg_size) - f(4k) */
+ return td->line_slope * ((tg->avg_size >> 10) - 4) +
+ tg->latency_target;
+}
+
static bool throtl_tg_is_idle(struct throtl_grp *tg)
{
- /* cgroup is idle if average think time is more than threshold */
- return ktime_get_ns() - tg->last_finish_time >
+ /*
+ * cgroup is idle if:
+ * 1. average think time is higher than threshold
+ * 2. average request size is small and average latency is higher
+ * than target
+ */
+ return (ktime_get_ns() - tg->last_finish_time >
4 * tg->td->idle_ttime_threshold ||
- tg->avg_ttime > tg->td->idle_ttime_threshold;
+ tg->avg_ttime > tg->td->idle_ttime_threshold) ||
+ (tg->avg_latency && tg->avg_size && tg->avg_size <= 128 * 1024 &&
+ tg->avg_latency < throtl_target_latency(tg->td, tg));
}
static bool throtl_upgrade_check_one(struct throtl_grp *tg)
@@ -2123,6 +2147,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
bio_associate_current(bio);
bio->bi_cg_private = q;
bio->bi_cg_size = bio_sectors(bio);
+ bio->bi_cg_enter_time = ktime_get_ns();
blk_throtl_update_ttime(tg);
@@ -2264,6 +2289,33 @@ void blk_throtl_bio_endio(struct bio *bio)
}
}
+ if (bio->bi_cg_enter_time && finish_time > bio->bi_cg_enter_time &&
+ tg->latency_target) {
+ lat = finish_time - bio->bi_cg_enter_time;
+ tg->total_latency += lat;
+ tg->total_size += bio->bi_cg_size << 9;
+ tg->bio_batch++;
+ }
+
+ if (tg->bio_batch >= 8) {
+ int batch = tg->bio_batch;
+ u64 size = tg->total_size;
+
+ lat = tg->total_latency;
+
+ tg->bio_batch = 0;
+ tg->total_latency = 0;
+ tg->total_size = 0;
+
+ if (batch) {
+ do_div(lat, batch);
+ tg->avg_latency = (tg->avg_latency * 7 +
+ lat) >> 3;
+ do_div(size, batch);
+ tg->avg_size = (tg->avg_size * 7 + size) >> 3;
+ }
+ }
+
end:
rcu_read_unlock();
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 45bb437..fe87a20 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -61,6 +61,7 @@ struct bio {
struct cgroup_subsys_state *bi_css;
void *bi_cg_private;
u64 bi_cg_issue_time;
+ u64 bi_cg_enter_time;
sector_t bi_cg_size;
#endif
union {
--
2.9.3
^ permalink raw reply related
* [PATCH V4 13/15] blk-throttle: add a mechanism to estimate IO latency
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>
We try to set a latency target for each cgroup. The problem is latency
highly depends on request size, users can't configure the target for
every request size. The idea is users configure latency target for 4k
IO, we estimate the target latency for other request size IO.
To do this, we sample some data, eg, average latency for request size
4k, 8k, 16k, 32k, 64k. We then use an equation f(x) = a * x + b to fit
the data (x is request size in KB, f(x) is the latency). Then we can use
the equation to estimate IO target latency for any request.
To increase the chance of sampling, we actually collect data for any IO
size less than 64k, then calcualte an average latency/size. This is ok
for line fit because the equation should work for average request
size/latency too.
But we shouldn't sample data at any time. If disk is congested, the
calculated data will not represent the disk's capability. Hence we only
do the sampling when block throttling is in the HIGH limit, with
assumption disk isn't congested in such state. If the assumption isn't
true, eg, high limit is too high, calculated latency target will be
higher.
How does the equation fit to actual data? I collected data from 4
different SSDs (one SATA, 3 NVMe). The error range is quite small. The
big difference between measured latency and calculated latency generally
comes from 4k IO. The biggest one has around 30% difference, which isn't
terrible as we don't need accurate latency target. We don't know if line
fit works for other SSDs though. For big request size latency, the error
range seems big. But this mechanism is to determine if we should
throttle IO (eg, if cgroup is idle). If cgroups average request size is
big, we can simply treat it as busy, hence we don't need the mechanism.
Hard disk is completely different. Latency depends on spindle seek
instead of request size. So this latency target feature is for SSD only.
The patch uses below algorithm to calculate the equation:
https://en.wikipedia.org/wiki/Simple_linear_regression
TODO: the latency sampling is better moving to request layer
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-throttle.c | 191 +++++++++++++++++++++++++++++++++++++++++++++-
include/linux/blk_types.h | 2 +
2 files changed, 190 insertions(+), 3 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 01b494d..a05d351 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -156,6 +156,20 @@ struct throtl_grp {
u64 avg_ttime;
};
+/* We measure latency for request size from 4k to 4k * ( 1 << 4) */
+#define LATENCY_BUCKET_SIZE 5
+
+struct latency_bucket {
+ u64 total_latency;
+ u64 total_size;
+ int samples;
+};
+
+struct avg_latency_bucket {
+ u64 latency;
+ u64 size;
+};
+
struct throtl_data
{
/* service tree for active throtl groups */
@@ -179,6 +193,12 @@ struct throtl_data
unsigned int scale;
u64 idle_ttime_threshold;
+
+ struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+ struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+ struct latency_bucket __percpu *latency_buckets;
+ s64 line_slope;
+ unsigned long last_calculate_time;
};
static void throtl_pending_timer_fn(unsigned long arg);
@@ -288,6 +308,19 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
return ret;
}
+static int request_bucket_index(sector_t sectors)
+{
+ int i;
+
+ for (i = LATENCY_BUCKET_SIZE - 1; i >= 0; i--) {
+ if (sectors > (1 << (i + 3)))
+ break;
+ }
+ if (i == LATENCY_BUCKET_SIZE - 1)
+ return -1;
+ return i + 1;
+}
+
/**
* throtl_log - log debug message via blktrace
* @sq: the service_queue being reported
@@ -1877,6 +1910,120 @@ static void blk_throtl_update_ttime(struct throtl_grp *tg)
tg->checked_last_finish_time = last_finish_time;
}
+static void throtl_calculate_line_slope(struct throtl_data *td)
+{
+ struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+ s64 sumX;
+ s64 sumY;
+ s64 sumXY;
+ s64 sumX2;
+ s64 xMean;
+ s64 yMean;
+ s64 denominator;
+ s64 slope;
+ int i, cpu;
+ int valid_lat;
+ u64 last_latency = 0;
+
+ if (!blk_queue_nonrot(td->queue))
+ return;
+ if (time_before(jiffies, td->last_calculate_time + HZ))
+ return;
+ td->last_calculate_time = jiffies;
+
+ memset(avg_latency, 0, sizeof(avg_latency));
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+ for_each_possible_cpu(cpu) {
+ struct latency_bucket *bucket;
+
+ bucket = per_cpu_ptr(td->latency_buckets, cpu);
+ tmp->total_latency += bucket[i].total_latency;
+ tmp->total_size += bucket[i].total_size;
+ tmp->samples += bucket[i].samples;
+ bucket[i].total_latency = 0;
+ bucket[i].total_size = 0;
+ bucket[i].samples = 0;
+ }
+
+ if (tmp->samples >= 32) {
+ u64 latency = tmp->total_latency;
+ u64 size = tmp->total_size;
+ int samples = tmp->samples;
+
+ tmp->total_latency = 0;
+ tmp->total_size = 0;
+ tmp->samples = 0;
+ do_div(size, samples);
+ if (size == 0 || size > (1 << (i + 12)))
+ continue;
+ avg_latency[i].size = size;
+ do_div(latency, samples);
+ if (latency == 0)
+ continue;
+ avg_latency[i].latency = latency;
+ }
+ }
+
+ valid_lat = 0;
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ if (!td->avg_buckets[i].latency && !avg_latency[i].latency)
+ continue;
+ valid_lat++;
+ if (!td->avg_buckets[i].latency) {
+ td->avg_buckets[i].latency = avg_latency[i].latency;
+ td->avg_buckets[i].size = avg_latency[i].size;
+ continue;
+ }
+ if (!avg_latency[i].latency)
+ continue;
+ /* make it smooth */
+ td->avg_buckets[i].latency = (td->avg_buckets[i].latency * 7 +
+ avg_latency[i].latency) >> 3;
+ td->avg_buckets[i].size = (td->avg_buckets[i].size * 7 +
+ avg_latency[i].size) >> 3;
+ /* filter out abnormal latency */
+ if (td->avg_buckets[i].latency <= last_latency) {
+ td->avg_buckets[i].latency = 0;
+ valid_lat--;
+ } else
+ last_latency = td->avg_buckets[i].latency;
+ }
+
+ if (valid_lat < 2)
+ return;
+
+ sumX = 0;
+ sumY = 0;
+ sumXY = 0;
+ sumX2 = 0;
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ u64 x, y;
+
+ if (td->avg_buckets[i].latency == 0)
+ continue;
+
+ x = td->avg_buckets[i].size >> 10;
+ y = td->avg_buckets[i].latency;
+ sumX += x;
+ sumY += y;
+
+ sumXY += x * y;
+ sumX2 += x * x;
+ }
+
+ xMean = sumX;
+ do_div(xMean, valid_lat);
+ yMean = sumY;
+ do_div(yMean, valid_lat);
+ denominator = sumX2 - sumX * xMean;
+
+ slope = sumXY - sumX * yMean;
+ do_div(slope, denominator);
+ td->line_slope = slope;
+}
+
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio)
{
@@ -1901,11 +2048,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
spin_lock_irq(q->queue_lock);
+ throtl_calculate_line_slope(tg->td);
+
if (unlikely(blk_queue_bypass(q)))
goto out_unlock;
bio_associate_current(bio);
bio->bi_cg_private = q;
+ bio->bi_cg_size = bio_sectors(bio);
blk_throtl_update_ttime(tg);
@@ -1992,8 +2142,11 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
* don't want bios to leave with the flag set. Clear the flag if
* being issued.
*/
- if (!throttled)
+ if (!throttled) {
+ if (blk_queue_nonrot(q))
+ bio->bi_cg_issue_time = ktime_get_ns();
bio->bi_opf &= ~REQ_THROTTLED;
+ }
return throttled;
}
@@ -2003,6 +2156,9 @@ void blk_throtl_bio_endio(struct bio *bio)
struct blkcg_gq *blkg;
struct throtl_grp *tg;
struct request_queue *q;
+ struct throtl_data *td;
+ u64 finish_time;
+ u64 lat;
q = bio->bi_cg_private;
if (!q)
@@ -2019,7 +2175,27 @@ void blk_throtl_bio_endio(struct bio *bio)
tg = blkg_to_tg(blkg ?: q->root_blkg);
- tg->last_finish_time = ktime_get_ns();
+ finish_time = ktime_get_ns();
+ tg->last_finish_time = finish_time;
+
+ td = tg->td;
+
+ if (bio->bi_cg_issue_time && finish_time > bio->bi_cg_issue_time) {
+ int index;
+
+ lat = finish_time - bio->bi_cg_issue_time;
+ index = request_bucket_index(bio->bi_cg_size);
+ if (index >= 0 && bio_op(bio) == REQ_OP_READ &&
+ td->limit_index == LIMIT_HIGH) {
+ struct latency_bucket *latency;
+
+ latency = get_cpu_ptr(td->latency_buckets);
+ latency[index].total_latency += lat;
+ latency[index].total_size += bio->bi_cg_size << 9;
+ latency[index].samples++;
+ put_cpu_ptr(td->latency_buckets);
+ }
+ }
end:
rcu_read_unlock();
@@ -2097,6 +2273,12 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
+ td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+ LATENCY_BUCKET_SIZE, __alignof__(u64));
+ if (!td->latency_buckets) {
+ kfree(td);
+ return -ENOMEM;
+ }
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
@@ -2113,8 +2295,10 @@ int blk_throtl_init(struct request_queue *q)
td->idle_ttime_threshold = -1;
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
- if (ret)
+ if (ret) {
+ free_percpu(td->latency_buckets);
kfree(td);
+ }
return ret;
}
@@ -2123,6 +2307,7 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+ free_percpu(q->td->latency_buckets);
kfree(q->td);
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ff8dd24..45bb437 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -60,6 +60,8 @@ struct bio {
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
void *bi_cg_private;
+ u64 bi_cg_issue_time;
+ sector_t bi_cg_size;
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
--
2.9.3
^ permalink raw reply related
* [PATCH V4 09/15] blk-throttle: make bandwidth change smooth
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>
When cgroups all reach high limit, cgroups can dispatch more IO. This
could make some cgroups dispatch more IO but others not, and even some
cgroups could dispatch less IO than their high limit. For example, cg1
high limit 10MB/s, cg2 limit 80MB/s, assume disk maximum bandwidth is
120M/s for the workload. Their bps could something like this:
cg1/cg2 bps: T1: 10/80 -> T2: 60/60 -> T3: 10/80
At T1, all cgroups reach high limit, so they can dispatch more IO later.
Then cg1 dispatch more IO and cg2 has no room to dispatch enough IO. At
T2, cg2 only dispatches 60M/s. Since We detect cg2 dispatches less IO
than its high limit 80M/s, we downgrade the queue from LIMIT_MAX to
LIMIT_HIGH, then all cgroups are throttled to their high limit (T3). cg2
will have bandwidth below its high limit at most time.
The big problem here is we don't know the maximum bandwidth of the
workload, so we can't make smart decision to avoid the situation. This
patch makes cgroup bandwidth change smooth. After disk upgrades from
LIMIT_HIGH to LIMIT_MAX, we don't allow cgroups use all bandwidth upto
their max limit immediately. Their bandwidth limit will be increased
gradually to avoid above situation. So above example will became
something like:
cg1/cg2 bps: 10/80 -> 15/105 -> 20/100 -> 25/95 -> 30/90 -> 35/85 -> 40/80
-> 45/75 -> 10/80
In this way cgroups bandwidth will be above their limit in majority
time, this still doesn't fully utilize disk bandwidth, but that's
something we pay for sharing.
Note this doesn't completely avoid cgroup running under its high limit.
The best way to guarantee cgroup doesn't run under its limit is to set
max limit. For example, if we set cg1 max limit to 40, cg2 will never
run under its high limit.
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-throttle.c | 42 ++++++++++++++++++++++++++++++++++++++----
1 file changed, 38 insertions(+), 4 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 32cc6ec..45a28c4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -170,6 +170,8 @@ struct throtl_data
unsigned long high_upgrade_time;
unsigned long high_downgrade_time;
+
+ unsigned int scale;
};
static void throtl_pending_timer_fn(unsigned long arg);
@@ -224,12 +226,27 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
{
struct blkcg_gq *blkg = tg_to_blkg(tg);
+ struct throtl_data *td;
uint64_t ret;
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
return -1;
- ret = tg->bps[rw][tg->td->limit_index];
- if (ret == -1 && tg->td->limit_index == LIMIT_HIGH)
+ td = tg->td;
+ ret = tg->bps[rw][td->limit_index];
+ if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_HIGH] != -1) {
+ uint64_t increase;
+
+ if (td->scale < 4096 && time_after_eq(jiffies,
+ td->high_upgrade_time + td->scale * td->throtl_slice)) {
+ unsigned int time = jiffies - td->high_upgrade_time;
+
+ td->scale = time / td->throtl_slice;
+ }
+ increase = (tg->bps[rw][LIMIT_HIGH] >> 1) * td->scale;
+ ret = min(tg->bps[rw][LIMIT_MAX],
+ tg->bps[rw][LIMIT_HIGH] + increase);
+ }
+ if (ret == -1 && td->limit_index == LIMIT_HIGH)
return tg->bps[rw][LIMIT_MAX];
return ret;
@@ -238,12 +255,28 @@ static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
{
struct blkcg_gq *blkg = tg_to_blkg(tg);
+ struct throtl_data *td;
unsigned int ret;
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
return -1;
- ret = tg->iops[rw][tg->td->limit_index];
- if (ret == -1 && tg->td->limit_index == LIMIT_HIGH)
+ td = tg->td;
+ ret = tg->iops[rw][td->limit_index];
+ if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_HIGH] != -1) {
+ uint64_t increase;
+
+ if (td->scale < 4096 && time_after_eq(jiffies,
+ td->high_upgrade_time + td->scale * td->throtl_slice)) {
+ unsigned int time = jiffies - td->high_upgrade_time;
+
+ td->scale = time / td->throtl_slice;
+ }
+
+ increase = (tg->iops[rw][LIMIT_HIGH] >> 1) * td->scale;
+ ret = min(tg->iops[rw][LIMIT_MAX],
+ tg->iops[rw][LIMIT_HIGH] + (unsigned int)increase);
+ }
+ if (ret == -1 && td->limit_index == LIMIT_HIGH)
return tg->iops[rw][LIMIT_MAX];
return ret;
}
@@ -1676,6 +1709,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
td->limit_index = LIMIT_MAX;
td->high_upgrade_time = jiffies;
+ td->scale = 0;
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
--
2.9.3
^ permalink raw reply related
* [PATCH V4 08/15] blk-throttle: detect completed idle cgroup
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>
cgroup could be assigned a limit, but doesn't dispatch enough IO, eg the
cgroup is idle. When this happens, the cgroup doesn't hit its limit, so
we can't move the state machine to higher level and all cgroups will be
throttled to thier lower limit, so we waste bandwidth. Detecting idle
cgroup is hard. This patch handles a simple case, a cgroup doesn't
dispatch any IO. We ignore such cgroup's limit, so other cgroups can use
the bandwidth.
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-throttle.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e85b2b6..32cc6ec 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -144,6 +144,8 @@ struct throtl_grp {
unsigned long last_check_time;
+ unsigned long last_dispatch_time[2];
+
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
@@ -438,11 +440,14 @@ static void tg_update_has_rules(struct throtl_grp *tg)
static void throtl_pd_online(struct blkg_policy_data *pd)
{
+ struct throtl_grp *tg = pd_to_tg(pd);
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
- tg_update_has_rules(pd_to_tg(pd));
+ tg_update_has_rules(tg);
+ tg->last_dispatch_time[READ] = jiffies;
+ tg->last_dispatch_time[WRITE] = jiffies;
}
static void blk_throtl_update_valid_limit(struct throtl_data *td)
@@ -1611,6 +1616,12 @@ static bool throtl_upgrade_check_one(struct throtl_grp *tg)
if (write_limit && sq->nr_queued[WRITE] &&
(!read_limit || sq->nr_queued[READ]))
return true;
+
+ if (time_after_eq(jiffies,
+ tg->last_dispatch_time[READ] + tg->td->throtl_slice) &&
+ time_after_eq(jiffies,
+ tg->last_dispatch_time[WRITE] + tg->td->throtl_slice))
+ return true;
return false;
}
@@ -1691,6 +1702,11 @@ static bool throtl_downgrade_check_one(struct throtl_grp *tg)
struct throtl_data *td = tg->td;
unsigned long now = jiffies;
+ if (time_after_eq(now, tg->last_dispatch_time[READ] +
+ td->throtl_slice) &&
+ time_after_eq(now, tg->last_dispatch_time[WRITE] +
+ td->throtl_slice))
+ return false;
/*
* If cgroup is below high limit, consider downgrade and throttle other
* cgroups
@@ -1811,6 +1827,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
again:
while (true) {
+ tg->last_dispatch_time[rw] = jiffies;
if (tg->last_high_overflow_time[rw] == 0)
tg->last_high_overflow_time[rw] = jiffies;
throtl_downgrade_check(tg);
--
2.9.3
^ permalink raw reply related
* [PATCH V4 14/15] blk-throttle: add interface for per-cgroup target latency
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>
Add interface for per-cgroup target latency. This latency is for 4k
request.
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-throttle.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 67 insertions(+)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a05d351..ac4d9ea 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -147,6 +147,8 @@ struct throtl_grp {
unsigned long last_check_time;
int upgrade_check_batch;
+
+ u64 latency_target;
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
@@ -463,6 +465,7 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
tg->iops[rw][index] = -1;
}
}
+ /* target latency default 0, eg, always not meet */
return &tg->pd;
}
@@ -1572,6 +1575,64 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
return ret ?: nbytes;
}
+static u64 tg_prfill_latency_target(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct throtl_grp *tg = pd_to_tg(pd);
+ const char *dname = blkg_dev_name(pd->blkg);
+
+ if (!dname)
+ return 0;
+ if (tg->latency_target == 0)
+ return 0;
+
+ seq_printf(sf, "%s 4k_lat=%llu\n", dname, tg->latency_target);
+ return 0;
+}
+
+static int tg_print_latency_target(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ tg_prfill_latency_target, &blkcg_policy_throtl,
+ seq_cft(sf)->private, false);
+ return 0;
+}
+
+static ssize_t tg_set_latency_target(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkg_conf_ctx ctx;
+ struct throtl_grp *tg;
+ int ret = -EINVAL;
+ char tok[27];
+ char *p;
+ u64 val;
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+ if (ret)
+ return ret;
+
+ tg = blkg_to_tg(ctx.blkg);
+
+ if (sscanf(ctx.body, "%26s", tok) != 1)
+ goto out_finish;
+
+ p = tok;
+ strsep(&p, "=");
+ if (!p || kstrtou64(p, 10, &val))
+ goto out_finish;
+
+ if (strcmp(tok, "4k_lat"))
+ goto out_finish;
+
+ tg->latency_target = val;
+ ret = 0;
+out_finish:
+ blkg_conf_finish(&ctx);
+ return ret ?: nbytes;
+}
+
static struct cftype throtl_files[] = {
{
.name = "high",
@@ -1587,6 +1648,12 @@ static struct cftype throtl_files[] = {
.write = tg_set_limit,
.private = LIMIT_MAX,
},
+ {
+ .name = "latency_target",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = tg_print_latency_target,
+ .write = tg_set_latency_target,
+ },
{ } /* terminate */
};
--
2.9.3
^ permalink raw reply related
* [PATCH V4 12/15] blk-throttle: ignore idle cgroup limit
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>
Last patch introduces a way to detect idle cgroup. We use it to make
upgrade/downgrade decision. And the new algorithm can detect completely
idle cgroup too, so we can delete the corresponding code.
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-throttle.c | 39 +++++++++++++++++++++++++--------------
1 file changed, 25 insertions(+), 14 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e403e88..01b494d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -146,8 +146,7 @@ struct throtl_grp {
unsigned long last_check_time;
- unsigned long last_dispatch_time[2];
-
+ int upgrade_check_batch;
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
@@ -487,8 +486,6 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
* Update has_rules[] after a new group is brought online.
*/
tg_update_has_rules(tg);
- tg->last_dispatch_time[READ] = jiffies;
- tg->last_dispatch_time[WRITE] = jiffies;
}
static void blk_throtl_update_valid_limit(struct throtl_data *td)
@@ -1667,9 +1664,8 @@ static bool throtl_upgrade_check_one(struct throtl_grp *tg)
return true;
if (time_after_eq(jiffies,
- tg->last_dispatch_time[READ] + tg->td->throtl_slice) &&
- time_after_eq(jiffies,
- tg->last_dispatch_time[WRITE] + tg->td->throtl_slice))
+ tg_last_high_overflow_time(tg) + tg->td->throtl_slice) &&
+ throtl_tg_is_idle(tg))
return true;
return false;
}
@@ -1718,6 +1714,24 @@ static bool throtl_can_upgrade(struct throtl_data *td,
return true;
}
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+ if (tg->td->limit_index != LIMIT_HIGH)
+ return;
+
+ if (!time_after_eq(jiffies,
+ __tg_last_high_overflow_time(tg) + tg->td->throtl_slice))
+ return;
+
+ tg->upgrade_check_batch++;
+ if (tg->upgrade_check_batch < 16)
+ return;
+ tg->upgrade_check_batch = 0;
+
+ if (throtl_can_upgrade(tg->td, NULL))
+ throtl_upgrade_state(tg->td);
+}
+
static void throtl_upgrade_state(struct throtl_data *td)
{
struct cgroup_subsys_state *pos_css;
@@ -1752,18 +1766,15 @@ static bool throtl_downgrade_check_one(struct throtl_grp *tg)
struct throtl_data *td = tg->td;
unsigned long now = jiffies;
- if (time_after_eq(now, tg->last_dispatch_time[READ] +
- td->throtl_slice) &&
- time_after_eq(now, tg->last_dispatch_time[WRITE] +
- td->throtl_slice))
- return false;
/*
* If cgroup is below high limit, consider downgrade and throttle other
* cgroups
*/
if (time_after_eq(now, td->high_upgrade_time + td->throtl_slice) &&
time_after_eq(now, tg_last_high_overflow_time(tg) +
- td->throtl_slice))
+ td->throtl_slice) &&
+ (!throtl_tg_is_idle(tg) ||
+ !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
return true;
return false;
}
@@ -1902,10 +1913,10 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
again:
while (true) {
- tg->last_dispatch_time[rw] = jiffies;
if (tg->last_high_overflow_time[rw] == 0)
tg->last_high_overflow_time[rw] = jiffies;
throtl_downgrade_check(tg);
+ throtl_upgrade_check(tg);
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;
--
2.9.3
^ permalink raw reply related
* [PATCH V4 11/15] blk-throttle: add interface to configure think time threshold
From: Shaohua Li @ 2016-11-14 22:22 UTC (permalink / raw)
To: linux-block, linux-kernel; +Cc: Kernel-team, axboe, tj, vgoyal
In-Reply-To: <cover.1479161136.git.shli@fb.com>
Add interface to configure the threshold
Signed-off-by: Shaohua Li <shli@fb.com>
---
block/blk-sysfs.c | 7 +++++++
block/blk-throttle.c | 25 +++++++++++++++++++++++++
block/blk.h | 4 ++++
3 files changed, 36 insertions(+)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3e284e4..f15aeed 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -532,6 +532,12 @@ static struct queue_sysfs_entry throtl_slice_entry = {
.show = blk_throtl_slice_show,
.store = blk_throtl_slice_store,
};
+
+static struct queue_sysfs_entry throtl_idle_threshold_entry = {
+ .attr = {.name = "throttling_idle_threshold", .mode = S_IRUGO | S_IWUSR },
+ .show = blk_throtl_idle_threshold_show,
+ .store = blk_throtl_idle_threshold_store,
+};
#endif
static struct attribute *default_attrs[] = {
@@ -563,6 +569,7 @@ static struct attribute *default_attrs[] = {
&queue_dax_entry.attr,
#ifdef CONFIG_BLK_DEV_THROTTLING
&throtl_slice_entry.attr,
+ &throtl_idle_threshold_entry.attr,
#endif
NULL,
};
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index cb5fd85..e403e88 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2139,6 +2139,31 @@ ssize_t blk_throtl_slice_store(struct request_queue *q,
return count;
}
+ssize_t blk_throtl_idle_threshold_show(struct request_queue *q, char *page)
+{
+ u64 threshold = q->td->idle_ttime_threshold;
+
+ if (!q->td)
+ return -EINVAL;
+ do_div(threshold, 1000);
+ return sprintf(page, "%lluus\n", threshold);
+}
+
+ssize_t blk_throtl_idle_threshold_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long v;
+
+ if (!q->td)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &v))
+ return -EINVAL;
+ if (v == 0)
+ return -EINVAL;
+ q->td->idle_ttime_threshold = v * 1000;
+ return count;
+}
+
static int __init throtl_init(void)
{
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk.h b/block/blk.h
index b433f35..2ebde12 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -292,6 +292,10 @@ extern void blk_throtl_exit(struct request_queue *q);
extern ssize_t blk_throtl_slice_show(struct request_queue *q, char *page);
extern ssize_t blk_throtl_slice_store(struct request_queue *q,
const char *page, size_t count);
+extern ssize_t blk_throtl_idle_threshold_show(struct request_queue *q,
+ char *page);
+extern ssize_t blk_throtl_idle_threshold_store(struct request_queue *q,
+ const char *page, size_t count);
extern void blk_throtl_bio_endio(struct bio *bio);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline void blk_throtl_drain(struct request_queue *q) { }
--
2.9.3
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.