From: Khalid Aziz <khalid@gonehiking.org>
To: Takao Indoh <indou.takao@jp.fujitsu.com>
Cc: martin.wilck@ts.fujitsu.com, linux-pci@vger.kernel.org,
x86@kernel.org, kexec@lists.infradead.org,
linux-kernel@vger.kernel.org, hbabu@us.ibm.com,
andi@firstfloor.org, ddutile@redhat.com,
ishii.hironobu@jp.fujitsu.com, hpa@zytor.com,
bhelgaas@google.com, tglx@linutronix.de, mingo@redhat.com,
vgoyal@redhat.com
Subject: Re: [PATCH v3 1/2] x86, pci: Reset PCIe devices at boot time
Date: Wed, 10 Oct 2012 14:08:24 -0600 [thread overview]
Message-ID: <1349899704.25679.14.camel@rhapsody> (raw)
In-Reply-To: <20121010074603.1084.92389.sendpatchset@indoh>
Please see comments inline:
On Wed, 2012-10-10 at 16:51 +0900, Takao Indoh wrote:
> This patch resets PCIe devices at boot time by hot reset when
> "reset_devices" is specified.
>
>
> Signed-off-by: Takao Indoh <indou.takao@jp.fujitsu.com>
> ---
> arch/x86/include/asm/pci-direct.h | 1
> arch/x86/kernel/setup.c | 3
> arch/x86/pci/early.c | 299 ++++++++++++++++++++++++++++
> drivers/pci/pci.c | 18 -
> include/linux/pci.h | 18 +
> init/main.c | 4
> 6 files changed, 323 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
> index b1e7a45..de30db2 100644
> --- a/arch/x86/include/asm/pci-direct.h
> +++ b/arch/x86/include/asm/pci-direct.h
> @@ -18,4 +18,5 @@ extern int early_pci_allowed(void);
> extern unsigned int pci_early_dump_regs;
> extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
> extern void early_dump_pci_devices(void);
> +extern void early_reset_pcie_devices(void);
> #endif /* _ASM_X86_PCI_DIRECT_H */
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index f4b9b80..24b011c 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -988,6 +988,9 @@ void __init setup_arch(char **cmdline_p)
> generic_apic_probe();
>
> early_quirks();
> +#ifdef CONFIG_PCI
> + early_reset_pcie_devices();
> +#endif
>
> /*
> * Read APIC and some other early information from ACPI tables.
> diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
> index d1067d5..584f16b 100644
> --- a/arch/x86/pci/early.c
> +++ b/arch/x86/pci/early.c
> @@ -1,5 +1,6 @@
> #include <linux/kernel.h>
> #include <linux/pci.h>
> +#include <linux/bootmem.h>
> #include <asm/pci-direct.h>
> #include <asm/io.h>
> #include <asm/pci_x86.h>
> @@ -109,3 +110,301 @@ void early_dump_pci_devices(void)
> }
> }
> }
> +
> +struct save_config {
> + u32 pci[16];
> + u16 pcie[PCI_EXP_SAVE_REGS];
> +};
> +
> +struct devinfo {
> + int pcie_pos; /* position of PCI Express capability */
> + int pcie_flags; /* PCI_EXP_FLAGS */
> + struct save_config *save;
> +};
> +
> +static struct save_config *save_cfg;
> +static void __init pci_udelay(int loops)
> +{
> + while (loops--) {
> + /* Approximately 1 us */
> + native_io_delay();
> + }
> +}
> +
> +/* Derived from drivers/pci/pci.c */
> +#define PCI_FIND_CAP_TTL 48
> +static int __init __pci_find_next_cap_ttl(u8 bus, u8 slot, u8 func,
> + u8 pos, int cap, int *ttl)
> +{
> + u8 id;
> +
> + while ((*ttl)--) {
> + pos = read_pci_config_byte(bus, slot, func, pos);
> + if (pos < 0x40)
> + break;
> + pos &= ~3;
> + id = read_pci_config_byte(bus, slot, func,
> + pos + PCI_CAP_LIST_ID);
> + if (id == 0xff)
> + break;
> + if (id == cap)
> + return pos;
> + pos += PCI_CAP_LIST_NEXT;
> + }
> + return 0;
> +}
> +
> +static int __init __pci_find_next_cap(u8 bus, u8 slot, u8 func, u8 pos, int cap)
> +{
> + int ttl = PCI_FIND_CAP_TTL;
> +
> + return __pci_find_next_cap_ttl(bus, slot, func, pos, cap, &ttl);
> +}
> +
> +static int __init __pci_bus_find_cap_start(u8 bus, u8 slot, u8 func,
> + u8 hdr_type)
> +{
> + u16 status;
> +
> + status = read_pci_config_16(bus, slot, func, PCI_STATUS);
> + if (!(status & PCI_STATUS_CAP_LIST))
> + return 0;
> +
> + switch (hdr_type) {
> + case PCI_HEADER_TYPE_NORMAL:
> + case PCI_HEADER_TYPE_BRIDGE:
> + return PCI_CAPABILITY_LIST;
> + case PCI_HEADER_TYPE_CARDBUS:
> + return PCI_CB_CAPABILITY_LIST;
> + default:
> + return 0;
> + }
> +
> + return 0;
> +}
> +
> +static int __init early_pci_find_capability(u8 bus, u8 slot, u8 func, int cap)
> +{
> + int pos;
> + u8 type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
> +
> + pos = __pci_bus_find_cap_start(bus, slot, func, type & 0x7f);
> + if (pos)
> + pos = __pci_find_next_cap(bus, slot, func, pos, cap);
> +
> + return pos;
> +}
> +
> +static void __init do_reset(u8 bus, u8 slot, u8 func)
> +{
> + u16 ctrl;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func);
> +
> + /* Assert Secondary Bus Reset */
> + ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL);
> + ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> +
> + pci_udelay(5000);
> +
> + /* De-assert Secondary Bus Reset */
> + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> +
> + pci_udelay(500000);
This is 0.5 second. This will add up quickly on larger servers with
multiple busses. Is 0.5 second required by the spec?
aer_do_secondary_bus_reset() holds PCI_BRIDGE_CTL_BUS_RESET for 2 ms and
then waits another 200 ms after de-asserting it. Still long, but less
than half of the delay in above code..
> +}
> +
> +static void __init save_state(unsigned bus, unsigned slot, unsigned func,
> + struct devinfo *info)
> +{
> + int i;
> + int pcie, flags, pcie_type;
> + struct save_config *save;
> +
> + pcie = info->pcie_pos;
> + flags = info->pcie_flags;
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + save = info->save;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d save state\n", bus, slot, func);
> +
> + for (i = 0; i < 16; i++)
> + save->pci[i] = read_pci_config(bus, slot, func, i * 4);
> + i = 0;
> + if (pcie_cap_has_devctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL);
> + if (pcie_cap_has_lnkctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL);
> + if (pcie_cap_has_sltctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL);
> + if (pcie_cap_has_rtctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_RTCTL);
> +
> + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL2);
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL2);
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL2);
> + }
> +}
> +
> +static void __init restore_state(unsigned bus, unsigned slot, unsigned func,
> + struct devinfo *info)
> +{
> + int i = 0;
> + int pcie, flags, pcie_type;
> + struct save_config *save;
> +
> + pcie = info->pcie_pos;
> + flags = info->pcie_flags;
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + save = info->save;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d restore state\n",
> + bus, slot, func);
> +
> + if (pcie_cap_has_devctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL, save->pcie[i++]);
> + if (pcie_cap_has_lnkctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL, save->pcie[i++]);
> + if (pcie_cap_has_sltctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL, save->pcie[i++]);
> + if (pcie_cap_has_rtctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_RTCTL, save->pcie[i++]);
> +
> + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL2, save->pcie[i++]);
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL2, save->pcie[i++]);
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL2, save->pcie[i++]);
> + }
> +
> + for (i = 15; i >= 0; i--)
> + write_pci_config(bus, slot, func, i * 4, save->pci[i]);
> +}
> +
> +static void __init reset_pcie_device(unsigned bus, unsigned slot, unsigned func)
> +{
> + int f, count;
> + int pcie, pcie_type;
> + u8 type;
> + u16 vendor, flags;
> + u32 class;
> + int secondary;
> + struct devinfo child[8];
> +
> + pcie = early_pci_find_capability(bus, slot, func, PCI_CAP_ID_EXP);
> + if (!pcie)
> + return;
> +
> + flags = read_pci_config_16(bus, slot, func, pcie + PCI_EXP_FLAGS);
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + if ((pcie_type != PCI_EXP_TYPE_ROOT_PORT) &&
> + (pcie_type != PCI_EXP_TYPE_DOWNSTREAM))
> + return;
> +
> + type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
> + if ((type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
> + return;
> + secondary = read_pci_config_byte(bus, slot, func, PCI_SECONDARY_BUS);
> + memset(child, 0, sizeof(child));
> + for (count = 0, f = 0; f < 8; f++) {
Can we use a constant instead of "8" in the loop here? There are a few
other places in kernel code with very similar loops enumerating over PCI
functions that again use "8" instead of a constant like
PCI_MAX_FUNCTIONS. I would suggest we use a constant at least in the new
code.
> + vendor = read_pci_config_16(secondary, 0, f, PCI_VENDOR_ID);
> + if (vendor == 0xffff)
> + continue;
> +
> + pcie = early_pci_find_capability(secondary, 0, f,
> + PCI_CAP_ID_EXP);
> + if (!pcie)
> + continue;
> +
> + flags = read_pci_config_16(secondary, 0, f,
> + pcie + PCI_EXP_FLAGS);
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + if ((pcie_type == PCI_EXP_TYPE_UPSTREAM) ||
> + (pcie_type == PCI_EXP_TYPE_PCI_BRIDGE))
> + /* Don't reset switch, bridge */
> + return;
> +
> + class = read_pci_config(secondary, 0, f, PCI_CLASS_REVISION);
> + if ((class >> 24) == PCI_BASE_CLASS_DISPLAY)
> + /* Don't reset VGA device */
> + return;
> +
> + count++;
> + child[f].pcie_pos = pcie;
> + child[f].pcie_flags = flags;
> + child[f].save = save_cfg + f;
> + }
> +
> + if (!count)
> + return;
> +
> + /* save */
> + for (f = 0; f < 8; f++)
> + if (child[f].pcie_pos)
> + save_state(secondary, 0, f, &child[f]);
> +
> + do_reset(bus, slot, func);
> +
> + /* restore */
> + for (f = 0; f < 8; f++)
> + if (child[f].pcie_pos)
> + restore_state(secondary, 0, f, &child[f]);
> +}
> +
> +void __init early_reset_pcie_devices(void)
> +{
> + unsigned bus, slot, func;
> + int size;
> +
> + if (!early_pci_allowed() || !reset_devices)
> + return;
> +
> + /* alloc space to save config */
> + size = sizeof(struct save_config)*8;
Use a constant instead of "8", please.
> + save_cfg = (struct save_config *)alloc_bootmem(size);
> + if (save_cfg == NULL) {
> + printk(KERN_ERR "reset_pcie: alloc_bootmem failed\n");
> + return;
> + }
> +
> + for (bus = 0; bus < 256; bus++) {
> + for (slot = 0; slot < 32; slot++) {
> + for (func = 0; func < 8; func++) {
> + u16 vendor;
> + u8 type;
> + vendor = read_pci_config_16(bus, slot, func,
> + PCI_VENDOR_ID);
> +
> + if (vendor == 0xffff)
> + continue;
> +
> + reset_pcie_device(bus, slot, func);
> +
> + if (func == 0) {
> + type = read_pci_config_byte(bus, slot,
> + func,
> + PCI_HEADER_TYPE);
> + if (!(type & 0x80))
> + break;
> + }
> + }
> + }
> + }
> +
> + free_bootmem(__pa(save_cfg), size);
> +}
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index ab4bf5a..a7a4125 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -852,24 +852,6 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
>
> EXPORT_SYMBOL(pci_choose_state);
>
> -#define PCI_EXP_SAVE_REGS 7
> -
> -#define pcie_cap_has_devctl(type, flags) 1
> -#define pcie_cap_has_lnkctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - (type == PCI_EXP_TYPE_ROOT_PORT || \
> - type == PCI_EXP_TYPE_ENDPOINT || \
> - type == PCI_EXP_TYPE_LEG_END))
> -#define pcie_cap_has_sltctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - ((type == PCI_EXP_TYPE_ROOT_PORT) || \
> - (type == PCI_EXP_TYPE_DOWNSTREAM && \
> - (flags & PCI_EXP_FLAGS_SLOT))))
> -#define pcie_cap_has_rtctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - (type == PCI_EXP_TYPE_ROOT_PORT || \
> - type == PCI_EXP_TYPE_RC_EC))
> -
> static struct pci_cap_saved_state *pci_find_saved_cap(
> struct pci_dev *pci_dev, char cap)
> {
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 5faa831..8e10401 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1790,5 +1790,23 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
> */
> struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
>
> +#define PCI_EXP_SAVE_REGS 7
> +
> +#define pcie_cap_has_devctl(type, flags) 1
> +#define pcie_cap_has_lnkctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + (type == PCI_EXP_TYPE_ROOT_PORT || \
> + type == PCI_EXP_TYPE_ENDPOINT || \
> + type == PCI_EXP_TYPE_LEG_END))
> +#define pcie_cap_has_sltctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + ((type == PCI_EXP_TYPE_ROOT_PORT) || \
> + (type == PCI_EXP_TYPE_DOWNSTREAM && \
> + (flags & PCI_EXP_FLAGS_SLOT))))
> +#define pcie_cap_has_rtctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + (type == PCI_EXP_TYPE_ROOT_PORT || \
> + type == PCI_EXP_TYPE_RC_EC))
> +
> #endif /* __KERNEL__ */
> #endif /* LINUX_PCI_H */
> diff --git a/init/main.c b/init/main.c
> index b286730..ebaf067 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -144,10 +144,10 @@ EXPORT_SYMBOL(reset_devices);
> static int __init set_reset_devices(char *str)
> {
> reset_devices = 1;
> - return 1;
> + return 0;
> }
>
> -__setup("reset_devices", set_reset_devices);
> +early_param("reset_devices", set_reset_devices);
>
> static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
> const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
We have been seeing problems with kexec/kdump kernel for quite some time
that are related to I/O devices not being quiesced before kexec. I had
added code to clear Bus Master bit to help stop runaway DMAs which
helped many cases, but obviously not all. If resetting downstream ports
helps stop runaway I/O from PCIe devices, I am all for this approach.
This patch still doesn't do anything for old PCI devices though.
--
Khalid
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
WARNING: multiple messages have this Message-ID (diff)
From: Khalid Aziz <khalid@gonehiking.org>
To: Takao Indoh <indou.takao@jp.fujitsu.com>
Cc: linux-pci@vger.kernel.org, x86@kernel.org,
linux-kernel@vger.kernel.org, martin.wilck@ts.fujitsu.com,
kexec@lists.infradead.org, hbabu@us.ibm.com, andi@firstfloor.org,
ddutile@redhat.com, ishii.hironobu@jp.fujitsu.com, hpa@zytor.com,
bhelgaas@google.com, tglx@linutronix.de, mingo@redhat.com,
vgoyal@redhat.com
Subject: Re: [PATCH v3 1/2] x86, pci: Reset PCIe devices at boot time
Date: Wed, 10 Oct 2012 14:08:24 -0600 [thread overview]
Message-ID: <1349899704.25679.14.camel@rhapsody> (raw)
In-Reply-To: <20121010074603.1084.92389.sendpatchset@indoh>
Please see comments inline:
On Wed, 2012-10-10 at 16:51 +0900, Takao Indoh wrote:
> This patch resets PCIe devices at boot time by hot reset when
> "reset_devices" is specified.
>
>
> Signed-off-by: Takao Indoh <indou.takao@jp.fujitsu.com>
> ---
> arch/x86/include/asm/pci-direct.h | 1
> arch/x86/kernel/setup.c | 3
> arch/x86/pci/early.c | 299 ++++++++++++++++++++++++++++
> drivers/pci/pci.c | 18 -
> include/linux/pci.h | 18 +
> init/main.c | 4
> 6 files changed, 323 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
> index b1e7a45..de30db2 100644
> --- a/arch/x86/include/asm/pci-direct.h
> +++ b/arch/x86/include/asm/pci-direct.h
> @@ -18,4 +18,5 @@ extern int early_pci_allowed(void);
> extern unsigned int pci_early_dump_regs;
> extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
> extern void early_dump_pci_devices(void);
> +extern void early_reset_pcie_devices(void);
> #endif /* _ASM_X86_PCI_DIRECT_H */
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index f4b9b80..24b011c 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -988,6 +988,9 @@ void __init setup_arch(char **cmdline_p)
> generic_apic_probe();
>
> early_quirks();
> +#ifdef CONFIG_PCI
> + early_reset_pcie_devices();
> +#endif
>
> /*
> * Read APIC and some other early information from ACPI tables.
> diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
> index d1067d5..584f16b 100644
> --- a/arch/x86/pci/early.c
> +++ b/arch/x86/pci/early.c
> @@ -1,5 +1,6 @@
> #include <linux/kernel.h>
> #include <linux/pci.h>
> +#include <linux/bootmem.h>
> #include <asm/pci-direct.h>
> #include <asm/io.h>
> #include <asm/pci_x86.h>
> @@ -109,3 +110,301 @@ void early_dump_pci_devices(void)
> }
> }
> }
> +
> +struct save_config {
> + u32 pci[16];
> + u16 pcie[PCI_EXP_SAVE_REGS];
> +};
> +
> +struct devinfo {
> + int pcie_pos; /* position of PCI Express capability */
> + int pcie_flags; /* PCI_EXP_FLAGS */
> + struct save_config *save;
> +};
> +
> +static struct save_config *save_cfg;
> +static void __init pci_udelay(int loops)
> +{
> + while (loops--) {
> + /* Approximately 1 us */
> + native_io_delay();
> + }
> +}
> +
> +/* Derived from drivers/pci/pci.c */
> +#define PCI_FIND_CAP_TTL 48
> +static int __init __pci_find_next_cap_ttl(u8 bus, u8 slot, u8 func,
> + u8 pos, int cap, int *ttl)
> +{
> + u8 id;
> +
> + while ((*ttl)--) {
> + pos = read_pci_config_byte(bus, slot, func, pos);
> + if (pos < 0x40)
> + break;
> + pos &= ~3;
> + id = read_pci_config_byte(bus, slot, func,
> + pos + PCI_CAP_LIST_ID);
> + if (id == 0xff)
> + break;
> + if (id == cap)
> + return pos;
> + pos += PCI_CAP_LIST_NEXT;
> + }
> + return 0;
> +}
> +
> +static int __init __pci_find_next_cap(u8 bus, u8 slot, u8 func, u8 pos, int cap)
> +{
> + int ttl = PCI_FIND_CAP_TTL;
> +
> + return __pci_find_next_cap_ttl(bus, slot, func, pos, cap, &ttl);
> +}
> +
> +static int __init __pci_bus_find_cap_start(u8 bus, u8 slot, u8 func,
> + u8 hdr_type)
> +{
> + u16 status;
> +
> + status = read_pci_config_16(bus, slot, func, PCI_STATUS);
> + if (!(status & PCI_STATUS_CAP_LIST))
> + return 0;
> +
> + switch (hdr_type) {
> + case PCI_HEADER_TYPE_NORMAL:
> + case PCI_HEADER_TYPE_BRIDGE:
> + return PCI_CAPABILITY_LIST;
> + case PCI_HEADER_TYPE_CARDBUS:
> + return PCI_CB_CAPABILITY_LIST;
> + default:
> + return 0;
> + }
> +
> + return 0;
> +}
> +
> +static int __init early_pci_find_capability(u8 bus, u8 slot, u8 func, int cap)
> +{
> + int pos;
> + u8 type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
> +
> + pos = __pci_bus_find_cap_start(bus, slot, func, type & 0x7f);
> + if (pos)
> + pos = __pci_find_next_cap(bus, slot, func, pos, cap);
> +
> + return pos;
> +}
> +
> +static void __init do_reset(u8 bus, u8 slot, u8 func)
> +{
> + u16 ctrl;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func);
> +
> + /* Assert Secondary Bus Reset */
> + ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL);
> + ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> +
> + pci_udelay(5000);
> +
> + /* De-assert Secondary Bus Reset */
> + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
> + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> +
> + pci_udelay(500000);
This is 0.5 second. This will add up quickly on larger servers with
multiple busses. Is 0.5 second required by the spec?
aer_do_secondary_bus_reset() holds PCI_BRIDGE_CTL_BUS_RESET for 2 ms and
then waits another 200 ms after de-asserting it. Still long, but less
than half of the delay in above code..
> +}
> +
> +static void __init save_state(unsigned bus, unsigned slot, unsigned func,
> + struct devinfo *info)
> +{
> + int i;
> + int pcie, flags, pcie_type;
> + struct save_config *save;
> +
> + pcie = info->pcie_pos;
> + flags = info->pcie_flags;
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + save = info->save;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d save state\n", bus, slot, func);
> +
> + for (i = 0; i < 16; i++)
> + save->pci[i] = read_pci_config(bus, slot, func, i * 4);
> + i = 0;
> + if (pcie_cap_has_devctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL);
> + if (pcie_cap_has_lnkctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL);
> + if (pcie_cap_has_sltctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL);
> + if (pcie_cap_has_rtctl(pcie_type, flags))
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_RTCTL);
> +
> + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL2);
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL2);
> + save->pcie[i++] = read_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL2);
> + }
> +}
> +
> +static void __init restore_state(unsigned bus, unsigned slot, unsigned func,
> + struct devinfo *info)
> +{
> + int i = 0;
> + int pcie, flags, pcie_type;
> + struct save_config *save;
> +
> + pcie = info->pcie_pos;
> + flags = info->pcie_flags;
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + save = info->save;
> +
> + printk(KERN_INFO "pci 0000:%02x:%02x.%d restore state\n",
> + bus, slot, func);
> +
> + if (pcie_cap_has_devctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL, save->pcie[i++]);
> + if (pcie_cap_has_lnkctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL, save->pcie[i++]);
> + if (pcie_cap_has_sltctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL, save->pcie[i++]);
> + if (pcie_cap_has_rtctl(pcie_type, flags))
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_RTCTL, save->pcie[i++]);
> +
> + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_DEVCTL2, save->pcie[i++]);
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_LNKCTL2, save->pcie[i++]);
> + write_pci_config_16(bus, slot, func,
> + pcie + PCI_EXP_SLTCTL2, save->pcie[i++]);
> + }
> +
> + for (i = 15; i >= 0; i--)
> + write_pci_config(bus, slot, func, i * 4, save->pci[i]);
> +}
> +
> +static void __init reset_pcie_device(unsigned bus, unsigned slot, unsigned func)
> +{
> + int f, count;
> + int pcie, pcie_type;
> + u8 type;
> + u16 vendor, flags;
> + u32 class;
> + int secondary;
> + struct devinfo child[8];
> +
> + pcie = early_pci_find_capability(bus, slot, func, PCI_CAP_ID_EXP);
> + if (!pcie)
> + return;
> +
> + flags = read_pci_config_16(bus, slot, func, pcie + PCI_EXP_FLAGS);
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + if ((pcie_type != PCI_EXP_TYPE_ROOT_PORT) &&
> + (pcie_type != PCI_EXP_TYPE_DOWNSTREAM))
> + return;
> +
> + type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
> + if ((type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
> + return;
> + secondary = read_pci_config_byte(bus, slot, func, PCI_SECONDARY_BUS);
> + memset(child, 0, sizeof(child));
> + for (count = 0, f = 0; f < 8; f++) {
Can we use a constant instead of "8" in the loop here? There are a few
other places in kernel code with very similar loops enumerating over PCI
functions that again use "8" instead of a constant like
PCI_MAX_FUNCTIONS. I would suggest we use a constant at least in the new
code.
> + vendor = read_pci_config_16(secondary, 0, f, PCI_VENDOR_ID);
> + if (vendor == 0xffff)
> + continue;
> +
> + pcie = early_pci_find_capability(secondary, 0, f,
> + PCI_CAP_ID_EXP);
> + if (!pcie)
> + continue;
> +
> + flags = read_pci_config_16(secondary, 0, f,
> + pcie + PCI_EXP_FLAGS);
> + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> + if ((pcie_type == PCI_EXP_TYPE_UPSTREAM) ||
> + (pcie_type == PCI_EXP_TYPE_PCI_BRIDGE))
> + /* Don't reset switch, bridge */
> + return;
> +
> + class = read_pci_config(secondary, 0, f, PCI_CLASS_REVISION);
> + if ((class >> 24) == PCI_BASE_CLASS_DISPLAY)
> + /* Don't reset VGA device */
> + return;
> +
> + count++;
> + child[f].pcie_pos = pcie;
> + child[f].pcie_flags = flags;
> + child[f].save = save_cfg + f;
> + }
> +
> + if (!count)
> + return;
> +
> + /* save */
> + for (f = 0; f < 8; f++)
> + if (child[f].pcie_pos)
> + save_state(secondary, 0, f, &child[f]);
> +
> + do_reset(bus, slot, func);
> +
> + /* restore */
> + for (f = 0; f < 8; f++)
> + if (child[f].pcie_pos)
> + restore_state(secondary, 0, f, &child[f]);
> +}
> +
> +void __init early_reset_pcie_devices(void)
> +{
> + unsigned bus, slot, func;
> + int size;
> +
> + if (!early_pci_allowed() || !reset_devices)
> + return;
> +
> + /* alloc space to save config */
> + size = sizeof(struct save_config)*8;
Use a constant instead of "8", please.
> + save_cfg = (struct save_config *)alloc_bootmem(size);
> + if (save_cfg == NULL) {
> + printk(KERN_ERR "reset_pcie: alloc_bootmem failed\n");
> + return;
> + }
> +
> + for (bus = 0; bus < 256; bus++) {
> + for (slot = 0; slot < 32; slot++) {
> + for (func = 0; func < 8; func++) {
> + u16 vendor;
> + u8 type;
> + vendor = read_pci_config_16(bus, slot, func,
> + PCI_VENDOR_ID);
> +
> + if (vendor == 0xffff)
> + continue;
> +
> + reset_pcie_device(bus, slot, func);
> +
> + if (func == 0) {
> + type = read_pci_config_byte(bus, slot,
> + func,
> + PCI_HEADER_TYPE);
> + if (!(type & 0x80))
> + break;
> + }
> + }
> + }
> + }
> +
> + free_bootmem(__pa(save_cfg), size);
> +}
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index ab4bf5a..a7a4125 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -852,24 +852,6 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state)
>
> EXPORT_SYMBOL(pci_choose_state);
>
> -#define PCI_EXP_SAVE_REGS 7
> -
> -#define pcie_cap_has_devctl(type, flags) 1
> -#define pcie_cap_has_lnkctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - (type == PCI_EXP_TYPE_ROOT_PORT || \
> - type == PCI_EXP_TYPE_ENDPOINT || \
> - type == PCI_EXP_TYPE_LEG_END))
> -#define pcie_cap_has_sltctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - ((type == PCI_EXP_TYPE_ROOT_PORT) || \
> - (type == PCI_EXP_TYPE_DOWNSTREAM && \
> - (flags & PCI_EXP_FLAGS_SLOT))))
> -#define pcie_cap_has_rtctl(type, flags) \
> - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> - (type == PCI_EXP_TYPE_ROOT_PORT || \
> - type == PCI_EXP_TYPE_RC_EC))
> -
> static struct pci_cap_saved_state *pci_find_saved_cap(
> struct pci_dev *pci_dev, char cap)
> {
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 5faa831..8e10401 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1790,5 +1790,23 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
> */
> struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
>
> +#define PCI_EXP_SAVE_REGS 7
> +
> +#define pcie_cap_has_devctl(type, flags) 1
> +#define pcie_cap_has_lnkctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + (type == PCI_EXP_TYPE_ROOT_PORT || \
> + type == PCI_EXP_TYPE_ENDPOINT || \
> + type == PCI_EXP_TYPE_LEG_END))
> +#define pcie_cap_has_sltctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + ((type == PCI_EXP_TYPE_ROOT_PORT) || \
> + (type == PCI_EXP_TYPE_DOWNSTREAM && \
> + (flags & PCI_EXP_FLAGS_SLOT))))
> +#define pcie_cap_has_rtctl(type, flags) \
> + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \
> + (type == PCI_EXP_TYPE_ROOT_PORT || \
> + type == PCI_EXP_TYPE_RC_EC))
> +
> #endif /* __KERNEL__ */
> #endif /* LINUX_PCI_H */
> diff --git a/init/main.c b/init/main.c
> index b286730..ebaf067 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -144,10 +144,10 @@ EXPORT_SYMBOL(reset_devices);
> static int __init set_reset_devices(char *str)
> {
> reset_devices = 1;
> - return 1;
> + return 0;
> }
>
> -__setup("reset_devices", set_reset_devices);
> +early_param("reset_devices", set_reset_devices);
>
> static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
> const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
We have been seeing problems with kexec/kdump kernel for quite some time
that are related to I/O devices not being quiesced before kexec. I had
added code to clear Bus Master bit to help stop runaway DMAs which
helped many cases, but obviously not all. If resetting downstream ports
helps stop runaway I/O from PCIe devices, I am all for this approach.
This patch still doesn't do anything for old PCI devices though.
--
Khalid
next prev parent reply other threads:[~2012-10-10 20:13 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-10-10 7:50 [PATCH v3 0/2] Reset PCIe devices to address DMA problem on kdump with iommu Takao Indoh
2012-10-10 7:50 ` Takao Indoh
2012-10-10 7:51 ` [PATCH v3 1/2] x86, pci: Reset PCIe devices at boot time Takao Indoh
2012-10-10 7:51 ` Takao Indoh
2012-10-10 20:08 ` Khalid Aziz [this message]
2012-10-10 20:08 ` Khalid Aziz
2012-10-11 6:16 ` Takao Indoh
2012-10-11 6:16 ` Takao Indoh
2012-10-11 17:28 ` Khalid Aziz
2012-10-11 17:28 ` Khalid Aziz
2012-10-12 11:28 ` Takao Indoh
2012-10-12 11:28 ` Takao Indoh
2012-10-10 7:51 ` [PATCH v3 2/2] x86, pci: Enable PCI INTx when MSI is disabled Takao Indoh
2012-10-10 7:51 ` Takao Indoh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1349899704.25679.14.camel@rhapsody \
--to=khalid@gonehiking.org \
--cc=andi@firstfloor.org \
--cc=bhelgaas@google.com \
--cc=ddutile@redhat.com \
--cc=hbabu@us.ibm.com \
--cc=hpa@zytor.com \
--cc=indou.takao@jp.fujitsu.com \
--cc=ishii.hironobu@jp.fujitsu.com \
--cc=kexec@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pci@vger.kernel.org \
--cc=martin.wilck@ts.fujitsu.com \
--cc=mingo@redhat.com \
--cc=tglx@linutronix.de \
--cc=vgoyal@redhat.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.