From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pa0-f42.google.com ([209.85.220.42]:36762 "EHLO mail-pa0-f42.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751378AbbEKHCP (ORCPT ); Mon, 11 May 2015 03:02:15 -0400 Received: by pabsx10 with SMTP id sx10so102828744pab.3 for ; Mon, 11 May 2015 00:02:15 -0700 (PDT) Message-ID: <555053F0.1000409@ozlabs.ru> Date: Mon, 11 May 2015 17:02:08 +1000 From: Alexey Kardashevskiy MIME-Version: 1.0 To: Gavin Shan CC: linuxppc-dev@lists.ozlabs.org, linux-pci@vger.kernel.org, benh@kernel.crashing.org, bhelgaas@google.com Subject: Re: [PATCH v4 07/21] powerpc/powernv: Release PEs dynamically References: <1430460188-31343-1-git-send-email-gwshan@linux.vnet.ibm.com> <1430460188-31343-8-git-send-email-gwshan@linux.vnet.ibm.com> <554E00EB.5070802@ozlabs.ru> <20150511062553.GB28073@gwshan> In-Reply-To: <20150511062553.GB28073@gwshan> Content-Type: text/plain; charset=koi8-r; format=flowed Sender: linux-pci-owner@vger.kernel.org List-ID: On 05/11/2015 04:25 PM, Gavin Shan wrote: > On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote: >> On 05/01/2015 04:02 PM, Gavin Shan wrote: >>> The original code doesn't support releasing PEs dynamically, meaning >>> that PE and the associated resources (IO, M32, M64 and DMA) can't >>> be released when unplugging a PCI adapter from one hotpluggable slot. >>> >>> The patch takes object oriented methodology, introducs reference >>> count to PE, which is initialized to 1 and increased with 1 when a >>> new PCI device joins the PE. Once the last PCI device leaves the >>> PE, the PE is going to be release together with its associated >>> (IO, M32, M64, DMA) resources. >> >> >> Too little commit log for non-trivial non-cut-n-paste 30KB patch... >> > > Ok. I'll add more details in next revision. > >>> >>> Signed-off-by: Gavin Shan >>> --- >>> arch/powerpc/include/asm/pci-bridge.h | 3 + >>> arch/powerpc/kernel/pci-hotplug.c | 5 + >>> arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++----------- >>> arch/powerpc/platforms/powernv/pci.h | 4 +- >>> 4 files changed, 432 insertions(+), 238 deletions(-) >>> >>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h >>> index 5367eb3..a6ad4b1 100644 >>> --- a/arch/powerpc/include/asm/pci-bridge.h >>> +++ b/arch/powerpc/include/asm/pci-bridge.h >>> @@ -31,6 +31,9 @@ struct pci_controller_ops { >>> resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type); >>> void (*setup_bridge)(struct pci_bus *, unsigned long); >>> void (*reset_secondary_bus)(struct pci_dev *dev); >>> + >>> + /* Called when PCI device is released */ >>> + void (*release_device)(struct pci_dev *); >>> }; >>> >>> /* >>> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c >>> index 7ed85a6..0040343 100644 >>> --- a/arch/powerpc/kernel/pci-hotplug.c >>> +++ b/arch/powerpc/kernel/pci-hotplug.c >>> @@ -29,6 +29,11 @@ >>> */ >>> void pcibios_release_device(struct pci_dev *dev) >>> { >>> + struct pci_controller *hose = pci_bus_to_host(dev->bus); >>> + >>> + if (hose->controller_ops.release_device) >>> + hose->controller_ops.release_device(dev); >>> + >>> eeh_remove_device(dev); >>> } >>> >>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c >>> index 910fb67..ef8c216 100644 >>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c >>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c >>> @@ -12,6 +12,8 @@ >>> #undef DEBUG >>> >>> #include >>> +#include >>> +#include >>> #include >>> #include >>> #include >>> @@ -47,6 +49,8 @@ >>> /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ >>> #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) >>> >>> +static void pnv_ioda_release_pe(struct kref *kref); >>> + >>> static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, >>> const char *fmt, ...) >>> { >>> @@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) >>> (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); >>> } >>> >>> -static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) >>> +static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe) >>> { >>> - if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { >>> - pr_warn("%s: Invalid PE %d on PHB#%x\n", >>> - __func__, pe_no, phb->hose->global_number); >>> + if (!pe) >>> + return; >>> + >>> + kref_get(&pe->kref); >>> +} >>> + >>> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe) >>> +{ >>> + unsigned int count; >>> + >>> + if (!pe) >>> return; >>> + >>> + /* >>> + * The count is initialized to 1 and increased with 1 when >>> + * a new PCI device is bound with the PE. Once the last PCI >>> + * device is leaving from the PE, the PE is going to be >>> + * released. >>> + */ >>> + count = atomic_read(&pe->kref.refcount); >>> + if (count == 2) >>> + kref_sub(&pe->kref, 2, pnv_ioda_release_pe); >>> + else >>> + kref_put(&pe->kref, pnv_ioda_release_pe); >> >> >> What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()? >> > > Yeah, that would have problem. But it shouldn't happen because the > PCI devices are joining the parent PE# in strictly serialized mode. > Same thing happens when detaching PCI devices from its parent PE. oookay. Another thing then - why is this kref counter initialized to 1? It would make sense if you did something special when the counter becomes 1 after decrement but you do not. Also, this kref thing makes sense if you do kref_put() in multiple places and do not know which one will be the last one so you pass the callback to all of them. Here you do kref_put/sub in one place and you read the counter - so you can call pnv_ioda_release_pe() directly. And it feels like a simple atomic_t would do the job just fine. If you still feel that the counter should start from 1, there are atomic_dec_if_positive() and atomic_inc_not_zero() and others. >>> +} >>> + >>> +static void pnv_pci_release_device(struct pci_dev *pdev) >>> +{ >>> + struct pci_controller *hose = pci_bus_to_host(pdev->bus); >>> + struct pnv_phb *phb = hose->private_data; >>> + struct pci_dn *pdn = pci_get_pdn(pdev); >>> + struct pnv_ioda_pe *pe; >>> + >>> + if (pdn && pdn->pe_number != IODA_INVALID_PE) { >>> + pe = &phb->ioda.pe_array[pdn->pe_number]; >>> + pnv_ioda_pe_put(pe); >>> + pdn->pe_number = IODA_INVALID_PE; >>> } >>> +} >>> >>> - if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) { >>> - pr_warn("%s: PE %d was assigned on PHB#%x\n", >>> - __func__, pe_no, phb->hose->global_number); >>> +static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe) >>> +{ >>> + struct pnv_phb *phb = pe->phb; >>> + int index, count; >>> + unsigned long tbl_addr, tbl_size; >>> + >>> + /* No DMA capability for slave PEs */ >>> + if (pe->flags & PNV_IODA_PE_SLAVE) >>> + return; >>> + >>> + /* Bypass DMA window */ >>> + if (phb->type == PNV_PHB_IODA2 && >>> + pe->tce_bypass_enabled && >>> + pe->tce32_table && >>> + pe->tce32_table->set_bypass) >>> + pe->tce32_table->set_bypass(pe->tce32_table, false); >>> + >>> + /* 32-bits DMA window */ >>> + count = pe->tce32_seg_end - pe->tce32_seg_start; >>> + tbl_addr = pe->tce32_table->it_base; >>> + if (!count) >>> return; >>> + >>> + /* Free IOMMU table */ >>> + iommu_free_table(pe->tce32_table, >>> + of_node_full_name(phb->hose->dn)); >>> + >>> + /* Deconfigure TCE table */ >>> + switch (phb->type) { >>> + case PNV_PHB_IODA1: >>> + for (index = 0; index < count; index++) >>> + opal_pci_map_pe_dma_window(phb->opal_id, >>> + pe->pe_number, >>> + pe->tce32_seg_start + index, >>> + 1, >>> + __pa(tbl_addr) + >>> + index * TCE32_TABLE_SIZE, >>> + 0, >>> + 0x1000); >>> + bitmap_clear(phb->ioda.tce32_segmap, >>> + pe->tce32_seg_start, >>> + count); >>> + tbl_size = TCE32_TABLE_SIZE * count; >>> + break; >>> + case PNV_PHB_IODA2: >>> + opal_pci_map_pe_dma_window(phb->opal_id, >>> + pe->pe_number, >>> + pe->pe_number << 1, >>> + 1, >>> + __pa(tbl_addr), >>> + 0, >>> + 0x1000); >>> + tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base)); >>> + tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8; >>> + break; >>> + default: >>> + pe_warn(pe, "Unsupported PHB type %d\n", phb->type); >>> + return; >>> + } >>> + >>> + /* Free memory of IOMMU table */ >>> + free_pages(tbl_addr, get_order(tbl_size)); >> >> >> You just programmed the table address to TVT and then you are releasing the >> pages. It does not seem right, it will leave garbage in TVT. Also, I am >> adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits >>from there (I'll post v10 soon, you'll be in copy and you'll have to review >> that ;) ). >> > > I assume you're talking about TVE. I don't understand how garbage will be left > in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE > with zero'ed "tce_table_size". The pages previously allocated for TCE table is > released to buddy system, which can be allocated by somebody else (from buddy > or slab). opal_pci_map_pe_dma_window() takes __pa(tbl_addr) which points to some memory which is still allocated. This value goes to a table (which has 2 entries per PE, one for 32bit DMA window and one for bypass/hugewindow) which PHB uses to get the actual TCE table address. What is the name of this table? :) Anyway, you write an address there and then you call free_pages() so after free_pages(), the value in that TVE/TVT/whatever table is a garbage. > > Ok. Please put me into the cc list. I guess the whole series of patches is > better to rebased on your DDW patchset, which is to be merged first, I believe. > >> >>> + pe->tce32_table = NULL; >>> + pe->tce32_seg_start = 0; >>> + pe->tce32_seg_end = 0; >>> +} >>> + >>> +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) >>> +{ >>> + struct pnv_phb *phb = pe->phb; >>> + unsigned long *segmap = NULL, *pe_segmap = NULL; >>> + int i; >>> + uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE, >>> + OPAL_M32_WINDOW_TYPE, >>> + OPAL_M64_WINDOW_TYPE }; >>> + >>> + for (win = 0; win < ARRAY_SIZE(win_type); win++) { >>> + switch (win_type[win]) { >>> + case OPAL_IO_WINDOW_TYPE: >>> + segmap = phb->ioda.io_segmap; >>> + pe_segmap = pe->io_segmap; >>> + break; >>> + case OPAL_M32_WINDOW_TYPE: >>> + segmap = phb->ioda.m32_segmap; >>> + pe_segmap = pe->m32_segmap; >>> + break; >>> + case OPAL_M64_WINDOW_TYPE: >>> + segmap = phb->ioda.m64_segmap; >>> + pe_segmap = pe->m64_segmap; >>> + break; >>> + } >>> + i = -1; >>> + while ((i = find_next_bit(pe_segmap, >>> + phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) { >>> + if (win_type[win] == OPAL_IO_WINDOW_TYPE || >>> + win_type[win] == OPAL_M32_WINDOW_TYPE) >>> + opal_pci_map_pe_mmio_window(phb->opal_id, >>> + phb->ioda.reserved_pe, >>> + win_type[win], 0, i); >>> + else if (phb->type == PNV_PHB_IODA1) >>> + opal_pci_map_pe_mmio_window(phb->opal_id, >>> + phb->ioda.reserved_pe, >>> + win_type[win], >>> + i / 8, i % 8); >> >> The function is called ""release" but it programs something what looks like >> reasonable values, is it correct? >> > > It's out of problem, When the segment is deallocated, it's mapped to the > reserved PE#. > >> >> >>> + >>> + clear_bit(i, pe_segmap); >>> + clear_bit(i, segmap); >>> + } >>> + } >>> +} >>> + >>> +static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, >>> + struct pnv_ioda_pe *parent, >>> + struct pnv_ioda_pe *child, >>> + bool is_add) >>> +{ >>> + const char *desc = is_add ? "adding" : "removing"; >>> + uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN : >>> + OPAL_REMOVE_PE_FROM_DOMAIN; >>> + struct pnv_ioda_pe *slave; >>> + long rc; >>> + >>> + /* Parent PE affects child PE */ >>> + rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, >>> + child->pe_number, op); >>> + if (rc != OPAL_SUCCESS) { >>> + pe_warn(child, "OPAL error %ld %s to parent PELTV\n", >>> + rc, desc); >>> + return -ENXIO; >>> + } >>> + >>> + if (!(child->flags & PNV_IODA_PE_MASTER)) >>> + return 0; >>> + >>> + /* Compound case: parent PE affects slave PEs */ >>> + list_for_each_entry(slave, &child->slaves, list) { >>> + rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, >>> + slave->pe_number, op); >>> + if (rc != OPAL_SUCCESS) { >>> + pe_warn(slave, "OPAL error %ld %s to parent PELTV\n", >>> + rc, desc); >>> + return -ENXIO; >>> + } >>> + } >>> + >>> + return 0; >>> +} >>> + >>> +static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add) >>> +{ >>> + struct pnv_phb *phb = pe->phb; >>> + struct pnv_ioda_pe *slave; >>> + struct pci_dev *pdev = NULL; >>> + int ret; >>> + >>> + /* >>> + * Clear PE frozen state. If it's master PE, we need >>> + * clear slave PE frozen state as well. >>> + */ >>> + opal_pci_eeh_freeze_clear(phb->opal_id, >>> + pe->pe_number, >>> + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> + if (pe->flags & PNV_IODA_PE_MASTER) { >>> + list_for_each_entry(slave, &pe->slaves, list) { >>> + opal_pci_eeh_freeze_clear(phb->opal_id, >>> + slave->pe_number, >>> + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> + } >>> + } >>> + >>> + /* >>> + * Associate PE in PELT. We need add the PE into the >>> + * corresponding PELT-V as well. Otherwise, the error >>> + * originated from the PE might contribute to other >>> + * PEs. >>> + */ >>> + ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add); >>> + if (ret) >>> + return ret; >>> + >>> + /* For compound PEs, any one affects all of them */ >>> + if (pe->flags & PNV_IODA_PE_MASTER) { >>> + list_for_each_entry(slave, &pe->slaves, list) { >>> + ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add); >>> + if (ret) >>> + return ret; >>> + } >>> + } >>> + >>> + if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) >>> + pdev = pe->pbus->self; >>> + else if (pe->flags & PNV_IODA_PE_DEV) >>> + pdev = pe->pdev->bus->self; >>> +#ifdef CONFIG_PCI_IOV >>> + else if (pe->flags & PNV_IODA_PE_VF) >>> + pdev = pe->parent_dev->bus->self; >>> +#endif /* CONFIG_PCI_IOV */ >>> + >>> + while (pdev) { >>> + struct pci_dn *pdn = pci_get_pdn(pdev); >>> + struct pnv_ioda_pe *parent; >>> + >>> + if (pdn && pdn->pe_number != IODA_INVALID_PE) { >>> + parent = &phb->ioda.pe_array[pdn->pe_number]; >>> + ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); >>> + if (ret) >>> + return ret; >>> + } >>> + >>> + pdev = pdev->bus->self; >>> + } >>> + >>> + return 0; >>> +} >>> + >>> +static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe) >> >> >> It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just >> moving of this function to a different place deserves a separate patch with a >> comment why ("it is going to be used now for non-SRIOV case too" may be?). >> > > Yeah, it makes sense to me. Will fix it up. > >> >>> +{ >>> + struct pnv_phb *phb = pe->phb; >>> + struct pci_dev *parent; >>> + uint8_t bcomp, dcomp, fcomp; >>> + long rid_end, rid; >>> + int64_t rc; >>> + >>> + /* Tear down MVE */ >>> + if (phb->type == PNV_PHB_IODA1 && >>> + pe->mve_number != -1) { >>> + rc = opal_pci_set_mve(phb->opal_id, >>> + pe->mve_number, >>> + phb->ioda.reserved_pe); >>> + if (rc != OPAL_SUCCESS) >>> + pe_warn(pe, "Error %lld unmapping MVE#%d\n", >>> + rc, pe->mve_number); >>> + rc = opal_pci_set_mve_enable(phb->opal_id, >>> + pe->mve_number, >>> + OPAL_DISABLE_MVE); >>> + if (rc != OPAL_SUCCESS) >>> + pe_warn(pe, "Error %lld disabling MVE#%d\n", >>> + rc, pe->mve_number); >>> + pe->mve_number = -1; >>> + } >>> + >>> + /* Unmapping PELTV */ >>> + pnv_ioda_set_peltv(pe, false); >>> + >>> + /* To unmap PELTM */ >>> + if (pe->pbus) { >>> + int count; >>> + >>> + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; >>> + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; >>> + parent = pe->pbus->self; >>> + if (pe->flags & PNV_IODA_PE_BUS_ALL) >>> + count = pe->pbus->busn_res.end - >>> + pe->pbus->busn_res.start + 1; >>> + else >>> + count = 1; >>> + >>> + switch(count) { >>> + case 1: bcomp = OpalPciBusAll; break; >>> + case 2: bcomp = OpalPciBus7Bits; break; >>> + case 4: bcomp = OpalPciBus6Bits; break; >>> + case 8: bcomp = OpalPciBus5Bits; break; >>> + case 16: bcomp = OpalPciBus4Bits; break; >>> + case 32: bcomp = OpalPciBus3Bits; break; >>> + default: >>> + /* Fail back to case of one bus */ >>> + pe_warn(pe, "Cannot support %d buses\n", count); >>> + bcomp = OpalPciBusAll; >>> + } >>> + rid_end = pe->rid + (count << 8); >>> + } else { >>> +#ifdef CONFIG_PCI_IOV >>> + if (pe->flags & PNV_IODA_PE_VF) >>> + parent = pe->parent_dev; >>> + else >>> +#endif >>> + parent = pe->pdev->bus->self; >>> + bcomp = OpalPciBusAll; >>> + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; >>> + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; >>> + rid_end = pe->rid + 1; >>> + } >>> + >>> + /* Clear RID mapping */ >>> + for (rid = pe->rid; rid < rid_end; rid++) >>> + phb->ioda.pe_rmap[rid] = IODA_INVALID_PE; >>> + >>> + /* Unmapping PELTM */ >>> + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, >>> + bcomp, dcomp, fcomp, OPAL_UNMAP_PE); >>> + if (rc) >>> + pe_warn(pe, "Error %ld unmapping PELTM\n", rc); >>> +} >>> + >>> +static void pnv_ioda_release_pe(struct kref *kref) >>> +{ >>> + struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref); >>> + struct pnv_ioda_pe *tmp, *slave; >>> + struct pnv_phb *phb = pe->phb; >>> + >>> + pnv_ioda_release_pe_dma(pe); >>> + pnv_ioda_release_pe_seg(pe); >>> + pnv_ioda_deconfigure_pe(pe); >>> + >>> + /* Release slave PEs for compound PE */ >>> + if (pe->flags & PNV_IODA_PE_MASTER) { >>> + list_for_each_entry_safe(slave, tmp, &pe->slaves, list) >>> + pnv_ioda_pe_put(slave); >>> + } >>> + >>> + /* Remove the PE from various list. We need remove slave >>> + * PE from master's list. >>> + */ >>> + list_del(&pe->dma_link); >>> + list_del(&pe->list); >>> + >>> + /* Free PE number */ >>> + clear_bit(pe->pe_number, phb->ioda.pe_alloc); >>> +} >>> + >>> +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, >>> + int pe_no) >>> +{ >>> + struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no]; >>> + >>> + kref_init(&pe->kref); >>> + pe->phb = phb; >>> + pe->pe_number = pe_no; >>> + INIT_LIST_HEAD(&pe->dma_link); >>> + INIT_LIST_HEAD(&pe->list); >>> + >>> + return pe; >>> +} >>> + >>> +static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb, >>> + int pe_no) >>> +{ >>> + if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { >>> + pr_warn("%s: Invalid PE %d on PHB#%x\n", >>> + __func__, pe_no, phb->hose->global_number); >>> + return NULL; >>> } >>> >>> - phb->ioda.pe_array[pe_no].phb = phb; >>> - phb->ioda.pe_array[pe_no].pe_number = pe_no; >>> + /* >>> + * Same PE might be reserved for multiple times, which >>> + * is out of problem actually. >>> + */ >>> + set_bit(pe_no, phb->ioda.pe_alloc); >>> + return pnv_ioda_init_pe(phb, pe_no); >>> } >>> >>> -static int pnv_ioda_alloc_pe(struct pnv_phb *phb) >>> +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) >>> { >>> unsigned long pe_no; >>> unsigned long limit = phb->ioda.total_pe - 1; >>> @@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb) >>> break; >>> >>> if (--limit >= phb->ioda.total_pe) >>> - return IODA_INVALID_PE; >>> + return NULL; >>> } while(1); >>> >>> - phb->ioda.pe_array[pe_no].phb = phb; >>> - phb->ioda.pe_array[pe_no].pe_number = pe_no; >>> - return pe_no; >>> -} >>> - >>> -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) >>> -{ >>> - WARN_ON(phb->ioda.pe_array[pe].pdev); >>> - >>> - memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); >>> - clear_bit(pe, phb->ioda.pe_alloc); >>> + return pnv_ioda_init_pe(phb, pe_no); >>> } >>> >>> static int pnv_ioda1_init_m64(struct pnv_phb *phb) >>> @@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb, >>> } >>> } >>> >>> -static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> - struct pci_bus *bus, int all) >>> +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> + struct pci_bus *bus, >>> + int all) >> >> >> Mechanic changes like this could easily go to a separate patch. >> > > Indeed. I'll see how I can split the patches up in next revision. > Thanks for the suggestion. > >>> { >>> resource_size_t segsz = phb->ioda.m64_segsize; >>> struct pci_dev *pdev; >>> @@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> int i; >>> >>> if (!pnv_ioda_need_m64_pe(phb, bus)) >>> - return IODA_INVALID_PE; >>> + return NULL; >>> >>> /* Allocate bitmap */ >>> size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); >>> pe_bitsmap = kzalloc(size, GFP_KERNEL); >>> if (!pe_bitsmap) { >>> pr_warn("%s: Out of memory !\n", __func__); >>> - return IODA_INVALID_PE; >>> + return NULL; >>> } >>> >>> /* The bridge's M64 window might be extended to PHB's M64 >>> @@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> /* No M64 window found ? */ >>> if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) { >>> kfree(pe_bitsmap); >>> - return IODA_INVALID_PE; >>> + return NULL; >>> } >>> >>> /* Figure out the master PE and put all slave PEs >>> @@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> } >>> >>> kfree(pe_bitsmap); >>> - return master_pe->pe_number; >>> + return master_pe; >>> } >>> >>> static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) >>> @@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) >>> * but in the meantime, we need to protect them to avoid warnings >>> */ >>> #ifdef CONFIG_PCI_MSI >>> -static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) >>> +static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev) >>> { >>> struct pci_controller *hose = pci_bus_to_host(dev->bus); >>> struct pnv_phb *phb = hose->private_data; >>> @@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) >>> } >>> #endif /* CONFIG_PCI_MSI */ >>> >>> -static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, >>> - struct pnv_ioda_pe *parent, >>> - struct pnv_ioda_pe *child, >>> - bool is_add) >>> -{ >>> - const char *desc = is_add ? "adding" : "removing"; >>> - uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN : >>> - OPAL_REMOVE_PE_FROM_DOMAIN; >>> - struct pnv_ioda_pe *slave; >>> - long rc; >>> - >>> - /* Parent PE affects child PE */ >>> - rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, >>> - child->pe_number, op); >>> - if (rc != OPAL_SUCCESS) { >>> - pe_warn(child, "OPAL error %ld %s to parent PELTV\n", >>> - rc, desc); >>> - return -ENXIO; >>> - } >>> - >>> - if (!(child->flags & PNV_IODA_PE_MASTER)) >>> - return 0; >>> - >>> - /* Compound case: parent PE affects slave PEs */ >>> - list_for_each_entry(slave, &child->slaves, list) { >>> - rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, >>> - slave->pe_number, op); >>> - if (rc != OPAL_SUCCESS) { >>> - pe_warn(slave, "OPAL error %ld %s to parent PELTV\n", >>> - rc, desc); >>> - return -ENXIO; >>> - } >>> - } >>> - >>> - return 0; >>> -} >>> - >>> -static int pnv_ioda_set_peltv(struct pnv_phb *phb, >>> - struct pnv_ioda_pe *pe, >>> - bool is_add) >>> -{ >>> - struct pnv_ioda_pe *slave; >>> - struct pci_dev *pdev = NULL; >>> - int ret; >>> - >>> - /* >>> - * Clear PE frozen state. If it's master PE, we need >>> - * clear slave PE frozen state as well. >>> - */ >>> - if (is_add) { >>> - opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, >>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> - if (pe->flags & PNV_IODA_PE_MASTER) { >>> - list_for_each_entry(slave, &pe->slaves, list) >>> - opal_pci_eeh_freeze_clear(phb->opal_id, >>> - slave->pe_number, >>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> - } >>> - } >>> - >>> - /* >>> - * Associate PE in PELT. We need add the PE into the >>> - * corresponding PELT-V as well. Otherwise, the error >>> - * originated from the PE might contribute to other >>> - * PEs. >>> - */ >>> - ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add); >>> - if (ret) >>> - return ret; >>> - >>> - /* For compound PEs, any one affects all of them */ >>> - if (pe->flags & PNV_IODA_PE_MASTER) { >>> - list_for_each_entry(slave, &pe->slaves, list) { >>> - ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add); >>> - if (ret) >>> - return ret; >>> - } >>> - } >>> - >>> - if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) >>> - pdev = pe->pbus->self; >>> - else if (pe->flags & PNV_IODA_PE_DEV) >>> - pdev = pe->pdev->bus->self; >>> -#ifdef CONFIG_PCI_IOV >>> - else if (pe->flags & PNV_IODA_PE_VF) >>> - pdev = pe->parent_dev->bus->self; >>> -#endif /* CONFIG_PCI_IOV */ >>> - while (pdev) { >>> - struct pci_dn *pdn = pci_get_pdn(pdev); >>> - struct pnv_ioda_pe *parent; >>> - >>> - if (pdn && pdn->pe_number != IODA_INVALID_PE) { >>> - parent = &phb->ioda.pe_array[pdn->pe_number]; >>> - ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); >>> - if (ret) >>> - return ret; >>> - } >>> - >>> - pdev = pdev->bus->self; >>> - } >>> - >>> - return 0; >>> -} >>> - >>> -#ifdef CONFIG_PCI_IOV >>> -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) >>> -{ >>> - struct pci_dev *parent; >>> - uint8_t bcomp, dcomp, fcomp; >>> - int64_t rc; >>> - long rid_end, rid; >>> - >>> - /* Currently, we just deconfigure VF PE. Bus PE will always there.*/ >>> - if (pe->pbus) { >>> - int count; >>> - >>> - dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; >>> - fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; >>> - parent = pe->pbus->self; >>> - if (pe->flags & PNV_IODA_PE_BUS_ALL) >>> - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; >>> - else >>> - count = 1; >>> - >>> - switch(count) { >>> - case 1: bcomp = OpalPciBusAll; break; >>> - case 2: bcomp = OpalPciBus7Bits; break; >>> - case 4: bcomp = OpalPciBus6Bits; break; >>> - case 8: bcomp = OpalPciBus5Bits; break; >>> - case 16: bcomp = OpalPciBus4Bits; break; >>> - case 32: bcomp = OpalPciBus3Bits; break; >>> - default: >>> - dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", >>> - count); >>> - /* Do an exact match only */ >>> - bcomp = OpalPciBusAll; >>> - } >>> - rid_end = pe->rid + (count << 8); >>> - } else { >>> - if (pe->flags & PNV_IODA_PE_VF) >>> - parent = pe->parent_dev; >>> - else >>> - parent = pe->pdev->bus->self; >>> - bcomp = OpalPciBusAll; >>> - dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; >>> - fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; >>> - rid_end = pe->rid + 1; >>> - } >>> - >>> - /* Clear the reverse map */ >>> - for (rid = pe->rid; rid < rid_end; rid++) >>> - phb->ioda.pe_rmap[rid] = IODA_INVALID_PE; >>> - >>> - /* Release from all parents PELT-V */ >>> - while (parent) { >>> - struct pci_dn *pdn = pci_get_pdn(parent); >>> - if (pdn && pdn->pe_number != IODA_INVALID_PE) { >>> - rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, >>> - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); >>> - /* XXX What to do in case of error ? */ >> >> >> Not much :) Free associated memory and mark it "dead" so it won't be used >> again till reboot. In what circumstance can this opal_pci_set_peltv() fail at >> all? >> > > Yeah, maybe. Until now, I didn't see this failure since the code is there > from the day. Note the code has been there for almost 4 years since the > day Ben wrote it. Sure. But if it starts failing, we won't even notice it - there is no even pr_err() or WARN_ON. > >> >>> - } >>> - parent = parent->bus->self; >>> - } >>> - >>> - opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, >>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> - >>> - /* Disassociate PE in PELT */ >>> - rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, >>> - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); >>> - if (rc) >>> - pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); >>> - rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, >>> - bcomp, dcomp, fcomp, OPAL_UNMAP_PE); >>> - if (rc) >>> - pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); >>> - >>> - pe->pbus = NULL; >>> - pe->pdev = NULL; >>> - pe->parent_dev = NULL; >>> - >>> - return 0; >>> -} >>> -#endif /* CONFIG_PCI_IOV */ >>> - >>> static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) >>> { >>> struct pci_dev *parent; >>> @@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) >>> } >>> >>> /* Configure PELTV */ >>> - pnv_ioda_set_peltv(phb, pe, true); >>> + pnv_ioda_set_peltv(pe, true); >>> >>> /* Setup reverse map */ >>> for (rid = pe->rid; rid < rid_end; rid++) >>> @@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) >>> if (pdn->pe_number != IODA_INVALID_PE) >>> continue; >>> >>> + /* Increase reference count of the parent PE */ >> >> When you comment like this, I read it as the comment belongs to the whole >> next chunk till the first empty line, i.e. to all 5 lines below, which is not >> the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() name >> suggests incrementing the reference counter 2) "pe" is always parent in this >> function. I do not insist though. >> > > Agree on your explaining. I'll remove this unuseful comments. > >> >>> + pnv_ioda_pe_get(pe); >>> pdn->pe_number = pe->pe_number; >>> pe->dma_weight += pnv_ioda_dev_dma_weight(dev); >>> if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) >>> @@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) >>> { >>> struct pci_controller *hose = pci_bus_to_host(bus); >>> struct pnv_phb *phb = hose->private_data; >>> - struct pnv_ioda_pe *pe; >>> + struct pnv_ioda_pe *pe = NULL; >>> int pe_num = IODA_INVALID_PE; >>> >>> /* For partial hotplug case, the PE instance hasn't been destroyed >>> @@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) >>> } >>> >>> /* PE number for root bus should have been reserved */ >>> - if (pci_is_root_bus(bus)) >>> - pe_num = phb->ioda.root_pe_no; >>> + if (pci_is_root_bus(bus) && >>> + phb->ioda.root_pe_no != IODA_INVALID_PE) >>> + pe = &phb->ioda.pe_array[phb->ioda.root_pe_no]; >>> >>> /* Check if PE is determined by M64 */ >>> - if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe) >>> - pe_num = phb->pick_m64_pe(phb, bus, all); >>> + if (!pe && phb->pick_m64_pe) >>> + pe = phb->pick_m64_pe(phb, bus, all); >>> >>> /* The PE number isn't pinned by M64 */ >>> - if (pe_num == IODA_INVALID_PE) >>> - pe_num = pnv_ioda_alloc_pe(phb); >>> + if (!pe) >>> + pe = pnv_ioda_alloc_pe(phb); >>> >>> - if (pe_num == IODA_INVALID_PE) { >>> - pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", >>> + if (!pe) { >>> + pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n", >>> __func__, pci_domain_nr(bus), bus->number); >>> return NULL; >>> } >>> >>> - pe = &phb->ioda.pe_array[pe_num]; >>> pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); >>> pe->pbus = bus; >>> pe->pdev = NULL; >>> @@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) >>> >>> if (pnv_ioda_configure_pe(phb, pe)) { >>> /* XXX What do we do here ? */ >>> - if (pe_num) >>> - pnv_ioda_free_pe(phb, pe_num); >>> - pe->pbus = NULL; >>> + pnv_ioda_pe_put(pe); >>> return NULL; >>> } >>> >>> pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), >>> - GFP_KERNEL, hose->node); >>> + GFP_KERNEL, hose->node); >> >> Seems like spaces change only - if you really want this change (which I hate >> - makes code look inaccurate to my taste but it seems I am in minority here >> :) ), please put it to the separate patch. >> > > Ok. Confirm with you: You prefer the original format? I don't know > why I prefer the later one. Maybe my eyes are quite broken :-) I prefer not to change existing whitespaces unless it is done once and for the entire file :) Just remove this change from the patch. >> >>> pe->tce32_table->data = pe; >>> >>> /* Associate it with all child devices */ >>> @@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) >>> list_del(&pe->list); >>> mutex_unlock(&phb->ioda.pe_list_mutex); >>> >>> - pnv_ioda_deconfigure_pe(phb, pe); >>> + pnv_ioda_deconfigure_pe(pe); >> >> >> Is this change necessary to get "Release PEs dynamically" working? Move it to >> mechanical changes patch may be? >> > > Ok. I'll try to do that. > >> >>> >>> - pnv_ioda_free_pe(phb, pe->pe_number); >>> + pnv_ioda_pe_put(pe); >>> } >>> } >>> >>> @@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) >>> >>> if (pnv_ioda_configure_pe(phb, pe)) { >>> /* XXX What do we do here ? */ >>> - if (pe_num) >>> - pnv_ioda_free_pe(phb, pe_num); >>> - pe->pdev = NULL; >>> + pnv_ioda_pe_put(pe); >>> continue; >>> } >>> >>> @@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode) >>> struct pnv_ioda_pe *pe; >>> int rc; >>> >>> - pe = pnv_ioda_get_pe(dev); >>> + pe = pnv_ioda_pci_dev_to_pe(dev); >> >> >> And this change could to separately. Not clear how this helps to "Release PEs >> dynamically". >> >> > > It's not related to "Release PEs dynamically". The change is introduced by > the function rename: Original pnv_ioda_get_pe() is renamed to pnv_ioda_pci_dev_to_pe(). But the rename happened in this patch and the patch's subj is "Release PEs dynamically" so it should be related somehow or move it to a simple separate patch "let's give the lalala function a better name to reflect what it actually does" (but in this case the new name does not make any more sense than the old one). >>> if (!pe) >>> return -ENODEV; >>> >>> @@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, >>> struct pnv_ioda_pe *pe; >>> int rc; >>> >>> - if (!(pe = pnv_ioda_get_pe(dev))) >>> + if (!(pe = pnv_ioda_pci_dev_to_pe(dev))) >>> return -ENODEV; >>> >>> /* Assign XIVE to PE */ >>> @@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, >>> unsigned int hwirq, unsigned int virq, >>> unsigned int is_64, struct msi_msg *msg) >>> { >>> - struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); >>> + struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev); >>> unsigned int xive_num = hwirq - phb->msi_base; >>> __be32 data; >>> int rc; >>> @@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, >>> pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge; >>> pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment; >>> pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus; >>> + pnv_pci_controller_ops.release_device = pnv_pci_release_device; >>> hose->controller_ops = pnv_pci_controller_ops; >>> >>> #ifdef CONFIG_PCI_IOV >>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h >>> index 1bea3a8..8b10f01 100644 >>> --- a/arch/powerpc/platforms/powernv/pci.h >>> +++ b/arch/powerpc/platforms/powernv/pci.h >>> @@ -28,6 +28,7 @@ enum pnv_phb_model { >>> /* Data associated with a PE, including IOMMU tracking etc.. */ >>> struct pnv_phb; >>> struct pnv_ioda_pe { >>> + struct kref kref; >>> unsigned long flags; >>> struct pnv_phb *phb; >>> >>> @@ -120,7 +121,8 @@ struct pnv_phb { >>> void (*shutdown)(struct pnv_phb *phb); >>> int (*init_m64)(struct pnv_phb *phb); >>> void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus); >>> - int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all); >>> + struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb, >>> + struct pci_bus *bus, int all); >>> int (*get_pe_state)(struct pnv_phb *phb, int pe_no); >>> void (*freeze_pe)(struct pnv_phb *phb, int pe_no); >>> int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); >>> > > Thanks, > Gavin > -- Alexey From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pa0-f52.google.com (mail-pa0-f52.google.com [209.85.220.52]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by lists.ozlabs.org (Postfix) with ESMTPS id 4F8041A0191 for ; Mon, 11 May 2015 17:02:17 +1000 (AEST) Received: by pacwv17 with SMTP id wv17so102861358pac.0 for ; Mon, 11 May 2015 00:02:15 -0700 (PDT) Message-ID: <555053F0.1000409@ozlabs.ru> Date: Mon, 11 May 2015 17:02:08 +1000 From: Alexey Kardashevskiy MIME-Version: 1.0 To: Gavin Shan Subject: Re: [PATCH v4 07/21] powerpc/powernv: Release PEs dynamically References: <1430460188-31343-1-git-send-email-gwshan@linux.vnet.ibm.com> <1430460188-31343-8-git-send-email-gwshan@linux.vnet.ibm.com> <554E00EB.5070802@ozlabs.ru> <20150511062553.GB28073@gwshan> In-Reply-To: <20150511062553.GB28073@gwshan> Content-Type: text/plain; charset=koi8-r; format=flowed Cc: bhelgaas@google.com, linux-pci@vger.kernel.org, linuxppc-dev@lists.ozlabs.org List-Id: Linux on PowerPC Developers Mail List List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , On 05/11/2015 04:25 PM, Gavin Shan wrote: > On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote: >> On 05/01/2015 04:02 PM, Gavin Shan wrote: >>> The original code doesn't support releasing PEs dynamically, meaning >>> that PE and the associated resources (IO, M32, M64 and DMA) can't >>> be released when unplugging a PCI adapter from one hotpluggable slot. >>> >>> The patch takes object oriented methodology, introducs reference >>> count to PE, which is initialized to 1 and increased with 1 when a >>> new PCI device joins the PE. Once the last PCI device leaves the >>> PE, the PE is going to be release together with its associated >>> (IO, M32, M64, DMA) resources. >> >> >> Too little commit log for non-trivial non-cut-n-paste 30KB patch... >> > > Ok. I'll add more details in next revision. > >>> >>> Signed-off-by: Gavin Shan >>> --- >>> arch/powerpc/include/asm/pci-bridge.h | 3 + >>> arch/powerpc/kernel/pci-hotplug.c | 5 + >>> arch/powerpc/platforms/powernv/pci-ioda.c | 658 +++++++++++++++++++----------- >>> arch/powerpc/platforms/powernv/pci.h | 4 +- >>> 4 files changed, 432 insertions(+), 238 deletions(-) >>> >>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h >>> index 5367eb3..a6ad4b1 100644 >>> --- a/arch/powerpc/include/asm/pci-bridge.h >>> +++ b/arch/powerpc/include/asm/pci-bridge.h >>> @@ -31,6 +31,9 @@ struct pci_controller_ops { >>> resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type); >>> void (*setup_bridge)(struct pci_bus *, unsigned long); >>> void (*reset_secondary_bus)(struct pci_dev *dev); >>> + >>> + /* Called when PCI device is released */ >>> + void (*release_device)(struct pci_dev *); >>> }; >>> >>> /* >>> diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c >>> index 7ed85a6..0040343 100644 >>> --- a/arch/powerpc/kernel/pci-hotplug.c >>> +++ b/arch/powerpc/kernel/pci-hotplug.c >>> @@ -29,6 +29,11 @@ >>> */ >>> void pcibios_release_device(struct pci_dev *dev) >>> { >>> + struct pci_controller *hose = pci_bus_to_host(dev->bus); >>> + >>> + if (hose->controller_ops.release_device) >>> + hose->controller_ops.release_device(dev); >>> + >>> eeh_remove_device(dev); >>> } >>> >>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c >>> index 910fb67..ef8c216 100644 >>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c >>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c >>> @@ -12,6 +12,8 @@ >>> #undef DEBUG >>> >>> #include >>> +#include >>> +#include >>> #include >>> #include >>> #include >>> @@ -47,6 +49,8 @@ >>> /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ >>> #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) >>> >>> +static void pnv_ioda_release_pe(struct kref *kref); >>> + >>> static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, >>> const char *fmt, ...) >>> { >>> @@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) >>> (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); >>> } >>> >>> -static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) >>> +static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe) >>> { >>> - if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { >>> - pr_warn("%s: Invalid PE %d on PHB#%x\n", >>> - __func__, pe_no, phb->hose->global_number); >>> + if (!pe) >>> + return; >>> + >>> + kref_get(&pe->kref); >>> +} >>> + >>> +static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe) >>> +{ >>> + unsigned int count; >>> + >>> + if (!pe) >>> return; >>> + >>> + /* >>> + * The count is initialized to 1 and increased with 1 when >>> + * a new PCI device is bound with the PE. Once the last PCI >>> + * device is leaving from the PE, the PE is going to be >>> + * released. >>> + */ >>> + count = atomic_read(&pe->kref.refcount); >>> + if (count == 2) >>> + kref_sub(&pe->kref, 2, pnv_ioda_release_pe); >>> + else >>> + kref_put(&pe->kref, pnv_ioda_release_pe); >> >> >> What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()? >> > > Yeah, that would have problem. But it shouldn't happen because the > PCI devices are joining the parent PE# in strictly serialized mode. > Same thing happens when detaching PCI devices from its parent PE. oookay. Another thing then - why is this kref counter initialized to 1? It would make sense if you did something special when the counter becomes 1 after decrement but you do not. Also, this kref thing makes sense if you do kref_put() in multiple places and do not know which one will be the last one so you pass the callback to all of them. Here you do kref_put/sub in one place and you read the counter - so you can call pnv_ioda_release_pe() directly. And it feels like a simple atomic_t would do the job just fine. If you still feel that the counter should start from 1, there are atomic_dec_if_positive() and atomic_inc_not_zero() and others. >>> +} >>> + >>> +static void pnv_pci_release_device(struct pci_dev *pdev) >>> +{ >>> + struct pci_controller *hose = pci_bus_to_host(pdev->bus); >>> + struct pnv_phb *phb = hose->private_data; >>> + struct pci_dn *pdn = pci_get_pdn(pdev); >>> + struct pnv_ioda_pe *pe; >>> + >>> + if (pdn && pdn->pe_number != IODA_INVALID_PE) { >>> + pe = &phb->ioda.pe_array[pdn->pe_number]; >>> + pnv_ioda_pe_put(pe); >>> + pdn->pe_number = IODA_INVALID_PE; >>> } >>> +} >>> >>> - if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) { >>> - pr_warn("%s: PE %d was assigned on PHB#%x\n", >>> - __func__, pe_no, phb->hose->global_number); >>> +static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe) >>> +{ >>> + struct pnv_phb *phb = pe->phb; >>> + int index, count; >>> + unsigned long tbl_addr, tbl_size; >>> + >>> + /* No DMA capability for slave PEs */ >>> + if (pe->flags & PNV_IODA_PE_SLAVE) >>> + return; >>> + >>> + /* Bypass DMA window */ >>> + if (phb->type == PNV_PHB_IODA2 && >>> + pe->tce_bypass_enabled && >>> + pe->tce32_table && >>> + pe->tce32_table->set_bypass) >>> + pe->tce32_table->set_bypass(pe->tce32_table, false); >>> + >>> + /* 32-bits DMA window */ >>> + count = pe->tce32_seg_end - pe->tce32_seg_start; >>> + tbl_addr = pe->tce32_table->it_base; >>> + if (!count) >>> return; >>> + >>> + /* Free IOMMU table */ >>> + iommu_free_table(pe->tce32_table, >>> + of_node_full_name(phb->hose->dn)); >>> + >>> + /* Deconfigure TCE table */ >>> + switch (phb->type) { >>> + case PNV_PHB_IODA1: >>> + for (index = 0; index < count; index++) >>> + opal_pci_map_pe_dma_window(phb->opal_id, >>> + pe->pe_number, >>> + pe->tce32_seg_start + index, >>> + 1, >>> + __pa(tbl_addr) + >>> + index * TCE32_TABLE_SIZE, >>> + 0, >>> + 0x1000); >>> + bitmap_clear(phb->ioda.tce32_segmap, >>> + pe->tce32_seg_start, >>> + count); >>> + tbl_size = TCE32_TABLE_SIZE * count; >>> + break; >>> + case PNV_PHB_IODA2: >>> + opal_pci_map_pe_dma_window(phb->opal_id, >>> + pe->pe_number, >>> + pe->pe_number << 1, >>> + 1, >>> + __pa(tbl_addr), >>> + 0, >>> + 0x1000); >>> + tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base)); >>> + tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8; >>> + break; >>> + default: >>> + pe_warn(pe, "Unsupported PHB type %d\n", phb->type); >>> + return; >>> + } >>> + >>> + /* Free memory of IOMMU table */ >>> + free_pages(tbl_addr, get_order(tbl_size)); >> >> >> You just programmed the table address to TVT and then you are releasing the >> pages. It does not seem right, it will leave garbage in TVT. Also, I am >> adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits >>from there (I'll post v10 soon, you'll be in copy and you'll have to review >> that ;) ). >> > > I assume you're talking about TVE. I don't understand how garbage will be left > in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE > with zero'ed "tce_table_size". The pages previously allocated for TCE table is > released to buddy system, which can be allocated by somebody else (from buddy > or slab). opal_pci_map_pe_dma_window() takes __pa(tbl_addr) which points to some memory which is still allocated. This value goes to a table (which has 2 entries per PE, one for 32bit DMA window and one for bypass/hugewindow) which PHB uses to get the actual TCE table address. What is the name of this table? :) Anyway, you write an address there and then you call free_pages() so after free_pages(), the value in that TVE/TVT/whatever table is a garbage. > > Ok. Please put me into the cc list. I guess the whole series of patches is > better to rebased on your DDW patchset, which is to be merged first, I believe. > >> >>> + pe->tce32_table = NULL; >>> + pe->tce32_seg_start = 0; >>> + pe->tce32_seg_end = 0; >>> +} >>> + >>> +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) >>> +{ >>> + struct pnv_phb *phb = pe->phb; >>> + unsigned long *segmap = NULL, *pe_segmap = NULL; >>> + int i; >>> + uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE, >>> + OPAL_M32_WINDOW_TYPE, >>> + OPAL_M64_WINDOW_TYPE }; >>> + >>> + for (win = 0; win < ARRAY_SIZE(win_type); win++) { >>> + switch (win_type[win]) { >>> + case OPAL_IO_WINDOW_TYPE: >>> + segmap = phb->ioda.io_segmap; >>> + pe_segmap = pe->io_segmap; >>> + break; >>> + case OPAL_M32_WINDOW_TYPE: >>> + segmap = phb->ioda.m32_segmap; >>> + pe_segmap = pe->m32_segmap; >>> + break; >>> + case OPAL_M64_WINDOW_TYPE: >>> + segmap = phb->ioda.m64_segmap; >>> + pe_segmap = pe->m64_segmap; >>> + break; >>> + } >>> + i = -1; >>> + while ((i = find_next_bit(pe_segmap, >>> + phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) { >>> + if (win_type[win] == OPAL_IO_WINDOW_TYPE || >>> + win_type[win] == OPAL_M32_WINDOW_TYPE) >>> + opal_pci_map_pe_mmio_window(phb->opal_id, >>> + phb->ioda.reserved_pe, >>> + win_type[win], 0, i); >>> + else if (phb->type == PNV_PHB_IODA1) >>> + opal_pci_map_pe_mmio_window(phb->opal_id, >>> + phb->ioda.reserved_pe, >>> + win_type[win], >>> + i / 8, i % 8); >> >> The function is called ""release" but it programs something what looks like >> reasonable values, is it correct? >> > > It's out of problem, When the segment is deallocated, it's mapped to the > reserved PE#. > >> >> >>> + >>> + clear_bit(i, pe_segmap); >>> + clear_bit(i, segmap); >>> + } >>> + } >>> +} >>> + >>> +static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, >>> + struct pnv_ioda_pe *parent, >>> + struct pnv_ioda_pe *child, >>> + bool is_add) >>> +{ >>> + const char *desc = is_add ? "adding" : "removing"; >>> + uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN : >>> + OPAL_REMOVE_PE_FROM_DOMAIN; >>> + struct pnv_ioda_pe *slave; >>> + long rc; >>> + >>> + /* Parent PE affects child PE */ >>> + rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, >>> + child->pe_number, op); >>> + if (rc != OPAL_SUCCESS) { >>> + pe_warn(child, "OPAL error %ld %s to parent PELTV\n", >>> + rc, desc); >>> + return -ENXIO; >>> + } >>> + >>> + if (!(child->flags & PNV_IODA_PE_MASTER)) >>> + return 0; >>> + >>> + /* Compound case: parent PE affects slave PEs */ >>> + list_for_each_entry(slave, &child->slaves, list) { >>> + rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, >>> + slave->pe_number, op); >>> + if (rc != OPAL_SUCCESS) { >>> + pe_warn(slave, "OPAL error %ld %s to parent PELTV\n", >>> + rc, desc); >>> + return -ENXIO; >>> + } >>> + } >>> + >>> + return 0; >>> +} >>> + >>> +static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add) >>> +{ >>> + struct pnv_phb *phb = pe->phb; >>> + struct pnv_ioda_pe *slave; >>> + struct pci_dev *pdev = NULL; >>> + int ret; >>> + >>> + /* >>> + * Clear PE frozen state. If it's master PE, we need >>> + * clear slave PE frozen state as well. >>> + */ >>> + opal_pci_eeh_freeze_clear(phb->opal_id, >>> + pe->pe_number, >>> + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> + if (pe->flags & PNV_IODA_PE_MASTER) { >>> + list_for_each_entry(slave, &pe->slaves, list) { >>> + opal_pci_eeh_freeze_clear(phb->opal_id, >>> + slave->pe_number, >>> + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> + } >>> + } >>> + >>> + /* >>> + * Associate PE in PELT. We need add the PE into the >>> + * corresponding PELT-V as well. Otherwise, the error >>> + * originated from the PE might contribute to other >>> + * PEs. >>> + */ >>> + ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add); >>> + if (ret) >>> + return ret; >>> + >>> + /* For compound PEs, any one affects all of them */ >>> + if (pe->flags & PNV_IODA_PE_MASTER) { >>> + list_for_each_entry(slave, &pe->slaves, list) { >>> + ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add); >>> + if (ret) >>> + return ret; >>> + } >>> + } >>> + >>> + if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) >>> + pdev = pe->pbus->self; >>> + else if (pe->flags & PNV_IODA_PE_DEV) >>> + pdev = pe->pdev->bus->self; >>> +#ifdef CONFIG_PCI_IOV >>> + else if (pe->flags & PNV_IODA_PE_VF) >>> + pdev = pe->parent_dev->bus->self; >>> +#endif /* CONFIG_PCI_IOV */ >>> + >>> + while (pdev) { >>> + struct pci_dn *pdn = pci_get_pdn(pdev); >>> + struct pnv_ioda_pe *parent; >>> + >>> + if (pdn && pdn->pe_number != IODA_INVALID_PE) { >>> + parent = &phb->ioda.pe_array[pdn->pe_number]; >>> + ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); >>> + if (ret) >>> + return ret; >>> + } >>> + >>> + pdev = pdev->bus->self; >>> + } >>> + >>> + return 0; >>> +} >>> + >>> +static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe) >> >> >> It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just >> moving of this function to a different place deserves a separate patch with a >> comment why ("it is going to be used now for non-SRIOV case too" may be?). >> > > Yeah, it makes sense to me. Will fix it up. > >> >>> +{ >>> + struct pnv_phb *phb = pe->phb; >>> + struct pci_dev *parent; >>> + uint8_t bcomp, dcomp, fcomp; >>> + long rid_end, rid; >>> + int64_t rc; >>> + >>> + /* Tear down MVE */ >>> + if (phb->type == PNV_PHB_IODA1 && >>> + pe->mve_number != -1) { >>> + rc = opal_pci_set_mve(phb->opal_id, >>> + pe->mve_number, >>> + phb->ioda.reserved_pe); >>> + if (rc != OPAL_SUCCESS) >>> + pe_warn(pe, "Error %lld unmapping MVE#%d\n", >>> + rc, pe->mve_number); >>> + rc = opal_pci_set_mve_enable(phb->opal_id, >>> + pe->mve_number, >>> + OPAL_DISABLE_MVE); >>> + if (rc != OPAL_SUCCESS) >>> + pe_warn(pe, "Error %lld disabling MVE#%d\n", >>> + rc, pe->mve_number); >>> + pe->mve_number = -1; >>> + } >>> + >>> + /* Unmapping PELTV */ >>> + pnv_ioda_set_peltv(pe, false); >>> + >>> + /* To unmap PELTM */ >>> + if (pe->pbus) { >>> + int count; >>> + >>> + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; >>> + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; >>> + parent = pe->pbus->self; >>> + if (pe->flags & PNV_IODA_PE_BUS_ALL) >>> + count = pe->pbus->busn_res.end - >>> + pe->pbus->busn_res.start + 1; >>> + else >>> + count = 1; >>> + >>> + switch(count) { >>> + case 1: bcomp = OpalPciBusAll; break; >>> + case 2: bcomp = OpalPciBus7Bits; break; >>> + case 4: bcomp = OpalPciBus6Bits; break; >>> + case 8: bcomp = OpalPciBus5Bits; break; >>> + case 16: bcomp = OpalPciBus4Bits; break; >>> + case 32: bcomp = OpalPciBus3Bits; break; >>> + default: >>> + /* Fail back to case of one bus */ >>> + pe_warn(pe, "Cannot support %d buses\n", count); >>> + bcomp = OpalPciBusAll; >>> + } >>> + rid_end = pe->rid + (count << 8); >>> + } else { >>> +#ifdef CONFIG_PCI_IOV >>> + if (pe->flags & PNV_IODA_PE_VF) >>> + parent = pe->parent_dev; >>> + else >>> +#endif >>> + parent = pe->pdev->bus->self; >>> + bcomp = OpalPciBusAll; >>> + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; >>> + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; >>> + rid_end = pe->rid + 1; >>> + } >>> + >>> + /* Clear RID mapping */ >>> + for (rid = pe->rid; rid < rid_end; rid++) >>> + phb->ioda.pe_rmap[rid] = IODA_INVALID_PE; >>> + >>> + /* Unmapping PELTM */ >>> + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, >>> + bcomp, dcomp, fcomp, OPAL_UNMAP_PE); >>> + if (rc) >>> + pe_warn(pe, "Error %ld unmapping PELTM\n", rc); >>> +} >>> + >>> +static void pnv_ioda_release_pe(struct kref *kref) >>> +{ >>> + struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref); >>> + struct pnv_ioda_pe *tmp, *slave; >>> + struct pnv_phb *phb = pe->phb; >>> + >>> + pnv_ioda_release_pe_dma(pe); >>> + pnv_ioda_release_pe_seg(pe); >>> + pnv_ioda_deconfigure_pe(pe); >>> + >>> + /* Release slave PEs for compound PE */ >>> + if (pe->flags & PNV_IODA_PE_MASTER) { >>> + list_for_each_entry_safe(slave, tmp, &pe->slaves, list) >>> + pnv_ioda_pe_put(slave); >>> + } >>> + >>> + /* Remove the PE from various list. We need remove slave >>> + * PE from master's list. >>> + */ >>> + list_del(&pe->dma_link); >>> + list_del(&pe->list); >>> + >>> + /* Free PE number */ >>> + clear_bit(pe->pe_number, phb->ioda.pe_alloc); >>> +} >>> + >>> +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, >>> + int pe_no) >>> +{ >>> + struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no]; >>> + >>> + kref_init(&pe->kref); >>> + pe->phb = phb; >>> + pe->pe_number = pe_no; >>> + INIT_LIST_HEAD(&pe->dma_link); >>> + INIT_LIST_HEAD(&pe->list); >>> + >>> + return pe; >>> +} >>> + >>> +static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb, >>> + int pe_no) >>> +{ >>> + if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { >>> + pr_warn("%s: Invalid PE %d on PHB#%x\n", >>> + __func__, pe_no, phb->hose->global_number); >>> + return NULL; >>> } >>> >>> - phb->ioda.pe_array[pe_no].phb = phb; >>> - phb->ioda.pe_array[pe_no].pe_number = pe_no; >>> + /* >>> + * Same PE might be reserved for multiple times, which >>> + * is out of problem actually. >>> + */ >>> + set_bit(pe_no, phb->ioda.pe_alloc); >>> + return pnv_ioda_init_pe(phb, pe_no); >>> } >>> >>> -static int pnv_ioda_alloc_pe(struct pnv_phb *phb) >>> +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) >>> { >>> unsigned long pe_no; >>> unsigned long limit = phb->ioda.total_pe - 1; >>> @@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb) >>> break; >>> >>> if (--limit >= phb->ioda.total_pe) >>> - return IODA_INVALID_PE; >>> + return NULL; >>> } while(1); >>> >>> - phb->ioda.pe_array[pe_no].phb = phb; >>> - phb->ioda.pe_array[pe_no].pe_number = pe_no; >>> - return pe_no; >>> -} >>> - >>> -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) >>> -{ >>> - WARN_ON(phb->ioda.pe_array[pe].pdev); >>> - >>> - memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); >>> - clear_bit(pe, phb->ioda.pe_alloc); >>> + return pnv_ioda_init_pe(phb, pe_no); >>> } >>> >>> static int pnv_ioda1_init_m64(struct pnv_phb *phb) >>> @@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb, >>> } >>> } >>> >>> -static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> - struct pci_bus *bus, int all) >>> +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> + struct pci_bus *bus, >>> + int all) >> >> >> Mechanic changes like this could easily go to a separate patch. >> > > Indeed. I'll see how I can split the patches up in next revision. > Thanks for the suggestion. > >>> { >>> resource_size_t segsz = phb->ioda.m64_segsize; >>> struct pci_dev *pdev; >>> @@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> int i; >>> >>> if (!pnv_ioda_need_m64_pe(phb, bus)) >>> - return IODA_INVALID_PE; >>> + return NULL; >>> >>> /* Allocate bitmap */ >>> size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); >>> pe_bitsmap = kzalloc(size, GFP_KERNEL); >>> if (!pe_bitsmap) { >>> pr_warn("%s: Out of memory !\n", __func__); >>> - return IODA_INVALID_PE; >>> + return NULL; >>> } >>> >>> /* The bridge's M64 window might be extended to PHB's M64 >>> @@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> /* No M64 window found ? */ >>> if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) { >>> kfree(pe_bitsmap); >>> - return IODA_INVALID_PE; >>> + return NULL; >>> } >>> >>> /* Figure out the master PE and put all slave PEs >>> @@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >>> } >>> >>> kfree(pe_bitsmap); >>> - return master_pe->pe_number; >>> + return master_pe; >>> } >>> >>> static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) >>> @@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) >>> * but in the meantime, we need to protect them to avoid warnings >>> */ >>> #ifdef CONFIG_PCI_MSI >>> -static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) >>> +static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev) >>> { >>> struct pci_controller *hose = pci_bus_to_host(dev->bus); >>> struct pnv_phb *phb = hose->private_data; >>> @@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) >>> } >>> #endif /* CONFIG_PCI_MSI */ >>> >>> -static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, >>> - struct pnv_ioda_pe *parent, >>> - struct pnv_ioda_pe *child, >>> - bool is_add) >>> -{ >>> - const char *desc = is_add ? "adding" : "removing"; >>> - uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN : >>> - OPAL_REMOVE_PE_FROM_DOMAIN; >>> - struct pnv_ioda_pe *slave; >>> - long rc; >>> - >>> - /* Parent PE affects child PE */ >>> - rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, >>> - child->pe_number, op); >>> - if (rc != OPAL_SUCCESS) { >>> - pe_warn(child, "OPAL error %ld %s to parent PELTV\n", >>> - rc, desc); >>> - return -ENXIO; >>> - } >>> - >>> - if (!(child->flags & PNV_IODA_PE_MASTER)) >>> - return 0; >>> - >>> - /* Compound case: parent PE affects slave PEs */ >>> - list_for_each_entry(slave, &child->slaves, list) { >>> - rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, >>> - slave->pe_number, op); >>> - if (rc != OPAL_SUCCESS) { >>> - pe_warn(slave, "OPAL error %ld %s to parent PELTV\n", >>> - rc, desc); >>> - return -ENXIO; >>> - } >>> - } >>> - >>> - return 0; >>> -} >>> - >>> -static int pnv_ioda_set_peltv(struct pnv_phb *phb, >>> - struct pnv_ioda_pe *pe, >>> - bool is_add) >>> -{ >>> - struct pnv_ioda_pe *slave; >>> - struct pci_dev *pdev = NULL; >>> - int ret; >>> - >>> - /* >>> - * Clear PE frozen state. If it's master PE, we need >>> - * clear slave PE frozen state as well. >>> - */ >>> - if (is_add) { >>> - opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, >>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> - if (pe->flags & PNV_IODA_PE_MASTER) { >>> - list_for_each_entry(slave, &pe->slaves, list) >>> - opal_pci_eeh_freeze_clear(phb->opal_id, >>> - slave->pe_number, >>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> - } >>> - } >>> - >>> - /* >>> - * Associate PE in PELT. We need add the PE into the >>> - * corresponding PELT-V as well. Otherwise, the error >>> - * originated from the PE might contribute to other >>> - * PEs. >>> - */ >>> - ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add); >>> - if (ret) >>> - return ret; >>> - >>> - /* For compound PEs, any one affects all of them */ >>> - if (pe->flags & PNV_IODA_PE_MASTER) { >>> - list_for_each_entry(slave, &pe->slaves, list) { >>> - ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add); >>> - if (ret) >>> - return ret; >>> - } >>> - } >>> - >>> - if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) >>> - pdev = pe->pbus->self; >>> - else if (pe->flags & PNV_IODA_PE_DEV) >>> - pdev = pe->pdev->bus->self; >>> -#ifdef CONFIG_PCI_IOV >>> - else if (pe->flags & PNV_IODA_PE_VF) >>> - pdev = pe->parent_dev->bus->self; >>> -#endif /* CONFIG_PCI_IOV */ >>> - while (pdev) { >>> - struct pci_dn *pdn = pci_get_pdn(pdev); >>> - struct pnv_ioda_pe *parent; >>> - >>> - if (pdn && pdn->pe_number != IODA_INVALID_PE) { >>> - parent = &phb->ioda.pe_array[pdn->pe_number]; >>> - ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); >>> - if (ret) >>> - return ret; >>> - } >>> - >>> - pdev = pdev->bus->self; >>> - } >>> - >>> - return 0; >>> -} >>> - >>> -#ifdef CONFIG_PCI_IOV >>> -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) >>> -{ >>> - struct pci_dev *parent; >>> - uint8_t bcomp, dcomp, fcomp; >>> - int64_t rc; >>> - long rid_end, rid; >>> - >>> - /* Currently, we just deconfigure VF PE. Bus PE will always there.*/ >>> - if (pe->pbus) { >>> - int count; >>> - >>> - dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; >>> - fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; >>> - parent = pe->pbus->self; >>> - if (pe->flags & PNV_IODA_PE_BUS_ALL) >>> - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; >>> - else >>> - count = 1; >>> - >>> - switch(count) { >>> - case 1: bcomp = OpalPciBusAll; break; >>> - case 2: bcomp = OpalPciBus7Bits; break; >>> - case 4: bcomp = OpalPciBus6Bits; break; >>> - case 8: bcomp = OpalPciBus5Bits; break; >>> - case 16: bcomp = OpalPciBus4Bits; break; >>> - case 32: bcomp = OpalPciBus3Bits; break; >>> - default: >>> - dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", >>> - count); >>> - /* Do an exact match only */ >>> - bcomp = OpalPciBusAll; >>> - } >>> - rid_end = pe->rid + (count << 8); >>> - } else { >>> - if (pe->flags & PNV_IODA_PE_VF) >>> - parent = pe->parent_dev; >>> - else >>> - parent = pe->pdev->bus->self; >>> - bcomp = OpalPciBusAll; >>> - dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; >>> - fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; >>> - rid_end = pe->rid + 1; >>> - } >>> - >>> - /* Clear the reverse map */ >>> - for (rid = pe->rid; rid < rid_end; rid++) >>> - phb->ioda.pe_rmap[rid] = IODA_INVALID_PE; >>> - >>> - /* Release from all parents PELT-V */ >>> - while (parent) { >>> - struct pci_dn *pdn = pci_get_pdn(parent); >>> - if (pdn && pdn->pe_number != IODA_INVALID_PE) { >>> - rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, >>> - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); >>> - /* XXX What to do in case of error ? */ >> >> >> Not much :) Free associated memory and mark it "dead" so it won't be used >> again till reboot. In what circumstance can this opal_pci_set_peltv() fail at >> all? >> > > Yeah, maybe. Until now, I didn't see this failure since the code is there > from the day. Note the code has been there for almost 4 years since the > day Ben wrote it. Sure. But if it starts failing, we won't even notice it - there is no even pr_err() or WARN_ON. > >> >>> - } >>> - parent = parent->bus->self; >>> - } >>> - >>> - opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, >>> - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); >>> - >>> - /* Disassociate PE in PELT */ >>> - rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, >>> - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); >>> - if (rc) >>> - pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); >>> - rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, >>> - bcomp, dcomp, fcomp, OPAL_UNMAP_PE); >>> - if (rc) >>> - pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); >>> - >>> - pe->pbus = NULL; >>> - pe->pdev = NULL; >>> - pe->parent_dev = NULL; >>> - >>> - return 0; >>> -} >>> -#endif /* CONFIG_PCI_IOV */ >>> - >>> static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) >>> { >>> struct pci_dev *parent; >>> @@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) >>> } >>> >>> /* Configure PELTV */ >>> - pnv_ioda_set_peltv(phb, pe, true); >>> + pnv_ioda_set_peltv(pe, true); >>> >>> /* Setup reverse map */ >>> for (rid = pe->rid; rid < rid_end; rid++) >>> @@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) >>> if (pdn->pe_number != IODA_INVALID_PE) >>> continue; >>> >>> + /* Increase reference count of the parent PE */ >> >> When you comment like this, I read it as the comment belongs to the whole >> next chunk till the first empty line, i.e. to all 5 lines below, which is not >> the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() name >> suggests incrementing the reference counter 2) "pe" is always parent in this >> function. I do not insist though. >> > > Agree on your explaining. I'll remove this unuseful comments. > >> >>> + pnv_ioda_pe_get(pe); >>> pdn->pe_number = pe->pe_number; >>> pe->dma_weight += pnv_ioda_dev_dma_weight(dev); >>> if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) >>> @@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) >>> { >>> struct pci_controller *hose = pci_bus_to_host(bus); >>> struct pnv_phb *phb = hose->private_data; >>> - struct pnv_ioda_pe *pe; >>> + struct pnv_ioda_pe *pe = NULL; >>> int pe_num = IODA_INVALID_PE; >>> >>> /* For partial hotplug case, the PE instance hasn't been destroyed >>> @@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) >>> } >>> >>> /* PE number for root bus should have been reserved */ >>> - if (pci_is_root_bus(bus)) >>> - pe_num = phb->ioda.root_pe_no; >>> + if (pci_is_root_bus(bus) && >>> + phb->ioda.root_pe_no != IODA_INVALID_PE) >>> + pe = &phb->ioda.pe_array[phb->ioda.root_pe_no]; >>> >>> /* Check if PE is determined by M64 */ >>> - if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe) >>> - pe_num = phb->pick_m64_pe(phb, bus, all); >>> + if (!pe && phb->pick_m64_pe) >>> + pe = phb->pick_m64_pe(phb, bus, all); >>> >>> /* The PE number isn't pinned by M64 */ >>> - if (pe_num == IODA_INVALID_PE) >>> - pe_num = pnv_ioda_alloc_pe(phb); >>> + if (!pe) >>> + pe = pnv_ioda_alloc_pe(phb); >>> >>> - if (pe_num == IODA_INVALID_PE) { >>> - pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", >>> + if (!pe) { >>> + pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n", >>> __func__, pci_domain_nr(bus), bus->number); >>> return NULL; >>> } >>> >>> - pe = &phb->ioda.pe_array[pe_num]; >>> pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); >>> pe->pbus = bus; >>> pe->pdev = NULL; >>> @@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) >>> >>> if (pnv_ioda_configure_pe(phb, pe)) { >>> /* XXX What do we do here ? */ >>> - if (pe_num) >>> - pnv_ioda_free_pe(phb, pe_num); >>> - pe->pbus = NULL; >>> + pnv_ioda_pe_put(pe); >>> return NULL; >>> } >>> >>> pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), >>> - GFP_KERNEL, hose->node); >>> + GFP_KERNEL, hose->node); >> >> Seems like spaces change only - if you really want this change (which I hate >> - makes code look inaccurate to my taste but it seems I am in minority here >> :) ), please put it to the separate patch. >> > > Ok. Confirm with you: You prefer the original format? I don't know > why I prefer the later one. Maybe my eyes are quite broken :-) I prefer not to change existing whitespaces unless it is done once and for the entire file :) Just remove this change from the patch. >> >>> pe->tce32_table->data = pe; >>> >>> /* Associate it with all child devices */ >>> @@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) >>> list_del(&pe->list); >>> mutex_unlock(&phb->ioda.pe_list_mutex); >>> >>> - pnv_ioda_deconfigure_pe(phb, pe); >>> + pnv_ioda_deconfigure_pe(pe); >> >> >> Is this change necessary to get "Release PEs dynamically" working? Move it to >> mechanical changes patch may be? >> > > Ok. I'll try to do that. > >> >>> >>> - pnv_ioda_free_pe(phb, pe->pe_number); >>> + pnv_ioda_pe_put(pe); >>> } >>> } >>> >>> @@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) >>> >>> if (pnv_ioda_configure_pe(phb, pe)) { >>> /* XXX What do we do here ? */ >>> - if (pe_num) >>> - pnv_ioda_free_pe(phb, pe_num); >>> - pe->pdev = NULL; >>> + pnv_ioda_pe_put(pe); >>> continue; >>> } >>> >>> @@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode) >>> struct pnv_ioda_pe *pe; >>> int rc; >>> >>> - pe = pnv_ioda_get_pe(dev); >>> + pe = pnv_ioda_pci_dev_to_pe(dev); >> >> >> And this change could to separately. Not clear how this helps to "Release PEs >> dynamically". >> >> > > It's not related to "Release PEs dynamically". The change is introduced by > the function rename: Original pnv_ioda_get_pe() is renamed to pnv_ioda_pci_dev_to_pe(). But the rename happened in this patch and the patch's subj is "Release PEs dynamically" so it should be related somehow or move it to a simple separate patch "let's give the lalala function a better name to reflect what it actually does" (but in this case the new name does not make any more sense than the old one). >>> if (!pe) >>> return -ENODEV; >>> >>> @@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, >>> struct pnv_ioda_pe *pe; >>> int rc; >>> >>> - if (!(pe = pnv_ioda_get_pe(dev))) >>> + if (!(pe = pnv_ioda_pci_dev_to_pe(dev))) >>> return -ENODEV; >>> >>> /* Assign XIVE to PE */ >>> @@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, >>> unsigned int hwirq, unsigned int virq, >>> unsigned int is_64, struct msi_msg *msg) >>> { >>> - struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); >>> + struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev); >>> unsigned int xive_num = hwirq - phb->msi_base; >>> __be32 data; >>> int rc; >>> @@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, >>> pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge; >>> pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment; >>> pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus; >>> + pnv_pci_controller_ops.release_device = pnv_pci_release_device; >>> hose->controller_ops = pnv_pci_controller_ops; >>> >>> #ifdef CONFIG_PCI_IOV >>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h >>> index 1bea3a8..8b10f01 100644 >>> --- a/arch/powerpc/platforms/powernv/pci.h >>> +++ b/arch/powerpc/platforms/powernv/pci.h >>> @@ -28,6 +28,7 @@ enum pnv_phb_model { >>> /* Data associated with a PE, including IOMMU tracking etc.. */ >>> struct pnv_phb; >>> struct pnv_ioda_pe { >>> + struct kref kref; >>> unsigned long flags; >>> struct pnv_phb *phb; >>> >>> @@ -120,7 +121,8 @@ struct pnv_phb { >>> void (*shutdown)(struct pnv_phb *phb); >>> int (*init_m64)(struct pnv_phb *phb); >>> void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus); >>> - int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all); >>> + struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb, >>> + struct pci_bus *bus, int all); >>> int (*get_pe_state)(struct pnv_phb *phb, int pe_no); >>> void (*freeze_pe)(struct pnv_phb *phb, int pe_no); >>> int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); >>> > > Thanks, > Gavin > -- Alexey