* [RFC PATCH kernel 1/2] irq: Add reference counting to IRQ mappings
@ 2020-10-27 9:06 ` Alexey Kardashevskiy
0 siblings, 0 replies; 13+ messages in thread
From: Alexey Kardashevskiy @ 2020-10-27 9:06 UTC (permalink / raw)
To: linuxppc-dev
Cc: Cédric Le Goater, Oliver O'Halloran, Marc Zyngier,
Michael Ellerman, Qian Cai, Rob Herring, Thomas Gleixner,
linux-kernel, Frederic Barrat, Michal Suchánek,
Alexey Kardashevskiy
PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
Device drivers map/unmap hardware interrupts via irq_create_mapping()/
irq_dispose_mapping(). The problem with that these interrupts are
shared and when performing hot unplug, we need to unmap the interrupt
only when the last device is released.
This reuses already existing irq_desc::kobj for this purpose.
The refcounter is naturally 1 when the descriptor is allocated already;
this adds kobject_get() in places where already existing mapped virq
is returned.
This reorganizes irq_dispose_mapping() to release the kobj and let
the release callback do the cleanup.
If some driver or platform does its own reference counting, this expects
those parties to call irq_find_mapping() and call irq_dispose_mapping()
for every irq_create_fwspec_mapping()/irq_create_mapping().
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
kernel/irq/irqdesc.c | 35 +++++++++++++++++++++++------------
kernel/irq/irqdomain.c | 27 +++++++++++++--------------
2 files changed, 36 insertions(+), 26 deletions(-)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1a7723604399..dae096238500 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -419,20 +419,39 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
return NULL;
}
+static void delayed_free_desc(struct rcu_head *rhp);
static void irq_kobj_release(struct kobject *kobj)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ struct irq_domain *domain;
+ unsigned int virq = desc->irq_data.irq;
- free_masks(desc);
- free_percpu(desc->kstat_irqs);
- kfree(desc);
+ domain = desc->irq_data.domain;
+ if (domain) {
+ if (irq_domain_is_hierarchy(domain)) {
+ irq_domain_free_irqs(virq, 1);
+ } else {
+ irq_domain_disassociate(domain, virq);
+ irq_free_desc(virq);
+ }
+ }
+
+ /*
+ * We free the descriptor, masks and stat fields via RCU. That
+ * allows demultiplex interrupts to do rcu based management of
+ * the child interrupts.
+ * This also allows us to use rcu in kstat_irqs_usr().
+ */
+ call_rcu(&desc->rcu, delayed_free_desc);
}
static void delayed_free_desc(struct rcu_head *rhp)
{
struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
- kobject_put(&desc->kobj);
+ free_masks(desc);
+ free_percpu(desc->kstat_irqs);
+ kfree(desc);
}
static void free_desc(unsigned int irq)
@@ -453,14 +472,6 @@ static void free_desc(unsigned int irq)
*/
irq_sysfs_del(desc);
delete_irq_desc(irq);
-
- /*
- * We free the descriptor, masks and stat fields via RCU. That
- * allows demultiplex interrupts to do rcu based management of
- * the child interrupts.
- * This also allows us to use rcu in kstat_irqs_usr().
- */
- call_rcu(&desc->rcu, delayed_free_desc);
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf8b374b892d..02733ddc321f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -638,6 +638,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
{
struct device_node *of_node;
int virq;
+ struct irq_desc *desc;
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -655,7 +656,9 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
+ desc = irq_to_desc(virq);
pr_debug("-> existing mapping on virq %d\n", virq);
+ kobject_get(&desc->kobj);
return virq;
}
@@ -751,6 +754,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
int virq;
+ struct irq_desc *desc;
if (fwspec->fwnode) {
domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
@@ -787,8 +791,11 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
* current trigger type then we are done so return the
* interrupt number.
*/
- if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
+ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) {
+ desc = irq_to_desc(virq);
+ kobject_get(&desc->kobj);
return virq;
+ }
/*
* If the trigger type has not been set yet, then set
@@ -800,6 +807,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
return 0;
irqd_set_trigger_type(irq_data, type);
+ desc = irq_to_desc(virq);
+ kobject_get(&desc->kobj);
return virq;
}
@@ -852,22 +861,12 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
*/
void irq_dispose_mapping(unsigned int virq)
{
- struct irq_data *irq_data = irq_get_irq_data(virq);
- struct irq_domain *domain;
+ struct irq_desc *desc = irq_to_desc(virq);
- if (!virq || !irq_data)
+ if (!virq || !desc)
return;
- domain = irq_data->domain;
- if (WARN_ON(domain == NULL))
- return;
-
- if (irq_domain_is_hierarchy(domain)) {
- irq_domain_free_irqs(virq, 1);
- } else {
- irq_domain_disassociate(domain, virq);
- irq_free_desc(virq);
- }
+ kobject_put(&desc->kobj);
}
EXPORT_SYMBOL_GPL(irq_dispose_mapping);
--
2.17.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [RFC PATCH kernel 1/2] irq: Add reference counting to IRQ mappings
2020-10-27 9:06 ` Alexey Kardashevskiy
(?)
@ 2020-10-27 13:06 ` kernel test robot
-1 siblings, 0 replies; 13+ messages in thread
From: kernel test robot @ 2020-10-27 13:06 UTC (permalink / raw)
To: kbuild-all
[-- Attachment #1: Type: text/plain, Size: 3468 bytes --]
Hi Alexey,
[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on powerpc/next]
[also build test ERROR on tip/irq/core v5.10-rc1 next-20201027]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Alexey-Kardashevskiy/irq-Add-reference-counting-to-IRQ-mappings/20201027-170907
base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: s390-zfcpdump_defconfig (attached as .config)
compiler: s390-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/63684bd94a7e53ec13917e2a23d55ce572bf2878
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Alexey-Kardashevskiy/irq-Add-reference-counting-to-IRQ-mappings/20201027-170907
git checkout 63684bd94a7e53ec13917e2a23d55ce572bf2878
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=s390
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
kernel/irq/irqdesc.c: In function 'irq_kobj_release':
>> kernel/irq/irqdesc.c:431:7: error: implicit declaration of function 'irq_domain_is_hierarchy' [-Werror=implicit-function-declaration]
431 | if (irq_domain_is_hierarchy(domain)) {
| ^~~~~~~~~~~~~~~~~~~~~~~
>> kernel/irq/irqdesc.c:432:4: error: implicit declaration of function 'irq_domain_free_irqs'; did you mean 'irq_domain_activate_irq'? [-Werror=implicit-function-declaration]
432 | irq_domain_free_irqs(virq, 1);
| ^~~~~~~~~~~~~~~~~~~~
| irq_domain_activate_irq
>> kernel/irq/irqdesc.c:434:4: error: implicit declaration of function 'irq_domain_disassociate' [-Werror=implicit-function-declaration]
434 | irq_domain_disassociate(domain, virq);
| ^~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/irq_domain_is_hierarchy +431 kernel/irq/irqdesc.c
421
422 static void delayed_free_desc(struct rcu_head *rhp);
423 static void irq_kobj_release(struct kobject *kobj)
424 {
425 struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
426 struct irq_domain *domain;
427 unsigned int virq = desc->irq_data.irq;
428
429 domain = desc->irq_data.domain;
430 if (domain) {
> 431 if (irq_domain_is_hierarchy(domain)) {
> 432 irq_domain_free_irqs(virq, 1);
433 } else {
> 434 irq_domain_disassociate(domain, virq);
435 irq_free_desc(virq);
436 }
437 }
438
439 /*
440 * We free the descriptor, masks and stat fields via RCU. That
441 * allows demultiplex interrupts to do rcu based management of
442 * the child interrupts.
443 * This also allows us to use rcu in kstat_irqs_usr().
444 */
445 call_rcu(&desc->rcu, delayed_free_desc);
446 }
447
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 7974 bytes --]
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [RFC PATCH kernel 1/2] irq: Add reference counting to IRQ mappings
2020-10-27 9:06 ` Alexey Kardashevskiy
@ 2020-10-27 16:09 ` Marc Zyngier
-1 siblings, 0 replies; 13+ messages in thread
From: Marc Zyngier @ 2020-10-27 16:09 UTC (permalink / raw)
To: Alexey Kardashevskiy
Cc: Rob Herring, linux-kernel, Qian Cai, Cédric Le Goater,
Frederic Barrat, Oliver O'Halloran, Thomas Gleixner,
Michal Suchánek, linuxppc-dev
Hi Alexey,
On 2020-10-27 09:06, Alexey Kardashevskiy wrote:
> PCI devices share 4 legacy INTx interrupts from the same PCI host
> bridge.
> Device drivers map/unmap hardware interrupts via irq_create_mapping()/
> irq_dispose_mapping(). The problem with that these interrupts are
> shared and when performing hot unplug, we need to unmap the interrupt
> only when the last device is released.
>
> This reuses already existing irq_desc::kobj for this purpose.
> The refcounter is naturally 1 when the descriptor is allocated already;
> this adds kobject_get() in places where already existing mapped virq
> is returned.
That's quite interesting, as I was about to revive a patch series that
rework the irqdomain subsystem to directly cache irq_desc instead of
raw interrupt numbers. And for that, I needed some form of
refcounting...
>
> This reorganizes irq_dispose_mapping() to release the kobj and let
> the release callback do the cleanup.
>
> If some driver or platform does its own reference counting, this
> expects
> those parties to call irq_find_mapping() and call irq_dispose_mapping()
> for every irq_create_fwspec_mapping()/irq_create_mapping().
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> kernel/irq/irqdesc.c | 35 +++++++++++++++++++++++------------
> kernel/irq/irqdomain.c | 27 +++++++++++++--------------
> 2 files changed, 36 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 1a7723604399..dae096238500 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -419,20 +419,39 @@ static struct irq_desc *alloc_desc(int irq, int
> node, unsigned int flags,
> return NULL;
> }
>
> +static void delayed_free_desc(struct rcu_head *rhp);
> static void irq_kobj_release(struct kobject *kobj)
> {
> struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
> + struct irq_domain *domain;
> + unsigned int virq = desc->irq_data.irq;
>
> - free_masks(desc);
> - free_percpu(desc->kstat_irqs);
> - kfree(desc);
> + domain = desc->irq_data.domain;
> + if (domain) {
> + if (irq_domain_is_hierarchy(domain)) {
> + irq_domain_free_irqs(virq, 1);
How does this work with hierarchical domains? Each domain should
contribute as a reference on the irq_desc. But if you got here,
it means the refcount has already dropped to 0.
So either there is nothing to free here, or you don't track the
references implied by the hierarchy. I suspect the latter.
> + } else {
> + irq_domain_disassociate(domain, virq);
> + irq_free_desc(virq);
> + }
> + }
> +
> + /*
> + * We free the descriptor, masks and stat fields via RCU. That
> + * allows demultiplex interrupts to do rcu based management of
> + * the child interrupts.
> + * This also allows us to use rcu in kstat_irqs_usr().
> + */
> + call_rcu(&desc->rcu, delayed_free_desc);
> }
>
> static void delayed_free_desc(struct rcu_head *rhp)
> {
> struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
>
> - kobject_put(&desc->kobj);
> + free_masks(desc);
> + free_percpu(desc->kstat_irqs);
> + kfree(desc);
> }
>
> static void free_desc(unsigned int irq)
> @@ -453,14 +472,6 @@ static void free_desc(unsigned int irq)
> */
> irq_sysfs_del(desc);
> delete_irq_desc(irq);
> -
> - /*
> - * We free the descriptor, masks and stat fields via RCU. That
> - * allows demultiplex interrupts to do rcu based management of
> - * the child interrupts.
> - * This also allows us to use rcu in kstat_irqs_usr().
> - */
> - call_rcu(&desc->rcu, delayed_free_desc);
> }
>
> static int alloc_descs(unsigned int start, unsigned int cnt, int node,
> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
> index cf8b374b892d..02733ddc321f 100644
> --- a/kernel/irq/irqdomain.c
> +++ b/kernel/irq/irqdomain.c
> @@ -638,6 +638,7 @@ unsigned int irq_create_mapping(struct irq_domain
> *domain,
> {
> struct device_node *of_node;
> int virq;
> + struct irq_desc *desc;
>
> pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
>
> @@ -655,7 +656,9 @@ unsigned int irq_create_mapping(struct irq_domain
> *domain,
> /* Check if mapping already exists */
> virq = irq_find_mapping(domain, hwirq);
> if (virq) {
> + desc = irq_to_desc(virq);
> pr_debug("-> existing mapping on virq %d\n", virq);
> + kobject_get(&desc->kobj);
My worry with this is that there is probably a significant amount of
code out there that relies on multiple calls to irq_create_mapping()
with the same parameters not to have any side effects. They would
expect a subsequent irq_dispose_mapping() to drop the translation
altogether, and that's obviously not the case here.
Have you audited the various call sites to see what could break?
> return virq;
> }
>
> @@ -751,6 +754,7 @@ unsigned int irq_create_fwspec_mapping(struct
> irq_fwspec *fwspec)
> irq_hw_number_t hwirq;
> unsigned int type = IRQ_TYPE_NONE;
> int virq;
> + struct irq_desc *desc;
>
> if (fwspec->fwnode) {
> domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
> @@ -787,8 +791,11 @@ unsigned int irq_create_fwspec_mapping(struct
> irq_fwspec *fwspec)
> * current trigger type then we are done so return the
> * interrupt number.
> */
> - if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
> + if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) {
> + desc = irq_to_desc(virq);
> + kobject_get(&desc->kobj);
> return virq;
> + }
>
> /*
> * If the trigger type has not been set yet, then set
> @@ -800,6 +807,8 @@ unsigned int irq_create_fwspec_mapping(struct
> irq_fwspec *fwspec)
> return 0;
>
> irqd_set_trigger_type(irq_data, type);
> + desc = irq_to_desc(virq);
> + kobject_get(&desc->kobj);
> return virq;
> }
>
> @@ -852,22 +861,12 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
> */
> void irq_dispose_mapping(unsigned int virq)
> {
> - struct irq_data *irq_data = irq_get_irq_data(virq);
> - struct irq_domain *domain;
> + struct irq_desc *desc = irq_to_desc(virq);
>
> - if (!virq || !irq_data)
> + if (!virq || !desc)
> return;
>
> - domain = irq_data->domain;
> - if (WARN_ON(domain == NULL))
> - return;
> -
> - if (irq_domain_is_hierarchy(domain)) {
> - irq_domain_free_irqs(virq, 1);
> - } else {
> - irq_domain_disassociate(domain, virq);
> - irq_free_desc(virq);
> - }
> + kobject_put(&desc->kobj);
> }
> EXPORT_SYMBOL_GPL(irq_dispose_mapping);
Thanks,
M.
--
Jazz is not dead. It just smells funny...
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [RFC PATCH kernel 1/2] irq: Add reference counting to IRQ mappings
@ 2020-10-27 16:09 ` Marc Zyngier
0 siblings, 0 replies; 13+ messages in thread
From: Marc Zyngier @ 2020-10-27 16:09 UTC (permalink / raw)
To: Alexey Kardashevskiy
Cc: linuxppc-dev, Cédric Le Goater, Oliver O'Halloran,
Michael Ellerman, Qian Cai, Rob Herring, Thomas Gleixner,
linux-kernel, Frederic Barrat, Michal Suchánek
Hi Alexey,
On 2020-10-27 09:06, Alexey Kardashevskiy wrote:
> PCI devices share 4 legacy INTx interrupts from the same PCI host
> bridge.
> Device drivers map/unmap hardware interrupts via irq_create_mapping()/
> irq_dispose_mapping(). The problem with that these interrupts are
> shared and when performing hot unplug, we need to unmap the interrupt
> only when the last device is released.
>
> This reuses already existing irq_desc::kobj for this purpose.
> The refcounter is naturally 1 when the descriptor is allocated already;
> this adds kobject_get() in places where already existing mapped virq
> is returned.
That's quite interesting, as I was about to revive a patch series that
rework the irqdomain subsystem to directly cache irq_desc instead of
raw interrupt numbers. And for that, I needed some form of
refcounting...
>
> This reorganizes irq_dispose_mapping() to release the kobj and let
> the release callback do the cleanup.
>
> If some driver or platform does its own reference counting, this
> expects
> those parties to call irq_find_mapping() and call irq_dispose_mapping()
> for every irq_create_fwspec_mapping()/irq_create_mapping().
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> kernel/irq/irqdesc.c | 35 +++++++++++++++++++++++------------
> kernel/irq/irqdomain.c | 27 +++++++++++++--------------
> 2 files changed, 36 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 1a7723604399..dae096238500 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -419,20 +419,39 @@ static struct irq_desc *alloc_desc(int irq, int
> node, unsigned int flags,
> return NULL;
> }
>
> +static void delayed_free_desc(struct rcu_head *rhp);
> static void irq_kobj_release(struct kobject *kobj)
> {
> struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
> + struct irq_domain *domain;
> + unsigned int virq = desc->irq_data.irq;
>
> - free_masks(desc);
> - free_percpu(desc->kstat_irqs);
> - kfree(desc);
> + domain = desc->irq_data.domain;
> + if (domain) {
> + if (irq_domain_is_hierarchy(domain)) {
> + irq_domain_free_irqs(virq, 1);
How does this work with hierarchical domains? Each domain should
contribute as a reference on the irq_desc. But if you got here,
it means the refcount has already dropped to 0.
So either there is nothing to free here, or you don't track the
references implied by the hierarchy. I suspect the latter.
> + } else {
> + irq_domain_disassociate(domain, virq);
> + irq_free_desc(virq);
> + }
> + }
> +
> + /*
> + * We free the descriptor, masks and stat fields via RCU. That
> + * allows demultiplex interrupts to do rcu based management of
> + * the child interrupts.
> + * This also allows us to use rcu in kstat_irqs_usr().
> + */
> + call_rcu(&desc->rcu, delayed_free_desc);
> }
>
> static void delayed_free_desc(struct rcu_head *rhp)
> {
> struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
>
> - kobject_put(&desc->kobj);
> + free_masks(desc);
> + free_percpu(desc->kstat_irqs);
> + kfree(desc);
> }
>
> static void free_desc(unsigned int irq)
> @@ -453,14 +472,6 @@ static void free_desc(unsigned int irq)
> */
> irq_sysfs_del(desc);
> delete_irq_desc(irq);
> -
> - /*
> - * We free the descriptor, masks and stat fields via RCU. That
> - * allows demultiplex interrupts to do rcu based management of
> - * the child interrupts.
> - * This also allows us to use rcu in kstat_irqs_usr().
> - */
> - call_rcu(&desc->rcu, delayed_free_desc);
> }
>
> static int alloc_descs(unsigned int start, unsigned int cnt, int node,
> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
> index cf8b374b892d..02733ddc321f 100644
> --- a/kernel/irq/irqdomain.c
> +++ b/kernel/irq/irqdomain.c
> @@ -638,6 +638,7 @@ unsigned int irq_create_mapping(struct irq_domain
> *domain,
> {
> struct device_node *of_node;
> int virq;
> + struct irq_desc *desc;
>
> pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
>
> @@ -655,7 +656,9 @@ unsigned int irq_create_mapping(struct irq_domain
> *domain,
> /* Check if mapping already exists */
> virq = irq_find_mapping(domain, hwirq);
> if (virq) {
> + desc = irq_to_desc(virq);
> pr_debug("-> existing mapping on virq %d\n", virq);
> + kobject_get(&desc->kobj);
My worry with this is that there is probably a significant amount of
code out there that relies on multiple calls to irq_create_mapping()
with the same parameters not to have any side effects. They would
expect a subsequent irq_dispose_mapping() to drop the translation
altogether, and that's obviously not the case here.
Have you audited the various call sites to see what could break?
> return virq;
> }
>
> @@ -751,6 +754,7 @@ unsigned int irq_create_fwspec_mapping(struct
> irq_fwspec *fwspec)
> irq_hw_number_t hwirq;
> unsigned int type = IRQ_TYPE_NONE;
> int virq;
> + struct irq_desc *desc;
>
> if (fwspec->fwnode) {
> domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
> @@ -787,8 +791,11 @@ unsigned int irq_create_fwspec_mapping(struct
> irq_fwspec *fwspec)
> * current trigger type then we are done so return the
> * interrupt number.
> */
> - if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
> + if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) {
> + desc = irq_to_desc(virq);
> + kobject_get(&desc->kobj);
> return virq;
> + }
>
> /*
> * If the trigger type has not been set yet, then set
> @@ -800,6 +807,8 @@ unsigned int irq_create_fwspec_mapping(struct
> irq_fwspec *fwspec)
> return 0;
>
> irqd_set_trigger_type(irq_data, type);
> + desc = irq_to_desc(virq);
> + kobject_get(&desc->kobj);
> return virq;
> }
>
> @@ -852,22 +861,12 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
> */
> void irq_dispose_mapping(unsigned int virq)
> {
> - struct irq_data *irq_data = irq_get_irq_data(virq);
> - struct irq_domain *domain;
> + struct irq_desc *desc = irq_to_desc(virq);
>
> - if (!virq || !irq_data)
> + if (!virq || !desc)
> return;
>
> - domain = irq_data->domain;
> - if (WARN_ON(domain == NULL))
> - return;
> -
> - if (irq_domain_is_hierarchy(domain)) {
> - irq_domain_free_irqs(virq, 1);
> - } else {
> - irq_domain_disassociate(domain, virq);
> - irq_free_desc(virq);
> - }
> + kobject_put(&desc->kobj);
> }
> EXPORT_SYMBOL_GPL(irq_dispose_mapping);
Thanks,
M.
--
Jazz is not dead. It just smells funny...
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [RFC PATCH kernel 1/2] irq: Add reference counting to IRQ mappings
2020-10-27 16:09 ` Marc Zyngier
@ 2020-10-29 5:04 ` Alexey Kardashevskiy
-1 siblings, 0 replies; 13+ messages in thread
From: Alexey Kardashevskiy @ 2020-10-29 5:04 UTC (permalink / raw)
To: Marc Zyngier
Cc: Rob Herring, linux-kernel, Qian Cai, Cédric Le Goater,
Frederic Barrat, Oliver O'Halloran, Thomas Gleixner,
Michal Suchánek, linuxppc-dev
On 28/10/2020 03:09, Marc Zyngier wrote:
> Hi Alexey,
>
> On 2020-10-27 09:06, Alexey Kardashevskiy wrote:
>> PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
>> Device drivers map/unmap hardware interrupts via irq_create_mapping()/
>> irq_dispose_mapping(). The problem with that these interrupts are
>> shared and when performing hot unplug, we need to unmap the interrupt
>> only when the last device is released.
>>
>> This reuses already existing irq_desc::kobj for this purpose.
>> The refcounter is naturally 1 when the descriptor is allocated already;
>> this adds kobject_get() in places where already existing mapped virq
>> is returned.
>
> That's quite interesting, as I was about to revive a patch series that
> rework the irqdomain subsystem to directly cache irq_desc instead of
> raw interrupt numbers. And for that, I needed some form of refcounting...
>
>>
>> This reorganizes irq_dispose_mapping() to release the kobj and let
>> the release callback do the cleanup.
>>
>> If some driver or platform does its own reference counting, this expects
>> those parties to call irq_find_mapping() and call irq_dispose_mapping()
>> for every irq_create_fwspec_mapping()/irq_create_mapping().
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> kernel/irq/irqdesc.c | 35 +++++++++++++++++++++++------------
>> kernel/irq/irqdomain.c | 27 +++++++++++++--------------
>> 2 files changed, 36 insertions(+), 26 deletions(-)
>>
>> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
>> index 1a7723604399..dae096238500 100644
>> --- a/kernel/irq/irqdesc.c
>> +++ b/kernel/irq/irqdesc.c
>> @@ -419,20 +419,39 @@ static struct irq_desc *alloc_desc(int irq, int
>> node, unsigned int flags,
>> return NULL;
>> }
>>
>> +static void delayed_free_desc(struct rcu_head *rhp);
>> static void irq_kobj_release(struct kobject *kobj)
>> {
>> struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
>> + struct irq_domain *domain;
>> + unsigned int virq = desc->irq_data.irq;
>>
>> - free_masks(desc);
>> - free_percpu(desc->kstat_irqs);
>> - kfree(desc);
>> + domain = desc->irq_data.domain;
>> + if (domain) {
>> + if (irq_domain_is_hierarchy(domain)) {
>> + irq_domain_free_irqs(virq, 1);
>
> How does this work with hierarchical domains? Each domain should
> contribute as a reference on the irq_desc. But if you got here,
> it means the refcount has already dropped to 0.
>
> So either there is nothing to free here, or you don't track the
> references implied by the hierarchy. I suspect the latter.
This is correct, I did not look at hierarchy yet, looking now...
>> + } else {
>> + irq_domain_disassociate(domain, virq);
>> + irq_free_desc(virq);
>> + }
>> + }
>> +
>> + /*
>> + * We free the descriptor, masks and stat fields via RCU. That
>> + * allows demultiplex interrupts to do rcu based management of
>> + * the child interrupts.
>> + * This also allows us to use rcu in kstat_irqs_usr().
>> + */
>> + call_rcu(&desc->rcu, delayed_free_desc);
>> }
>>
>> static void delayed_free_desc(struct rcu_head *rhp)
>> {
>> struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
>>
>> - kobject_put(&desc->kobj);
>> + free_masks(desc);
>> + free_percpu(desc->kstat_irqs);
>> + kfree(desc);
>> }
>>
>> static void free_desc(unsigned int irq)
>> @@ -453,14 +472,6 @@ static void free_desc(unsigned int irq)
>> */
>> irq_sysfs_del(desc);
>> delete_irq_desc(irq);
>> -
>> - /*
>> - * We free the descriptor, masks and stat fields via RCU. That
>> - * allows demultiplex interrupts to do rcu based management of
>> - * the child interrupts.
>> - * This also allows us to use rcu in kstat_irqs_usr().
>> - */
>> - call_rcu(&desc->rcu, delayed_free_desc);
>> }
>>
>> static int alloc_descs(unsigned int start, unsigned int cnt, int node,
>> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
>> index cf8b374b892d..02733ddc321f 100644
>> --- a/kernel/irq/irqdomain.c
>> +++ b/kernel/irq/irqdomain.c
>> @@ -638,6 +638,7 @@ unsigned int irq_create_mapping(struct irq_domain
>> *domain,
>> {
>> struct device_node *of_node;
>> int virq;
>> + struct irq_desc *desc;
>>
>> pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
>>
>> @@ -655,7 +656,9 @@ unsigned int irq_create_mapping(struct irq_domain
>> *domain,
>> /* Check if mapping already exists */
>> virq = irq_find_mapping(domain, hwirq);
>> if (virq) {
>> + desc = irq_to_desc(virq);
>> pr_debug("-> existing mapping on virq %d\n", virq);
>> + kobject_get(&desc->kobj);
>
> My worry with this is that there is probably a significant amount of
> code out there that relies on multiple calls to irq_create_mapping()
> with the same parameters not to have any side effects. They would
> expect a subsequent irq_dispose_mapping() to drop the translation
> altogether, and that's obviously not the case here.
>
> Have you audited the various call sites to see what could break?
The vast majority calls one of irq..create_mapping in init/probe and
then calls irq_dispose_mapping() right there if probing failed or when
the driver is unloaded. I could not spot any reference counting
anywhere, everyone seems to call irq_dispose_mapping() per every
irq_of_parse_and_map() (or friends).
Then there is a minority (such as the code I am fixing in
powerpc/pseries) but it is either broken (such as pseries/pci which does
not call irq_dispose_mapping at all) or it is 1 disposal per 1 mapping
(PPC KVM).
I did not spend awful amount of time though, git grep
irq_dispose_mapping gives 200 lines...
>
>> return virq;
>> }
>>
>> @@ -751,6 +754,7 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>> irq_hw_number_t hwirq;
>> unsigned int type = IRQ_TYPE_NONE;
>> int virq;
>> + struct irq_desc *desc;
>>
>> if (fwspec->fwnode) {
>> domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
>> @@ -787,8 +791,11 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>> * current trigger type then we are done so return the
>> * interrupt number.
>> */
>> - if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
>> + if (type == IRQ_TYPE_NONE || type ==
>> irq_get_trigger_type(virq)) {
>> + desc = irq_to_desc(virq);
>> + kobject_get(&desc->kobj);
>> return virq;
>> + }
>>
>> /*
>> * If the trigger type has not been set yet, then set
>> @@ -800,6 +807,8 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>> return 0;
>>
>> irqd_set_trigger_type(irq_data, type);
>> + desc = irq_to_desc(virq);
>> + kobject_get(&desc->kobj);
>> return virq;
>> }
>>
>> @@ -852,22 +861,12 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
>> */
>> void irq_dispose_mapping(unsigned int virq)
>> {
>> - struct irq_data *irq_data = irq_get_irq_data(virq);
>> - struct irq_domain *domain;
>> + struct irq_desc *desc = irq_to_desc(virq);
>>
>> - if (!virq || !irq_data)
>> + if (!virq || !desc)
>> return;
>>
>> - domain = irq_data->domain;
>> - if (WARN_ON(domain == NULL))
>> - return;
>> -
>> - if (irq_domain_is_hierarchy(domain)) {
>> - irq_domain_free_irqs(virq, 1);
>> - } else {
>> - irq_domain_disassociate(domain, virq);
>> - irq_free_desc(virq);
>> - }
>> + kobject_put(&desc->kobj);
>> }
>> EXPORT_SYMBOL_GPL(irq_dispose_mapping);
>
> Thanks,
>
> M.
--
Alexey
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [RFC PATCH kernel 1/2] irq: Add reference counting to IRQ mappings
@ 2020-10-29 5:04 ` Alexey Kardashevskiy
0 siblings, 0 replies; 13+ messages in thread
From: Alexey Kardashevskiy @ 2020-10-29 5:04 UTC (permalink / raw)
To: Marc Zyngier
Cc: linuxppc-dev, Cédric Le Goater, Oliver O'Halloran,
Michael Ellerman, Qian Cai, Rob Herring, Thomas Gleixner,
linux-kernel, Frederic Barrat, Michal Suchánek
On 28/10/2020 03:09, Marc Zyngier wrote:
> Hi Alexey,
>
> On 2020-10-27 09:06, Alexey Kardashevskiy wrote:
>> PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
>> Device drivers map/unmap hardware interrupts via irq_create_mapping()/
>> irq_dispose_mapping(). The problem with that these interrupts are
>> shared and when performing hot unplug, we need to unmap the interrupt
>> only when the last device is released.
>>
>> This reuses already existing irq_desc::kobj for this purpose.
>> The refcounter is naturally 1 when the descriptor is allocated already;
>> this adds kobject_get() in places where already existing mapped virq
>> is returned.
>
> That's quite interesting, as I was about to revive a patch series that
> rework the irqdomain subsystem to directly cache irq_desc instead of
> raw interrupt numbers. And for that, I needed some form of refcounting...
>
>>
>> This reorganizes irq_dispose_mapping() to release the kobj and let
>> the release callback do the cleanup.
>>
>> If some driver or platform does its own reference counting, this expects
>> those parties to call irq_find_mapping() and call irq_dispose_mapping()
>> for every irq_create_fwspec_mapping()/irq_create_mapping().
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> kernel/irq/irqdesc.c | 35 +++++++++++++++++++++++------------
>> kernel/irq/irqdomain.c | 27 +++++++++++++--------------
>> 2 files changed, 36 insertions(+), 26 deletions(-)
>>
>> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
>> index 1a7723604399..dae096238500 100644
>> --- a/kernel/irq/irqdesc.c
>> +++ b/kernel/irq/irqdesc.c
>> @@ -419,20 +419,39 @@ static struct irq_desc *alloc_desc(int irq, int
>> node, unsigned int flags,
>> return NULL;
>> }
>>
>> +static void delayed_free_desc(struct rcu_head *rhp);
>> static void irq_kobj_release(struct kobject *kobj)
>> {
>> struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
>> + struct irq_domain *domain;
>> + unsigned int virq = desc->irq_data.irq;
>>
>> - free_masks(desc);
>> - free_percpu(desc->kstat_irqs);
>> - kfree(desc);
>> + domain = desc->irq_data.domain;
>> + if (domain) {
>> + if (irq_domain_is_hierarchy(domain)) {
>> + irq_domain_free_irqs(virq, 1);
>
> How does this work with hierarchical domains? Each domain should
> contribute as a reference on the irq_desc. But if you got here,
> it means the refcount has already dropped to 0.
>
> So either there is nothing to free here, or you don't track the
> references implied by the hierarchy. I suspect the latter.
This is correct, I did not look at hierarchy yet, looking now...
>> + } else {
>> + irq_domain_disassociate(domain, virq);
>> + irq_free_desc(virq);
>> + }
>> + }
>> +
>> + /*
>> + * We free the descriptor, masks and stat fields via RCU. That
>> + * allows demultiplex interrupts to do rcu based management of
>> + * the child interrupts.
>> + * This also allows us to use rcu in kstat_irqs_usr().
>> + */
>> + call_rcu(&desc->rcu, delayed_free_desc);
>> }
>>
>> static void delayed_free_desc(struct rcu_head *rhp)
>> {
>> struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
>>
>> - kobject_put(&desc->kobj);
>> + free_masks(desc);
>> + free_percpu(desc->kstat_irqs);
>> + kfree(desc);
>> }
>>
>> static void free_desc(unsigned int irq)
>> @@ -453,14 +472,6 @@ static void free_desc(unsigned int irq)
>> */
>> irq_sysfs_del(desc);
>> delete_irq_desc(irq);
>> -
>> - /*
>> - * We free the descriptor, masks and stat fields via RCU. That
>> - * allows demultiplex interrupts to do rcu based management of
>> - * the child interrupts.
>> - * This also allows us to use rcu in kstat_irqs_usr().
>> - */
>> - call_rcu(&desc->rcu, delayed_free_desc);
>> }
>>
>> static int alloc_descs(unsigned int start, unsigned int cnt, int node,
>> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
>> index cf8b374b892d..02733ddc321f 100644
>> --- a/kernel/irq/irqdomain.c
>> +++ b/kernel/irq/irqdomain.c
>> @@ -638,6 +638,7 @@ unsigned int irq_create_mapping(struct irq_domain
>> *domain,
>> {
>> struct device_node *of_node;
>> int virq;
>> + struct irq_desc *desc;
>>
>> pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
>>
>> @@ -655,7 +656,9 @@ unsigned int irq_create_mapping(struct irq_domain
>> *domain,
>> /* Check if mapping already exists */
>> virq = irq_find_mapping(domain, hwirq);
>> if (virq) {
>> + desc = irq_to_desc(virq);
>> pr_debug("-> existing mapping on virq %d\n", virq);
>> + kobject_get(&desc->kobj);
>
> My worry with this is that there is probably a significant amount of
> code out there that relies on multiple calls to irq_create_mapping()
> with the same parameters not to have any side effects. They would
> expect a subsequent irq_dispose_mapping() to drop the translation
> altogether, and that's obviously not the case here.
>
> Have you audited the various call sites to see what could break?
The vast majority calls one of irq..create_mapping in init/probe and
then calls irq_dispose_mapping() right there if probing failed or when
the driver is unloaded. I could not spot any reference counting
anywhere, everyone seems to call irq_dispose_mapping() per every
irq_of_parse_and_map() (or friends).
Then there is a minority (such as the code I am fixing in
powerpc/pseries) but it is either broken (such as pseries/pci which does
not call irq_dispose_mapping at all) or it is 1 disposal per 1 mapping
(PPC KVM).
I did not spend awful amount of time though, git grep
irq_dispose_mapping gives 200 lines...
>
>> return virq;
>> }
>>
>> @@ -751,6 +754,7 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>> irq_hw_number_t hwirq;
>> unsigned int type = IRQ_TYPE_NONE;
>> int virq;
>> + struct irq_desc *desc;
>>
>> if (fwspec->fwnode) {
>> domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
>> @@ -787,8 +791,11 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>> * current trigger type then we are done so return the
>> * interrupt number.
>> */
>> - if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
>> + if (type == IRQ_TYPE_NONE || type ==
>> irq_get_trigger_type(virq)) {
>> + desc = irq_to_desc(virq);
>> + kobject_get(&desc->kobj);
>> return virq;
>> + }
>>
>> /*
>> * If the trigger type has not been set yet, then set
>> @@ -800,6 +807,8 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>> return 0;
>>
>> irqd_set_trigger_type(irq_data, type);
>> + desc = irq_to_desc(virq);
>> + kobject_get(&desc->kobj);
>> return virq;
>> }
>>
>> @@ -852,22 +861,12 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
>> */
>> void irq_dispose_mapping(unsigned int virq)
>> {
>> - struct irq_data *irq_data = irq_get_irq_data(virq);
>> - struct irq_domain *domain;
>> + struct irq_desc *desc = irq_to_desc(virq);
>>
>> - if (!virq || !irq_data)
>> + if (!virq || !desc)
>> return;
>>
>> - domain = irq_data->domain;
>> - if (WARN_ON(domain == NULL))
>> - return;
>> -
>> - if (irq_domain_is_hierarchy(domain)) {
>> - irq_domain_free_irqs(virq, 1);
>> - } else {
>> - irq_domain_disassociate(domain, virq);
>> - irq_free_desc(virq);
>> - }
>> + kobject_put(&desc->kobj);
>> }
>> EXPORT_SYMBOL_GPL(irq_dispose_mapping);
>
> Thanks,
>
> M.
--
Alexey
^ permalink raw reply [flat|nested] 13+ messages in thread