LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 3/3] powerpc/mm/hugetlb: Don't enable HugeTLB if we don't have a page table cache
From: Aneesh Kumar K.V @ 2019-05-14 14:50 UTC (permalink / raw)
  To: npiggin, paulus, mpe; +Cc: Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <20190514145041.7836-1-aneesh.kumar@linux.ibm.com>

This makes sure we don't enable HugeTLB if the cache is not configured.
I am still not sure about this. IMHO hugetlb support should be a hardware
support derivative and any cache allocation failure should be handled as I did
in the earlier patch. But then if we were not able to create hugetlb page table
cache, we can as well declare hugetlb support disabled thereby avoiding calling
into allocation routines.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/hugetlbpage.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index ee16a3fb788a..4bf8bc659cc7 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -602,6 +602,7 @@ __setup("hugepagesz=", hugepage_setup_sz);
 static int __init hugetlbpage_init(void)
 {
 	int psize;
+	bool configured = false;
 
 	if (hugetlb_disabled) {
 		pr_info("HugeTLB support is disabled!\n");
@@ -651,10 +652,16 @@ static int __init hugetlbpage_init(void)
 			pgtable_cache_add(pdshift - shift);
 		else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx))
 			pgtable_cache_add(PTE_T_ORDER);
+
+		if (!configured)
+			configured = true;
 	}
 
-	if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
-		hugetlbpage_init_default();
+	if (configured) {
+		if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
+			hugetlbpage_init_default();
+	} else
+		pr_info("Disabling HugeTLB");
 
 	return 0;
 }
-- 
2.21.0


^ permalink raw reply related

* [RFC PATCH 2/3] powerpc/mm/hugetlb: Fix kernel crash if we fail to allocate page table caches
From: Aneesh Kumar K.V @ 2019-05-14 14:50 UTC (permalink / raw)
  To: npiggin, paulus, mpe; +Cc: Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <20190514145041.7836-1-aneesh.kumar@linux.ibm.com>

We only check for hugetlb allocations, because with hugetlb we do conditional
registration. For PGD/PUD/PMD levels we register them always in
pgtable_cache_init.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/hugetlbpage.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index ae9d71da5219..ee16a3fb788a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -61,12 +61,17 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 		num_hugepd = 1;
 	}
 
+	if (!cachep) {
+		WARN_ONCE(1, "No page table cache created for hugetlb tables");
+		return -ENOMEM;
+	}
+
 	new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
 
 	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
 	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
 
-	if (! new)
+	if (!new)
 		return -ENOMEM;
 
 	/*
-- 
2.21.0


^ permalink raw reply related

* [RFC PATCH 1/3] powerpc/mm: Handle page table allocation failures
From: Aneesh Kumar K.V @ 2019-05-14 14:50 UTC (permalink / raw)
  To: npiggin, paulus, mpe; +Cc: Aneesh Kumar K.V, linuxppc-dev

This fixes the below crash that arises due to not handling page table allocation
failures while allocating hugetlb page table.

Fixes: e2b3d202d1db ("powerpc: Switch 16GB and 16MB explicit hugepages to a different page table format")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/hugetlbpage.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index c5c9ff2d7afc..ae9d71da5219 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -130,6 +130,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 	} else {
 		pdshift = PUD_SHIFT;
 		pu = pud_alloc(mm, pg, addr);
+		if (!pu)
+			return NULL;
 		if (pshift == PUD_SHIFT)
 			return (pte_t *)pu;
 		else if (pshift > PMD_SHIFT) {
@@ -138,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 		} else {
 			pdshift = PMD_SHIFT;
 			pm = pmd_alloc(mm, pu, addr);
+			if (!pm)
+				return NULL;
 			if (pshift == PMD_SHIFT)
 				/* 16MB hugepage */
 				return (pte_t *)pm;
@@ -154,12 +158,16 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 	} else {
 		pdshift = PUD_SHIFT;
 		pu = pud_alloc(mm, pg, addr);
+		if (!pu)
+			return NULL;
 		if (pshift >= PUD_SHIFT) {
 			ptl = pud_lockptr(mm, pu);
 			hpdp = (hugepd_t *)pu;
 		} else {
 			pdshift = PMD_SHIFT;
 			pm = pmd_alloc(mm, pu, addr);
+			if (!pm)
+				return NULL;
 			ptl = pmd_lockptr(mm, pm);
 			hpdp = (hugepd_t *)pm;
 		}
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v5 5/8] powerpc/pci/IOV: Add support for runtime enabling the VFs
From: Sergey Miroshnichenko @ 2019-05-14 14:44 UTC (permalink / raw)
  To: Oliver O'Halloran
  Cc: Stewart Smith, Alexey Kardashevskiy, linux-pci, linux,
	linuxppc-dev
In-Reply-To: <e27c1caebdbc4dc71fb8d132db24f04fa65a7a69.camel@gmail.com>

On 4/30/19 9:00 AM, Oliver O'Halloran wrote:
> On Mon, 2019-03-11 at 14:52 +0300, Sergey Miroshnichenko wrote:
> 
>> When called within pcibios_sriov_enable(), the pci_sriov_get_totalvfs(pdev)
>> returns zero, because the device is yet preparing to enable the VFs.
> 
> I don't think this is correct. The earliest pcibios_sriov_enable() can
> be called is during a driver probe function. The totalvfs field is
> initialised by pci_iov_init() which is called before the device has
> been added to the bus. If it's returning zero then maybe the driver
> limited the number of VFs to zero?
> 
> That said, you need to reset numvfs to zero before changing the value. 
> So limiting the number of pci_dns that are created to the number
> actually required rather than totalvfs doesn't hurt.
> 
>> With this patch it becomes possible to enable VFs via sysfs "sriov_numvfs"
>> on PowerNV.
> 
> I tested on a few of our lab systems with random kernel versions
> spanning from 4.15 to 5.0 and sriov_numvfs seemed to work fine on all
> of them. Is there a specific configuration you're testing that needed
> this change?
> 

Thanks a lot for the review and testing!

I've just received back the hardware (Mellanox ConnectX-4 -
drivers/net/ethernet/mellanox/mlx5), and got surprised: the issue with the
pci_sriov_get_totalvfs(pdev) returning zero can't be reproduced anymore :/ I've rechecked
the code and don't know how could this even happen. I'm sorry about that; if it will
happen again, I have to investigate deeper.

The PCI subsystem doesn't let the number of VFs to be changed from non-zero value to
another non-zero value: it needs to sriov_disable() first. I guess we can rely on that and
don't reset the numvfs to zero explicitly.

I'll change the patch description and resend it in v6 with other fixes of this patchset.

Best regards,
Serge

>> Signed-off-by: Sergey Miroshnichenko <s.miroshnichenko@yadro.com>
>> ---
>>  arch/powerpc/include/asm/pci-bridge.h     |  4 +--
>>  arch/powerpc/kernel/pci_dn.c              | 32 ++++++++++++++---------
>>  arch/powerpc/platforms/powernv/pci-ioda.c |  4 +--
>>  arch/powerpc/platforms/pseries/pci.c      |  4 +--
>>  4 files changed, 25 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>> index fc188e0e9179..6479bc96e0b6 100644
>> --- a/arch/powerpc/include/asm/pci-bridge.h
>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>> @@ -225,8 +225,8 @@ struct pci_dn {
>>  extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
>>  					   int devfn);
>>  extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev);
>> -extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev);
>> -extern void remove_dev_pci_data(struct pci_dev *pdev);
>> +extern struct pci_dn *pci_create_vf_pdns(struct pci_dev *pdev, int num_vfs);
>> +extern void pci_destroy_vf_pdns(struct pci_dev *pdev);
>>  extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
>>  					       struct device_node *dn);
>>  extern void pci_remove_device_node_info(struct device_node *dn);
>> diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
>> index 7f12882d8882..7fa362f8038d 100644
>> --- a/arch/powerpc/kernel/pci_dn.c
>> +++ b/arch/powerpc/kernel/pci_dn.c
>> @@ -222,18 +222,19 @@ static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev,
>>  	return pdn;
>>  }
>>  
>> -struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
>> +struct pci_dn *pci_create_vf_pdns(struct pci_dev *pdev, int num_vfs)
>>  {
>> +	struct pci_dn *pdn = pci_get_pdn(pdev);
>> +
>>  #ifdef CONFIG_PCI_IOV
>> -	struct pci_dn *parent, *pdn;
>> +	struct pci_dn *parent;
>>  	int i;
>>  
>>  	/* Only support IOV for now */
>>  	if (!pdev->is_physfn)
>> -		return pci_get_pdn(pdev);
>> +		return pdn;
>>  
>>  	/* Check if VFs have been populated */
>> -	pdn = pci_get_pdn(pdev);
>>  	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
>>  		return NULL;
>>  
>> @@ -242,33 +243,38 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
>>  	if (!parent)
>>  		return NULL;
>>  
>> -	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
>> +	for (i = 0; i < num_vfs; i++) {
>>  		struct eeh_dev *edev __maybe_unused;
>> +		struct pci_dn *vpdn;
>>  
>> -		pdn = pci_alloc_pdn(parent,
>> -				    pci_iov_virtfn_bus(pdev, i),
>> -				    pci_iov_virtfn_devfn(pdev, i));
>> -		if (!pdn) {
>> +		vpdn = pci_alloc_pdn(parent,
>> +				     pci_iov_virtfn_bus(pdev, i),
>> +				     pci_iov_virtfn_devfn(pdev, i));
>> +		if (!vpdn) {
>>  			dev_warn(&pdev->dev, "%s: Cannot create firmware data for VF#%d\n",
>>  				 __func__, i);
>>  			return NULL;
>>  		}
>>  
>> -		pdn->vf_index = i;
>> +		vpdn->vf_index = i;
>> +		vpdn->vendor_id = pdn->vendor_id;
>> +		vpdn->device_id = pdn->device_id;
>> +		vpdn->class_code = pdn->class_code;
>> +		vpdn->pci_ext_config_space = 0;
>>  
>>  #ifdef CONFIG_EEH
>>  		/* Create the EEH device for the VF */
>> -		edev = eeh_dev_init(pdn);
>> +		edev = eeh_dev_init(vpdn);
>>  		BUG_ON(!edev);
>>  		edev->physfn = pdev;
>>  #endif /* CONFIG_EEH */
>>  	}
>>  #endif /* CONFIG_PCI_IOV */
>>  
>> -	return pci_get_pdn(pdev);
>> +	return pdn;
>>  }
>>  
>> -void remove_dev_pci_data(struct pci_dev *pdev)
>> +void pci_destroy_vf_pdns(struct pci_dev *pdev)
>>  {
>>  #ifdef CONFIG_PCI_IOV
>>  	struct pci_dn *parent;
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index ed500f51d449..979c901535f2 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1720,14 +1720,14 @@ int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
>>  	pnv_pci_sriov_disable(pdev);
>>  
>>  	/* Release PCI data */
>> -	remove_dev_pci_data(pdev);
>> +	pci_destroy_vf_pdns(pdev);
>>  	return 0;
>>  }
>>  
>>  int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>  {
>>  	/* Allocate PCI data */
>> -	add_dev_pci_data(pdev);
>> +	pci_create_vf_pdns(pdev, num_vfs);
>>  
>>  	return pnv_pci_sriov_enable(pdev, num_vfs);
>>  }
>> diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
>> index 37a77e57893e..5e87596903a6 100644
>> --- a/arch/powerpc/platforms/pseries/pci.c
>> +++ b/arch/powerpc/platforms/pseries/pci.c
>> @@ -205,7 +205,7 @@ int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>  int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>  {
>>  	/* Allocate PCI data */
>> -	add_dev_pci_data(pdev);
>> +	pci_create_vf_pdns(pdev, num_vfs);
>>  	return pseries_pci_sriov_enable(pdev, num_vfs);
>>  }
>>  
>> @@ -217,7 +217,7 @@ int pseries_pcibios_sriov_disable(struct pci_dev *pdev)
>>  	/* Releasing pe_num_map */
>>  	kfree(pdn->pe_num_map);
>>  	/* Release PCI data */
>> -	remove_dev_pci_data(pdev);
>> +	pci_destroy_vf_pdns(pdev);
>>  	pci_vf_drivers_autoprobe(pdev, true);
>>  	return 0;
>>  }
> 

^ permalink raw reply

* [RFC PATCH v3 10/21] watchdog/hardlockup: Add function to enable NMI watchdog on all allowed CPUs at once
From: Ricardo Neri @ 2019-05-14 14:02 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Borislav Petkov
  Cc: Rafael J. Wysocki, Peter Zijlstra, Alexei Starovoitov,
	Stephane Eranian, Kai-Heng Feng, Paul Mackerras, H. Peter Anvin,
	sparclinux, Davidlohr Bueso, Ashok Raj, Joerg Roedel, x86,
	Luis R. Rodriguez, David Rientjes, Andi Kleen, Waiman Long,
	Paul E. McKenney, Don Zickus, Ravi V. Shankar,
	Konrad Rzeszutek Wilk, Marc Zyngier, Ricardo Neri,
	Frederic Weisbecker, Nicholas Piggin, Ricardo Neri,
	Byungchul Park, Babu Moger, Mathieu Desnoyers, Josh Poimboeuf,
	Tony Luck, Randy Dunlap, linux-kernel, iommu, Masami Hiramatsu,
	Suravee Suthikulpanit, Philippe Ombredanne, Colin Ian King,
	Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <1557842534-4266-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

When there are more than one implementation of the NMI watchdog, there may
be situations in which switching from one to another is needed (e.g., if
the time-stamp counter becomes unstable, the HPET-based NMI watchdog can
no longer be used.

The perf-based implementation of the hardlockup detector makes use of
various per-CPU variables which are accessed via this_cpu operations.
Hence, each CPU needs to enable its own NMI watchdog if using the perf
implementation.

Add functionality to switch from one NMI watchdog to another and do it
from each allowed CPU.

Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 include/linux/nmi.h |  2 ++
 kernel/watchdog.c   | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e5f1a86e20b7..6d828334348b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -83,9 +83,11 @@ static inline void reset_hung_task_detector(void) { }
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
+extern void hardlockup_start_all(void);
 extern unsigned int hardlockup_panic;
 #else
 static inline void hardlockup_detector_disable(void) {}
+static inline void hardlockup_start_all(void) {}
 #endif
 
 #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9e7b9306fe..be589001200a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -566,6 +566,21 @@ int lockup_detector_offline_cpu(unsigned int cpu)
 	return 0;
 }
 
+static int hardlockup_start_fn(void *data)
+{
+	watchdog_nmi_enable(smp_processor_id());
+	return 0;
+}
+
+void hardlockup_start_all(void)
+{
+	int cpu;
+
+	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+	for_each_cpu(cpu, &watchdog_allowed_mask)
+		smp_call_on_cpu(cpu, hardlockup_start_fn, NULL, false);
+}
+
 static void lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH v3 08/21] watchdog/hardlockup: Decouple the hardlockup detector from perf
From: Ricardo Neri @ 2019-05-14 14:02 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Borislav Petkov
  Cc: Rafael J. Wysocki, Peter Zijlstra, Alexei Starovoitov,
	Stephane Eranian, Kai-Heng Feng, Paul Mackerras, H. Peter Anvin,
	sparclinux, Davidlohr Bueso, Ashok Raj, Joerg Roedel, x86,
	Luis R. Rodriguez, David Rientjes, Andi Kleen, Waiman Long,
	Paul E. McKenney, Don Zickus, Ravi V. Shankar,
	Konrad Rzeszutek Wilk, Marc Zyngier, Ricardo Neri,
	Frederic Weisbecker, Nicholas Piggin, Ricardo Neri,
	Byungchul Park, Babu Moger, Mathieu Desnoyers, Josh Poimboeuf,
	Tony Luck, Randy Dunlap, linux-kernel, iommu, Masami Hiramatsu,
	Suravee Suthikulpanit, Philippe Ombredanne, Colin Ian King,
	Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <1557842534-4266-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

The current default implementation of the hardlockup detector assumes that
it is implemented using perf events. However, the hardlockup detector can
be driven by other sources of non-maskable interrupts (e.g., a properly
configured timer).

Group and wrap in #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF all the code
specific to perf: create and manage perf events, stop and start the perf-
based detector.

The generic portion of the detector (monitor the timers' thresholds, check
timestamps and detect hardlockups as well as the implementation of
arch_touch_nmi_watchdog()) is now selected with the new intermediate config
symbol CONFIG_HARDLOCKUP_DETECTOR_CORE.

The perf-based implementation of the detector selects the new intermediate
symbol. Other implementations should do the same.

Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 include/linux/nmi.h   |  5 ++++-
 kernel/Makefile       |  2 +-
 kernel/watchdog_hld.c | 32 ++++++++++++++++++++------------
 lib/Kconfig.debug     |  4 ++++
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5a8b19749769..e5f1a86e20b7 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -94,8 +94,11 @@ static inline void hardlockup_detector_disable(void) {}
 # define NMI_WATCHDOG_SYSCTL_PERM	0444
 #endif
 
-#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_CORE)
 extern void arch_touch_nmi_watchdog(void);
+#endif
+
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_detector_perf_disable(void);
diff --git a/kernel/Makefile b/kernel/Makefile
index 62471e75a2b0..e9bdbaa1ed50 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_CORE) += watchdog_hld.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index b352e507b17f..bb6435978c46 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -22,12 +22,8 @@
 
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-static DEFINE_PER_CPU(struct perf_event *, dead_event);
-static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
-static atomic_t watchdog_cpus = ATOMIC_INIT(0);
 
 notrace void arch_touch_nmi_watchdog(void)
 {
@@ -98,14 +94,6 @@ static inline bool watchdog_check_timestamp(void)
 }
 #endif
 
-static struct perf_event_attr wd_hw_attr = {
-	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES,
-	.size		= sizeof(struct perf_event_attr),
-	.pinned		= 1,
-	.disabled	= 1,
-};
-
 void inspect_for_hardlockups(struct pt_regs *regs)
 {
 	if (__this_cpu_read(watchdog_nmi_touch) == true) {
@@ -157,6 +145,24 @@ void inspect_for_hardlockups(struct pt_regs *regs)
 	return;
 }
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+#undef pr_fmt
+#define pr_fmt(fmt) "NMI perf watchdog: " fmt
+
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(struct perf_event *, dead_event);
+static struct cpumask dead_events_mask;
+
+static atomic_t watchdog_cpus = ATOMIC_INIT(0);
+
+static struct perf_event_attr wd_hw_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
+};
+
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
 				       struct perf_sample_data *data,
@@ -298,3 +304,5 @@ int __init hardlockup_detector_perf_init(void)
 	}
 	return ret;
 }
+
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_PERF */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d5a4a4036d2f..c7eaa4091743 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -848,9 +848,13 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
 	default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
 	default 1 if BOOTPARAM_SOFTLOCKUP_PANIC
 
+config HARDLOCKUP_DETECTOR_CORE
+	bool
+
 config HARDLOCKUP_DETECTOR_PERF
 	bool
 	select SOFTLOCKUP_DETECTOR
+	select HARDLOCUP_DETECTOR_CORE
 
 #
 # Enables a timestamp based low pass filter to compensate for perf based
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH v3 07/21] watchdog/hardlockup: Define a generic function to detect hardlockups
From: Ricardo Neri @ 2019-05-14 14:02 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Borislav Petkov
  Cc: Peter Zijlstra, Ricardo Neri, Stephane Eranian, Paul Mackerras,
	H. Peter Anvin, sparclinux, Ashok Raj, Joerg Roedel, x86,
	Luis R. Rodriguez, Andi Kleen, Don Zickus, Ravi V. Shankar,
	Ricardo Neri, Frederic Weisbecker, Nicholas Piggin, Babu Moger,
	Mathieu Desnoyers, Tony Luck, linux-kernel, iommu,
	Masami Hiramatsu, Suravee Suthikulpanit, Philippe Ombredanne,
	Colin Ian King, Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <1557842534-4266-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

The procedure to detect hardlockups is independent of the underlying
mechanism that generates the non-maskable interrupt used to drive the
detector. Thus, it can be put in a separate, generic function. In this
manner, it can be invoked by various implementations of the NMI watchdog.

For this purpose, move the bulk of watchdog_overflow_callback() to the
new function inspect_for_hardlockups(). This function can then be called
from the applicable NMI handlers.

Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 include/linux/nmi.h   |  1 +
 kernel/watchdog_hld.c | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9003e29cde46..5a8b19749769 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -212,6 +212,7 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
+void inspect_for_hardlockups(struct pt_regs *regs);
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 #include <asm/nmi.h>
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..b352e507b17f 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,14 +106,8 @@ static struct perf_event_attr wd_hw_attr = {
 	.disabled	= 1,
 };
 
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
-				       struct perf_sample_data *data,
-				       struct pt_regs *regs)
+void inspect_for_hardlockups(struct pt_regs *regs)
 {
-	/* Ensure the watchdog never gets throttled */
-	event->hw.interrupts = 0;
-
 	if (__this_cpu_read(watchdog_nmi_touch) == true) {
 		__this_cpu_write(watchdog_nmi_touch, false);
 		return;
@@ -163,6 +157,16 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	return;
 }
 
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+				       struct perf_sample_data *data,
+				       struct pt_regs *regs)
+{
+	/* Ensure the watchdog never gets throttled */
+	event->hw.interrupts = 0;
+	inspect_for_hardlockups(regs);
+}
+
 static int hardlockup_detector_event_create(void)
 {
 	unsigned int cpu = smp_processor_id();
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH] powerpc/mm: Fix crashes with hugepages & 4K pages
From: Christophe Leroy @ 2019-05-14 13:57 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: aneesh.kumar, sachinp
In-Reply-To: <20190514134321.25575-1-mpe@ellerman.id.au>



Le 14/05/2019 à 15:43, Michael Ellerman a écrit :
> The recent commit to cleanup ifdefs in the hugepage initialisation led
> to crashes when using 4K pages as reported by Sachin:
> 
>    BUG: Kernel NULL pointer dereference at 0x0000001c
>    Faulting instruction address: 0xc000000001d1e58c
>    Oops: Kernel access of bad area, sig: 11 [#1]
>    LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>    ...
>    CPU: 3 PID: 4635 Comm: futex_wake04 Tainted: G        W  O      5.1.0-next-20190507-autotest #1
>    NIP:  c000000001d1e58c LR: c000000001d1e54c CTR: 0000000000000000
>    REGS: c000000004937890 TRAP: 0300
>    MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22424822  XER: 00000000
>    CFAR: c00000000183e9e0 DAR: 000000000000001c DSISR: 40000000 IRQMASK: 0
>    ...
>    NIP kmem_cache_alloc+0xbc/0x5a0
>    LR  kmem_cache_alloc+0x7c/0x5a0
>    Call Trace:
>      huge_pte_alloc+0x580/0x950
>      hugetlb_fault+0x9a0/0x1250
>      handle_mm_fault+0x490/0x4a0
>      __do_page_fault+0x77c/0x1f00
>      do_page_fault+0x28/0x50
>      handle_page_fault+0x18/0x38
> 
> This is caused by us trying to allocate from a NULL kmem cache in
> __hugepte_alloc(). The kmem cache is NULL because it was never
> allocated in hugetlbpage_init(), because add_huge_page_size() returned
> an error.
> 
> The reason add_huge_page_size() returned an error is a simple typo, we
> are calling check_and_get_huge_psize(size) when we should be passing
> shift instead.
> 
> The fact that we're able to trigger this path when the kmem caches are
> NULL is a separate bug, ie. we should not advertise any hugepage sizes
> if we haven't setup the required caches for them.
> 
> This was only seen with 4K pages, with 64K pages we don't need to
> allocate any extra kmem caches because the 16M hugepage just occupies
> a single entry at the PMD level.
> 
> Fixes: 723f268f19da ("powerpc/mm: cleanup ifdef mess in add_huge_page_size()")
> Reported-by: Sachin Sant <sachinp@linux.ibm.com>
> Tested-by: Sachin Sant <sachinp@linux.ibm.com>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>

> ---
>   arch/powerpc/mm/hugetlbpage.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index c5c9ff2d7afc..b5d92dc32844 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -556,7 +556,7 @@ static int __init add_huge_page_size(unsigned long long size)
>   	if (size <= PAGE_SIZE || !is_power_of_2(size))
>   		return -EINVAL;
>   
> -	mmu_psize = check_and_get_huge_psize(size);
> +	mmu_psize = check_and_get_huge_psize(shift);
>   	if (mmu_psize < 0)
>   		return -EINVAL;
>   
> 

^ permalink raw reply

* [PATCH] powerpc/mm: Fix crashes with hugepages & 4K pages
From: Michael Ellerman @ 2019-05-14 13:43 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: aneesh.kumar, sachinp

The recent commit to cleanup ifdefs in the hugepage initialisation led
to crashes when using 4K pages as reported by Sachin:

  BUG: Kernel NULL pointer dereference at 0x0000001c
  Faulting instruction address: 0xc000000001d1e58c
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
  ...
  CPU: 3 PID: 4635 Comm: futex_wake04 Tainted: G        W  O      5.1.0-next-20190507-autotest #1
  NIP:  c000000001d1e58c LR: c000000001d1e54c CTR: 0000000000000000
  REGS: c000000004937890 TRAP: 0300
  MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22424822  XER: 00000000
  CFAR: c00000000183e9e0 DAR: 000000000000001c DSISR: 40000000 IRQMASK: 0
  ...
  NIP kmem_cache_alloc+0xbc/0x5a0
  LR  kmem_cache_alloc+0x7c/0x5a0
  Call Trace:
    huge_pte_alloc+0x580/0x950
    hugetlb_fault+0x9a0/0x1250
    handle_mm_fault+0x490/0x4a0
    __do_page_fault+0x77c/0x1f00
    do_page_fault+0x28/0x50
    handle_page_fault+0x18/0x38

This is caused by us trying to allocate from a NULL kmem cache in
__hugepte_alloc(). The kmem cache is NULL because it was never
allocated in hugetlbpage_init(), because add_huge_page_size() returned
an error.

The reason add_huge_page_size() returned an error is a simple typo, we
are calling check_and_get_huge_psize(size) when we should be passing
shift instead.

The fact that we're able to trigger this path when the kmem caches are
NULL is a separate bug, ie. we should not advertise any hugepage sizes
if we haven't setup the required caches for them.

This was only seen with 4K pages, with 64K pages we don't need to
allocate any extra kmem caches because the 16M hugepage just occupies
a single entry at the PMD level.

Fixes: 723f268f19da ("powerpc/mm: cleanup ifdef mess in add_huge_page_size()")
Reported-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugetlbpage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index c5c9ff2d7afc..b5d92dc32844 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -556,7 +556,7 @@ static int __init add_huge_page_size(unsigned long long size)
 	if (size <= PAGE_SIZE || !is_power_of_2(size))
 		return -EINVAL;

-	mmu_psize = check_and_get_huge_psize(size);
+	mmu_psize = check_and_get_huge_psize(shift);
 	if (mmu_psize < 0)
 		return -EINVAL;

-- 
2.20.1

^ permalink raw reply related

* [Bug 203161] ppc64 started failing to build: error: 'MAX_PHYSMEM_BITS' undeclared
From: bugzilla-daemon @ 2019-05-14 13:27 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-203161-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=203161

Michael Ellerman (michael@ellerman.id.au) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|RESOLVED                    |CLOSED
                 CC|                            |michael@ellerman.id.au

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: Kernel OOPS followed by a panic on next20190507 with 4K page size
From: Christophe Leroy @ 2019-05-14 13:08 UTC (permalink / raw)
  To: Michael Ellerman, Sachin Sant, Aneesh Kumar K.V; +Cc: linux-next, linuxppc-dev
In-Reply-To: <87pnolrxri.fsf@concordia.ellerman.id.au>



Le 14/05/2019 à 15:06, Michael Ellerman a écrit :
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>> Le 14/05/2019 à 10:57, Sachin Sant a écrit :
>>>> On 14-May-2019, at 7:00 AM, Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> wrote:
>>>> On 5/8/19 4:30 PM, Sachin Sant wrote:
>>>>> While running LTP tests (specifically futex_wake04) against next-20199597
>>>>> build with 4K page size on a POWER8 LPAR following crash is observed.
>>>>> [ 4233.214876] BUG: Kernel NULL pointer dereference at 0x0000001c
>>>>> [ 4233.214898] Faulting instruction address: 0xc000000001d1e58c
>>>>> [ 4233.214905] Oops: Kernel access of bad area, sig: 11 [#1]
>>>>> [ 4233.214911] LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>>>>> [ 4233.214920] Dumping ftrace buffer:
>>>>> [ 4233.214928]    (ftrace buffer empty)
>>>>> [ 4233.214933] Modules linked in: overlay rpadlpar_io rpaphp iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc kvm iptable_filter pseries_rng rng_core vmx_crypto ip_tables x_tables autofs4 [last unloaded: dummy_del_mod]
>>>>> [ 4233.214973] CPU: 3 PID: 4635 Comm: futex_wake04 Tainted: G        W  O      5.1.0-next-20190507-autotest #1
>>>>> [ 4233.214980] NIP:  c000000001d1e58c LR: c000000001d1e54c CTR: 0000000000000000
>>>>> [ 4233.214987] REGS: c000000004937890 TRAP: 0300   Tainted: G        W  O       (5.1.0-next-20190507-autotest)
>>>>> [ 4233.214993] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22424822  XER: 00000000
>>>>> [ 4233.215005] CFAR: c00000000183e9e0 DAR: 000000000000001c DSISR: 40000000 IRQMASK: 0
>>>>> [ 4233.215005] GPR00: c000000001901a80 c000000004937b20 c000000003938700 0000000000000000
>>>>> [ 4233.215005] GPR04: 0000000000400cc0 000000000003efff 000000027966e000 c000000003ba8700
>>>>> [ 4233.215005] GPR08: c000000003ba8700 000000000d601125 c000000003ba8700 0000000080000000
>>>>> [ 4233.215005] GPR12: 0000000022424822 c00000001ecae280 0000000000000000 0000000000000000
>>>>> [ 4233.215005] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>>>> [ 4233.215005] GPR20: 0000000000000018 c0000000039e2d30 c0000000039e2d28 c0000002762da460
>>>>> [ 4233.215005] GPR24: 000000000000001c 0000000000000000 0000000000000001 c000000001901a80
>>>>> [ 4233.215005] GPR28: 0000000000400cc0 0000000000000000 0000000000000000 0000000000400cc0
>>>>> [ 4233.215065] NIP [c000000001d1e58c] kmem_cache_alloc+0xbc/0x5a0
>>>>> [ 4233.215071] LR [c000000001d1e54c] kmem_cache_alloc+0x7c/0x5a0
>>>>> [ 4233.215075] Call Trace:
>>>>> [ 4233.215081] [c000000004937b20] [c000000001c91150] __pud_alloc+0x160/0x200 (unreliable)
>>>>> [ 4233.215090] [c000000004937b80] [c000000001901a80] huge_pte_alloc+0x580/0x950
>>>>> [ 4233.215098] [c000000004937c00] [c000000001cf7910] hugetlb_fault+0x9a0/0x1250
>>>>> [ 4233.215106] [c000000004937ce0] [c000000001c94a80] handle_mm_fault+0x490/0x4a0
>>>>> [ 4233.215114] [c000000004937d20] [c0000000018d529c] __do_page_fault+0x77c/0x1f00
>>>>> [ 4233.215121] [c000000004937e00] [c0000000018d6a48] do_page_fault+0x28/0x50
>>>>> [ 4233.215129] [c000000004937e20] [c00000000183b0d4] handle_page_fault+0x18/0x38
>>>>> [ 4233.215135] Instruction dump:
>>>>> [ 4233.215139] 39290001 f92ac1b0 419e009c 3ce20027 3ba00000 e927c1f0 39290001 f927c1f0
>>>>> [ 4233.215149] 3d420027 e92ac290 39290001 f92ac290 <8359001c> 83390018 60000000 3ce20027
>>>>
>>>> I did send a patch to the list to handle page allocation failures in this patch. But i guess what we are finding here is get_current() crashing. Any chance to bisect this?
>>>>
>>>
>>> Following commit seems to have introduced this problem.
>>>
>>> 723f268f19 - powerpc/mm: cleanup ifdef mess in add_huge_page_size()
>>>
>>> Reverting this patch allows the test case to execute properly without a crash.
>>
>> Oops ...
>>
>> Can you check by replacing
>>
>> mmu_psize = check_and_get_huge_psize(size);
>>
>> by
>>
>> mmu_psize = check_and_get_huge_psize(shift);
>>
>> in add_huge_page_size()
> 
> Yeah that's it :)
> 
> I'm writing a commit, unless you have already?
> 

No I haven't.

Christophe

^ permalink raw reply

* Re: Kernel OOPS followed by a panic on next20190507 with 4K page size
From: Michael Ellerman @ 2019-05-14 13:06 UTC (permalink / raw)
  To: Christophe Leroy, Sachin Sant, Aneesh Kumar K.V; +Cc: linux-next, linuxppc-dev
In-Reply-To: <fb4c0e92-ef29-c26e-9e24-602203edd45a@c-s.fr>

Christophe Leroy <christophe.leroy@c-s.fr> writes:
> Le 14/05/2019 à 10:57, Sachin Sant a écrit :
>>> On 14-May-2019, at 7:00 AM, Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> wrote:
>>> On 5/8/19 4:30 PM, Sachin Sant wrote:
>>>> While running LTP tests (specifically futex_wake04) against next-20199597
>>>> build with 4K page size on a POWER8 LPAR following crash is observed.
>>>> [ 4233.214876] BUG: Kernel NULL pointer dereference at 0x0000001c
>>>> [ 4233.214898] Faulting instruction address: 0xc000000001d1e58c
>>>> [ 4233.214905] Oops: Kernel access of bad area, sig: 11 [#1]
>>>> [ 4233.214911] LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>>>> [ 4233.214920] Dumping ftrace buffer:
>>>> [ 4233.214928]    (ftrace buffer empty)
>>>> [ 4233.214933] Modules linked in: overlay rpadlpar_io rpaphp iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc kvm iptable_filter pseries_rng rng_core vmx_crypto ip_tables x_tables autofs4 [last unloaded: dummy_del_mod]
>>>> [ 4233.214973] CPU: 3 PID: 4635 Comm: futex_wake04 Tainted: G        W  O      5.1.0-next-20190507-autotest #1
>>>> [ 4233.214980] NIP:  c000000001d1e58c LR: c000000001d1e54c CTR: 0000000000000000
>>>> [ 4233.214987] REGS: c000000004937890 TRAP: 0300   Tainted: G        W  O       (5.1.0-next-20190507-autotest)
>>>> [ 4233.214993] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22424822  XER: 00000000
>>>> [ 4233.215005] CFAR: c00000000183e9e0 DAR: 000000000000001c DSISR: 40000000 IRQMASK: 0
>>>> [ 4233.215005] GPR00: c000000001901a80 c000000004937b20 c000000003938700 0000000000000000
>>>> [ 4233.215005] GPR04: 0000000000400cc0 000000000003efff 000000027966e000 c000000003ba8700
>>>> [ 4233.215005] GPR08: c000000003ba8700 000000000d601125 c000000003ba8700 0000000080000000
>>>> [ 4233.215005] GPR12: 0000000022424822 c00000001ecae280 0000000000000000 0000000000000000
>>>> [ 4233.215005] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>>> [ 4233.215005] GPR20: 0000000000000018 c0000000039e2d30 c0000000039e2d28 c0000002762da460
>>>> [ 4233.215005] GPR24: 000000000000001c 0000000000000000 0000000000000001 c000000001901a80
>>>> [ 4233.215005] GPR28: 0000000000400cc0 0000000000000000 0000000000000000 0000000000400cc0
>>>> [ 4233.215065] NIP [c000000001d1e58c] kmem_cache_alloc+0xbc/0x5a0
>>>> [ 4233.215071] LR [c000000001d1e54c] kmem_cache_alloc+0x7c/0x5a0
>>>> [ 4233.215075] Call Trace:
>>>> [ 4233.215081] [c000000004937b20] [c000000001c91150] __pud_alloc+0x160/0x200 (unreliable)
>>>> [ 4233.215090] [c000000004937b80] [c000000001901a80] huge_pte_alloc+0x580/0x950
>>>> [ 4233.215098] [c000000004937c00] [c000000001cf7910] hugetlb_fault+0x9a0/0x1250
>>>> [ 4233.215106] [c000000004937ce0] [c000000001c94a80] handle_mm_fault+0x490/0x4a0
>>>> [ 4233.215114] [c000000004937d20] [c0000000018d529c] __do_page_fault+0x77c/0x1f00
>>>> [ 4233.215121] [c000000004937e00] [c0000000018d6a48] do_page_fault+0x28/0x50
>>>> [ 4233.215129] [c000000004937e20] [c00000000183b0d4] handle_page_fault+0x18/0x38
>>>> [ 4233.215135] Instruction dump:
>>>> [ 4233.215139] 39290001 f92ac1b0 419e009c 3ce20027 3ba00000 e927c1f0 39290001 f927c1f0
>>>> [ 4233.215149] 3d420027 e92ac290 39290001 f92ac290 <8359001c> 83390018 60000000 3ce20027
>>>
>>> I did send a patch to the list to handle page allocation failures in this patch. But i guess what we are finding here is get_current() crashing. Any chance to bisect this?
>>>
>> 
>> Following commit seems to have introduced this problem.
>> 
>> 723f268f19 - powerpc/mm: cleanup ifdef mess in add_huge_page_size()
>> 
>> Reverting this patch allows the test case to execute properly without a crash.
>
> Oops ...
>
> Can you check by replacing
>
> mmu_psize = check_and_get_huge_psize(size);
>
> by
>
> mmu_psize = check_and_get_huge_psize(shift);
>
> in add_huge_page_size()

Yeah that's it :)

I'm writing a commit, unless you have already?

cheers

^ permalink raw reply

* [PATCH] powerpc: Allow may_use_simd() to function as feature detection
From: Shawn Landden @ 2019-05-14 12:49 UTC (permalink / raw)
  Cc: Shawn Landden, Paul Mackerras, linuxppc-dev
In-Reply-To: <20190513005104.20140-1-shawn@git.icu>

ARM does this, so we might as well too.
I am a bit confused however as CONFIG_ALTIVEC does not select
CONFIG_PPC_FPU. Would you ever have altivec without a fpu?

Signed-off-by: Shawn Landden <shawn@git.icu>
---
 arch/powerpc/include/asm/simd.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/include/asm/simd.h b/arch/powerpc/include/asm/simd.h
index 9b066d633..2fe26f258 100644
--- a/arch/powerpc/include/asm/simd.h
+++ b/arch/powerpc/include/asm/simd.h
@@ -7,4 +7,11 @@
  * It's always ok in process context (ie "not interrupt")
  * but it is sometimes ok even from an irq.
  */
+#ifdef CONFIG_PPC_FPU
 extern bool may_use_simd(void);
+#else
+static inline bool may_use_simd(void)
+{
+	return false;
+}
+#endif
-- 
2.21.0.1020.gf2820cf01a


^ permalink raw reply related

* Re: [PATCH] powerpc/powernv/npu: Fix reference leak
From: Greg Kurz @ 2019-05-14 12:25 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Alexey Kardashevskiy, Alistair Popple, linuxppc-dev, linux-kernel,
	stable
In-Reply-To: <87sgths2zf.fsf@concordia.ellerman.id.au>

On Tue, 14 May 2019 21:13:40 +1000
Michael Ellerman <mpe@ellerman.id.au> wrote:

> Greg Kurz <groug@kaod.org> writes:
> > Michael,
> >
> > Any comments on this patch ? Should I repost with a shorter comment
> > as suggested by Alexey ?  
> 
> No the longer comment seems fine to me.
> 
> I'm not a big fan of the patch, it's basically a hack :)
> 

Yeah :)

> But for a backportable fix I guess it is OK.
> 
> I would be happier though if we eventually fix up the code to do the
> refcounting properly.
> 

I had started to do just that before deciding to go for the backportable
hack. Should I rebase the other patches I have on top of this patch and
repost the whole thing, so that we have both the ugly fix for stable and
the pretty one for 5.2 ?

Cheers,

--
Greg

> cheers
> 
> > On Mon, 29 Apr 2019 12:36:59 +0200
> > Greg Kurz <groug@kaod.org> wrote:  
> >> On Mon, 29 Apr 2019 16:01:29 +1000
> >> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>   
> >> > On 20/04/2019 01:34, Greg Kurz wrote:    
> >> > > Since 902bdc57451c, get_pci_dev() calls pci_get_domain_bus_and_slot(). This
> >> > > has the effect of incrementing the reference count of the PCI device, as
> >> > > explained in drivers/pci/search.c:
> >> > > 
> >> > >  * Given a PCI domain, bus, and slot/function number, the desired PCI
> >> > >  * device is located in the list of PCI devices. If the device is
> >> > >  * found, its reference count is increased and this function returns a
> >> > >  * pointer to its data structure.  The caller must decrement the
> >> > >  * reference count by calling pci_dev_put().  If no device is found,
> >> > >  * %NULL is returned.
> >> > > 
> >> > > Nothing was done to call pci_dev_put() and the reference count of GPU and
> >> > > NPU PCI devices rockets up.
> >> > > 
> >> > > A natural way to fix this would be to teach the callers about the change,
> >> > > so that they call pci_dev_put() when done with the pointer. This turns
> >> > > out to be quite intrusive, as it affects many paths in npu-dma.c,
> >> > > pci-ioda.c and vfio_pci_nvlink2.c.      
> >> > 
> >> > 
> >> > afaict this referencing is only done to protect the current traverser
> >> > and what you've done is actually a natural way (and the generic
> >> > pci_get_dev_by_id() does exactly the same), although this looks a bit weird.
> >> >     
> >> 
> >> Not exactly the same: pci_get_dev_by_id() always increment the refcount
> >> of the returned PCI device. The refcount is only decremented when this
> >> device is passed to pci_get_dev_by_id() to continue searching.
> >> 
> >> That means that the users of the PCI device pointer returned by
> >> pci_get_dev_by_id() or its exported variants pci_get_subsys(),
> >> pci_get_device() and pci_get_class() do handle the refcount. They
> >> all pass the pointer to pci_dev_put() or continue the search,
> >> which calls pci_dev_put() internally.
> >> 
> >> Direct and indirect callers of get_pci_dev() don't care for the
> >> refcount at all unless I'm missing something.
> >>   
> >> >     
> >> > > Also, the issue appeared in 4.16 and
> >> > > some affected code got moved around since then: it would be problematic
> >> > > to backport the fix to stable releases.
> >> > > 
> >> > > All that code never cared for reference counting anyway. Call pci_dev_put()
> >> > > from get_pci_dev() to revert to the previous behavior.      
> >> > >> Fixes: 902bdc57451c ("powerpc/powernv/idoa: Remove unnecessary pcidev      
> >> > from pci_dn")    
> >> > > Cc: stable@vger.kernel.org # v4.16
> >> > > Signed-off-by: Greg Kurz <groug@kaod.org>
> >> > > ---
> >> > >  arch/powerpc/platforms/powernv/npu-dma.c |   15 ++++++++++++++-
> >> > >  1 file changed, 14 insertions(+), 1 deletion(-)
> >> > > 
> >> > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> >> > > index e713ade30087..d8f3647e8fb2 100644
> >> > > --- a/arch/powerpc/platforms/powernv/npu-dma.c
> >> > > +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> >> > > @@ -31,9 +31,22 @@ static DEFINE_SPINLOCK(npu_context_lock);
> >> > >  static struct pci_dev *get_pci_dev(struct device_node *dn)
> >> > >  {
> >> > >  	struct pci_dn *pdn = PCI_DN(dn);
> >> > > +	struct pci_dev *pdev;
> >> > >  
> >> > > -	return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
> >> > > +	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
> >> > >  					   pdn->busno, pdn->devfn);
> >> > > +
> >> > > +	/*
> >> > > +	 * pci_get_domain_bus_and_slot() increased the reference count of
> >> > > +	 * the PCI device, but callers don't need that actually as the PE
> >> > > +	 * already holds a reference to the device.      
> >> > 
> >> > Imho this would be just enough.
> >> > 
> >> > Anyway,
> >> > 
> >> > Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> >     
> >> 
> >> Thanks !
> >> 
> >> I now realize that I forgot to add the --cc option for stable on my stgit
> >> command line :-\.
> >> 
> >> Cc'ing now.
> >>   
> >> > 
> >> > How did you find it? :)
> >> >     
> >> 
> >> While reading code to find some inspiration for OpenCAPI passthrough. :)
> >> 
> >> I saw the following in vfio_pci_ibm_npu2_init():
> >> 
> >> 	if (!pnv_pci_get_gpu_dev(vdev->pdev))
> >> 		return -ENODEV;
> >> 
> >> and simply followed the function calls.
> >>   
> >> >     
> >> > > Since callers aren't
> >> > > +	 * aware of the reference count change, call pci_dev_put() now to
> >> > > +	 * avoid leaks.
> >> > > +	 */
> >> > > +	if (pdev)
> >> > > +		pci_dev_put(pdev);
> >> > > +
> >> > > +	return pdev;
> >> > >  }
> >> > >  
> >> > >  /* Given a NPU device get the associated PCI device. */
> >> > >       
> >> >     
> >>   


^ permalink raw reply

* Re: [PATCH v10 09/12] ima: Implement support for module-style appended signatures
From: Mimi Zohar @ 2019-05-14 12:09 UTC (permalink / raw)
  To: Thiago Jung Bauermann, linux-integrity
  Cc: Herbert Xu, linux-doc, Dmitry Kasatkin, David S. Miller,
	Jonathan Corbet, linux-kernel, James Morris, David Howells,
	AKASHI, Takahiro, linux-security-module, keyrings, linux-crypto,
	Jessica Yu, linuxppc-dev, David Woodhouse, Serge E. Hallyn
In-Reply-To: <20190418035120.2354-10-bauerman@linux.ibm.com>

Hi Thiago,

On Thu, 2019-04-18 at 00:51 -0300, Thiago Jung Bauermann wrote:
> 
> @@ -326,6 +356,10 @@ int ima_appraise_measurement(enum ima_hooks func,
>         case INTEGRITY_UNKNOWN:
>                 break;
>         case INTEGRITY_NOXATTRS:        /* No EVM protected xattrs. */
> +               /* It's fine not to have xattrs when using a modsig. */
> +               if (try_modsig)
> +                       break;
> +               /* fall through */
>         case INTEGRITY_NOLABEL:         /* No security.evm xattr. */
>                 cause = "missing-HMAC";
>                 goto out;
> @@ -340,6 +374,14 @@ int ima_appraise_measurement(enum ima_hooks func,
>                 rc = xattr_verify(func, iint, xattr_value, xattr_len, &status,
>                                   &cause);
>  
> +       /*
> +        * If we have a modsig and either no imasig or the imasig's key isn't
> +        * known, then try verifying the modsig.
> +        */
> +       if (status != INTEGRITY_PASS && try_modsig &&
> +           (!xattr_value || rc == -ENOKEY))
> +               rc = modsig_verify(func, modsig, &status, &cause);

EVM protects other security xattrs, not just security.ima, if they
exist.  As a result, evm_verifyxattr() could pass based on the other
security xattrs.

Mimi

> +
>  out:
>         /*
>          * File signatures on some filesystems can not be properly verified.


^ permalink raw reply

* Re: Kernel OOPS followed by a panic on next20190507 with 4K page size
From: Sachin Sant @ 2019-05-14 11:50 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: Aneesh Kumar K.V, linux-next, linuxppc-dev
In-Reply-To: <fb4c0e92-ef29-c26e-9e24-602203edd45a@c-s.fr>



> On 14-May-2019, at 4:35 PM, Christophe Leroy <christophe.leroy@c-s.fr> wrote:
> 
> 
> 
> Le 14/05/2019 à 10:57, Sachin Sant a écrit :
>>> On 14-May-2019, at 7:00 AM, Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> wrote:
>>> 
>>> On 5/8/19 4:30 PM, Sachin Sant wrote:
>>>> While running LTP tests (specifically futex_wake04) against next-20199597
>>>> build with 4K page size on a POWER8 LPAR following crash is observed.
>>>> [ 4233.214876] BUG: Kernel NULL pointer dereference at 0x0000001c
>>>> [ 4233.214898] Faulting instruction address: 0xc000000001d1e58c
>>>> [ 4233.214905] Oops: Kernel access of bad area, sig: 11 [#1]
>>>> [ 4233.214911] LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>>>> [ 4233.214920] Dumping ftrace buffer:
>>>> [ 4233.214928]    (ftrace buffer empty)
>>>> [ 4233.214933] Modules linked in: overlay rpadlpar_io rpaphp iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc kvm iptable_filter pseries_rng rng_core vmx_crypto ip_tables x_tables autofs4 [last unloaded: dummy_del_mod]
>>>> [ 4233.214973] CPU: 3 PID: 4635 Comm: futex_wake04 Tainted: G        W  O      5.1.0-next-20190507-autotest #1
>>>> [ 4233.214980] NIP:  c000000001d1e58c LR: c000000001d1e54c CTR: 0000000000000000
>>>> [ 4233.214987] REGS: c000000004937890 TRAP: 0300   Tainted: G        W  O       (5.1.0-next-20190507-autotest)
>>>> [ 4233.214993] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22424822  XER: 00000000
>>>> [ 4233.215005] CFAR: c00000000183e9e0 DAR: 000000000000001c DSISR: 40000000 IRQMASK: 0
>>>> [ 4233.215005] GPR00: c000000001901a80 c000000004937b20 c000000003938700 0000000000000000
>>>> [ 4233.215005] GPR04: 0000000000400cc0 000000000003efff 000000027966e000 c000000003ba8700
>>>> [ 4233.215005] GPR08: c000000003ba8700 000000000d601125 c000000003ba8700 0000000080000000
>>>> [ 4233.215005] GPR12: 0000000022424822 c00000001ecae280 0000000000000000 0000000000000000
>>>> [ 4233.215005] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>>> [ 4233.215005] GPR20: 0000000000000018 c0000000039e2d30 c0000000039e2d28 c0000002762da460
>>>> [ 4233.215005] GPR24: 000000000000001c 0000000000000000 0000000000000001 c000000001901a80
>>>> [ 4233.215005] GPR28: 0000000000400cc0 0000000000000000 0000000000000000 0000000000400cc0
>>>> [ 4233.215065] NIP [c000000001d1e58c] kmem_cache_alloc+0xbc/0x5a0
>>>> [ 4233.215071] LR [c000000001d1e54c] kmem_cache_alloc+0x7c/0x5a0
>>>> [ 4233.215075] Call Trace:
>>>> [ 4233.215081] [c000000004937b20] [c000000001c91150] __pud_alloc+0x160/0x200 (unreliable)
>>>> [ 4233.215090] [c000000004937b80] [c000000001901a80] huge_pte_alloc+0x580/0x950
>>>> [ 4233.215098] [c000000004937c00] [c000000001cf7910] hugetlb_fault+0x9a0/0x1250
>>>> [ 4233.215106] [c000000004937ce0] [c000000001c94a80] handle_mm_fault+0x490/0x4a0
>>>> [ 4233.215114] [c000000004937d20] [c0000000018d529c] __do_page_fault+0x77c/0x1f00
>>>> [ 4233.215121] [c000000004937e00] [c0000000018d6a48] do_page_fault+0x28/0x50
>>>> [ 4233.215129] [c000000004937e20] [c00000000183b0d4] handle_page_fault+0x18/0x38
>>>> [ 4233.215135] Instruction dump:
>>>> [ 4233.215139] 39290001 f92ac1b0 419e009c 3ce20027 3ba00000 e927c1f0 39290001 f927c1f0
>>>> [ 4233.215149] 3d420027 e92ac290 39290001 f92ac290 <8359001c> 83390018 60000000 3ce20027
>>> 
>>> I did send a patch to the list to handle page allocation failures in this patch. But i guess what we are finding here is get_current() crashing. Any chance to bisect this?
>>> 
>> Following commit seems to have introduced this problem.
>> 723f268f19 - powerpc/mm: cleanup ifdef mess in add_huge_page_size()
>> Reverting this patch allows the test case to execute properly without a crash.
> 
> Oops ...
> 
> Can you check by replacing
> 
> mmu_psize = check_and_get_huge_psize(size);
> 
> by
> 
> mmu_psize = check_and_get_huge_psize(shift);
> 
> in add_huge_page_size()

Yup this allowed the test to PASS without any crash.

Thanks
-Sachin


^ permalink raw reply

* Re: [PATCH] powerpc/powernv/npu: Fix reference leak
From: Michael Ellerman @ 2019-05-14 11:13 UTC (permalink / raw)
  To: Greg Kurz
  Cc: Alexey Kardashevskiy, Alistair Popple, linuxppc-dev, linux-kernel,
	stable
In-Reply-To: <20190513135606.7d9a0902@bahia.lan>

Greg Kurz <groug@kaod.org> writes:
> Michael,
>
> Any comments on this patch ? Should I repost with a shorter comment
> as suggested by Alexey ?

No the longer comment seems fine to me.

I'm not a big fan of the patch, it's basically a hack :)

But for a backportable fix I guess it is OK.

I would be happier though if we eventually fix up the code to do the
refcounting properly.

cheers

> On Mon, 29 Apr 2019 12:36:59 +0200
> Greg Kurz <groug@kaod.org> wrote:
>> On Mon, 29 Apr 2019 16:01:29 +1000
>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>> 
>> > On 20/04/2019 01:34, Greg Kurz wrote:  
>> > > Since 902bdc57451c, get_pci_dev() calls pci_get_domain_bus_and_slot(). This
>> > > has the effect of incrementing the reference count of the PCI device, as
>> > > explained in drivers/pci/search.c:
>> > > 
>> > >  * Given a PCI domain, bus, and slot/function number, the desired PCI
>> > >  * device is located in the list of PCI devices. If the device is
>> > >  * found, its reference count is increased and this function returns a
>> > >  * pointer to its data structure.  The caller must decrement the
>> > >  * reference count by calling pci_dev_put().  If no device is found,
>> > >  * %NULL is returned.
>> > > 
>> > > Nothing was done to call pci_dev_put() and the reference count of GPU and
>> > > NPU PCI devices rockets up.
>> > > 
>> > > A natural way to fix this would be to teach the callers about the change,
>> > > so that they call pci_dev_put() when done with the pointer. This turns
>> > > out to be quite intrusive, as it affects many paths in npu-dma.c,
>> > > pci-ioda.c and vfio_pci_nvlink2.c.    
>> > 
>> > 
>> > afaict this referencing is only done to protect the current traverser
>> > and what you've done is actually a natural way (and the generic
>> > pci_get_dev_by_id() does exactly the same), although this looks a bit weird.
>> >   
>> 
>> Not exactly the same: pci_get_dev_by_id() always increment the refcount
>> of the returned PCI device. The refcount is only decremented when this
>> device is passed to pci_get_dev_by_id() to continue searching.
>> 
>> That means that the users of the PCI device pointer returned by
>> pci_get_dev_by_id() or its exported variants pci_get_subsys(),
>> pci_get_device() and pci_get_class() do handle the refcount. They
>> all pass the pointer to pci_dev_put() or continue the search,
>> which calls pci_dev_put() internally.
>> 
>> Direct and indirect callers of get_pci_dev() don't care for the
>> refcount at all unless I'm missing something.
>> 
>> >   
>> > > Also, the issue appeared in 4.16 and
>> > > some affected code got moved around since then: it would be problematic
>> > > to backport the fix to stable releases.
>> > > 
>> > > All that code never cared for reference counting anyway. Call pci_dev_put()
>> > > from get_pci_dev() to revert to the previous behavior.    
>> > >> Fixes: 902bdc57451c ("powerpc/powernv/idoa: Remove unnecessary pcidev    
>> > from pci_dn")  
>> > > Cc: stable@vger.kernel.org # v4.16
>> > > Signed-off-by: Greg Kurz <groug@kaod.org>
>> > > ---
>> > >  arch/powerpc/platforms/powernv/npu-dma.c |   15 ++++++++++++++-
>> > >  1 file changed, 14 insertions(+), 1 deletion(-)
>> > > 
>> > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
>> > > index e713ade30087..d8f3647e8fb2 100644
>> > > --- a/arch/powerpc/platforms/powernv/npu-dma.c
>> > > +++ b/arch/powerpc/platforms/powernv/npu-dma.c
>> > > @@ -31,9 +31,22 @@ static DEFINE_SPINLOCK(npu_context_lock);
>> > >  static struct pci_dev *get_pci_dev(struct device_node *dn)
>> > >  {
>> > >  	struct pci_dn *pdn = PCI_DN(dn);
>> > > +	struct pci_dev *pdev;
>> > >  
>> > > -	return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
>> > > +	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
>> > >  					   pdn->busno, pdn->devfn);
>> > > +
>> > > +	/*
>> > > +	 * pci_get_domain_bus_and_slot() increased the reference count of
>> > > +	 * the PCI device, but callers don't need that actually as the PE
>> > > +	 * already holds a reference to the device.    
>> > 
>> > Imho this would be just enough.
>> > 
>> > Anyway,
>> > 
>> > Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> >   
>> 
>> Thanks !
>> 
>> I now realize that I forgot to add the --cc option for stable on my stgit
>> command line :-\.
>> 
>> Cc'ing now.
>> 
>> > 
>> > How did you find it? :)
>> >   
>> 
>> While reading code to find some inspiration for OpenCAPI passthrough. :)
>> 
>> I saw the following in vfio_pci_ibm_npu2_init():
>> 
>> 	if (!pnv_pci_get_gpu_dev(vdev->pdev))
>> 		return -ENODEV;
>> 
>> and simply followed the function calls.
>> 
>> >   
>> > > Since callers aren't
>> > > +	 * aware of the reference count change, call pci_dev_put() now to
>> > > +	 * avoid leaks.
>> > > +	 */
>> > > +	if (pdev)
>> > > +		pci_dev_put(pdev);
>> > > +
>> > > +	return pdev;
>> > >  }
>> > >  
>> > >  /* Given a NPU device get the associated PCI device. */
>> > >     
>> >   
>> 

^ permalink raw reply

* Re: Kernel OOPS followed by a panic on next20190507 with 4K page size
From: Christophe Leroy @ 2019-05-14 11:05 UTC (permalink / raw)
  To: Sachin Sant, Aneesh Kumar K.V; +Cc: linux-next, linuxppc-dev
In-Reply-To: <4465D9C6-BE89-4215-9730-21CD40ABEA50@linux.vnet.ibm.com>



Le 14/05/2019 à 10:57, Sachin Sant a écrit :
> 
> 
>> On 14-May-2019, at 7:00 AM, Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> wrote:
>>
>> On 5/8/19 4:30 PM, Sachin Sant wrote:
>>> While running LTP tests (specifically futex_wake04) against next-20199597
>>> build with 4K page size on a POWER8 LPAR following crash is observed.
>>> [ 4233.214876] BUG: Kernel NULL pointer dereference at 0x0000001c
>>> [ 4233.214898] Faulting instruction address: 0xc000000001d1e58c
>>> [ 4233.214905] Oops: Kernel access of bad area, sig: 11 [#1]
>>> [ 4233.214911] LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>>> [ 4233.214920] Dumping ftrace buffer:
>>> [ 4233.214928]    (ftrace buffer empty)
>>> [ 4233.214933] Modules linked in: overlay rpadlpar_io rpaphp iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc kvm iptable_filter pseries_rng rng_core vmx_crypto ip_tables x_tables autofs4 [last unloaded: dummy_del_mod]
>>> [ 4233.214973] CPU: 3 PID: 4635 Comm: futex_wake04 Tainted: G        W  O      5.1.0-next-20190507-autotest #1
>>> [ 4233.214980] NIP:  c000000001d1e58c LR: c000000001d1e54c CTR: 0000000000000000
>>> [ 4233.214987] REGS: c000000004937890 TRAP: 0300   Tainted: G        W  O       (5.1.0-next-20190507-autotest)
>>> [ 4233.214993] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22424822  XER: 00000000
>>> [ 4233.215005] CFAR: c00000000183e9e0 DAR: 000000000000001c DSISR: 40000000 IRQMASK: 0
>>> [ 4233.215005] GPR00: c000000001901a80 c000000004937b20 c000000003938700 0000000000000000
>>> [ 4233.215005] GPR04: 0000000000400cc0 000000000003efff 000000027966e000 c000000003ba8700
>>> [ 4233.215005] GPR08: c000000003ba8700 000000000d601125 c000000003ba8700 0000000080000000
>>> [ 4233.215005] GPR12: 0000000022424822 c00000001ecae280 0000000000000000 0000000000000000
>>> [ 4233.215005] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>> [ 4233.215005] GPR20: 0000000000000018 c0000000039e2d30 c0000000039e2d28 c0000002762da460
>>> [ 4233.215005] GPR24: 000000000000001c 0000000000000000 0000000000000001 c000000001901a80
>>> [ 4233.215005] GPR28: 0000000000400cc0 0000000000000000 0000000000000000 0000000000400cc0
>>> [ 4233.215065] NIP [c000000001d1e58c] kmem_cache_alloc+0xbc/0x5a0
>>> [ 4233.215071] LR [c000000001d1e54c] kmem_cache_alloc+0x7c/0x5a0
>>> [ 4233.215075] Call Trace:
>>> [ 4233.215081] [c000000004937b20] [c000000001c91150] __pud_alloc+0x160/0x200 (unreliable)
>>> [ 4233.215090] [c000000004937b80] [c000000001901a80] huge_pte_alloc+0x580/0x950
>>> [ 4233.215098] [c000000004937c00] [c000000001cf7910] hugetlb_fault+0x9a0/0x1250
>>> [ 4233.215106] [c000000004937ce0] [c000000001c94a80] handle_mm_fault+0x490/0x4a0
>>> [ 4233.215114] [c000000004937d20] [c0000000018d529c] __do_page_fault+0x77c/0x1f00
>>> [ 4233.215121] [c000000004937e00] [c0000000018d6a48] do_page_fault+0x28/0x50
>>> [ 4233.215129] [c000000004937e20] [c00000000183b0d4] handle_page_fault+0x18/0x38
>>> [ 4233.215135] Instruction dump:
>>> [ 4233.215139] 39290001 f92ac1b0 419e009c 3ce20027 3ba00000 e927c1f0 39290001 f927c1f0
>>> [ 4233.215149] 3d420027 e92ac290 39290001 f92ac290 <8359001c> 83390018 60000000 3ce20027
>>
>> I did send a patch to the list to handle page allocation failures in this patch. But i guess what we are finding here is get_current() crashing. Any chance to bisect this?
>>
> 
> Following commit seems to have introduced this problem.
> 
> 723f268f19 - powerpc/mm: cleanup ifdef mess in add_huge_page_size()
> 
> Reverting this patch allows the test case to execute properly without a crash.

Oops ...

Can you check by replacing

mmu_psize = check_and_get_huge_psize(size);

by

mmu_psize = check_and_get_huge_psize(shift);

in add_huge_page_size()

Thanks
Christophe

> 
> Thanks
> -Sachin
> 

^ permalink raw reply

* Re: Kernel OOPS followed by a panic on next20190507 with 4K page size
From: Michael Ellerman @ 2019-05-14 10:24 UTC (permalink / raw)
  To: Sachin Sant, Aneesh Kumar K.V; +Cc: linux-next, linuxppc-dev
In-Reply-To: <4465D9C6-BE89-4215-9730-21CD40ABEA50@linux.vnet.ibm.com>

Sachin Sant <sachinp@linux.vnet.ibm.com> writes:
>> On 14-May-2019, at 7:00 AM, Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> wrote:
>> 
>> On 5/8/19 4:30 PM, Sachin Sant wrote:
>>> While running LTP tests (specifically futex_wake04) against next-20199597
>>> build with 4K page size on a POWER8 LPAR following crash is observed.
>>> [ 4233.214876] BUG: Kernel NULL pointer dereference at 0x0000001c
>>> [ 4233.214898] Faulting instruction address: 0xc000000001d1e58c
>>> [ 4233.214905] Oops: Kernel access of bad area, sig: 11 [#1]
>>> [ 4233.214911] LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>>> [ 4233.214920] Dumping ftrace buffer:
>>> [ 4233.214928]    (ftrace buffer empty)
>>> [ 4233.214933] Modules linked in: overlay rpadlpar_io rpaphp iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc kvm iptable_filter pseries_rng rng_core vmx_crypto ip_tables x_tables autofs4 [last unloaded: dummy_del_mod]
>>> [ 4233.214973] CPU: 3 PID: 4635 Comm: futex_wake04 Tainted: G        W  O      5.1.0-next-20190507-autotest #1
>>> [ 4233.214980] NIP:  c000000001d1e58c LR: c000000001d1e54c CTR: 0000000000000000
>>> [ 4233.214987] REGS: c000000004937890 TRAP: 0300   Tainted: G        W  O       (5.1.0-next-20190507-autotest)
>>> [ 4233.214993] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22424822  XER: 00000000
>>> [ 4233.215005] CFAR: c00000000183e9e0 DAR: 000000000000001c DSISR: 40000000 IRQMASK: 0
>>> [ 4233.215005] GPR00: c000000001901a80 c000000004937b20 c000000003938700 0000000000000000
>>> [ 4233.215005] GPR04: 0000000000400cc0 000000000003efff 000000027966e000 c000000003ba8700
>>> [ 4233.215005] GPR08: c000000003ba8700 000000000d601125 c000000003ba8700 0000000080000000
>>> [ 4233.215005] GPR12: 0000000022424822 c00000001ecae280 0000000000000000 0000000000000000
>>> [ 4233.215005] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>> [ 4233.215005] GPR20: 0000000000000018 c0000000039e2d30 c0000000039e2d28 c0000002762da460
>>> [ 4233.215005] GPR24: 000000000000001c 0000000000000000 0000000000000001 c000000001901a80
>>> [ 4233.215005] GPR28: 0000000000400cc0 0000000000000000 0000000000000000 0000000000400cc0
>>> [ 4233.215065] NIP [c000000001d1e58c] kmem_cache_alloc+0xbc/0x5a0
>>> [ 4233.215071] LR [c000000001d1e54c] kmem_cache_alloc+0x7c/0x5a0
>>> [ 4233.215075] Call Trace:
>>> [ 4233.215081] [c000000004937b20] [c000000001c91150] __pud_alloc+0x160/0x200 (unreliable)
>>> [ 4233.215090] [c000000004937b80] [c000000001901a80] huge_pte_alloc+0x580/0x950
>>> [ 4233.215098] [c000000004937c00] [c000000001cf7910] hugetlb_fault+0x9a0/0x1250
>>> [ 4233.215106] [c000000004937ce0] [c000000001c94a80] handle_mm_fault+0x490/0x4a0
>>> [ 4233.215114] [c000000004937d20] [c0000000018d529c] __do_page_fault+0x77c/0x1f00
>>> [ 4233.215121] [c000000004937e00] [c0000000018d6a48] do_page_fault+0x28/0x50
>>> [ 4233.215129] [c000000004937e20] [c00000000183b0d4] handle_page_fault+0x18/0x38
>>> [ 4233.215135] Instruction dump:
>>> [ 4233.215139] 39290001 f92ac1b0 419e009c 3ce20027 3ba00000 e927c1f0 39290001 f927c1f0
>>> [ 4233.215149] 3d420027 e92ac290 39290001 f92ac290 <8359001c> 83390018 60000000 3ce20027
>> 
>> I did send a patch to the list to handle page allocation failures in this patch. But i guess what we are finding here is get_current() crashing. Any chance to bisect this?
>> 
>
> Following commit seems to have introduced this problem.
>
> 723f268f19 - powerpc/mm: cleanup ifdef mess in add_huge_page_size()
>
> Reverting this patch allows the test case to execute properly without a crash.

I think I see the bug, let me test.

cheers

^ permalink raw reply

* Re: [RESEND PATCH] powerpc/pseries: Fix cpu_hotplug_lock acquisition in resize_hpt
From: Gautham R Shenoy @ 2019-05-14 10:18 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Gautham R. Shenoy, Aneesh Kumar K.V, linux-kernel,
	Nicholas Piggin, Paul Mackerras, linuxppc-dev
In-Reply-To: <874l5xtt6v.fsf@concordia.ellerman.id.au>

On Tue, May 14, 2019 at 05:02:16PM +1000, Michael Ellerman wrote:
> "Gautham R. Shenoy" <ego@linux.vnet.ibm.com> writes:
> > From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
> >
> > Subject: Re: [RESEND PATCH] powerpc/pseries: Fix cpu_hotplug_lock acquisition in resize_hpt
> 
> ps. A "RESEND" implies the patch is unchanged and you're just resending
> it because it was ignored.
> 
> In this case it should have just been "PATCH v2", with a note below the "---"
> saying "v2: Rebased onto powerpc/next ..."

Ok. I will send a v3 :-)

> 
> cheers
> 
> > During a memory hotplug operations involving resizing of the HPT, we
> > invoke a stop_machine() to perform the resizing. In this code path, we
> > end up recursively taking the cpu_hotplug_lock, first in
> > memory_hotplug_begin() and then subsequently in stop_machine(). This
> > causes the system to hang. With lockdep enabled we get the following
> > error message before the hang.
> >
> >   swapper/0/1 is trying to acquire lock:
> >   (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: stop_machine+0x2c/0x60
> >
> >   but task is already holding lock:
> >   (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
> >
> >   other info that might help us debug this:
> >    Possible unsafe locking scenario:
> >
> >          CPU0
> >          ----
> >     lock(cpu_hotplug_lock.rw_sem);
> >     lock(cpu_hotplug_lock.rw_sem);
> >
> >    *** DEADLOCK ***
> >
> > Fix this issue by
> >   1) Requiring all the calls to pseries_lpar_resize_hpt() be made
> >      with cpu_hotplug_lock held.
> >
> >   2) In pseries_lpar_resize_hpt() invoke stop_machine_cpuslocked()
> >      as a consequence of 1)
> >
> >   3) To satisfy 1), in hpt_order_set(), call mmu_hash_ops.resize_hpt()
> >      with cpu_hotplug_lock held.
> >
> > Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> > Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
> > ---
> >
> > Rebased this one against powerpc/next instead of linux/master.
> >
> >  arch/powerpc/mm/book3s64/hash_utils.c | 9 ++++++++-
> >  arch/powerpc/platforms/pseries/lpar.c | 8 ++++++--
> >  2 files changed, 14 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
> > index 919a861..d07fcafd 100644
> > --- a/arch/powerpc/mm/book3s64/hash_utils.c
> > +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> > @@ -38,6 +38,7 @@
> >  #include <linux/libfdt.h>
> >  #include <linux/pkeys.h>
> >  #include <linux/hugetlb.h>
> > +#include <linux/cpu.h>
> >  
> >  #include <asm/debugfs.h>
> >  #include <asm/processor.h>
> > @@ -1928,10 +1929,16 @@ static int hpt_order_get(void *data, u64 *val)
> >  
> >  static int hpt_order_set(void *data, u64 val)
> >  {
> > +	int ret;
> > +
> >  	if (!mmu_hash_ops.resize_hpt)
> >  		return -ENODEV;
> >  
> > -	return mmu_hash_ops.resize_hpt(val);
> > +	cpus_read_lock();
> > +	ret = mmu_hash_ops.resize_hpt(val);
> > +	cpus_read_unlock();
> > +
> > +	return ret;
> >  }
> >  
> >  DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
> > diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
> > index 1034ef1..2fc9756 100644
> > --- a/arch/powerpc/platforms/pseries/lpar.c
> > +++ b/arch/powerpc/platforms/pseries/lpar.c
> > @@ -859,7 +859,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
> >  	return 0;
> >  }
> >  
> > -/* Must be called in user context */
> > +/*
> > + * Must be called in user context. The caller should hold the
> > + * cpus_lock.
> > + */
> >  static int pseries_lpar_resize_hpt(unsigned long shift)
> >  {
> >  	struct hpt_resize_state state = {
> > @@ -913,7 +916,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
> >  
> >  	t1 = ktime_get();
> >  
> > -	rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
> > +	rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
> > +				     &state, NULL);
> >  
> >  	t2 = ktime_get();
> >  
> > -- 
> > 1.9.4
> 


^ permalink raw reply

* Re: [RESEND PATCH] powerpc/pseries: Fix cpu_hotplug_lock acquisition in resize_hpt
From: Gautham R Shenoy @ 2019-05-14 10:18 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Gautham R. Shenoy, Aneesh Kumar K.V, linux-kernel,
	Nicholas Piggin, Paul Mackerras, linuxppc-dev
In-Reply-To: <877eattta4.fsf@concordia.ellerman.id.au>

Hi Michael,

On Tue, May 14, 2019 at 05:00:19PM +1000, Michael Ellerman wrote:
> "Gautham R. Shenoy" <ego@linux.vnet.ibm.com> writes:
> > From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
> >
> > During a memory hotplug operations involving resizing of the HPT, we
> > invoke a stop_machine() to perform the resizing. In this code path, we
> > end up recursively taking the cpu_hotplug_lock, first in
> > memory_hotplug_begin() and then subsequently in stop_machine(). This
> > causes the system to hang.
>
> This implies we have never tested a memory hotplug that resized the HPT.
> Is that really true? Or did something change?
>

This was reported by Aneesh during a testcase involving reconfiguring
the namespace for nvdimm where we do a memory remove followed by
add. The memory add invokes resize_hpt().

It seems we can hit this issue when we perform a memory hotplug/unplug
in the guest.

> > With lockdep enabled we get the following
> > error message before the hang.
> >
> >   swapper/0/1 is trying to acquire lock:
> >   (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: stop_machine+0x2c/0x60
> >
> >   but task is already holding lock:
> >   (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
> 
> Do we have the full stack trace?

Yes, here is the complete log:

[    0.537123] swapper/0/1 is trying to acquire lock:
[    0.537197] (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: stop_machine+0x2c/0x60
[    0.537336]
[    0.537336] but task is already holding lock:
[    0.537429] (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50

[    0.537570]                                   
[    0.537570] other info that might help us debug this:
[    0.537663]  Possible unsafe locking scenario:
[    0.537663]
[    0.537756]        CPU0                                     
[    0.537794]        ----                                       
[    0.537832]   lock(cpu_hotplug_lock.rw_sem);                                                        
[    0.537906]   lock(cpu_hotplug_lock.rw_sem);             
[    0.537980]                                       
[    0.537980]  *** DEADLOCK ***                                                           
[    0.537980]                                                                                      
[    0.538074]  May be due to missing lock nesting notation                           
[    0.538074]                                                                    
[    0.538168] 3 locks held by swapper/0/1:               
[    0.538224]  #0: (____ptrval____) (&dev->mutex){....}, at: __driver_attach+0x12c/0x1b0
[    0.538348]  #1: (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50
[    0.538477]  #2: (____ptrval____) (mem_hotplug_lock.rw_sem){++++}, at: percpu_down_write+0x54/0x1a0
[    0.538608]
[    0.538608] stack backtrace:                                              
[    0.538685] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc5-58373-gbc99402235f3-dirty #166
[    0.538812] Call Trace:                                          
[    0.538863] [c0000000feb03150] [c000000000e32bd4] dump_stack+0xe8/0x164 (unreliable)
[    0.538975] [c0000000feb031a0] [c00000000020d6c0] __lock_acquire+0x1110/0x1c70
[    0.539086] [c0000000feb03320] [c00000000020f080] lock_acquire+0x240/0x290
[    0.539180] [c0000000feb033e0] [c00000000017f554] cpus_read_lock+0x64/0xf0
[    0.539273] [c0000000feb03420] [c00000000029ebac] stop_machine+0x2c/0x60     
[    0.539367] [c0000000feb03460] [c0000000000d7f7c] pseries_lpar_resize_hpt+0x19c/0x2c0
[    0.539479] [c0000000feb03500] [c0000000000788d0] resize_hpt_for_hotplug+0x70/0xd0
[    0.539590] [c0000000feb03570] [c000000000e5d278] arch_add_memory+0x58/0xfc
[    0.539683] [c0000000feb03610] [c0000000003553a8] devm_memremap_pages+0x5e8/0x8f0
[    0.539804] [c0000000feb036c0] [c0000000009c2394] pmem_attach_disk+0x764/0x830    
[    0.539916] [c0000000feb037d0] [c0000000009a7c38] nvdimm_bus_probe+0x118/0x240
[    0.540026] [c0000000feb03860] [c000000000968500] really_probe+0x230/0x4b0
[    0.540119] [c0000000feb038f0] [c000000000968aec] driver_probe_device+0x16c/0x1e0
[    0.540230] [c0000000feb03970] [c000000000968ca8] __driver_attach+0x148/0x1b0
[    0.540340] [c0000000feb039f0] [c0000000009650b0] bus_for_each_dev+0x90/0x130
[    0.540451] [c0000000feb03a50] [c000000000967dd4] driver_attach+0x34/0x50
[    0.540544] [c0000000feb03a70] [c000000000967068] bus_add_driver+0x1a8/0x360
[    0.540654] [c0000000feb03b00] [c00000000096a498] driver_register+0x108/0x170
[    0.540766] [c0000000feb03b70] [c0000000009a7400] __nd_driver_register+0xd0/0xf0
[    0.540898] [c0000000feb03bd0] [c00000000128aa90] nd_pmem_driver_init+0x34/0x48
[    0.541010] [c0000000feb03bf0] [c000000000010a10] do_one_initcall+0x1e0/0x45c
[    0.541122] [c0000000feb03cd0] [c00000000122462c] kernel_init_freeable+0x540/0x64c          
[    0.541232] [c0000000feb03db0] [c00000000001110c] kernel_init+0x2c/0x160
[    0.541326] [c0000000feb03e20] [c00000000000bed4] ret_from_kernel_thread+0x5c/0x68


> 
> >   other info that might help us debug this:
> >    Possible unsafe locking scenario:
> >
> >          CPU0
> >          ----
> >     lock(cpu_hotplug_lock.rw_sem);
> >     lock(cpu_hotplug_lock.rw_sem);
> >
> >    *** DEADLOCK ***
> >
> > Fix this issue by
> >   1) Requiring all the calls to pseries_lpar_resize_hpt() be made
> >      with cpu_hotplug_lock held.
> >
> >   2) In pseries_lpar_resize_hpt() invoke stop_machine_cpuslocked()
> >      as a consequence of 1)
> >
> >   3) To satisfy 1), in hpt_order_set(), call mmu_hash_ops.resize_hpt()
> >      with cpu_hotplug_lock held.
> >
> > Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> > Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
> > ---
> >
> > Rebased this one against powerpc/next instead of linux/master.
> >
> >  arch/powerpc/mm/book3s64/hash_utils.c | 9 ++++++++-
> >  arch/powerpc/platforms/pseries/lpar.c | 8 ++++++--
> >  2 files changed, 14 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
> > index 919a861..d07fcafd 100644
> > --- a/arch/powerpc/mm/book3s64/hash_utils.c
> > +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> > @@ -38,6 +38,7 @@
> >  #include <linux/libfdt.h>
> >  #include <linux/pkeys.h>
> >  #include <linux/hugetlb.h>
> > +#include <linux/cpu.h>
> >  
> >  #include <asm/debugfs.h>
> >  #include <asm/processor.h>
> > @@ -1928,10 +1929,16 @@ static int hpt_order_get(void *data, u64 *val)
> >  
> >  static int hpt_order_set(void *data, u64 val)
> >  {
> > +	int ret;
> > +
> >  	if (!mmu_hash_ops.resize_hpt)
> >  		return -ENODEV;
> >  
> > -	return mmu_hash_ops.resize_hpt(val);
> > +	cpus_read_lock();
> > +	ret = mmu_hash_ops.resize_hpt(val);
> > +	cpus_read_unlock();
> > +
> > +	return ret;
> >  }
> >  
> >  DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
> > diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
> > index 1034ef1..2fc9756 100644
> > --- a/arch/powerpc/platforms/pseries/lpar.c
> > +++ b/arch/powerpc/platforms/pseries/lpar.c
> > @@ -859,7 +859,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
> >  	return 0;
> >  }
> >  
> > -/* Must be called in user context */
> > +/*
> > + * Must be called in user context. The caller should hold the
> 
> I realise you're just copying that comment, but it seems wrong. "user
> context" means userspace. I think it means "process context" doesn't it?
>

Yes, from the qemu process context. I will fix this part of the
comment and also change the should to must.

> Also "should" should be "must" :)
>

Thanks for the review.


> > + * cpus_lock.
> > + */
> >  static int pseries_lpar_resize_hpt(unsigned long shift)
> >  {
> >  	struct hpt_resize_state state = {
> > @@ -913,7 +916,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
> >  
> >  	t1 = ktime_get();
> >  
> > -	rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
> > +	rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
> > +				     &state, NULL);
> >  
> >  	t2 = ktime_get();
> 
> cheers
> 


^ permalink raw reply

* Re: [PATCH stable 4.9] powerpc/lib: fix code patching during early init on PPC32
From: Michael Ellerman @ 2019-05-14 10:17 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras, Greg KH,
	erhard_f, Michael Neuling
  Cc: linuxppc-dev, linux-kernel, stable@vger.kernel.org
In-Reply-To: <015040d922a68fa15c81879cc2fc9ed7ac106e60.1557827275.git.christophe.leroy@c-s.fr>

Christophe Leroy <christophe.leroy@c-s.fr> writes:
> [Backport of upstream commit b45ba4a51cde29b2939365ef0c07ad34c8321789]
>
> On powerpc32, patch_instruction() is called by apply_feature_fixups()
> which is called from early_init()
>
> There is the following note in front of early_init():
>  * Note that the kernel may be running at an address which is different
>  * from the address that it was linked at, so we must use RELOC/PTRRELOC
>  * to access static data (including strings).  -- paulus
>
> Therefore init_mem_is_free must be accessed with PTRRELOC()
>
> Fixes: 1c38a84d4586 ("powerpc: Avoid code patching freed init sections")
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=203597
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>
> ---
> Can't apply the upstream commit as such due to several of other unrelated stuff
> like STRICT_KERNEL_RWX which are missing for instance.
> So instead, using same approach as for commit 252eb55816a6f69ef9464cad303cdb3326cdc61d

Yeah this looks good to me.

Though should we keep the same subject as the upstream commit this is
sort of a backport of? That might make it simpler for people who are
trying to keep track of things?

ie. "powerpc/lib: fix book3s/32 boot failure due to code patching"

cheers

> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> index 14535ad4cdd1..c312955977ce 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -23,7 +23,7 @@ int patch_instruction(unsigned int *addr, unsigned int instr)
>  	int err;
>  
>  	/* Make sure we aren't patching a freed init section */
> -	if (init_mem_is_free && init_section_contains(addr, 4)) {
> +	if (*PTRRELOC(&init_mem_is_free) && init_section_contains(addr, 4)) {
>  		pr_debug("Skipping init section patching addr: 0x%px\n", addr);
>  		return 0;
>  	}
> -- 
> 2.13.3

^ permalink raw reply

* [PATCH stable 4.9] powerpc/lib: fix code patching during early init on PPC32
From: Christophe Leroy @ 2019-05-14  9:57 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, Greg KH,
	erhard_f, Michael Neuling
  Cc: linuxppc-dev, linux-kernel, stable

[Backport of upstream commit b45ba4a51cde29b2939365ef0c07ad34c8321789]

On powerpc32, patch_instruction() is called by apply_feature_fixups()
which is called from early_init()

There is the following note in front of early_init():
 * Note that the kernel may be running at an address which is different
 * from the address that it was linked at, so we must use RELOC/PTRRELOC
 * to access static data (including strings).  -- paulus

Therefore init_mem_is_free must be accessed with PTRRELOC()

Fixes: 1c38a84d4586 ("powerpc: Avoid code patching freed init sections")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=203597
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>

---
Can't apply the upstream commit as such due to several of other unrelated stuff
like STRICT_KERNEL_RWX which are missing for instance.
So instead, using same approach as for commit 252eb55816a6f69ef9464cad303cdb3326cdc61d

 arch/powerpc/lib/code-patching.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 14535ad4cdd1..c312955977ce 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -23,7 +23,7 @@ int patch_instruction(unsigned int *addr, unsigned int instr)
 	int err;
 
 	/* Make sure we aren't patching a freed init section */
-	if (init_mem_is_free && init_section_contains(addr, 4)) {
+	if (*PTRRELOC(&init_mem_is_free) && init_section_contains(addr, 4)) {
 		pr_debug("Skipping init section patching addr: 0x%px\n", addr);
 		return 0;
 	}
-- 
2.13.3


^ permalink raw reply related

* Re: [PATCH] powerpc/mm: Handle page table allocation failures
From: Sachin Sant @ 2019-05-14  9:33 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Aneesh Kumar K.V, paulus, linuxppc-dev, npiggin
In-Reply-To: <87ftphtu6s.fsf@concordia.ellerman.id.au>



> On 14-May-2019, at 12:10 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>> This fix the below crash that arise due to not handling page table allocation
>> failures while allocating hugetlb page table.
>> 
>> BUG: Kernel NULL pointer dereference at 0x0000001c
>> Faulting instruction address: 0xc000000001d1e58c
>> Oops: Kernel access of bad area, sig: 11 [#1]
>> LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>> 
>> CPU: 3 PID: 4635 Comm: futex_wake04 Tainted: G        W  O      5.1.0-next-20190507-autotest #1
>> NIP:  c000000001d1e58c LR: c000000001d1e54c CTR: 0000000000000000
>> REGS: c000000004937890 TRAP: 0300   Tainted: G        W  O       (5.1.0-next-20190507-autotest)
>> MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22424822  XER: 00000000
>> CFAR: c00000000183e9e0 DAR: 000000000000001c DSISR: 40000000 IRQMASK: 0
>> GPR00: c000000001901a80 c000000004937b20 c000000003938700 0000000000000000
>> GPR04: 0000000000400cc0 000000000003efff 000000027966e000 c000000003ba8700
>> GPR08: c000000003ba8700 000000000d601125 c000000003ba8700 0000000080000000
>> GPR12: 0000000022424822 c00000001ecae280 0000000000000000 0000000000000000
>> GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>> GPR20: 0000000000000018 c0000000039e2d30 c0000000039e2d28 c0000002762da460
>> GPR24: 000000000000001c 0000000000000000 0000000000000001 c000000001901a80
>> GPR28: 0000000000400cc0 0000000000000000 0000000000000000 0000000000400cc0
>> NIP [c000000001d1e58c] kmem_cache_alloc+0xbc/0x5a0
>> LR [c000000001d1e54c] kmem_cache_alloc+0x7c/0x5a0
>> Call Trace:
>>  [c000000001c91150] __pud_alloc+0x160/0x200 (unreliable)
>>  [c000000001901a80] huge_pte_alloc+0x580/0x950
>>  [c000000001cf7910] hugetlb_fault+0x9a0/0x1250
>>  [c000000001c94a80] handle_mm_fault+0x490/0x4a0
>>  [c0000000018d529c] __do_page_fault+0x77c/0x1f00
>>  [c0000000018d6a48] do_page_fault+0x28/0x50
>>  [c00000000183b0d4] handle_page_fault+0x18/0x38
>> 
>> Fixes: e2b3d202d1db ("powerpc: Switch 16GB and 16MB explicit hugepages to a different page table format")
>> Reported-by: Sachin Sant <sachinp@linux.vnet.ibm.com>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>> 
>> Note: I did add a recent commit for the Fixes tag. But in reality we never checked for page table
>> allocation failure there. If we want to go to that old commit, then we may need.
> 
> If we never checked for failure in that path, is there some reason we've
> only just noticed the crashes? Are we just testing under memory pressure
> more effectively than we used to?
> 
Actually the reported crash seems to be due to commit 723f268f19

723f268f19 - powerpc/mm: cleanup ifdef mess in add_huge_page_size()

Reverting this patch allows the test case to execute correctly without a crash.

Thanks
-Sachin


^ permalink raw reply

* [PATCH 3/4] powerpc/32: define helpers to get L1 cache sizes.
From: Christophe Leroy @ 2019-05-14  9:05 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Oliver O'Halloran, Segher Boessenkool
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <239d1c8f15b8bedc161a234f9f1a22a07160dbdf.1557824379.git.christophe.leroy@c-s.fr>

This patch defines C helpers to retrieve the size of
cache blocks and uses them in the cacheflush functions.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/cache.h      | 16 ++++++++++++++--
 arch/powerpc/include/asm/cacheflush.h | 24 +++++++++++++++---------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 40ea5b3781c6..0009a0a82e86 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -33,7 +33,8 @@
 
 #define IFETCH_ALIGN_BYTES	(1 << IFETCH_ALIGN_SHIFT)
 
-#if defined(__powerpc64__) && !defined(__ASSEMBLY__)
+#if !defined(__ASSEMBLY__)
+#ifdef CONFIG_PPC64
 
 struct ppc_cache_info {
 	u32 size;
@@ -53,7 +54,18 @@ struct ppc64_caches {
 };
 
 extern struct ppc64_caches ppc64_caches;
-#endif /* __powerpc64__ && ! __ASSEMBLY__ */
+#else
+static inline u32 l1_cache_shift(void)
+{
+	return L1_CACHE_SHIFT;
+}
+
+static inline u32 l1_cache_bytes(void)
+{
+	return L1_CACHE_BYTES;
+}
+#endif
+#endif /* ! __ASSEMBLY__ */
 
 #if defined(__ASSEMBLY__)
 /*
diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index e9a40b110f1d..d405f18441cd 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -64,11 +64,13 @@ static inline void __flush_dcache_icache_phys(unsigned long physaddr)
  */
 static inline void flush_dcache_range(unsigned long start, unsigned long stop)
 {
-	void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
-	unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
+	unsigned long shift = l1_cache_shift();
+	unsigned long bytes = l1_cache_bytes();
+	void *addr = (void *)(start & ~(bytes - 1));
+	unsigned long size = stop - (unsigned long)addr + (bytes - 1);
 	unsigned long i;
 
-	for (i = 0; i < size >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
+	for (i = 0; i < size >> shift; i++, addr += bytes)
 		dcbf(addr);
 	mb();	/* sync */
 }
@@ -80,11 +82,13 @@ static inline void flush_dcache_range(unsigned long start, unsigned long stop)
  */
 static inline void clean_dcache_range(unsigned long start, unsigned long stop)
 {
-	void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
-	unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
+	unsigned long shift = l1_cache_shift();
+	unsigned long bytes = l1_cache_bytes();
+	void *addr = (void *)(start & ~(bytes - 1));
+	unsigned long size = stop - (unsigned long)addr + (bytes - 1);
 	unsigned long i;
 
-	for (i = 0; i < size >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
+	for (i = 0; i < size >> shift; i++, addr += bytes)
 		dcbst(addr);
 	mb();	/* sync */
 }
@@ -97,11 +101,13 @@ static inline void clean_dcache_range(unsigned long start, unsigned long stop)
 static inline void invalidate_dcache_range(unsigned long start,
 					   unsigned long stop)
 {
-	void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
-	unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
+	unsigned long shift = l1_cache_shift();
+	unsigned long bytes = l1_cache_bytes();
+	void *addr = (void *)(start & ~(bytes - 1));
+	unsigned long size = stop - (unsigned long)addr + (bytes - 1);
 	unsigned long i;
 
-	for (i = 0; i < size >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
+	for (i = 0; i < size >> shift; i++, addr += bytes)
 		dcbi(addr);
 	mb();	/* sync */
 }
-- 
2.13.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox