LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v6 08/15] memory-hotplug: Common APIs to support page tables hot-remove
From: Simon Jeons @ 2013-01-29 13:04 UTC (permalink / raw)
  To: Tang Chen
  Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
	linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
	kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
	wujianguo, yinghai, laijs, linux-kernel, minchan.kim, akpm,
	linuxppc-dev
In-Reply-To: <1357723959-5416-9-git-send-email-tangchen@cn.fujitsu.com>

Hi Tang,
On Wed, 2013-01-09 at 17:32 +0800, Tang Chen wrote:
> From: Wen Congyang <wency@cn.fujitsu.com>
> 
> When memory is removed, the corresponding pagetables should alse be removed.
> This patch introduces some common APIs to support vmemmap pagetable and x86_64
> architecture pagetable removing.

Why don't need to build_all_zonelists like online_pages does during
hot-add path(add_memory)?

> 
> All pages of virtual mapping in removed memory cannot be freedi if some pages
> used as PGD/PUD includes not only removed memory but also other memory. So the
> patch uses the following way to check whether page can be freed or not.
> 
>  1. When removing memory, the page structs of the revmoved memory are filled
>     with 0FD.
>  2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
>     In this case, the page used as PT/PMD can be freed.
> 
> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/pgtable_types.h |    1 +
>  arch/x86/mm/init_64.c                |  299 ++++++++++++++++++++++++++++++++++
>  arch/x86/mm/pageattr.c               |   47 +++---
>  include/linux/bootmem.h              |    1 +
>  4 files changed, 326 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 3c32db8..4b6fd2a 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -352,6 +352,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
>   * as a pte too.
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 9ac1723..fe01116 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -682,6 +682,305 @@ int arch_add_memory(int nid, u64 start, u64 size)
>  }
>  EXPORT_SYMBOL_GPL(arch_add_memory);
>  
> +#define PAGE_INUSE 0xFD
> +
> +static void __meminit free_pagetable(struct page *page, int order)
> +{
> +	struct zone *zone;
> +	bool bootmem = false;
> +	unsigned long magic;
> +	unsigned int nr_pages = 1 << order;
> +
> +	/* bootmem page has reserved flag */
> +	if (PageReserved(page)) {
> +		__ClearPageReserved(page);
> +		bootmem = true;
> +
> +		magic = (unsigned long)page->lru.next;
> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
> +			while (nr_pages--)
> +				put_page_bootmem(page++);
> +		} else
> +			__free_pages_bootmem(page, order);
> +	} else
> +		free_pages((unsigned long)page_address(page), order);
> +
> +	/*
> +	 * SECTION_INFO pages and MIX_SECTION_INFO pages
> +	 * are all allocated by bootmem.
> +	 */
> +	if (bootmem) {
> +		zone = page_zone(page);
> +		zone_span_writelock(zone);
> +		zone->present_pages += nr_pages;
> +		zone_span_writeunlock(zone);
> +		totalram_pages += nr_pages;
> +	}
> +}
> +
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +	pte_t *pte;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		pte = pte_start + i;
> +		if (pte_val(*pte))
> +			return;
> +	}
> +
> +	/* free a pte talbe */
> +	free_pagetable(pmd_page(*pmd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pmd_clear(pmd);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +	pmd_t *pmd;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		pmd = pmd_start + i;
> +		if (pmd_val(*pmd))
> +			return;
> +	}
> +
> +	/* free a pmd talbe */
> +	free_pagetable(pud_page(*pud), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pud_clear(pud);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +/* Return true if pgd is changed, otherwise return false. */
> +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
> +{
> +	pud_t *pud;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		pud = pud_start + i;
> +		if (pud_val(*pud))
> +			return false;
> +	}
> +
> +	/* free a pud table */
> +	free_pagetable(pgd_page(*pgd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pgd_clear(pgd);
> +	spin_unlock(&init_mm.page_table_lock);
> +
> +	return true;
> +}
> +
> +static void __meminit
> +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long next, pages = 0;
> +	pte_t *pte;
> +	void *page_addr;
> +	phys_addr_t phys_addr;
> +
> +	pte = pte_start + pte_index(addr);
> +	for (; addr < end; addr = next, pte++) {
> +		next = (addr + PAGE_SIZE) & PAGE_MASK;
> +		if (next > end)
> +			next = end;
> +
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		/*
> +		 * We mapped [0,1G) memory as identity mapping when
> +		 * initializing, in arch/x86/kernel/head_64.S. These
> +		 * pagetables cannot be removed.
> +		 */
> +		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
> +		if (phys_addr < (phys_addr_t)0x40000000)
> +			return;
> +
> +		if (IS_ALIGNED(addr, PAGE_SIZE) &&
> +		    IS_ALIGNED(next, PAGE_SIZE)) {
> +			if (!direct) {
> +				free_pagetable(pte_page(*pte), 0);
> +				pages++;
> +			}
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pte_clear(&init_mm, addr, pte);
> +			spin_unlock(&init_mm.page_table_lock);
> +		} else {
> +			/*
> +			 * If we are not removing the whole page, it means
> +			 * other ptes in this page are being used and we canot
> +			 * remove them. So fill the unused ptes with 0xFD, and
> +			 * remove the page when it is wholly filled with 0xFD.
> +			 */
> +			memset((void *)addr, PAGE_INUSE, next - addr);
> +			page_addr = page_address(pte_page(*pte));
> +
> +			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
> +				free_pagetable(pte_page(*pte), 0);
> +				pages++;
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pte_clear(&init_mm, addr, pte);
> +				spin_unlock(&init_mm.page_table_lock);
> +			}
> +		}
> +	}
> +
> +	/* Call free_pte_table() in remove_pmd_table(). */
> +	flush_tlb_all();
> +	if (direct)
> +		update_page_count(PG_LEVEL_4K, -pages);
> +}
> +
> +static void __meminit
> +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long pte_phys, next, pages = 0;
> +	pte_t *pte_base;
> +	pmd_t *pmd;
> +
> +	pmd = pmd_start + pmd_index(addr);
> +	for (; addr < end; addr = next, pmd++) {
> +		next = pmd_addr_end(addr, end);
> +
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_large(*pmd)) {
> +			if (IS_ALIGNED(addr, PMD_SIZE) &&
> +			    IS_ALIGNED(next, PMD_SIZE)) {
> +				if (!direct) {
> +					free_pagetable(pmd_page(*pmd),
> +						       get_order(PMD_SIZE));
> +					pages++;
> +				}
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pmd_clear(pmd);
> +				spin_unlock(&init_mm.page_table_lock);
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 2M page, but we need to remove part of them,
> +			 * so split 2M page to 4K page.
> +			 */
> +			pte_base = (pte_t *)alloc_low_page(&pte_phys);
> +			BUG_ON(!pte_base);
> +			__split_large_page((pte_t *)pmd, addr,
> +					   (pte_t *)pte_base);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +
> +			flush_tlb_all();
> +		}
> +
> +		pte_base = (pte_t *)map_low_page((pte_t *)pmd_page_vaddr(*pmd));
> +		remove_pte_table(pte_base, addr, next, direct);
> +		free_pte_table(pte_base, pmd);
> +		unmap_low_page(pte_base);
> +	}
> +
> +	/* Call free_pmd_table() in remove_pud_table(). */
> +	if (direct)
> +		update_page_count(PG_LEVEL_2M, -pages);
> +}
> +
> +static void __meminit
> +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long pmd_phys, next, pages = 0;
> +	pmd_t *pmd_base;
> +	pud_t *pud;
> +
> +	pud = pud_start + pud_index(addr);
> +	for (; addr < end; addr = next, pud++) {
> +		next = pud_addr_end(addr, end);
> +
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_large(*pud)) {
> +			if (IS_ALIGNED(addr, PUD_SIZE) &&
> +			    IS_ALIGNED(next, PUD_SIZE)) {
> +				if (!direct) {
> +					free_pagetable(pud_page(*pud),
> +						       get_order(PUD_SIZE));
> +					pages++;
> +				}
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pud_clear(pud);
> +				spin_unlock(&init_mm.page_table_lock);
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 1G page, but we need to remove part of them,
> +			 * so split 1G page to 2M page.
> +			 */
> +			pmd_base = (pmd_t *)alloc_low_page(&pmd_phys);
> +			BUG_ON(!pmd_base);
> +			__split_large_page((pte_t *)pud, addr,
> +					   (pte_t *)pmd_base);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pud_populate(&init_mm, pud, __va(pmd_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +
> +			flush_tlb_all();
> +		}
> +
> +		pmd_base = (pmd_t *)map_low_page((pmd_t *)pud_page_vaddr(*pud));
> +		remove_pmd_table(pmd_base, addr, next, direct);
> +		free_pmd_table(pmd_base, pud);
> +		unmap_low_page(pmd_base);
> +	}
> +
> +	if (direct)
> +		update_page_count(PG_LEVEL_1G, -pages);
> +}
> +
> +/* start and end are both virtual address. */
> +static void __meminit
> +remove_pagetable(unsigned long start, unsigned long end, bool direct)
> +{
> +	unsigned long next;
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	bool pgd_changed = false;
> +
> +	for (; start < end; start = next) {
> +		pgd = pgd_offset_k(start);
> +		if (!pgd_present(*pgd))
> +			continue;
> +
> +		next = pgd_addr_end(start, end);
> +
> +		pud = (pud_t *)map_low_page((pud_t *)pgd_page_vaddr(*pgd));
> +		remove_pud_table(pud, start, next, direct);
> +		if (free_pud_table(pud, pgd))
> +			pgd_changed = true;
> +		unmap_low_page(pud);
> +	}
> +
> +	if (pgd_changed)
> +		sync_global_pgds(start, end - 1);
> +
> +	flush_tlb_all();
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  int __ref arch_remove_memory(u64 start, u64 size)
>  {
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index a718e0d..7dcb6f9 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -501,21 +501,13 @@ out_unlock:
>  	return do_split;
>  }
>  
> -static int split_large_page(pte_t *kpte, unsigned long address)
> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
>  {
>  	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
> -	pte_t *pbase, *tmp;
> +	pte_t *tmp;
>  	pgprot_t ref_prot;
> -	struct page *base;
> -
> -	if (!debug_pagealloc)
> -		spin_unlock(&cpa_lock);
> -	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> -	if (!debug_pagealloc)
> -		spin_lock(&cpa_lock);
> -	if (!base)
> -		return -ENOMEM;
> +	struct page *base = virt_to_page(pbase);
>  
>  	spin_lock(&pgd_lock);
>  	/*
> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
>  	 * up for us already:
>  	 */
>  	tmp = lookup_address(address, &level);
> -	if (tmp != kpte)
> -		goto out_unlock;
> +	if (tmp != kpte) {
> +		spin_unlock(&pgd_lock);
> +		return 1;
> +	}
>  
> -	pbase = (pte_t *)page_address(base);
>  	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>  	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
>  	/*
> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
>  	 * going on.
>  	 */
>  	__flush_tlb_all();
> +	spin_unlock(&pgd_lock);
>  
> -	base = NULL;
> +	return 0;
> +}
>  
> -out_unlock:
> -	/*
> -	 * If we dropped out via the lookup_address check under
> -	 * pgd_lock then stick the page back into the pool:
> -	 */
> -	if (base)
> +static int split_large_page(pte_t *kpte, unsigned long address)
> +{
> +	pte_t *pbase;
> +	struct page *base;
> +
> +	if (!debug_pagealloc)
> +		spin_unlock(&cpa_lock);
> +	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> +	if (!debug_pagealloc)
> +		spin_lock(&cpa_lock);
> +	if (!base)
> +		return -ENOMEM;
> +
> +	pbase = (pte_t *)page_address(base);
> +	if (__split_large_page(kpte, address, pbase))
>  		__free_page(base);
> -	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
> index 3f778c2..190ff06 100644
> --- a/include/linux/bootmem.h
> +++ b/include/linux/bootmem.h
> @@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
>  			      unsigned long size);
>  extern void free_bootmem(unsigned long physaddr, unsigned long size);
>  extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
> +extern void __free_pages_bootmem(struct page *page, unsigned int order);
>  
>  /*
>   * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,

^ permalink raw reply

* Re: [PATCH v6 08/15] memory-hotplug: Common APIs to support page tables hot-remove
From: Simon Jeons @ 2013-01-29 13:02 UTC (permalink / raw)
  To: Tang Chen
  Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
	linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
	kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
	wujianguo, yinghai, laijs, linux-kernel, minchan.kim, akpm,
	linuxppc-dev
In-Reply-To: <1357723959-5416-9-git-send-email-tangchen@cn.fujitsu.com>

Hi Tang,
On Wed, 2013-01-09 at 17:32 +0800, Tang Chen wrote:
> From: Wen Congyang <wency@cn.fujitsu.com>
> 
> When memory is removed, the corresponding pagetables should alse be removed.
> This patch introduces some common APIs to support vmemmap pagetable and x86_64
> architecture pagetable removing.
> 

When page table of hot-add memory is created?

> All pages of virtual mapping in removed memory cannot be freedi if some pages
> used as PGD/PUD includes not only removed memory but also other memory. So the
> patch uses the following way to check whether page can be freed or not.
> 
>  1. When removing memory, the page structs of the revmoved memory are filled
>     with 0FD.
>  2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
>     In this case, the page used as PT/PMD can be freed.
> 
> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/pgtable_types.h |    1 +
>  arch/x86/mm/init_64.c                |  299 ++++++++++++++++++++++++++++++++++
>  arch/x86/mm/pageattr.c               |   47 +++---
>  include/linux/bootmem.h              |    1 +
>  4 files changed, 326 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 3c32db8..4b6fd2a 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -352,6 +352,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
>   * as a pte too.
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 9ac1723..fe01116 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -682,6 +682,305 @@ int arch_add_memory(int nid, u64 start, u64 size)
>  }
>  EXPORT_SYMBOL_GPL(arch_add_memory);
>  
> +#define PAGE_INUSE 0xFD
> +
> +static void __meminit free_pagetable(struct page *page, int order)
> +{
> +	struct zone *zone;
> +	bool bootmem = false;
> +	unsigned long magic;
> +	unsigned int nr_pages = 1 << order;
> +
> +	/* bootmem page has reserved flag */
> +	if (PageReserved(page)) {
> +		__ClearPageReserved(page);
> +		bootmem = true;
> +
> +		magic = (unsigned long)page->lru.next;
> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
> +			while (nr_pages--)
> +				put_page_bootmem(page++);
> +		} else
> +			__free_pages_bootmem(page, order);
> +	} else
> +		free_pages((unsigned long)page_address(page), order);
> +
> +	/*
> +	 * SECTION_INFO pages and MIX_SECTION_INFO pages
> +	 * are all allocated by bootmem.
> +	 */
> +	if (bootmem) {
> +		zone = page_zone(page);
> +		zone_span_writelock(zone);
> +		zone->present_pages += nr_pages;
> +		zone_span_writeunlock(zone);
> +		totalram_pages += nr_pages;
> +	}
> +}
> +
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +	pte_t *pte;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		pte = pte_start + i;
> +		if (pte_val(*pte))
> +			return;
> +	}
> +
> +	/* free a pte talbe */
> +	free_pagetable(pmd_page(*pmd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pmd_clear(pmd);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +	pmd_t *pmd;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		pmd = pmd_start + i;
> +		if (pmd_val(*pmd))
> +			return;
> +	}
> +
> +	/* free a pmd talbe */
> +	free_pagetable(pud_page(*pud), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pud_clear(pud);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +/* Return true if pgd is changed, otherwise return false. */
> +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
> +{
> +	pud_t *pud;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		pud = pud_start + i;
> +		if (pud_val(*pud))
> +			return false;
> +	}
> +
> +	/* free a pud table */
> +	free_pagetable(pgd_page(*pgd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pgd_clear(pgd);
> +	spin_unlock(&init_mm.page_table_lock);
> +
> +	return true;
> +}
> +
> +static void __meminit
> +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long next, pages = 0;
> +	pte_t *pte;
> +	void *page_addr;
> +	phys_addr_t phys_addr;
> +
> +	pte = pte_start + pte_index(addr);
> +	for (; addr < end; addr = next, pte++) {
> +		next = (addr + PAGE_SIZE) & PAGE_MASK;
> +		if (next > end)
> +			next = end;
> +
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		/*
> +		 * We mapped [0,1G) memory as identity mapping when
> +		 * initializing, in arch/x86/kernel/head_64.S. These
> +		 * pagetables cannot be removed.
> +		 */
> +		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
> +		if (phys_addr < (phys_addr_t)0x40000000)
> +			return;
> +
> +		if (IS_ALIGNED(addr, PAGE_SIZE) &&
> +		    IS_ALIGNED(next, PAGE_SIZE)) {
> +			if (!direct) {
> +				free_pagetable(pte_page(*pte), 0);
> +				pages++;
> +			}
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pte_clear(&init_mm, addr, pte);
> +			spin_unlock(&init_mm.page_table_lock);
> +		} else {
> +			/*
> +			 * If we are not removing the whole page, it means
> +			 * other ptes in this page are being used and we canot
> +			 * remove them. So fill the unused ptes with 0xFD, and
> +			 * remove the page when it is wholly filled with 0xFD.
> +			 */
> +			memset((void *)addr, PAGE_INUSE, next - addr);
> +			page_addr = page_address(pte_page(*pte));
> +
> +			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
> +				free_pagetable(pte_page(*pte), 0);
> +				pages++;
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pte_clear(&init_mm, addr, pte);
> +				spin_unlock(&init_mm.page_table_lock);
> +			}
> +		}
> +	}
> +
> +	/* Call free_pte_table() in remove_pmd_table(). */
> +	flush_tlb_all();
> +	if (direct)
> +		update_page_count(PG_LEVEL_4K, -pages);
> +}
> +
> +static void __meminit
> +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long pte_phys, next, pages = 0;
> +	pte_t *pte_base;
> +	pmd_t *pmd;
> +
> +	pmd = pmd_start + pmd_index(addr);
> +	for (; addr < end; addr = next, pmd++) {
> +		next = pmd_addr_end(addr, end);
> +
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_large(*pmd)) {
> +			if (IS_ALIGNED(addr, PMD_SIZE) &&
> +			    IS_ALIGNED(next, PMD_SIZE)) {
> +				if (!direct) {
> +					free_pagetable(pmd_page(*pmd),
> +						       get_order(PMD_SIZE));
> +					pages++;
> +				}
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pmd_clear(pmd);
> +				spin_unlock(&init_mm.page_table_lock);
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 2M page, but we need to remove part of them,
> +			 * so split 2M page to 4K page.
> +			 */
> +			pte_base = (pte_t *)alloc_low_page(&pte_phys);
> +			BUG_ON(!pte_base);
> +			__split_large_page((pte_t *)pmd, addr,
> +					   (pte_t *)pte_base);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +
> +			flush_tlb_all();
> +		}
> +
> +		pte_base = (pte_t *)map_low_page((pte_t *)pmd_page_vaddr(*pmd));
> +		remove_pte_table(pte_base, addr, next, direct);
> +		free_pte_table(pte_base, pmd);
> +		unmap_low_page(pte_base);
> +	}
> +
> +	/* Call free_pmd_table() in remove_pud_table(). */
> +	if (direct)
> +		update_page_count(PG_LEVEL_2M, -pages);
> +}
> +
> +static void __meminit
> +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long pmd_phys, next, pages = 0;
> +	pmd_t *pmd_base;
> +	pud_t *pud;
> +
> +	pud = pud_start + pud_index(addr);
> +	for (; addr < end; addr = next, pud++) {
> +		next = pud_addr_end(addr, end);
> +
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_large(*pud)) {
> +			if (IS_ALIGNED(addr, PUD_SIZE) &&
> +			    IS_ALIGNED(next, PUD_SIZE)) {
> +				if (!direct) {
> +					free_pagetable(pud_page(*pud),
> +						       get_order(PUD_SIZE));
> +					pages++;
> +				}
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pud_clear(pud);
> +				spin_unlock(&init_mm.page_table_lock);
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 1G page, but we need to remove part of them,
> +			 * so split 1G page to 2M page.
> +			 */
> +			pmd_base = (pmd_t *)alloc_low_page(&pmd_phys);
> +			BUG_ON(!pmd_base);
> +			__split_large_page((pte_t *)pud, addr,
> +					   (pte_t *)pmd_base);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pud_populate(&init_mm, pud, __va(pmd_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +
> +			flush_tlb_all();
> +		}
> +
> +		pmd_base = (pmd_t *)map_low_page((pmd_t *)pud_page_vaddr(*pud));
> +		remove_pmd_table(pmd_base, addr, next, direct);
> +		free_pmd_table(pmd_base, pud);
> +		unmap_low_page(pmd_base);
> +	}
> +
> +	if (direct)
> +		update_page_count(PG_LEVEL_1G, -pages);
> +}
> +
> +/* start and end are both virtual address. */
> +static void __meminit
> +remove_pagetable(unsigned long start, unsigned long end, bool direct)
> +{
> +	unsigned long next;
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	bool pgd_changed = false;
> +
> +	for (; start < end; start = next) {
> +		pgd = pgd_offset_k(start);
> +		if (!pgd_present(*pgd))
> +			continue;
> +
> +		next = pgd_addr_end(start, end);
> +
> +		pud = (pud_t *)map_low_page((pud_t *)pgd_page_vaddr(*pgd));
> +		remove_pud_table(pud, start, next, direct);
> +		if (free_pud_table(pud, pgd))
> +			pgd_changed = true;
> +		unmap_low_page(pud);
> +	}
> +
> +	if (pgd_changed)
> +		sync_global_pgds(start, end - 1);
> +
> +	flush_tlb_all();
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  int __ref arch_remove_memory(u64 start, u64 size)
>  {
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index a718e0d..7dcb6f9 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -501,21 +501,13 @@ out_unlock:
>  	return do_split;
>  }
>  
> -static int split_large_page(pte_t *kpte, unsigned long address)
> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
>  {
>  	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
> -	pte_t *pbase, *tmp;
> +	pte_t *tmp;
>  	pgprot_t ref_prot;
> -	struct page *base;
> -
> -	if (!debug_pagealloc)
> -		spin_unlock(&cpa_lock);
> -	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> -	if (!debug_pagealloc)
> -		spin_lock(&cpa_lock);
> -	if (!base)
> -		return -ENOMEM;
> +	struct page *base = virt_to_page(pbase);
>  
>  	spin_lock(&pgd_lock);
>  	/*
> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
>  	 * up for us already:
>  	 */
>  	tmp = lookup_address(address, &level);
> -	if (tmp != kpte)
> -		goto out_unlock;
> +	if (tmp != kpte) {
> +		spin_unlock(&pgd_lock);
> +		return 1;
> +	}
>  
> -	pbase = (pte_t *)page_address(base);
>  	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>  	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
>  	/*
> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
>  	 * going on.
>  	 */
>  	__flush_tlb_all();
> +	spin_unlock(&pgd_lock);
>  
> -	base = NULL;
> +	return 0;
> +}
>  
> -out_unlock:
> -	/*
> -	 * If we dropped out via the lookup_address check under
> -	 * pgd_lock then stick the page back into the pool:
> -	 */
> -	if (base)
> +static int split_large_page(pte_t *kpte, unsigned long address)
> +{
> +	pte_t *pbase;
> +	struct page *base;
> +
> +	if (!debug_pagealloc)
> +		spin_unlock(&cpa_lock);
> +	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> +	if (!debug_pagealloc)
> +		spin_lock(&cpa_lock);
> +	if (!base)
> +		return -ENOMEM;
> +
> +	pbase = (pte_t *)page_address(base);
> +	if (__split_large_page(kpte, address, pbase))
>  		__free_page(base);
> -	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
> index 3f778c2..190ff06 100644
> --- a/include/linux/bootmem.h
> +++ b/include/linux/bootmem.h
> @@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
>  			      unsigned long size);
>  extern void free_bootmem(unsigned long physaddr, unsigned long size);
>  extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
> +extern void __free_pages_bootmem(struct page *page, unsigned int order);
>  
>  /*
>   * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,

^ permalink raw reply

* Re: [PATCH v6 00/15] memory-hotplug: hot-remove physical memory
From: Simon Jeons @ 2013-01-29 12:52 UTC (permalink / raw)
  To: Tang Chen
  Cc: linux-ia64, linux-sh, linux-mm, paulus, hpa, sparclinux, cl,
	linux-s390, x86, linux-acpi, isimatu.yasuaki, linfeng, mgorman,
	kosaki.motohiro, rientjes, len.brown, wency, cmetcalf, glommer,
	wujianguo, yinghai, laijs, linux-kernel, minchan.kim, akpm,
	linuxppc-dev
In-Reply-To: <1357723959-5416-1-git-send-email-tangchen@cn.fujitsu.com>

Hi Tang,

On Wed, 2013-01-09 at 17:32 +0800, Tang Chen wrote:
> Here is the physical memory hot-remove patch-set based on 3.8rc-2.

Some questions ask you, not has relationship with this patchset, but is
memory hotplug stuff.

1. In function node_states_check_changes_online:

comments:
* If we don't have HIGHMEM nor movable node,
* node_states[N_NORMAL_MEMORY] contains nodes which have zones of
* 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.

How to understand it? Why we don't have HIGHMEM nor movable node and
node_staes[N_NORMAL_MEMORY] contains 0...ZONE_MOVABLE, IIUC,
N_NORMAL_MEMORY only means the node has regular memory.

* If we don't have movable node, node_states[N_NORMAL_MEMORY]
* contains nodes which have zones of 0...ZONE_MOVABLE,
* set zone_last to ZONE_MOVABLE.

How to understand?

2. In function move_pfn_range_left, why end <= z2->zone_start_pfn is not
correct? The comments said that must include/overlap, why?

3. In function online_pages, the normal case(w/o online_kenrel,
online_movable), why not check if the new zone is overlap with adjacent
zones?

4. Could you summarize the difference implementation between hot-add and
logic-add, hot-remove and logic-remove?   


> 
> This patch-set aims to implement physical memory hot-removing.
> 
> The patches can free/remove the following things:
> 
>   - /sys/firmware/memmap/X/{end, start, type} : [PATCH 4/15]
>   - memmap of sparse-vmemmap                  : [PATCH 6,7,8,10/15]
>   - page table of removed memory              : [RFC PATCH 7,8,10/15]
>   - node and related sysfs files              : [RFC PATCH 13-15/15]
> 
> 
> Existing problem:
> If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
> when we online pages.
> 
> For example: there is a memory device on node 1. The address range
> is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
> and memory11 under the directory /sys/devices/system/memory/.
> 
> If CONFIG_MEMCG is selected, when we online memory8, the memory stored page
> cgroup is not provided by this memory device. But when we online memory9, the
> memory stored page cgroup may be provided by memory8. So we can't offline
> memory8 now. We should offline the memory in the reversed order.
> 
> When the memory device is hotremoved, we will auto offline memory provided
> by this memory device. But we don't know which memory is onlined first, so
> offlining memory may fail.
> 
> In patch1, we provide a solution which is not good enough:
> Iterate twice to offline the memory.
> 1st iterate: offline every non primary memory block.
> 2nd iterate: offline primary (i.e. first added) memory block.
> 
> And a new idea from Wen Congyang <wency@cn.fujitsu.com> is:
> allocate the memory from the memory block they are describing.
> 
> But we are not sure if it is OK to do so because there is not existing API
> to do so, and we need to move page_cgroup memory allocation from MEM_GOING_ONLINE
> to MEM_ONLINE. And also, it may interfere the hugepage.
> 
> 
> 
> How to test this patchset?
> 1. apply this patchset and build the kernel. MEMORY_HOTPLUG, MEMORY_HOTREMOVE,
>    ACPI_HOTPLUG_MEMORY must be selected.
> 2. load the module acpi_memhotplug
> 3. hotplug the memory device(it depends on your hardware)
>    You will see the memory device under the directory /sys/bus/acpi/devices/.
>    Its name is PNP0C80:XX.
> 4. online/offline pages provided by this memory device
>    You can write online/offline to /sys/devices/system/memory/memoryX/state to
>    online/offline pages provided by this memory device
> 5. hotremove the memory device
>    You can hotremove the memory device by the hardware, or writing 1 to
>    /sys/bus/acpi/devices/PNP0C80:XX/eject.

Is there a similar knode to hot-add the memory device?

> 
> 
> Note: if the memory provided by the memory device is used by the kernel, it
> can't be offlined. It is not a bug.
> 
> 
> Changelogs from v5 to v6:
>  Patch3: Add some more comments to explain memory hot-remove.
>  Patch4: Remove bootmem member in struct firmware_map_entry.
>  Patch6: Repeatedly register bootmem pages when using hugepage.
>  Patch8: Repeatedly free bootmem pages when using hugepage.
>  Patch14: Don't free pgdat when offlining a node, just reset it to 0.
>  Patch15: New patch, pgdat is not freed in patch14, so don't allocate a new
>           one when online a node.
> 
> Changelogs from v4 to v5:
>  Patch7: new patch, move pgdat_resize_lock into sparse_remove_one_section() to
>          avoid disabling irq because we need flush tlb when free pagetables.
>  Patch8: new patch, pick up some common APIs that are used to free direct mapping
>          and vmemmap pagetables.
>  Patch9: free direct mapping pagetables on x86_64 arch.
>  Patch10: free vmemmap pagetables.
>  Patch11: since freeing memmap with vmemmap has been implemented, the config
>           macro CONFIG_SPARSEMEM_VMEMMAP when defining __remove_section() is
>           no longer needed.
>  Patch13: no need to modify acpi_memory_disable_device() since it was removed,
>           and add nid parameter when calling remove_memory().
> 
> Changelogs from v3 to v4:
>  Patch7: remove unused codes.
>  Patch8: fix nr_pages that is passed to free_map_bootmem()
> 
> Changelogs from v2 to v3:
>  Patch9: call sync_global_pgds() if pgd is changed
>  Patch10: fix a problem int the patch
> 
> Changelogs from v1 to v2:
>  Patch1: new patch, offline memory twice. 1st iterate: offline every non primary
>          memory block. 2nd iterate: offline primary (i.e. first added) memory
>          block.
> 
>  Patch3: new patch, no logical change, just remove reduntant codes.
> 
>  Patch9: merge the patch from wujianguo into this patch. flush tlb on all cpu
>          after the pagetable is changed.
> 
>  Patch12: new patch, free node_data when a node is offlined.
> 
> 
> Tang Chen (6):
>   memory-hotplug: move pgdat_resize_lock into
>     sparse_remove_one_section()
>   memory-hotplug: remove page table of x86_64 architecture
>   memory-hotplug: remove memmap of sparse-vmemmap
>   memory-hotplug: Integrated __remove_section() of
>     CONFIG_SPARSEMEM_VMEMMAP.
>   memory-hotplug: remove sysfs file of node
>   memory-hotplug: Do not allocate pdgat if it was not freed when
>     offline.
> 
> Wen Congyang (5):
>   memory-hotplug: try to offline the memory twice to avoid dependence
>   memory-hotplug: remove redundant codes
>   memory-hotplug: introduce new function arch_remove_memory() for
>     removing page table depends on architecture
>   memory-hotplug: Common APIs to support page tables hot-remove
>   memory-hotplug: free node_data when a node is offlined
> 
> Yasuaki Ishimatsu (4):
>   memory-hotplug: check whether all memory blocks are offlined or not
>     when removing memory
>   memory-hotplug: remove /sys/firmware/memmap/X sysfs
>   memory-hotplug: implement register_page_bootmem_info_section of
>     sparse-vmemmap
>   memory-hotplug: memory_hotplug: clear zone when removing the memory
> 
>  arch/arm64/mm/mmu.c                  |    3 +
>  arch/ia64/mm/discontig.c             |   10 +
>  arch/ia64/mm/init.c                  |   18 ++
>  arch/powerpc/mm/init_64.c            |   10 +
>  arch/powerpc/mm/mem.c                |   12 +
>  arch/s390/mm/init.c                  |   12 +
>  arch/s390/mm/vmem.c                  |   10 +
>  arch/sh/mm/init.c                    |   17 ++
>  arch/sparc/mm/init_64.c              |   10 +
>  arch/tile/mm/init.c                  |    8 +
>  arch/x86/include/asm/pgtable_types.h |    1 +
>  arch/x86/mm/init_32.c                |   12 +
>  arch/x86/mm/init_64.c                |  390 +++++++++++++++++++++++++++++
>  arch/x86/mm/pageattr.c               |   47 ++--
>  drivers/acpi/acpi_memhotplug.c       |    8 +-
>  drivers/base/memory.c                |    6 +
>  drivers/firmware/memmap.c            |   96 +++++++-
>  include/linux/bootmem.h              |    1 +
>  include/linux/firmware-map.h         |    6 +
>  include/linux/memory_hotplug.h       |   15 +-
>  include/linux/mm.h                   |    4 +-
>  mm/memory_hotplug.c                  |  459 +++++++++++++++++++++++++++++++---
>  mm/sparse.c                          |    8 +-
>  23 files changed, 1094 insertions(+), 69 deletions(-)
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v5 04/45] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks
From: Namhyung Kim @ 2013-01-29 11:12 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: linux-doc, peterz, fweisbec, linux-kernel, walken, mingo,
	linux-arch, linux, xiaoguangrong, wangyun, paulmck, nikunj,
	linux-pm, rusty, rostedt, rjw, tglx, linux-arm-kernel, netdev,
	oleg, sbw, Tejun Heo, akpm, linuxppc-dev
In-Reply-To: <5100B8CC.4080406@linux.vnet.ibm.com>

On Thu, 24 Jan 2013 10:00:04 +0530, Srivatsa S. Bhat wrote:
> On 01/24/2013 01:27 AM, Tejun Heo wrote:
>> On Thu, Jan 24, 2013 at 01:03:52AM +0530, Srivatsa S. Bhat wrote:
>>> CPU 0                          CPU 1
>>>
>>> read_lock(&rwlock)
>>>
>>>                               write_lock(&rwlock) //spins, because CPU 0
>>>                               //has acquired the lock for read
>>>
>>> read_lock(&rwlock)
>>>    ^^^^^
>>> What happens here? Does CPU 0 start spinning (and hence deadlock) or will
>>> it continue realizing that it already holds the rwlock for read?
>> 
>> I don't think rwlock allows nesting write lock inside read lock.
>> read_lock(); write_lock() will always deadlock.
>> 
>
> Sure, I understand that :-) My question was, what happens when *two* CPUs
> are involved, as in, the read_lock() is invoked only on CPU 0 whereas the
> write_lock() is invoked on CPU 1.
>
> For example, the same scenario shown above, but with slightly different
> timing, will NOT result in a deadlock:
>
> Scenario 2:
>   CPU 0                                CPU 1
>
> read_lock(&rwlock)
>
>
> read_lock(&rwlock) //doesn't spin
>
>                                     write_lock(&rwlock) //spins, because CPU 0
>                                     //has acquired the lock for read
>
>
> So I was wondering whether the "fairness" logic of rwlocks would cause
> the second read_lock() to spin (in the first scenario shown above) because
> a writer is already waiting (and hence new readers should spin) and thus
> cause a deadlock.

In my understanding, current x86 rwlock does basically this (of course,
in an atomic fashion):


#define RW_LOCK_BIAS 0x10000

rwlock_init(rwlock)
{
	rwlock->lock = RW_LOCK_BIAS;
}

arch_read_lock(rwlock)
{
retry:
	if (--rwlock->lock >= 0)
		return;

        rwlock->lock++;
        while (rwlock->lock < 1)
        	continue;

        goto retry;
}

arch_write_lock(rwlock)
{
retry:
	if ((rwlock->lock -= RW_LOCK_BIAS) == 0)
        	return;

        rwlock->lock += RW_LOCK_BIAS;
	while (rwlock->lock != RW_LOCK_BIAS)
		continue;

        goto retry;
}


So I can't find where the 'fairness' logic comes from..

Thanks,
Namhyung

^ permalink raw reply

* Re: [PATCH 2/2] pseries/iommu: remove DDW on kexec
From: Michael Ellerman @ 2013-01-29 10:58 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: miltonm, paulus, anton, nfont, linuxppc-dev
In-Reply-To: <20130129020357.GB12156@linux.vnet.ibm.com>

On Mon, 2013-01-28 at 18:03 -0800, Nishanth Aravamudan wrote:
> pseries/iommu: remove DDW on kexec
>  ...
>     
> I believe the simplest, easiest-to-maintain fix is to just change our
> initcall to, rather than detecting and updating the new kernel's DDW
> knowledge, just remove all DDW configurations. When the drivers
> re-initialize, we will set everything back up as it was before.

I don't know this code at all, but this sounds like it will also work
for kdump, right? ie. when the original kernel has crashed the 2nd
kernel will tear the DDW down and set it back up.

cheers

^ permalink raw reply

* Re: [PATCH v5 07/45] CPU hotplug: Provide APIs to prevent CPU offline from atomic context
From: Namhyung Kim @ 2013-01-29 10:21 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, paulmck, nikunj, linux-pm, rusty,
	rostedt, rjw, tglx, linux-arm-kernel, netdev, oleg, sbw, tj, akpm,
	linuxppc-dev
In-Reply-To: <20130122073446.13822.39253.stgit@srivatsabhat.in.ibm.com>

Hi Srivatsa,

On Tue, 22 Jan 2013 13:04:54 +0530, Srivatsa S. Bhat wrote:
> @@ -246,15 +291,21 @@ struct take_cpu_down_param {
>  static int __ref take_cpu_down(void *_param)
>  {
>  	struct take_cpu_down_param *param = _param;
> -	int err;
> +	unsigned long flags;
> +	int err = 0;

It seems no need to set 'err' to 0.

Thanks,
Namhyung

> +
> +	percpu_write_lock_irqsave(&hotplug_pcpu_rwlock, &flags);
>  
>  	/* Ensure this CPU doesn't handle any more interrupts. */
>  	err = __cpu_disable();
>  	if (err < 0)
> -		return err;
> +		goto out;
>  
>  	cpu_notify(CPU_DYING | param->mod, param->hcpu);
> -	return 0;
> +
> +out:
> +	percpu_write_unlock_irqrestore(&hotplug_pcpu_rwlock, &flags);
> +	return err;
>  }
>  
>  /* Requires cpu_add_remove_lock to be held */
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] powerpc/512x: initialize clocks before bus probing
From: Anatolij Gustschin @ 2013-01-29  8:09 UTC (permalink / raw)
  To: linuxppc-dev

Early driver probing can fail due to not available clocks
(clk_get() fails) since the clk API init didn't take place yet.
Move clocks init before bus probing.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
---
 arch/powerpc/platforms/512x/mpc512x_shared.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index 37fc7a3..34bad13 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -428,8 +428,8 @@ void __init mpc512x_psc_fifo_init(void)
 
 void __init mpc512x_init(void)
 {
-	mpc512x_declare_of_platform_devices();
 	mpc5121_clk_init();
+	mpc512x_declare_of_platform_devices();
 	mpc512x_restart_init();
 	mpc512x_psc_fifo_init();
 }
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH] powerpc/p1023/config: enable hugetlbfs support
From: Shaohui Xie @ 2013-01-29  6:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Shaohui Xie

Hugetlbfs is missed in current p1023rds_defconfig, enable it by default.

Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
---
 arch/powerpc/configs/85xx/p1023rds_defconfig |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/configs/85xx/p1023rds_defconfig b/arch/powerpc/configs/85xx/p1023rds_defconfig
index b80bcc6..98125b0 100644
--- a/arch/powerpc/configs/85xx/p1023rds_defconfig
+++ b/arch/powerpc/configs/85xx/p1023rds_defconfig
@@ -1,4 +1,5 @@
 CONFIG_PPC_85xx=y
+CONFIG_PHYS_64BIT=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=2
 CONFIG_EXPERIMENTAL=y
@@ -138,6 +139,7 @@ CONFIG_VFAT_FS=y
 CONFIG_NTFS_FS=y
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
 CONFIG_ADFS_FS=m
 CONFIG_AFFS_FS=m
 CONFIG_HFS_FS=m
-- 
1.6.4

^ permalink raw reply related

* Re: [git pull] Please pull powerpc.git merge branch
From: Linus Torvalds @ 2013-01-29  2:42 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linuxppc-dev list, Andrew Morton, Linux Kernel list
In-Reply-To: <1359418161.18955.14.camel@pasglop>

[-- Attachment #1: Type: text/plain, Size: 400 bytes --]

On Mon, Jan 28, 2013 at 4:09 PM, Benjamin Herrenschmidt <
benh@kernel.crashing.org> wrote:
>
> It wasn't meant to be "snarky", sorry about that...

I'm kidding, I'm kidding, you can be as snarky as you want. It's not like
I'm some fragile flower.

> My usual problem with git request-pull when the mirror haven't caught up
> yet. Branch is "merge".

Ok, I see it. Pulled and pushed out,

      Linus

[-- Attachment #2: Type: text/html, Size: 632 bytes --]

^ permalink raw reply

* [PATCH 2/2] pseries/iommu: remove DDW on kexec
From: Nishanth Aravamudan @ 2013-01-29  2:03 UTC (permalink / raw)
  To: benh; +Cc: nfont, paulus, linuxppc-dev, miltonm, anton
In-Reply-To: <20130129020245.GA12156@linux.vnet.ibm.com>

pseries/iommu: remove DDW on kexec
    
We currently insert a property in the device-tree when we successfully
configure DDW for a given slot. This was meant to be an optimization to
speed up kexec/kdump, so that we don't need to make the RTAS calls again
to re-configured DDW in the new kernel.
    
However, we end up tripping a plpar_tce_stuff failure on kexec/kdump
because we unconditionally parse the ibm,dma-window property for the
node at bus/dev setup time. This property contains the 32-bit DMA window
LIOBN, which is distinct from the DDW window's. We pass that LIOBN (via
iommu_table_init -> iommu_table_clear -> tce_free ->
tce_freemulti_pSeriesLP) to plpar_tce_stuff, which fails because that
32-bit window is no longer present after
25ebc45b93452d0bc60271f178237123c4b26808 ("powerpc/pseries/iommu: remove
default window before attempting DDW manipulation").
    
I believe the simplest, easiest-to-maintain fix is to just change our
initcall to, rather than detecting and updating the new kernel's DDW
knowledge, just remove all DDW configurations. When the drivers
re-initialize, we will set everything back up as it was before.
    
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index a8e99f9..1b2a174 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -787,33 +787,68 @@ static u64 find_existing_ddw(struct device_node *pdn)
 	return dma_addr;
 }
 
+static void __restore_default_window(struct eeh_dev *edev,
+						u32 ddw_restore_token)
+{
+	u32 cfg_addr;
+	u64 buid;
+	int ret;
+
+	/*
+	 * Get the config address and phb buid of the PE window.
+	 * Rely on eeh to retrieve this for us.
+	 * Retrieve them from the pci device, not the node with the
+	 * dma-window property
+	 */
+	cfg_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		cfg_addr = edev->pe_config_addr;
+	buid = edev->phb->buid;
+
+	do {
+		ret = rtas_call(ddw_restore_token, 3, 1, NULL, cfg_addr,
+					BUID_HI(buid), BUID_LO(buid));
+	} while (rtas_busy_delay(ret));
+	pr_info("ibm,reset-pe-dma-windows(%x) %x %x %x returned %d\n",
+		 ddw_restore_token, cfg_addr, BUID_HI(buid), BUID_LO(buid), ret);
+}
+
 static int find_existing_ddw_windows(void)
 {
-	int len;
 	struct device_node *pdn;
-	struct direct_window *window;
 	const struct dynamic_dma_window_prop *direct64;
+	const u32 *ddw_extensions;
 
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		return 0;
 
 	for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-		direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+		direct64 = of_get_property(pdn, DIRECT64_PROPNAME, NULL);
 		if (!direct64)
 			continue;
 
-		window = kzalloc(sizeof(*window), GFP_KERNEL);
-		if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
-			kfree(window);
-			remove_ddw(pdn);
-			continue;
-		}
+		/*
+		 * We need to ensure the IOMMU table is active when we
+		 * return from the IOMMU setup so that the common code
+		 * can clear the table or find the holes. To that end,
+		 * first, remove any existing DDW configuration.
+		 */
+		remove_ddw(pdn);
 
-		window->device = pdn;
-		window->prop = direct64;
-		spin_lock(&direct_window_list_lock);
-		list_add(&window->list, &direct_window_list);
-		spin_unlock(&direct_window_list_lock);
+		/*
+		 * Second, if we are running on a new enough level of
+		 * firmware where the restore API is present, use it to
+		 * restore the 32-bit window, which was removed in
+		 * create_ddw.
+		 * If the API is not present, then create_ddw couldn't
+		 * have removed the 32-bit window in the first place, so
+		 * removing the DDW configuration should be sufficient.
+		 */
+		ddw_extensions = of_get_property(pdn, "ibm,ddw-extensions",
+									NULL);
+		if (ddw_extensions && ddw_extensions[0] > 0)
+			__restore_default_window(of_node_to_eeh_dev(pdn),
+							ddw_extensions[1]);
 	}
 
 	return 0;
@@ -886,30 +921,7 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
 static void restore_default_window(struct pci_dev *dev,
 					u32 ddw_restore_token)
 {
-	struct eeh_dev *edev;
-	u32 cfg_addr;
-	u64 buid;
-	int ret;
-
-	/*
-	 * Get the config address and phb buid of the PE window.
-	 * Rely on eeh to retrieve this for us.
-	 * Retrieve them from the pci device, not the node with the
-	 * dma-window property
-	 */
-	edev = pci_dev_to_eeh_dev(dev);
-	cfg_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		cfg_addr = edev->pe_config_addr;
-	buid = edev->phb->buid;
-
-	do {
-		ret = rtas_call(ddw_restore_token, 3, 1, NULL, cfg_addr,
-					BUID_HI(buid), BUID_LO(buid));
-	} while (rtas_busy_delay(ret));
-	dev_info(&dev->dev,
-		"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d\n",
-		 ddw_restore_token, cfg_addr, BUID_HI(buid), BUID_LO(buid), ret);
+	__restore_default_window(pci_dev_to_eeh_dev(dev), ddw_restore_token);
 }
 
 /*

^ permalink raw reply related

* [PATCH 1/2] pseries/iommu: restore_default_window does not use liobn parameter
From: Nishanth Aravamudan @ 2013-01-29  2:02 UTC (permalink / raw)
  To: benh; +Cc: nfont, paulus, linuxppc-dev, miltonm, anton

The parameter is unused, and complicates a following fix. Just remove
it.
    
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index b4bb9e1..a8e99f9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -884,7 +884,7 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
 }
 
 static void restore_default_window(struct pci_dev *dev,
-				u32 ddw_restore_token, unsigned long liobn)
+					u32 ddw_restore_token)
 {
 	struct eeh_dev *edev;
 	u32 cfg_addr;
@@ -1100,7 +1100,7 @@ out_free_prop:
 
 out_restore_window:
 	if (ddw_restore_token)
-		restore_default_window(dev, ddw_restore_token, liobn);
+		restore_default_window(dev, ddw_restore_token);
 
 out_unlock:
 	mutex_unlock(&direct_window_init_mutex);

^ permalink raw reply related

* Re: [PATCH v5 31/45] blackfin/smp: Use get/put_online_cpus_atomic() to prevent CPU offline
From: Srivatsa S. Bhat @ 2013-01-29  1:14 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, paulmck, Bob Liu, nikunj, linux-pm,
	rusty, rostedt, rjw, namhyung, tglx, linux-arm-kernel, netdev,
	oleg, sbw, akpm, linuxppc-dev
In-Reply-To: <CAOS58YN_r-ahRvPekOo_84bM7z3YcFw1EA7DcTf0NVyEhv7ssA@mail.gmail.com>

On 01/29/2013 06:06 AM, Tejun Heo wrote:
> Hello, Bob.
> 
> On Mon, Jan 28, 2013 at 1:09 AM, Bob Liu <lliubbo@gmail.com> wrote:
>> Thanks, will be applied to my blackfin arch tree.
> 
> I think we still have some work ahead of us to have this patchset
> ready for inclusion and even then it probably would be best to route
> these patches together, so probably not a very good idea to apply this
> to blackfin right now.
> 

Thanks Tejun for pointing that out! I'll address the review comments
soon and respin the patchset.
 
Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v2] ppc/iommu: use find_first_bit to look up entries in the iommu table
From: Benjamin Herrenschmidt @ 2013-01-29  0:35 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo; +Cc: paulus, linuxppc-dev, shangw, anton
In-Reply-To: <1357846419-13515-1-git-send-email-cascardo@linux.vnet.ibm.com>

On Thu, 2013-01-10 at 17:33 -0200, Thadeu Lima de Souza Cascardo wrote:
> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
> ---
> v2:
> Remove the unneeded extra variable i, which caused build failure.

I believe something equivalent is already in -next, can you dbl check ?

Cheers,
Ben.

> ---
>  arch/powerpc/kernel/iommu.c |    9 ++-------
>  1 files changed, 2 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 6d48ff8..0fc44d2 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -708,7 +708,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
>  
>  void iommu_free_table(struct iommu_table *tbl, const char *node_name)
>  {
> -	unsigned long bitmap_sz, i;
> +	unsigned long bitmap_sz;
>  	unsigned int order;
>  
>  	if (!tbl || !tbl->it_map) {
> @@ -725,14 +725,9 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
>  		clear_bit(0, tbl->it_map);
>  
>  	/* verify that table contains no entries */
> -	/* it_size is in entries, and we're examining 64 at a time */
> -	for (i = 0; i < (tbl->it_size/64); i++) {
> -		if (tbl->it_map[i] != 0) {
> +	if (find_first_bit(tbl->it_map, tbl->it_size) < tbl->it_size)
>  			printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
>  				__func__, node_name);
> -			break;
> -		}
> -	}
>  
>  	/* calculate bitmap size in bytes */
>  	bitmap_sz = (tbl->it_size + 7) / 8;

^ permalink raw reply

* Re: [git pull] Please pull powerpc.git merge branch
From: Benjamin Herrenschmidt @ 2013-01-29  0:09 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linuxppc-dev list, Andrew Morton, Linux Kernel list
In-Reply-To: <CA+55aFw=Gr_UYEjfQ6sjz-5YdoegzxC+A0x6e8GViOSuXtE6hA@mail.gmail.com>

On Mon, 2013-01-28 at 16:03 -0800, Linus Torvalds wrote:
> I'll have you know that I haven't quite even left for Au yet, and I
> have LCA before diving. So no snarky "in between dives" comments,
> please.

It wasn't meant to be "snarky", sorry about that...

> At least not for a few days.
> 
> >   git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git
> 
> Nothing there. Forgot to push? Or some unnamed branch/tag?

My usual problem with git request-pull when the mirror haven't caught up
yet. Branch is "merge".

> (And I _am_ leaving for the airport soon, so I may not get to it for a
> while unless you reply asap)

Have a good trip !

Cheers,
Ben.

^ permalink raw reply

* Re: [git pull] Please pull powerpc.git merge branch
From: Linus Torvalds @ 2013-01-29  0:03 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linuxppc-dev list, Andrew Morton, Linux Kernel list
In-Reply-To: <1359416520.18955.12.camel@pasglop>

On Mon, Jan 28, 2013 at 3:42 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
>
> Whenever you have a chance between two dives, you might want to consider
> pulling my merge branch to pickup a few fixes for 3.8 that have been
> accumulating for the last couple of weeks (I was myself travelling
> then on vacation).

I'll have you know that I haven't quite even left for Au yet, and I
have LCA before diving. So no snarky "in between dives" comments,
please.

At least not for a few days.

>   git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git

Nothing there. Forgot to push? Or some unnamed branch/tag?

(And I _am_ leaving for the airport soon, so I may not get to it for a
while unless you reply asap)

          Linus

^ permalink raw reply

* [git pull] Please pull powerpc.git merge branch
From: Benjamin Herrenschmidt @ 2013-01-28 23:42 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linuxppc-dev list, Andrew Morton, Linux Kernel list

Hi Linus !

Whenever you have a chance between two dives, you might want to consider
pulling my merge branch to pickup a few fixes for 3.8 that have been
accumulating for the last couple of weeks (I was myself travelling
then on vacation). Nothing major, just a handful of powerpc bug fixes
that I consider worth getting in before 3.8 goes final.

Cheers,
Ben.

The following changes since commit 45e72af09faa7dad5d8539ebac0fe317ae88318b:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-3.0-fixes (2013-01-28 11:53:49 -0800)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git 

for you to fetch changes up to 689dfa894c57842a05bf6dc9f97e6bb71ec5f386:

  powerpc: Max next_tb to prevent from replaying timer interrupt (2013-01-29 10:18:16 +1100)

----------------------------------------------------------------
Carl E. Love (1):
      powerpc/oprofile: Fix error in oprofile power7_marked_instr_event() function

Cong Ding (1):
      powerpc: kernel/kgdb.c: Fix memory leakage

Li Zhong (1):
      powerpc: Fix MAX_STACK_TRACE_ENTRIES too low warning for ppc32

Steven Rostedt (1):
      powerpc/pasemi: Fix crash on reboot

Tiejun Chen (2):
      powerpc/book3e: Disable interrupt after preempt_schedule_irq
      powerpc: Max next_tb to prevent from replaying timer interrupt

 arch/powerpc/kernel/entry_32.S          |    2 ++
 arch/powerpc/kernel/entry_64.S          |   13 +++++++++++++
 arch/powerpc/kernel/kgdb.c              |    5 +++--
 arch/powerpc/kernel/time.c              |    9 +++++++--
 arch/powerpc/oprofile/op_model_power4.c |    2 +-
 arch/powerpc/platforms/pasemi/cpufreq.c |    7 +++++++
 6 files changed, 33 insertions(+), 5 deletions(-)

^ permalink raw reply

* Re: [RFC/PATCH 29/32] usb: gadget: pxa27x_udc: let udc-core manage gadget->dev
From: Robert Jarzmik @ 2013-01-28 20:18 UTC (permalink / raw)
  To: Felipe Balbi
  Cc: kgene.kim, eric.y.miao, kuninori.morimoto.gx, alexander.shishkin,
	gregkh, yoshihiro.shimoda.uh, Linux USB Mailing List,
	nicolas.ferre, linux-geode, haojian.zhuang, linux-samsung-soc,
	ben-linux, dahlmann.thomas, linux, Linux OMAP Mailing List,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <1359042370-4358-30-git-send-email-balbi@ti.com>

Felipe Balbi <balbi@ti.com> writes:

> By simply setting a flag, we can drop some
> boilerplate code.
>
> Signed-off-by: Felipe Balbi <balbi@ti.com>
> ---
>  drivers/usb/gadget/pxa27x_udc.c | 9 +--------
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>

And I tested also your patch and it works in my environment. For next patches
I'd like to be CCed for pxa27x_udc stuff as I'm maintaining that one since its
beginning (and yes, I know, I didn't put that in MAINTAINERS ...).

Cheers.

--
Robert

^ permalink raw reply

* Re: [PATCH v5 31/45] blackfin/smp: Use get/put_online_cpus_atomic() to prevent CPU offline
From: Tejun Heo @ 2013-01-28 19:06 UTC (permalink / raw)
  To: Bob Liu
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, paulmck, nikunj, linux-pm, rusty,
	rostedt, rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw,
	Srivatsa S. Bhat, akpm, linuxppc-dev
In-Reply-To: <CAA_GA1fSLNKSDTGSzHoA+j-mur85BvoSRkARq+DMfORuwrrX4Q@mail.gmail.com>

Hello, Bob.

On Mon, Jan 28, 2013 at 1:09 AM, Bob Liu <lliubbo@gmail.com> wrote:
> Thanks, will be applied to my blackfin arch tree.

I think we still have some work ahead of us to have this patchset
ready for inclusion and even then it probably would be best to route
these patches together, so probably not a very good idea to apply this
to blackfin right now.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH v5 31/45] blackfin/smp: Use get/put_online_cpus_atomic() to prevent CPU offline
From: Bob Liu @ 2013-01-28  9:09 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: linux-doc, peterz, fweisbec, linux-kernel, mingo, linux-arch,
	linux, xiaoguangrong, wangyun, paulmck, nikunj, linux-pm, rusty,
	rostedt, rjw, namhyung, tglx, linux-arm-kernel, netdev, oleg, sbw,
	tj, akpm, linuxppc-dev
In-Reply-To: <20130122074123.13822.39102.stgit@srivatsabhat.in.ibm.com>

On Tue, Jan 22, 2013 at 3:41 PM, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
> Once stop_machine() is gone from the CPU offline path, we won't be able to
> depend on preempt_disable() or local_irq_disable() to prevent CPUs from
> going offline from under us.
>
> Use the get/put_online_cpus_atomic() APIs to prevent CPUs from going offline,
> while invoking from atomic context.
>
> Cc: Mike Frysinger <vapier@gentoo.org>
> Cc: Bob Liu <lliubbo@gmail.com>
> Cc: Steven Miao <realmz6@gmail.com>
> Cc: uclinux-dist-devel@blackfin.uclinux.org
> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>

Thanks, will be applied to my blackfin arch tree.

> ---
>
>  arch/blackfin/mach-common/smp.c |    6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
> index bb61ae4..6cc6d7a 100644
> --- a/arch/blackfin/mach-common/smp.c
> +++ b/arch/blackfin/mach-common/smp.c
> @@ -194,6 +194,7 @@ void send_ipi(const struct cpumask *cpumask, enum ipi_message_type msg)
>         struct ipi_data *bfin_ipi_data;
>         unsigned long flags;
>
> +       get_online_cpus_atomic();
>         local_irq_save(flags);
>         smp_mb();
>         for_each_cpu(cpu, cpumask) {
> @@ -205,6 +206,7 @@ void send_ipi(const struct cpumask *cpumask, enum ipi_message_type msg)
>         }
>
>         local_irq_restore(flags);
> +       put_online_cpus_atomic();
>  }
>
>  void arch_send_call_function_single_ipi(int cpu)
> @@ -238,13 +240,13 @@ void smp_send_stop(void)
>  {
>         cpumask_t callmap;
>
> -       preempt_disable();
> +       get_online_cpus_atomic();
>         cpumask_copy(&callmap, cpu_online_mask);
>         cpumask_clear_cpu(smp_processor_id(), &callmap);
>         if (!cpumask_empty(&callmap))
>                 send_ipi(&callmap, BFIN_IPI_CPU_STOP);
>
> -       preempt_enable();
> +       put_online_cpus_atomic();
>
>         return;
>  }
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Regards,
--Bob

^ permalink raw reply

* Re: [PATCH Bug fix 0/5] Bug fix for physical memory hot-remove.
From: Michal Hocko @ 2013-01-28  8:15 UTC (permalink / raw)
  To: Tang Chen
  Cc: len.brown, linux-mm, paulus, hpa, cl, sfr, x86, linux-acpi,
	isimatu.yasuaki, linfeng, mgorman, kosaki.motohiro, rientjes,
	Simon Jeons, jiang.liu, wency, julian.calaby, glommer, wujianguo,
	yinghai, laijs, linux-kernel, minchan.kim, akpm, linuxppc-dev
In-Reply-To: <5105D57D.3050900@cn.fujitsu.com>

On Mon 28-01-13 09:33:49, Tang Chen wrote:
> On 01/25/2013 09:17 PM, Michal Hocko wrote:
> >On Wed 23-01-13 06:29:31, Simon Jeons wrote:
> >>On Tue, 2013-01-22 at 19:42 +0800, Tang Chen wrote:
> >>>Here are some bug fix patches for physical memory hot-remove. All these
> >>>patches are based on the latest -mm tree.
> >>>git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git akpm
> >>>
> >>>And patch1 and patch3 are very important.
> >>>patch1: free compound pages when freeing memmap, otherwise the kernel
> >>>         will panic the next time memory is hot-added.
> >>>patch3: the old way of freeing pagetable pages was wrong. We should never
> >>>         split larger pages into small ones.
> >>>
> >>>
> >>
> >>Hi Tang,
> >>
> >>I remember your big physical memory hot-remove patchset has already
> >>merged by Andrew, but where I can find it? Could you give me git tree
> >>address?
> >
> >Andrew tree is also mirrored into a git tree.
> >http://git.kernel.org/?p=linux/kernel/git/mhocko/mm.git;a=summary
> >
> >It contains only Memory management patches on top of the last major
> >release (since-.X.Y branch).
> 
> Hi Michal,
> 
> I'm not sure I got your meaning. :)

Well, the mirror tree gets updated when Andrew releases mmotm and quite
often even when mmots is released.
All patches in the mm section are applied.

> In http://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git akpm,
> I can find the following commit.
> 
> commit deed0460e01b3968f2cf46fb94851936535b7e0d
> Author: Tang Chen <tangchen@cn.fujitsu.com>
> Date:   Sat Jan 19 11:07:13 2013 +1100
> 
>     memory-hotplug: do not allocate pgdat if it was not freed when
> offline.
> 
> 
> This is one of memory hot-remove patches. Please try to update the
> mirror tree,
> and try to find the above commit.

That one is in my mirror tree as f48bf999 (memory-hotplug: do not
allocate pdgat if it was not freed when offline.).
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 20/22] PCI, powerpc: Kill pci_root_buses in resources reservations
From: Yinghai Lu @ 2013-01-28  5:23 UTC (permalink / raw)
  To: Yijing Wang
  Cc: Toshi Kani, Jiang Liu, Greg Kroah-Hartman, Rafael J. Wysocki,
	linux-pci, Bjorn Helgaas, Taku Izumi, linuxppc-dev,
	Paul Mackerras
In-Reply-To: <5105F51A.1060600@huawei.com>

[-- Attachment #1: Type: text/plain, Size: 1334 bytes --]

On Sun, Jan 27, 2013 at 7:48 PM, Yijing Wang <wangyijing@huawei.com> wrote:
> On 2013/1/28 3:23, Yinghai Lu wrote:
>> diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
>> index 2cbe676..f848325 100644
>> --- a/arch/powerpc/kernel/pci_64.c
>> +++ b/arch/powerpc/kernel/pci_64.c
>> @@ -208,9 +208,9 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus,
>>                         unsigned long in_devfn)
>>  {
>>       struct pci_controller* hose;
>> -     struct list_head *ln;
>> -     struct pci_bus *bus = NULL;
>> +     struct pci_bus *bus;
>>       struct device_node *hose_node;
>> +     struct pci_host_bridge *host_bridge = NULL;
>>
>>       /* Argh ! Please forgive me for that hack, but that's the
>>        * simplest way to get existing XFree to not lockup on some
>> @@ -230,8 +230,8 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus,
>>        * used on pre-domains setup. We return the first match
>>        */
>>
>> -     for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) {
>> -             bus = pci_bus_b(ln);
>> +     for_each_pci_host_bridge(host_bridge) {
>> +             bus = host_bridge->bus;
>>               if (in_bus >= bus->number && in_bus <= bus->busn_res.end)
>
> Need put_device(&host_bridge->dev).
>

fixed in attached.

Thanks

Yinghai

[-- Attachment #2: kill_pci_root_buses_powerpc.patch --]
[-- Type: application/octet-stream, Size: 4361 bytes --]

Subject: [PATCH] PCI, powerpc: Kill pci_root_buses in resources reservations

Replace that with hotplug-safe version iteration.

-v2: add missing put_device found by Yijing Wang <wangyijing@huawei.com>
     also move the handling into the for_each_pci_host_bridge {} to
     make it really hotplug safe, also make the function more readable.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: linuxppc-dev@lists.ozlabs.org

---
 arch/powerpc/kernel/pci-common.c |   13 +++----
 arch/powerpc/kernel/pci_64.c     |   67 +++++++++++++++++++++++----------------
 2 files changed, 46 insertions(+), 34 deletions(-)

Index: linux-2.6/arch/powerpc/kernel/pci-common.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/pci-common.c
+++ linux-2.6/arch/powerpc/kernel/pci-common.c
@@ -1397,11 +1397,11 @@ static void __init pcibios_reserve_legac
 
 void __init pcibios_resource_survey(void)
 {
-	struct pci_bus *b;
+	struct pci_host_bridge *host_bridge = NULL;
 
 	/* Allocate and assign resources */
-	list_for_each_entry(b, &pci_root_buses, node)
-		pcibios_allocate_bus_resources(b);
+	for_each_pci_host_bridge(host_bridge)
+		pcibios_allocate_bus_resources(host_bridge->bus);
 	pcibios_allocate_resources(0);
 	pcibios_allocate_resources(1);
 
@@ -1409,10 +1409,9 @@ void __init pcibios_resource_survey(void
 	 * the low IO area and the VGA memory area if they intersect the
 	 * bus available resources to avoid allocating things on top of them
 	 */
-	if (!pci_has_flag(PCI_PROBE_ONLY)) {
-		list_for_each_entry(b, &pci_root_buses, node)
-			pcibios_reserve_legacy_regions(b);
-	}
+	if (!pci_has_flag(PCI_PROBE_ONLY))
+		for_each_pci_host_bridge(host_bridge)
+			pcibios_reserve_legacy_regions(host_bridge->bus);
 
 	/* Now, if the platform didn't decide to blindly trust the firmware,
 	 * we proceed to assigning things that were left unassigned
Index: linux-2.6/arch/powerpc/kernel/pci_64.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/pci_64.c
+++ linux-2.6/arch/powerpc/kernel/pci_64.c
@@ -207,10 +207,7 @@ void pcibios_setup_phb_io_space(struct p
 long sys_pciconfig_iobase(long which, unsigned long in_bus,
 			  unsigned long in_devfn)
 {
-	struct pci_controller* hose;
-	struct list_head *ln;
-	struct pci_bus *bus = NULL;
-	struct device_node *hose_node;
+	struct pci_host_bridge *host_bridge = NULL;
 
 	/* Argh ! Please forgive me for that hack, but that's the
 	 * simplest way to get existing XFree to not lockup on some
@@ -229,33 +226,49 @@ long sys_pciconfig_iobase(long which, un
 	/* That syscall isn't quite compatible with PCI domains, but it's
 	 * used on pre-domains setup. We return the first match
 	 */
-
-	for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) {
-		bus = pci_bus_b(ln);
-		if (in_bus >= bus->number && in_bus <= bus->busn_res.end)
+	for_each_pci_host_bridge(host_bridge) {
+		struct device_node *hose_node;
+		struct pci_controller* hose;
+		struct pci_bus *bus;
+		long ret;
+
+		bus = host_bridge->bus;
+		if (in_bus < bus->number || in_bus > bus->busn_res.end)
+			continue;
+
+		if (!bus->dev.of_node) {
+			put_device(&host_bridge->dev);
+			return -ENODEV;
+		}
+
+		hose_node = bus->dev.of_node;
+		hose = PCI_DN(hose_node)->phb;
+
+		switch (which) {
+		case IOBASE_BRIDGE_NUMBER:
+			ret = (long)hose->first_busno;
 			break;
-		bus = NULL;
-	}
-	if (bus == NULL || bus->dev.of_node == NULL)
-		return -ENODEV;
-
-	hose_node = bus->dev.of_node;
-	hose = PCI_DN(hose_node)->phb;
+		case IOBASE_MEMORY:
+			ret = (long)hose->pci_mem_offset;
+			break;
+		case IOBASE_IO:
+			ret = (long)hose->io_base_phys;
+			break;
+		case IOBASE_ISA_IO:
+			ret = (long)isa_io_base;
+			break;
+		case IOBASE_ISA_MEM:
+			ret = -EINVAL;
+			break;
+		default:
+			ret = -EOPNOTSUPP;
+		}
 
-	switch (which) {
-	case IOBASE_BRIDGE_NUMBER:
-		return (long)hose->first_busno;
-	case IOBASE_MEMORY:
-		return (long)hose->pci_mem_offset;
-	case IOBASE_IO:
-		return (long)hose->io_base_phys;
-	case IOBASE_ISA_IO:
-		return (long)isa_io_base;
-	case IOBASE_ISA_MEM:
-		return -EINVAL;
+		put_device(&host_bridge->dev);
+		return ret;
 	}
 
-	return -EOPNOTSUPP;
+	return -ENODEV;
 }
 
 #ifdef CONFIG_NUMA

^ permalink raw reply

* Re: [PATCH v3 20/22] PCI, powerpc: Kill pci_root_buses in resources reservations
From: Yijing Wang @ 2013-01-28  3:48 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Toshi Kani, Jiang Liu, Greg Kroah-Hartman, Rafael J. Wysocki,
	linux-pci, Bjorn Helgaas, Taku Izumi, linuxppc-dev,
	Paul Mackerras
In-Reply-To: <1359314629-18651-21-git-send-email-yinghai@kernel.org>

On 2013/1/28 3:23, Yinghai Lu wrote:
> Replace that with hotplug-safe version iteration.
> 
> Signed-off-by: Yinghai Lu <yinghai@kernel.org>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: linuxppc-dev@lists.ozlabs.org
> ---
>  arch/powerpc/kernel/pci-common.c |   13 ++++++-------
>  arch/powerpc/kernel/pci_64.c     |    8 ++++----
>  2 files changed, 10 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> index abc0d08..9ccecbe 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -1398,11 +1398,11 @@ static void __init pcibios_reserve_legacy_regions(struct pci_bus *bus)
>  
>  void __init pcibios_resource_survey(void)
>  {
> -	struct pci_bus *b;
> +	struct pci_host_bridge *host_bridge = NULL;
>  
>  	/* Allocate and assign resources */
> -	list_for_each_entry(b, &pci_root_buses, node)
> -		pcibios_allocate_bus_resources(b);
> +	for_each_pci_host_bridge(host_bridge)
> +		pcibios_allocate_bus_resources(host_bridge->bus);
>  	pcibios_allocate_resources(0);
>  	pcibios_allocate_resources(1);
>  
> @@ -1410,10 +1410,9 @@ void __init pcibios_resource_survey(void)
>  	 * the low IO area and the VGA memory area if they intersect the
>  	 * bus available resources to avoid allocating things on top of them
>  	 */
> -	if (!pci_has_flag(PCI_PROBE_ONLY)) {
> -		list_for_each_entry(b, &pci_root_buses, node)
> -			pcibios_reserve_legacy_regions(b);
> -	}
> +	if (!pci_has_flag(PCI_PROBE_ONLY))
> +		for_each_pci_host_bridge(host_bridge)
> +			pcibios_reserve_legacy_regions(host_bridge->bus);
>  
>  	/* Now, if the platform didn't decide to blindly trust the firmware,
>  	 * we proceed to assigning things that were left unassigned
> diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
> index 2cbe676..f848325 100644
> --- a/arch/powerpc/kernel/pci_64.c
> +++ b/arch/powerpc/kernel/pci_64.c
> @@ -208,9 +208,9 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus,
>  			  unsigned long in_devfn)
>  {
>  	struct pci_controller* hose;
> -	struct list_head *ln;
> -	struct pci_bus *bus = NULL;
> +	struct pci_bus *bus;
>  	struct device_node *hose_node;
> +	struct pci_host_bridge *host_bridge = NULL;
>  
>  	/* Argh ! Please forgive me for that hack, but that's the
>  	 * simplest way to get existing XFree to not lockup on some
> @@ -230,8 +230,8 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus,
>  	 * used on pre-domains setup. We return the first match
>  	 */
>  
> -	for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) {
> -		bus = pci_bus_b(ln);
> +	for_each_pci_host_bridge(host_bridge) {
> +		bus = host_bridge->bus;
>  		if (in_bus >= bus->number && in_bus <= bus->busn_res.end)

Need put_device(&host_bridge->dev).

>  			break;
>  		bus = NULL;
> 


-- 
Thanks!
Yijing

^ permalink raw reply

* Re: [PATCH Bug fix 0/5] Bug fix for physical memory hot-remove.
From: Tang Chen @ 2013-01-28  1:33 UTC (permalink / raw)
  To: Michal Hocko
  Cc: len.brown, linux-mm, paulus, hpa, cl, sfr, x86, linux-acpi,
	isimatu.yasuaki, linfeng, mgorman, kosaki.motohiro, rientjes,
	Simon Jeons, jiang.liu, wency, julian.calaby, glommer, wujianguo,
	yinghai, laijs, linux-kernel, minchan.kim, akpm, linuxppc-dev
In-Reply-To: <20130125131740.GA1615@dhcp22.suse.cz>

On 01/25/2013 09:17 PM, Michal Hocko wrote:
> On Wed 23-01-13 06:29:31, Simon Jeons wrote:
>> On Tue, 2013-01-22 at 19:42 +0800, Tang Chen wrote:
>>> Here are some bug fix patches for physical memory hot-remove. All these
>>> patches are based on the latest -mm tree.
>>> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git akpm
>>>
>>> And patch1 and patch3 are very important.
>>> patch1: free compound pages when freeing memmap, otherwise the kernel
>>>          will panic the next time memory is hot-added.
>>> patch3: the old way of freeing pagetable pages was wrong. We should never
>>>          split larger pages into small ones.
>>>
>>>
>>
>> Hi Tang,
>>
>> I remember your big physical memory hot-remove patchset has already
>> merged by Andrew, but where I can find it? Could you give me git tree
>> address?
>
> Andrew tree is also mirrored into a git tree.
> http://git.kernel.org/?p=linux/kernel/git/mhocko/mm.git;a=summary
>
> It contains only Memory management patches on top of the last major
> release (since-.X.Y branch).

Hi Michal,

I'm not sure I got your meaning. :)

In http://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git akpm,
I can find the following commit.

commit deed0460e01b3968f2cf46fb94851936535b7e0d
Author: Tang Chen <tangchen@cn.fujitsu.com>
Date:   Sat Jan 19 11:07:13 2013 +1100

     memory-hotplug: do not allocate pgdat if it was not freed when 
offline.


This is one of memory hot-remove patches. Please try to update the 
mirror tree,
and try to find the above commit.

Thanks. :)

^ permalink raw reply

* Re: [PATCH Bug fix 0/5] Bug fix for physical memory hot-remove.
From: Tang Chen @ 2013-01-28  1:22 UTC (permalink / raw)
  To: Toshi Kani
  Cc: linux-mm, paulus, hpa, cl, sfr, x86, linux-acpi, isimatu.yasuaki,
	linfeng, mgorman, kosaki.motohiro, rientjes, len.brown, jiang.liu,
	wency, julian.calaby, glommer, wujianguo, yinghai, laijs,
	linux-kernel, minchan.kim, akpm, linuxppc-dev
In-Reply-To: <1359137977.14145.417.camel@misato.fc.hp.com>

On 01/26/2013 02:19 AM, Toshi Kani wrote:
> On Tue, 2013-01-22 at 19:42 +0800, Tang Chen wrote:
>> Here are some bug fix patches for physical memory hot-remove. All these
>> patches are based on the latest -mm tree.
>> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git akpm
>>
>> And patch1 and patch3 are very important.
>> patch1: free compound pages when freeing memmap, otherwise the kernel
>>          will panic the next time memory is hot-added.
>> patch3: the old way of freeing pagetable pages was wrong. We should never
>>          split larger pages into small ones.
>>
>>
>> Lai Jiangshan (1):
>>    Bug-fix: mempolicy: fix is_valid_nodemask()
>>
>> Tang Chen (3):
>>    Bug fix: Do not split pages when freeing pagetable pages.
>>    Bug fix: Fix section mismatch problem of
>>      release_firmware_map_entry().
>>    Bug fix: Fix the doc format in drivers/firmware/memmap.c
>>
>> Wen Congyang (1):
>>    Bug fix: consider compound pages when free memmap
>>
>>   arch/x86/mm/init_64.c     |  148 ++++++++++++++-------------------------------
>>   drivers/firmware/memmap.c |   16 +++---
>>   mm/mempolicy.c            |   36 +++++++----
>>   mm/sparse.c               |    2 +-
>>   4 files changed, 77 insertions(+), 125 deletions(-)
>
> This patchset fixed a blocker panic I was hitting in my memory hot-plug
> testing.  Memory hotplug works fine with this patchset (for testing my
> hotplug framework patchset :).  For the series:

Hi Toshi-san,

Thank you for testing. :)

>
> Tested-by: Toshi Kani<toshi.kani@hp.com>
>
> Thanks,
> -Toshi
>
>
>
>

^ permalink raw reply

* Re: [REGRESSION][3.8.-rc1][ INFO: possible circular locking dependency detected ]
From: Christian Kujau @ 2013-01-27 23:02 UTC (permalink / raw)
  To: Maciej Rutecki; +Cc: Cong Wang, linux-fbdev, linuxppc-dev, LKML
In-Reply-To: <alpine.DEB.2.01.1212231321560.7378@trent.utfs.org>

On Sun, 23 Dec 2012 at 13:34, Christian Kujau wrote:
> On Sat, 22 Dec 2012 at 16:28, Maciej Rutecki wrote:
> > Got during suspend to disk:
> 
> I got a similar message on a powerpc G4 system, right after bootup (no 
> suspend involved):
> 
>     http://nerdbynature.de/bits/3.8.0-rc1/

This is still present in 3.8-rc5, right after bootup. Any thoughts?

Thanks,
C.

> 
> [   97.803049] ======================================================
> [   97.803051] [ INFO: possible circular locking dependency detected ]
> [   97.803059] 3.8.0-rc1-dirty #2 Not tainted
> [   97.803060] -------------------------------------------------------
> [   97.803066] kworker/0:1/235 is trying to acquire lock:
> [   97.803097]  ((fb_notifier_list).rwsem){.+.+.+}, at: [<c00606a0>] __blocking_notifier_call_chain+0x44/0x88
> [   97.803099] 
> [   97.803099] but task is already holding lock:
> [   97.803110]  (console_lock){+.+.+.}, at: [<c03b9fd0>] console_callback+0x20/0x194
> [   97.803112] 
> [   97.803112] which lock already depends on the new lock.
> 
> ...and on it goes. Please see the URL above for the whole dmesg and 
> .config.
> 
> @Li Zhong: I have applied your fix for the "MAX_STACK_TRACE_ENTRIES too 
>            low" warning[0] to 3.8-rc1 (hence the -dirty flag), but in the 
>            backtrace "ret_from_kernel_thread" shows up again. FWIW, your
>            patch helped to make the "MAX_STACK_TRACE_ENTRIES too low" 
>            warning go away in 3.7.0-rc7 and it did not re-appear ever 
>            since.
> 
> Thanks,
> Christian.
> 
> [0] http://lkml.indiana.edu/hypermail/linux/kernel/1211.3/01917.html
> 
> > [  269.784867] [ INFO: possible circular locking dependency detected ]
> > [  269.784869] 3.8.0-rc1 #1 Not tainted
> > [  269.784870] -------------------------------------------------------
> > [  269.784871] kworker/u:3/56 is trying to acquire lock:
> > [  269.784878]  ((fb_notifier_list).rwsem){.+.+.+}, at: [<ffffffff81062a1d>] 
> > __blocking_notifier_call_chain+0x49/0x80
> > [  269.784879] 
> > [  269.784879] but task is already holding lock:
> > [  269.784884]  (console_lock){+.+.+.}, at: [<ffffffff812ee4ce>] 
> > i915_drm_freeze+0x9e/0xbb
> > [  269.784884] 
> > [  269.784884] which lock already depends on the new lock.
> > [  269.784884] 
> > [  269.784885] 
> > [  269.784885] the existing dependency chain (in reverse order) is:
> > [  269.784887] 
> > [  269.784887] -> #1 (console_lock){+.+.+.}:
> > [  269.784890]        [<ffffffff810890e4>] lock_acquire+0x95/0x105
> > [  269.784893]        [<ffffffff810405a1>] console_lock+0x59/0x5b
> > [  269.784897]        [<ffffffff812ba125>] register_con_driver+0x36/0x128
> > [  269.784899]        [<ffffffff812bb27e>] take_over_console+0x1e/0x45
> > [  269.784903]        [<ffffffff81257a04>] fbcon_takeover+0x56/0x98
> > [  269.784906]        [<ffffffff8125b857>] fbcon_event_notify+0x2c1/0x5ea
> > [  269.784909]        [<ffffffff8149a211>] notifier_call_chain+0x67/0x92
> > [  269.784911]        [<ffffffff81062a33>] __blocking_notifier_call_chain+0x5f/0x80
> > [  269.784912]        [<ffffffff81062a63>] blocking_notifier_call_chain+0xf/0x11
> > [  269.784915]        [<ffffffff8124e85e>] fb_notifier_call_chain+0x16/0x18
> > [  269.784917]        [<ffffffff812505d7>] register_framebuffer+0x20a/0x26e
> > [  269.784920]        [<ffffffff812d3ca0>] 
> > drm_fb_helper_single_fb_probe+0x1ce/0x297
> > [  269.784922]        [<ffffffff812d3f40>] drm_fb_helper_initial_config+0x1d7/0x1ef
> > [  269.784924]        [<ffffffff8132cee2>] intel_fbdev_init+0x6f/0x82
> > [  269.784927]        [<ffffffff812f22f6>] i915_driver_load+0xa9e/0xc78
> > [  269.784929]        [<ffffffff812e020c>] drm_get_pci_dev+0x165/0x26d
> > [  269.784931]        [<ffffffff812ee8da>] i915_pci_probe+0x60/0x69
> > [  269.784933]        [<ffffffff8123fe8e>] local_pci_probe+0x39/0x61
> > [  269.784935]        [<ffffffff812400f5>] pci_device_probe+0xba/0xe0
> > [  269.784938]        [<ffffffff8133d3b6>] driver_probe_device+0x99/0x1c4
> > [  269.784940]        [<ffffffff8133d52f>] __driver_attach+0x4e/0x6f
> > [  269.784942]        [<ffffffff8133bae1>] bus_for_each_dev+0x52/0x84
> > [  269.784944]        [<ffffffff8133cec6>] driver_attach+0x19/0x1b
> > [  269.784946]        [<ffffffff8133cb65>] bus_add_driver+0xdf/0x203
> > [  269.784948]        [<ffffffff8133dad3>] driver_register+0x8e/0x114
> > [  269.784952]        [<ffffffff8123f581>] __pci_register_driver+0x5d/0x62
> > [  269.784953]        [<ffffffff812e0395>] drm_pci_init+0x81/0xe6
> > [  269.784957]        [<ffffffff81af7612>] i915_init+0x66/0x68
> > [  269.784959]        [<ffffffff810020b4>] do_one_initcall+0x7a/0x136
> > [  269.784962]        [<ffffffff8147ceaa>] kernel_init+0x141/0x296
> > [  269.784964]        [<ffffffff8149c7bc>] ret_from_fork+0x7c/0xb0
> > [  269.784966] 
> > [  269.784966] -> #0 ((fb_notifier_list).rwsem){.+.+.+}:
> > [  269.784967]        [<ffffffff81088955>] __lock_acquire+0xa7e/0xddd
> > [  269.784969]        [<ffffffff810890e4>] lock_acquire+0x95/0x105
> > [  269.784971]        [<ffffffff81495092>] down_read+0x34/0x43
> > [  269.784973]        [<ffffffff81062a1d>] __blocking_notifier_call_chain+0x49/0x80
> > [  269.784975]        [<ffffffff81062a63>] blocking_notifier_call_chain+0xf/0x11
> > [  269.784977]        [<ffffffff8124e85e>] fb_notifier_call_chain+0x16/0x18
> > [  269.784979]        [<ffffffff8124ec47>] fb_set_suspend+0x22/0x4d
> > [  269.784981]        [<ffffffff8132cfe3>] intel_fbdev_set_suspend+0x20/0x22
> > [  269.784983]        [<ffffffff812ee4db>] i915_drm_freeze+0xab/0xbb
> > [  269.784985]        [<ffffffff812eea82>] i915_pm_freeze+0x3d/0x41
> > [  269.784987]        [<ffffffff8123f759>] pci_pm_freeze+0x65/0x8d
> > [  269.784990]        [<ffffffff81342f20>] dpm_run_callback.isra.3+0x27/0x56
> > [  269.784993]        [<ffffffff81343085>] __device_suspend+0x136/0x1b1
> > [  269.784995]        [<ffffffff8134311a>] async_suspend+0x1a/0x58
> > [  269.784997]        [<ffffffff81063a6b>] async_run_entry_fn+0xa4/0x17c
> > [  269.785000]        [<ffffffff81058df2>] process_one_work+0x1cf/0x38e
> > [  269.785002]        [<ffffffff81059290>] worker_thread+0x12e/0x1cc
> > [  269.785004]        [<ffffffff8105d416>] kthread+0xac/0xb4
> > [  269.785006]        [<ffffffff8149c7bc>] ret_from_fork+0x7c/0xb0
> > [  269.785006] 
> > [  269.785006] other info that might help us debug this:
> > [  269.785006] 
> > [  269.785007]  Possible unsafe locking scenario:
> > [  269.785007] 
> > [  269.785008]        CPU0                    CPU1
> > [  269.785008]        ----                    ----
> > [  269.785009]   lock(console_lock);
> > [  269.785010]                                lock((fb_notifier_list).rwsem);
> > [  269.785012]                                lock(console_lock);
> > [  269.785013]   lock((fb_notifier_list).rwsem);
> > [  269.785013] 
> > [  269.785013]  *** DEADLOCK ***
> > [  269.785013] 
> > [  269.785014] 4 locks held by kworker/u:3/56:
> > [  269.785018]  #0:  (events_unbound){.+.+.+}, at: [<ffffffff81058d77>] 
> > process_one_work+0x154/0x38e
> > [  269.785021]  #1:  ((&entry->work)){+.+.+.}, at: [<ffffffff81058d77>] 
> > process_one_work+0x154/0x38e
> > [  269.785024]  #2:  (&__lockdep_no_validate__){......}, at: [<ffffffff81342d85>] 
> > device_lock+0xf/0x11
> > [  269.785027]  #3:  (console_lock){+.+.+.}, at: [<ffffffff812ee4ce>] 
> > i915_drm_freeze+0x9e/0xbb
> > [  269.785028] 
> > [  269.785028] stack backtrace:
> > [  269.785029] Pid: 56, comm: kworker/u:3 Not tainted 3.8.0-rc1 #1
> > [  269.785030] Call Trace:
> > [  269.785035]  [<ffffffff8148fcb5>] print_circular_bug+0x1f8/0x209
> > [  269.785036]  [<ffffffff81088955>] __lock_acquire+0xa7e/0xddd
> > [  269.785038]  [<ffffffff810890e4>] lock_acquire+0x95/0x105
> > [  269.785040]  [<ffffffff81062a1d>] ? __blocking_notifier_call_chain+0x49/0x80
> > [  269.785042]  [<ffffffff81495092>] down_read+0x34/0x43
> > [  269.785044]  [<ffffffff81062a1d>] ? __blocking_notifier_call_chain+0x49/0x80
> > [  269.785046]  [<ffffffff81062a1d>] __blocking_notifier_call_chain+0x49/0x80
> > [  269.785047]  [<ffffffff81062a63>] blocking_notifier_call_chain+0xf/0x11
> > [  269.785050]  [<ffffffff8124e85e>] fb_notifier_call_chain+0x16/0x18
> > [  269.785052]  [<ffffffff8124ec47>] fb_set_suspend+0x22/0x4d
> > [  269.785054]  [<ffffffff8132cfe3>] intel_fbdev_set_suspend+0x20/0x22
> > [  269.785055]  [<ffffffff812ee4db>] i915_drm_freeze+0xab/0xbb
> > [  269.785057]  [<ffffffff812eea82>] i915_pm_freeze+0x3d/0x41
> > [  269.785060]  [<ffffffff8123f759>] pci_pm_freeze+0x65/0x8d
> > [  269.785062]  [<ffffffff8123f6f4>] ? pci_pm_poweroff+0x9c/0x9c
> > [  269.785064]  [<ffffffff81342f20>] dpm_run_callback.isra.3+0x27/0x56
> > [  269.785066]  [<ffffffff81343085>] __device_suspend+0x136/0x1b1
> > [  269.785068]  [<ffffffff81089563>] ? trace_hardirqs_on_caller+0x117/0x173
> > [  269.785070]  [<ffffffff8134311a>] async_suspend+0x1a/0x58
> > [  269.785072]  [<ffffffff81063a6b>] async_run_entry_fn+0xa4/0x17c
> > [  269.785074]  [<ffffffff81058df2>] process_one_work+0x1cf/0x38e
> > [  269.785076]  [<ffffffff81058d77>] ? process_one_work+0x154/0x38e
> > [  269.785078]  [<ffffffff810639c7>] ? async_schedule+0x12/0x12
> > [  269.785080]  [<ffffffff8105679f>] ? spin_lock_irq+0x9/0xb
> > [  269.785082]  [<ffffffff81059290>] worker_thread+0x12e/0x1cc
> > [  269.785084]  [<ffffffff81059162>] ? rescuer_thread+0x187/0x187
> > [  269.785085]  [<ffffffff8105d416>] kthread+0xac/0xb4
> > [  269.785088]  [<ffffffff8105d36a>] ? __kthread_parkme+0x60/0x60
> > [  269.785090]  [<ffffffff8149c7bc>] ret_from_fork+0x7c/0xb0
> > [  269.785091]  [<ffffffff8105d36a>] ? __kthread_parkme+0x60/0x60
> > 
> > 
> > Config:
> > http://mrutecki.pl/download/kernel/3.8.0-rc1/s2disk/config-3.8.0-rc1
> > 
> > dmesg:
> > http://mrutecki.pl/download/kernel/3.8.0-rc1/s2disk/dmesg-3.8.0-rc1.txt
> > 
> > 
> > Found similar report:
> > http://marc.info/?l=linux-kernel&m=135546308908700&w=2
> > 
> > Regards
> > 
> > -- 
> > Maciej Rutecki
> > http://www.mrutecki.pl
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> > 
> 
> -- 
> BOFH excuse #435:
> 
> Internet shut down due to maintenance
> 

-- 
BOFH excuse #238:

You did wha... oh _dear_....

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox