All of lore.kernel.org
 help / color / mirror / Atom feed
From: wujianguo <wujianguo106@gmail.com>
To: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: x86@kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org, linux-acpi@vger.kernel.org,
	linux-s390@vger.kernel.org, linux-sh@vger.kernel.org,
	linux-ia64@vger.kernel.org, cmetcalf@tilera.com,
	sparclinux@vger.kernel.org, rientjes@google.com,
	liuj97@gmail.com, len.brown@intel.com, cl@linux.com,
	minchan.kim@gmail.com, akpm@linux-foundation.org,
	kosaki.motohiro@jp.fujitsu.com, wency@cn.fujitsu.com,
	wujianguo@huawei.com, qiuxishi@huawei.com, jiang.liu@huawei.com
Subject: Re: [PATCH 8/10] memory-hotplug : remove page table of x86_64 architecture
Date: Tue, 09 Oct 2012 16:26:40 +0800	[thread overview]
Message-ID: <5073DFC0.3010400@gmail.com> (raw)
In-Reply-To: <506E4799.30407@jp.fujitsu.com>

Hi Congyang,
	I think we should also free pages which are used by page tables after removing
page tables of the memory.

From: Jianguo Wu <wujianguo@huawei.com>

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
---
 arch/x86/mm/init_64.c |  110 +++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 89 insertions(+), 21 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5596dfa..81f9c3b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -675,6 +675,74 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);

+static inline void free_pagetable(struct page *page)
+{
+	struct zone *zone;
+
+	__ClearPageReserved(page);
+	__free_page(page);
+
+	zone = page_zone(page);
+	zone_span_writelock(zone);
+	zone->present_pages++;
+	zone_span_writeunlock(zone);
+	totalram_pages++;
+}
+
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (pte_val(*pte))
+			break;
+	}
+
+	/* free a pte talbe */
+	if (i == PTRS_PER_PTE) {
+		free_pagetable(pmd_page(*pmd));
+		pmd_clear(pmd);
+	}
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (pmd_val(*pmd))
+			break;
+	}
+
+	/* free a pmd talbe */
+	if (i == PTRS_PER_PMD) {
+		free_pagetable(pud_page(*pud));
+		pud_clear(pud);
+	}
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (pud_val(*pud))
+			break;
+	}
+
+	/* free a pud table */
+	if (i == PTRS_PER_PUD) {
+		free_pagetable(pgd_page(*pgd));
+		pgd_clear(pgd);
+	}
+}
+
 static void __meminit
 phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
 {
@@ -704,21 +772,19 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
 	unsigned long pages = 0, next;
 	int i = pmd_index(addr);

-	for (; i < PTRS_PER_PMD; i++, addr = next) {
+	for (; i < PTRS_PER_PMD && addr < end; i++, addr = next) {
 		unsigned long pte_phys;
 		pmd_t *pmd = pmd_page + pmd_index(addr);
 		pte_t *pte;

-		if (addr >= end)
-			break;
-
-		next = (addr & PMD_MASK) + PMD_SIZE;
+		next = pmd_addr_end(addr, end);

 		if (!pmd_present(*pmd))
 			continue;

 		if (pmd_large(*pmd)) {
-			if ((addr & ~PMD_MASK) == 0 && next <= end) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE)) {
 				set_pmd(pmd, __pmd(0));
 				pages++;
 				continue;
@@ -729,7 +795,8 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
 			 * so split 2M page to 4K page.
 			 */
 			pte = alloc_low_page(&pte_phys);
-			__split_large_page((pte_t *)pmd, addr, pte);
+			__split_large_page((pte_t *)pmd,
+					   (unsigned long)__va(addr), pte);

 			spin_lock(&init_mm.page_table_lock);
 			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
@@ -738,7 +805,8 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)

 		spin_lock(&init_mm.page_table_lock);
 		pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
-		phys_pte_remove(pte, addr, end);
+		phys_pte_remove(pte, addr, next);
+		free_pte_table(pte, pmd);
 		unmap_low_page(pte);
 		spin_unlock(&init_mm.page_table_lock);
 	}
@@ -751,21 +819,19 @@ phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
 	unsigned long pages = 0, next;
 	int i = pud_index(addr);

-	for (; i < PTRS_PER_PUD; i++, addr = next) {
+	for (; i < PTRS_PER_PUD && addr < end; i++, addr = next) {
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;

-		if (addr >= end)
-			break;
-
-		next = (addr & PUD_MASK) + PUD_SIZE;
+		next = pud_addr_end(addr, end);

 		if (!pud_present(*pud))
 			continue;

 		if (pud_large(*pud)) {
-			if ((addr & ~PUD_MASK) == 0 && next <= end) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE)) {
 				set_pud(pud, __pud(0));
 				pages++;
 				continue;
@@ -776,15 +842,18 @@ phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
 			 * so split 1G page to 2M page.
 			 */
 			pmd = alloc_low_page(&pmd_phys);
-			__split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
+			__split_large_page((pte_t *)pud,
+					   (unsigned long)__va(addr),
+					   (pte_t *)pmd);

 			spin_lock(&init_mm.page_table_lock);
 			pud_populate(&init_mm, pud, __va(pmd_phys));
 			spin_unlock(&init_mm.page_table_lock);
 		}

-		pmd = map_low_page(pmd_offset(pud, 0));
-		phys_pmd_remove(pmd, addr, end);
+		pmd = map_low_page((pmd_t *)pud_page_vaddr(*pud));
+		phys_pmd_remove(pmd, addr, next);
+		free_pmd_table(pmd, pud);
 		unmap_low_page(pmd);
 		__flush_tlb_all();
 	}
@@ -805,15 +874,14 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 		pgd_t *pgd = pgd_offset_k(start);
 		pud_t *pud;

-		next = (start + PGDIR_SIZE) & PGDIR_MASK;
-		if (next > end)
-			next = end;
+		next = pgd_addr_end(start, end);

 		if (!pgd_present(*pgd))
 			continue;

 		pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
-		phys_pud_remove(pud, __pa(start), __pa(end));
+		phys_pud_remove(pud, __pa(start), __pa(next));
+		free_pud_table(pud, pgd);
 		unmap_low_page(pud);
 	}

-- 1.7.6.1 .


On 2012-10-5 10:36, Yasuaki Ishimatsu wrote:
> From: Wen Congyang <wency@cn.fujitsu.com>
> 
> For hot removing memory, we sholud remove page table about the memory.
> So the patch searches a page table about the removed memory, and clear
> page table.
> 
> CC: David Rientjes <rientjes@google.com>
> CC: Jiang Liu <liuj97@gmail.com>
> CC: Len Brown <len.brown@intel.com>
> CC: Christoph Lameter <cl@linux.com>
> Cc: Minchan Kim <minchan.kim@gmail.com>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/pgtable_types.h |    1 
>  arch/x86/mm/init_64.c                |  147 +++++++++++++++++++++++++++++++++++
>  arch/x86/mm/pageattr.c               |   47 +++++------
>  3 files changed, 173 insertions(+), 22 deletions(-)
> 
> Index: linux-3.6/arch/x86/mm/init_64.c
> ===================================================================
> --- linux-3.6.orig/arch/x86/mm/init_64.c	2012-10-04 18:30:21.171698416 +0900
> +++ linux-3.6/arch/x86/mm/init_64.c	2012-10-04 18:30:27.317704652 +0900
> @@ -675,6 +675,151 @@ int arch_add_memory(int nid, u64 start, 
>  }
>  EXPORT_SYMBOL_GPL(arch_add_memory);
>  
> +static void __meminit
> +phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned pages = 0;
> +	int i = pte_index(addr);
> +
> +	pte_t *pte = pte_page + pte_index(addr);
> +
> +	for (; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
> +
> +		if (addr >= end)
> +			break;
> +
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		pages++;
> +		set_pte(pte, __pte(0));
> +	}
> +
> +	update_page_count(PG_LEVEL_4K, -pages);
> +}
> +
> +static void __meminit
> +phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned long pages = 0, next;
> +	int i = pmd_index(addr);
> +
> +	for (; i < PTRS_PER_PMD; i++, addr = next) {
> +		unsigned long pte_phys;
> +		pmd_t *pmd = pmd_page + pmd_index(addr);
> +		pte_t *pte;
> +
> +		if (addr >= end)
> +			break;
> +
> +		next = (addr & PMD_MASK) + PMD_SIZE;
> +
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_large(*pmd)) {
> +			if ((addr & ~PMD_MASK) == 0 && next <= end) {
> +				set_pmd(pmd, __pmd(0));
> +				pages++;
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 2M page, but we need to remove part of them,
> +			 * so split 2M page to 4K page.
> +			 */
> +			pte = alloc_low_page(&pte_phys);
> +			__split_large_page((pte_t *)pmd, addr, pte);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +		}
> +
> +		spin_lock(&init_mm.page_table_lock);
> +		pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
> +		phys_pte_remove(pte, addr, end);
> +		unmap_low_page(pte);
> +		spin_unlock(&init_mm.page_table_lock);
> +	}
> +	update_page_count(PG_LEVEL_2M, -pages);
> +}
> +
> +static void __meminit
> +phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned long pages = 0, next;
> +	int i = pud_index(addr);
> +
> +	for (; i < PTRS_PER_PUD; i++, addr = next) {
> +		unsigned long pmd_phys;
> +		pud_t *pud = pud_page + pud_index(addr);
> +		pmd_t *pmd;
> +
> +		if (addr >= end)
> +			break;
> +
> +		next = (addr & PUD_MASK) + PUD_SIZE;
> +
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_large(*pud)) {
> +			if ((addr & ~PUD_MASK) == 0 && next <= end) {
> +				set_pud(pud, __pud(0));
> +				pages++;
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 1G page, but we need to remove part of them,
> +			 * so split 1G page to 2M page.
> +			 */
> +			pmd = alloc_low_page(&pmd_phys);
> +			__split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pud_populate(&init_mm, pud, __va(pmd_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +		}
> +
> +		pmd = map_low_page(pmd_offset(pud, 0));
> +		phys_pmd_remove(pmd, addr, end);
> +		unmap_low_page(pmd);
> +		__flush_tlb_all();
> +	}
> +	__flush_tlb_all();
> +
> +	update_page_count(PG_LEVEL_1G, -pages);
> +}
> +
> +void __meminit
> +kernel_physical_mapping_remove(unsigned long start, unsigned long end)
> +{
> +	unsigned long next;
> +
> +	start = (unsigned long)__va(start);
> +	end = (unsigned long)__va(end);
> +
> +	for (; start < end; start = next) {
> +		pgd_t *pgd = pgd_offset_k(start);
> +		pud_t *pud;
> +
> +		next = (start + PGDIR_SIZE) & PGDIR_MASK;
> +		if (next > end)
> +			next = end;
> +
> +		if (!pgd_present(*pgd))
> +			continue;
> +
> +		pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
> +		phys_pud_remove(pud, __pa(start), __pa(end));
> +		unmap_low_page(pud);
> +	}
> +
> +	__flush_tlb_all();
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  int __ref arch_remove_memory(u64 start, u64 size)
>  {
> @@ -687,6 +832,8 @@ int __ref arch_remove_memory(u64 start, 
>  	ret = __remove_pages(zone, start_pfn, nr_pages);
>  	WARN_ON_ONCE(ret);
>  
> +	kernel_physical_mapping_remove(start, start + size);
> +
>  	return ret;
>  }
>  #endif
> Index: linux-3.6/arch/x86/include/asm/pgtable_types.h
> ===================================================================
> --- linux-3.6.orig/arch/x86/include/asm/pgtable_types.h	2012-10-04 18:26:51.925486954 +0900
> +++ linux-3.6/arch/x86/include/asm/pgtable_types.h	2012-10-04 18:30:27.322704656 +0900
> @@ -334,6 +334,7 @@ static inline void update_page_count(int
>   * as a pte too.
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> Index: linux-3.6/arch/x86/mm/pageattr.c
> ===================================================================
> --- linux-3.6.orig/arch/x86/mm/pageattr.c	2012-10-04 18:26:51.923486952 +0900
> +++ linux-3.6/arch/x86/mm/pageattr.c	2012-10-04 18:30:27.328704662 +0900
> @@ -501,21 +501,13 @@ out_unlock:
>  	return do_split;
>  }
>  
> -static int split_large_page(pte_t *kpte, unsigned long address)
> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
>  {
>  	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
> -	pte_t *pbase, *tmp;
> +	pte_t *tmp;
>  	pgprot_t ref_prot;
> -	struct page *base;
> -
> -	if (!debug_pagealloc)
> -		spin_unlock(&cpa_lock);
> -	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> -	if (!debug_pagealloc)
> -		spin_lock(&cpa_lock);
> -	if (!base)
> -		return -ENOMEM;
> +	struct page *base = virt_to_page(pbase);
>  
>  	spin_lock(&pgd_lock);
>  	/*
> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte,
>  	 * up for us already:
>  	 */
>  	tmp = lookup_address(address, &level);
> -	if (tmp != kpte)
> -		goto out_unlock;
> +	if (tmp != kpte) {
> +		spin_unlock(&pgd_lock);
> +		return 1;
> +	}
>  
> -	pbase = (pte_t *)page_address(base);
>  	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>  	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
>  	/*
> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte,
>  	 * going on.
>  	 */
>  	__flush_tlb_all();
> +	spin_unlock(&pgd_lock);
>  
> -	base = NULL;
> +	return 0;
> +}
>  
> -out_unlock:
> -	/*
> -	 * If we dropped out via the lookup_address check under
> -	 * pgd_lock then stick the page back into the pool:
> -	 */
> -	if (base)
> +static int split_large_page(pte_t *kpte, unsigned long address)
> +{
> +	pte_t *pbase;
> +	struct page *base;
> +
> +	if (!debug_pagealloc)
> +		spin_unlock(&cpa_lock);
> +	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> +	if (!debug_pagealloc)
> +		spin_lock(&cpa_lock);
> +	if (!base)
> +		return -ENOMEM;
> +
> +	pbase = (pte_t *)page_address(base);
> +	if (__split_large_page(kpte, address, pbase))
>  		__free_page(base);
> -	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 

WARNING: multiple messages have this Message-ID (diff)
From: wujianguo <wujianguo106@gmail.com>
To: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: x86@kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org, linux-acpi@vger.kernel.org,
	linux-s390@vger.kernel.org, linux-sh@vger.kernel.org,
	linux-ia64@vger.kernel.org, cmetcalf@tilera.com,
	sparclinux@vger.kernel.org, rientjes@google.com,
	liuj97@gmail.com, len.brown@intel.com, cl@linux.com,
	minchan.kim@gmail.com, akpm@linux-foundation.org,
	kosaki.motohiro@jp.fujitsu.com, wency@cn.fujitsu.com,
	wujianguo@huawei.com, qiuxishi@huawei.com, jiang.liu@huawei.com
Subject: Re: [PATCH 8/10] memory-hotplug : remove page table of x86_64 architecture
Date: Tue, 09 Oct 2012 08:26:40 +0000	[thread overview]
Message-ID: <5073DFC0.3010400@gmail.com> (raw)
In-Reply-To: <506E4799.30407@jp.fujitsu.com>

Hi Congyang,
	I think we should also free pages which are used by page tables after removing
page tables of the memory.

From: Jianguo Wu <wujianguo@huawei.com>

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
---
 arch/x86/mm/init_64.c |  110 +++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 89 insertions(+), 21 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5596dfa..81f9c3b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -675,6 +675,74 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);

+static inline void free_pagetable(struct page *page)
+{
+	struct zone *zone;
+
+	__ClearPageReserved(page);
+	__free_page(page);
+
+	zone = page_zone(page);
+	zone_span_writelock(zone);
+	zone->present_pages++;
+	zone_span_writeunlock(zone);
+	totalram_pages++;
+}
+
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (pte_val(*pte))
+			break;
+	}
+
+	/* free a pte talbe */
+	if (i = PTRS_PER_PTE) {
+		free_pagetable(pmd_page(*pmd));
+		pmd_clear(pmd);
+	}
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (pmd_val(*pmd))
+			break;
+	}
+
+	/* free a pmd talbe */
+	if (i = PTRS_PER_PMD) {
+		free_pagetable(pud_page(*pud));
+		pud_clear(pud);
+	}
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (pud_val(*pud))
+			break;
+	}
+
+	/* free a pud table */
+	if (i = PTRS_PER_PUD) {
+		free_pagetable(pgd_page(*pgd));
+		pgd_clear(pgd);
+	}
+}
+
 static void __meminit
 phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
 {
@@ -704,21 +772,19 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
 	unsigned long pages = 0, next;
 	int i = pmd_index(addr);

-	for (; i < PTRS_PER_PMD; i++, addr = next) {
+	for (; i < PTRS_PER_PMD && addr < end; i++, addr = next) {
 		unsigned long pte_phys;
 		pmd_t *pmd = pmd_page + pmd_index(addr);
 		pte_t *pte;

-		if (addr >= end)
-			break;
-
-		next = (addr & PMD_MASK) + PMD_SIZE;
+		next = pmd_addr_end(addr, end);

 		if (!pmd_present(*pmd))
 			continue;

 		if (pmd_large(*pmd)) {
-			if ((addr & ~PMD_MASK) = 0 && next <= end) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE)) {
 				set_pmd(pmd, __pmd(0));
 				pages++;
 				continue;
@@ -729,7 +795,8 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
 			 * so split 2M page to 4K page.
 			 */
 			pte = alloc_low_page(&pte_phys);
-			__split_large_page((pte_t *)pmd, addr, pte);
+			__split_large_page((pte_t *)pmd,
+					   (unsigned long)__va(addr), pte);

 			spin_lock(&init_mm.page_table_lock);
 			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
@@ -738,7 +805,8 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)

 		spin_lock(&init_mm.page_table_lock);
 		pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
-		phys_pte_remove(pte, addr, end);
+		phys_pte_remove(pte, addr, next);
+		free_pte_table(pte, pmd);
 		unmap_low_page(pte);
 		spin_unlock(&init_mm.page_table_lock);
 	}
@@ -751,21 +819,19 @@ phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
 	unsigned long pages = 0, next;
 	int i = pud_index(addr);

-	for (; i < PTRS_PER_PUD; i++, addr = next) {
+	for (; i < PTRS_PER_PUD && addr < end; i++, addr = next) {
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;

-		if (addr >= end)
-			break;
-
-		next = (addr & PUD_MASK) + PUD_SIZE;
+		next = pud_addr_end(addr, end);

 		if (!pud_present(*pud))
 			continue;

 		if (pud_large(*pud)) {
-			if ((addr & ~PUD_MASK) = 0 && next <= end) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE)) {
 				set_pud(pud, __pud(0));
 				pages++;
 				continue;
@@ -776,15 +842,18 @@ phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
 			 * so split 1G page to 2M page.
 			 */
 			pmd = alloc_low_page(&pmd_phys);
-			__split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
+			__split_large_page((pte_t *)pud,
+					   (unsigned long)__va(addr),
+					   (pte_t *)pmd);

 			spin_lock(&init_mm.page_table_lock);
 			pud_populate(&init_mm, pud, __va(pmd_phys));
 			spin_unlock(&init_mm.page_table_lock);
 		}

-		pmd = map_low_page(pmd_offset(pud, 0));
-		phys_pmd_remove(pmd, addr, end);
+		pmd = map_low_page((pmd_t *)pud_page_vaddr(*pud));
+		phys_pmd_remove(pmd, addr, next);
+		free_pmd_table(pmd, pud);
 		unmap_low_page(pmd);
 		__flush_tlb_all();
 	}
@@ -805,15 +874,14 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 		pgd_t *pgd = pgd_offset_k(start);
 		pud_t *pud;

-		next = (start + PGDIR_SIZE) & PGDIR_MASK;
-		if (next > end)
-			next = end;
+		next = pgd_addr_end(start, end);

 		if (!pgd_present(*pgd))
 			continue;

 		pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
-		phys_pud_remove(pud, __pa(start), __pa(end));
+		phys_pud_remove(pud, __pa(start), __pa(next));
+		free_pud_table(pud, pgd);
 		unmap_low_page(pud);
 	}

-- 1.7.6.1 .


On 2012-10-5 10:36, Yasuaki Ishimatsu wrote:
> From: Wen Congyang <wency@cn.fujitsu.com>
> 
> For hot removing memory, we sholud remove page table about the memory.
> So the patch searches a page table about the removed memory, and clear
> page table.
> 
> CC: David Rientjes <rientjes@google.com>
> CC: Jiang Liu <liuj97@gmail.com>
> CC: Len Brown <len.brown@intel.com>
> CC: Christoph Lameter <cl@linux.com>
> Cc: Minchan Kim <minchan.kim@gmail.com>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/pgtable_types.h |    1 
>  arch/x86/mm/init_64.c                |  147 +++++++++++++++++++++++++++++++++++
>  arch/x86/mm/pageattr.c               |   47 +++++------
>  3 files changed, 173 insertions(+), 22 deletions(-)
> 
> Index: linux-3.6/arch/x86/mm/init_64.c
> =================================> --- linux-3.6.orig/arch/x86/mm/init_64.c	2012-10-04 18:30:21.171698416 +0900
> +++ linux-3.6/arch/x86/mm/init_64.c	2012-10-04 18:30:27.317704652 +0900
> @@ -675,6 +675,151 @@ int arch_add_memory(int nid, u64 start, 
>  }
>  EXPORT_SYMBOL_GPL(arch_add_memory);
>  
> +static void __meminit
> +phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned pages = 0;
> +	int i = pte_index(addr);
> +
> +	pte_t *pte = pte_page + pte_index(addr);
> +
> +	for (; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
> +
> +		if (addr >= end)
> +			break;
> +
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		pages++;
> +		set_pte(pte, __pte(0));
> +	}
> +
> +	update_page_count(PG_LEVEL_4K, -pages);
> +}
> +
> +static void __meminit
> +phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned long pages = 0, next;
> +	int i = pmd_index(addr);
> +
> +	for (; i < PTRS_PER_PMD; i++, addr = next) {
> +		unsigned long pte_phys;
> +		pmd_t *pmd = pmd_page + pmd_index(addr);
> +		pte_t *pte;
> +
> +		if (addr >= end)
> +			break;
> +
> +		next = (addr & PMD_MASK) + PMD_SIZE;
> +
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_large(*pmd)) {
> +			if ((addr & ~PMD_MASK) = 0 && next <= end) {
> +				set_pmd(pmd, __pmd(0));
> +				pages++;
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 2M page, but we need to remove part of them,
> +			 * so split 2M page to 4K page.
> +			 */
> +			pte = alloc_low_page(&pte_phys);
> +			__split_large_page((pte_t *)pmd, addr, pte);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +		}
> +
> +		spin_lock(&init_mm.page_table_lock);
> +		pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
> +		phys_pte_remove(pte, addr, end);
> +		unmap_low_page(pte);
> +		spin_unlock(&init_mm.page_table_lock);
> +	}
> +	update_page_count(PG_LEVEL_2M, -pages);
> +}
> +
> +static void __meminit
> +phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned long pages = 0, next;
> +	int i = pud_index(addr);
> +
> +	for (; i < PTRS_PER_PUD; i++, addr = next) {
> +		unsigned long pmd_phys;
> +		pud_t *pud = pud_page + pud_index(addr);
> +		pmd_t *pmd;
> +
> +		if (addr >= end)
> +			break;
> +
> +		next = (addr & PUD_MASK) + PUD_SIZE;
> +
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_large(*pud)) {
> +			if ((addr & ~PUD_MASK) = 0 && next <= end) {
> +				set_pud(pud, __pud(0));
> +				pages++;
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 1G page, but we need to remove part of them,
> +			 * so split 1G page to 2M page.
> +			 */
> +			pmd = alloc_low_page(&pmd_phys);
> +			__split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pud_populate(&init_mm, pud, __va(pmd_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +		}
> +
> +		pmd = map_low_page(pmd_offset(pud, 0));
> +		phys_pmd_remove(pmd, addr, end);
> +		unmap_low_page(pmd);
> +		__flush_tlb_all();
> +	}
> +	__flush_tlb_all();
> +
> +	update_page_count(PG_LEVEL_1G, -pages);
> +}
> +
> +void __meminit
> +kernel_physical_mapping_remove(unsigned long start, unsigned long end)
> +{
> +	unsigned long next;
> +
> +	start = (unsigned long)__va(start);
> +	end = (unsigned long)__va(end);
> +
> +	for (; start < end; start = next) {
> +		pgd_t *pgd = pgd_offset_k(start);
> +		pud_t *pud;
> +
> +		next = (start + PGDIR_SIZE) & PGDIR_MASK;
> +		if (next > end)
> +			next = end;
> +
> +		if (!pgd_present(*pgd))
> +			continue;
> +
> +		pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
> +		phys_pud_remove(pud, __pa(start), __pa(end));
> +		unmap_low_page(pud);
> +	}
> +
> +	__flush_tlb_all();
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  int __ref arch_remove_memory(u64 start, u64 size)
>  {
> @@ -687,6 +832,8 @@ int __ref arch_remove_memory(u64 start, 
>  	ret = __remove_pages(zone, start_pfn, nr_pages);
>  	WARN_ON_ONCE(ret);
>  
> +	kernel_physical_mapping_remove(start, start + size);
> +
>  	return ret;
>  }
>  #endif
> Index: linux-3.6/arch/x86/include/asm/pgtable_types.h
> =================================> --- linux-3.6.orig/arch/x86/include/asm/pgtable_types.h	2012-10-04 18:26:51.925486954 +0900
> +++ linux-3.6/arch/x86/include/asm/pgtable_types.h	2012-10-04 18:30:27.322704656 +0900
> @@ -334,6 +334,7 @@ static inline void update_page_count(int
>   * as a pte too.
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> Index: linux-3.6/arch/x86/mm/pageattr.c
> =================================> --- linux-3.6.orig/arch/x86/mm/pageattr.c	2012-10-04 18:26:51.923486952 +0900
> +++ linux-3.6/arch/x86/mm/pageattr.c	2012-10-04 18:30:27.328704662 +0900
> @@ -501,21 +501,13 @@ out_unlock:
>  	return do_split;
>  }
>  
> -static int split_large_page(pte_t *kpte, unsigned long address)
> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
>  {
>  	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
> -	pte_t *pbase, *tmp;
> +	pte_t *tmp;
>  	pgprot_t ref_prot;
> -	struct page *base;
> -
> -	if (!debug_pagealloc)
> -		spin_unlock(&cpa_lock);
> -	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> -	if (!debug_pagealloc)
> -		spin_lock(&cpa_lock);
> -	if (!base)
> -		return -ENOMEM;
> +	struct page *base = virt_to_page(pbase);
>  
>  	spin_lock(&pgd_lock);
>  	/*
> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte,
>  	 * up for us already:
>  	 */
>  	tmp = lookup_address(address, &level);
> -	if (tmp != kpte)
> -		goto out_unlock;
> +	if (tmp != kpte) {
> +		spin_unlock(&pgd_lock);
> +		return 1;
> +	}
>  
> -	pbase = (pte_t *)page_address(base);
>  	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>  	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
>  	/*
> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte,
>  	 * going on.
>  	 */
>  	__flush_tlb_all();
> +	spin_unlock(&pgd_lock);
>  
> -	base = NULL;
> +	return 0;
> +}
>  
> -out_unlock:
> -	/*
> -	 * If we dropped out via the lookup_address check under
> -	 * pgd_lock then stick the page back into the pool:
> -	 */
> -	if (base)
> +static int split_large_page(pte_t *kpte, unsigned long address)
> +{
> +	pte_t *pbase;
> +	struct page *base;
> +
> +	if (!debug_pagealloc)
> +		spin_unlock(&cpa_lock);
> +	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> +	if (!debug_pagealloc)
> +		spin_lock(&cpa_lock);
> +	if (!base)
> +		return -ENOMEM;
> +
> +	pbase = (pte_t *)page_address(base);
> +	if (__split_large_page(kpte, address, pbase))
>  		__free_page(base);
> -	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 


WARNING: multiple messages have this Message-ID (diff)
From: wujianguo <wujianguo106@gmail.com>
To: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: linux-s390@vger.kernel.org, jiang.liu@huawei.com,
	linux-ia64@vger.kernel.org, wency@cn.fujitsu.com,
	len.brown@intel.com, linux-acpi@vger.kernel.org,
	linux-sh@vger.kernel.org, wujianguo@huawei.com, x86@kernel.org,
	linux-kernel@vger.kernel.org, cmetcalf@tilera.com,
	linux-mm@kvack.org, minchan.kim@gmail.com,
	kosaki.motohiro@jp.fujitsu.com, rientjes@google.com,
	sparclinux@vger.kernel.org, qiuxishi@huawei.com, cl@linux.com,
	linuxppc-dev@lists.ozlabs.org, akpm@linux-foundation.org,
	liuj97@gmail.com
Subject: Re: [PATCH 8/10] memory-hotplug : remove page table of x86_64 architecture
Date: Tue, 09 Oct 2012 16:26:40 +0800	[thread overview]
Message-ID: <5073DFC0.3010400@gmail.com> (raw)
In-Reply-To: <506E4799.30407@jp.fujitsu.com>

Hi Congyang,
	I think we should also free pages which are used by page tables after removing
page tables of the memory.

From: Jianguo Wu <wujianguo@huawei.com>

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
---
 arch/x86/mm/init_64.c |  110 +++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 89 insertions(+), 21 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5596dfa..81f9c3b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -675,6 +675,74 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);

+static inline void free_pagetable(struct page *page)
+{
+	struct zone *zone;
+
+	__ClearPageReserved(page);
+	__free_page(page);
+
+	zone = page_zone(page);
+	zone_span_writelock(zone);
+	zone->present_pages++;
+	zone_span_writeunlock(zone);
+	totalram_pages++;
+}
+
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (pte_val(*pte))
+			break;
+	}
+
+	/* free a pte talbe */
+	if (i == PTRS_PER_PTE) {
+		free_pagetable(pmd_page(*pmd));
+		pmd_clear(pmd);
+	}
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (pmd_val(*pmd))
+			break;
+	}
+
+	/* free a pmd talbe */
+	if (i == PTRS_PER_PMD) {
+		free_pagetable(pud_page(*pud));
+		pud_clear(pud);
+	}
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (pud_val(*pud))
+			break;
+	}
+
+	/* free a pud table */
+	if (i == PTRS_PER_PUD) {
+		free_pagetable(pgd_page(*pgd));
+		pgd_clear(pgd);
+	}
+}
+
 static void __meminit
 phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
 {
@@ -704,21 +772,19 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
 	unsigned long pages = 0, next;
 	int i = pmd_index(addr);

-	for (; i < PTRS_PER_PMD; i++, addr = next) {
+	for (; i < PTRS_PER_PMD && addr < end; i++, addr = next) {
 		unsigned long pte_phys;
 		pmd_t *pmd = pmd_page + pmd_index(addr);
 		pte_t *pte;

-		if (addr >= end)
-			break;
-
-		next = (addr & PMD_MASK) + PMD_SIZE;
+		next = pmd_addr_end(addr, end);

 		if (!pmd_present(*pmd))
 			continue;

 		if (pmd_large(*pmd)) {
-			if ((addr & ~PMD_MASK) == 0 && next <= end) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE)) {
 				set_pmd(pmd, __pmd(0));
 				pages++;
 				continue;
@@ -729,7 +795,8 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
 			 * so split 2M page to 4K page.
 			 */
 			pte = alloc_low_page(&pte_phys);
-			__split_large_page((pte_t *)pmd, addr, pte);
+			__split_large_page((pte_t *)pmd,
+					   (unsigned long)__va(addr), pte);

 			spin_lock(&init_mm.page_table_lock);
 			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
@@ -738,7 +805,8 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)

 		spin_lock(&init_mm.page_table_lock);
 		pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
-		phys_pte_remove(pte, addr, end);
+		phys_pte_remove(pte, addr, next);
+		free_pte_table(pte, pmd);
 		unmap_low_page(pte);
 		spin_unlock(&init_mm.page_table_lock);
 	}
@@ -751,21 +819,19 @@ phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
 	unsigned long pages = 0, next;
 	int i = pud_index(addr);

-	for (; i < PTRS_PER_PUD; i++, addr = next) {
+	for (; i < PTRS_PER_PUD && addr < end; i++, addr = next) {
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;

-		if (addr >= end)
-			break;
-
-		next = (addr & PUD_MASK) + PUD_SIZE;
+		next = pud_addr_end(addr, end);

 		if (!pud_present(*pud))
 			continue;

 		if (pud_large(*pud)) {
-			if ((addr & ~PUD_MASK) == 0 && next <= end) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE)) {
 				set_pud(pud, __pud(0));
 				pages++;
 				continue;
@@ -776,15 +842,18 @@ phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
 			 * so split 1G page to 2M page.
 			 */
 			pmd = alloc_low_page(&pmd_phys);
-			__split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
+			__split_large_page((pte_t *)pud,
+					   (unsigned long)__va(addr),
+					   (pte_t *)pmd);

 			spin_lock(&init_mm.page_table_lock);
 			pud_populate(&init_mm, pud, __va(pmd_phys));
 			spin_unlock(&init_mm.page_table_lock);
 		}

-		pmd = map_low_page(pmd_offset(pud, 0));
-		phys_pmd_remove(pmd, addr, end);
+		pmd = map_low_page((pmd_t *)pud_page_vaddr(*pud));
+		phys_pmd_remove(pmd, addr, next);
+		free_pmd_table(pmd, pud);
 		unmap_low_page(pmd);
 		__flush_tlb_all();
 	}
@@ -805,15 +874,14 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 		pgd_t *pgd = pgd_offset_k(start);
 		pud_t *pud;

-		next = (start + PGDIR_SIZE) & PGDIR_MASK;
-		if (next > end)
-			next = end;
+		next = pgd_addr_end(start, end);

 		if (!pgd_present(*pgd))
 			continue;

 		pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
-		phys_pud_remove(pud, __pa(start), __pa(end));
+		phys_pud_remove(pud, __pa(start), __pa(next));
+		free_pud_table(pud, pgd);
 		unmap_low_page(pud);
 	}

-- 1.7.6.1 .


On 2012-10-5 10:36, Yasuaki Ishimatsu wrote:
> From: Wen Congyang <wency@cn.fujitsu.com>
> 
> For hot removing memory, we sholud remove page table about the memory.
> So the patch searches a page table about the removed memory, and clear
> page table.
> 
> CC: David Rientjes <rientjes@google.com>
> CC: Jiang Liu <liuj97@gmail.com>
> CC: Len Brown <len.brown@intel.com>
> CC: Christoph Lameter <cl@linux.com>
> Cc: Minchan Kim <minchan.kim@gmail.com>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/pgtable_types.h |    1 
>  arch/x86/mm/init_64.c                |  147 +++++++++++++++++++++++++++++++++++
>  arch/x86/mm/pageattr.c               |   47 +++++------
>  3 files changed, 173 insertions(+), 22 deletions(-)
> 
> Index: linux-3.6/arch/x86/mm/init_64.c
> ===================================================================
> --- linux-3.6.orig/arch/x86/mm/init_64.c	2012-10-04 18:30:21.171698416 +0900
> +++ linux-3.6/arch/x86/mm/init_64.c	2012-10-04 18:30:27.317704652 +0900
> @@ -675,6 +675,151 @@ int arch_add_memory(int nid, u64 start, 
>  }
>  EXPORT_SYMBOL_GPL(arch_add_memory);
>  
> +static void __meminit
> +phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned pages = 0;
> +	int i = pte_index(addr);
> +
> +	pte_t *pte = pte_page + pte_index(addr);
> +
> +	for (; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
> +
> +		if (addr >= end)
> +			break;
> +
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		pages++;
> +		set_pte(pte, __pte(0));
> +	}
> +
> +	update_page_count(PG_LEVEL_4K, -pages);
> +}
> +
> +static void __meminit
> +phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned long pages = 0, next;
> +	int i = pmd_index(addr);
> +
> +	for (; i < PTRS_PER_PMD; i++, addr = next) {
> +		unsigned long pte_phys;
> +		pmd_t *pmd = pmd_page + pmd_index(addr);
> +		pte_t *pte;
> +
> +		if (addr >= end)
> +			break;
> +
> +		next = (addr & PMD_MASK) + PMD_SIZE;
> +
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_large(*pmd)) {
> +			if ((addr & ~PMD_MASK) == 0 && next <= end) {
> +				set_pmd(pmd, __pmd(0));
> +				pages++;
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 2M page, but we need to remove part of them,
> +			 * so split 2M page to 4K page.
> +			 */
> +			pte = alloc_low_page(&pte_phys);
> +			__split_large_page((pte_t *)pmd, addr, pte);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +		}
> +
> +		spin_lock(&init_mm.page_table_lock);
> +		pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
> +		phys_pte_remove(pte, addr, end);
> +		unmap_low_page(pte);
> +		spin_unlock(&init_mm.page_table_lock);
> +	}
> +	update_page_count(PG_LEVEL_2M, -pages);
> +}
> +
> +static void __meminit
> +phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned long pages = 0, next;
> +	int i = pud_index(addr);
> +
> +	for (; i < PTRS_PER_PUD; i++, addr = next) {
> +		unsigned long pmd_phys;
> +		pud_t *pud = pud_page + pud_index(addr);
> +		pmd_t *pmd;
> +
> +		if (addr >= end)
> +			break;
> +
> +		next = (addr & PUD_MASK) + PUD_SIZE;
> +
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_large(*pud)) {
> +			if ((addr & ~PUD_MASK) == 0 && next <= end) {
> +				set_pud(pud, __pud(0));
> +				pages++;
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 1G page, but we need to remove part of them,
> +			 * so split 1G page to 2M page.
> +			 */
> +			pmd = alloc_low_page(&pmd_phys);
> +			__split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pud_populate(&init_mm, pud, __va(pmd_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +		}
> +
> +		pmd = map_low_page(pmd_offset(pud, 0));
> +		phys_pmd_remove(pmd, addr, end);
> +		unmap_low_page(pmd);
> +		__flush_tlb_all();
> +	}
> +	__flush_tlb_all();
> +
> +	update_page_count(PG_LEVEL_1G, -pages);
> +}
> +
> +void __meminit
> +kernel_physical_mapping_remove(unsigned long start, unsigned long end)
> +{
> +	unsigned long next;
> +
> +	start = (unsigned long)__va(start);
> +	end = (unsigned long)__va(end);
> +
> +	for (; start < end; start = next) {
> +		pgd_t *pgd = pgd_offset_k(start);
> +		pud_t *pud;
> +
> +		next = (start + PGDIR_SIZE) & PGDIR_MASK;
> +		if (next > end)
> +			next = end;
> +
> +		if (!pgd_present(*pgd))
> +			continue;
> +
> +		pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
> +		phys_pud_remove(pud, __pa(start), __pa(end));
> +		unmap_low_page(pud);
> +	}
> +
> +	__flush_tlb_all();
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  int __ref arch_remove_memory(u64 start, u64 size)
>  {
> @@ -687,6 +832,8 @@ int __ref arch_remove_memory(u64 start, 
>  	ret = __remove_pages(zone, start_pfn, nr_pages);
>  	WARN_ON_ONCE(ret);
>  
> +	kernel_physical_mapping_remove(start, start + size);
> +
>  	return ret;
>  }
>  #endif
> Index: linux-3.6/arch/x86/include/asm/pgtable_types.h
> ===================================================================
> --- linux-3.6.orig/arch/x86/include/asm/pgtable_types.h	2012-10-04 18:26:51.925486954 +0900
> +++ linux-3.6/arch/x86/include/asm/pgtable_types.h	2012-10-04 18:30:27.322704656 +0900
> @@ -334,6 +334,7 @@ static inline void update_page_count(int
>   * as a pte too.
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> Index: linux-3.6/arch/x86/mm/pageattr.c
> ===================================================================
> --- linux-3.6.orig/arch/x86/mm/pageattr.c	2012-10-04 18:26:51.923486952 +0900
> +++ linux-3.6/arch/x86/mm/pageattr.c	2012-10-04 18:30:27.328704662 +0900
> @@ -501,21 +501,13 @@ out_unlock:
>  	return do_split;
>  }
>  
> -static int split_large_page(pte_t *kpte, unsigned long address)
> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
>  {
>  	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
> -	pte_t *pbase, *tmp;
> +	pte_t *tmp;
>  	pgprot_t ref_prot;
> -	struct page *base;
> -
> -	if (!debug_pagealloc)
> -		spin_unlock(&cpa_lock);
> -	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> -	if (!debug_pagealloc)
> -		spin_lock(&cpa_lock);
> -	if (!base)
> -		return -ENOMEM;
> +	struct page *base = virt_to_page(pbase);
>  
>  	spin_lock(&pgd_lock);
>  	/*
> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte,
>  	 * up for us already:
>  	 */
>  	tmp = lookup_address(address, &level);
> -	if (tmp != kpte)
> -		goto out_unlock;
> +	if (tmp != kpte) {
> +		spin_unlock(&pgd_lock);
> +		return 1;
> +	}
>  
> -	pbase = (pte_t *)page_address(base);
>  	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>  	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
>  	/*
> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte,
>  	 * going on.
>  	 */
>  	__flush_tlb_all();
> +	spin_unlock(&pgd_lock);
>  
> -	base = NULL;
> +	return 0;
> +}
>  
> -out_unlock:
> -	/*
> -	 * If we dropped out via the lookup_address check under
> -	 * pgd_lock then stick the page back into the pool:
> -	 */
> -	if (base)
> +static int split_large_page(pte_t *kpte, unsigned long address)
> +{
> +	pte_t *pbase;
> +	struct page *base;
> +
> +	if (!debug_pagealloc)
> +		spin_unlock(&cpa_lock);
> +	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> +	if (!debug_pagealloc)
> +		spin_lock(&cpa_lock);
> +	if (!base)
> +		return -ENOMEM;
> +
> +	pbase = (pte_t *)page_address(base);
> +	if (__split_large_page(kpte, address, pbase))
>  		__free_page(base);
> -	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 

WARNING: multiple messages have this Message-ID (diff)
From: wujianguo <wujianguo106@gmail.com>
To: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: x86@kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org, linux-acpi@vger.kernel.org,
	linux-s390@vger.kernel.org, linux-sh@vger.kernel.org,
	linux-ia64@vger.kernel.org, cmetcalf@tilera.com,
	sparclinux@vger.kernel.org, rientjes@google.com,
	liuj97@gmail.com, len.brown@intel.com, cl@linux.com,
	minchan.kim@gmail.com, akpm@linux-foundation.org,
	kosaki.motohiro@jp.fujitsu.com, wency@cn.fujitsu.com,
	wujianguo@huawei.com, qiuxishi@huawei.com, jiang.liu@huawei.com
Subject: Re: [PATCH 8/10] memory-hotplug : remove page table of x86_64 architecture
Date: Tue, 09 Oct 2012 16:26:40 +0800	[thread overview]
Message-ID: <5073DFC0.3010400@gmail.com> (raw)
In-Reply-To: <506E4799.30407@jp.fujitsu.com>

Hi Congyang,
	I think we should also free pages which are used by page tables after removing
page tables of the memory.

From: Jianguo Wu <wujianguo@huawei.com>

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
---
 arch/x86/mm/init_64.c |  110 +++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 89 insertions(+), 21 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5596dfa..81f9c3b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -675,6 +675,74 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);

+static inline void free_pagetable(struct page *page)
+{
+	struct zone *zone;
+
+	__ClearPageReserved(page);
+	__free_page(page);
+
+	zone = page_zone(page);
+	zone_span_writelock(zone);
+	zone->present_pages++;
+	zone_span_writeunlock(zone);
+	totalram_pages++;
+}
+
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (pte_val(*pte))
+			break;
+	}
+
+	/* free a pte talbe */
+	if (i == PTRS_PER_PTE) {
+		free_pagetable(pmd_page(*pmd));
+		pmd_clear(pmd);
+	}
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (pmd_val(*pmd))
+			break;
+	}
+
+	/* free a pmd talbe */
+	if (i == PTRS_PER_PMD) {
+		free_pagetable(pud_page(*pud));
+		pud_clear(pud);
+	}
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (pud_val(*pud))
+			break;
+	}
+
+	/* free a pud table */
+	if (i == PTRS_PER_PUD) {
+		free_pagetable(pgd_page(*pgd));
+		pgd_clear(pgd);
+	}
+}
+
 static void __meminit
 phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
 {
@@ -704,21 +772,19 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
 	unsigned long pages = 0, next;
 	int i = pmd_index(addr);

-	for (; i < PTRS_PER_PMD; i++, addr = next) {
+	for (; i < PTRS_PER_PMD && addr < end; i++, addr = next) {
 		unsigned long pte_phys;
 		pmd_t *pmd = pmd_page + pmd_index(addr);
 		pte_t *pte;

-		if (addr >= end)
-			break;
-
-		next = (addr & PMD_MASK) + PMD_SIZE;
+		next = pmd_addr_end(addr, end);

 		if (!pmd_present(*pmd))
 			continue;

 		if (pmd_large(*pmd)) {
-			if ((addr & ~PMD_MASK) == 0 && next <= end) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE)) {
 				set_pmd(pmd, __pmd(0));
 				pages++;
 				continue;
@@ -729,7 +795,8 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
 			 * so split 2M page to 4K page.
 			 */
 			pte = alloc_low_page(&pte_phys);
-			__split_large_page((pte_t *)pmd, addr, pte);
+			__split_large_page((pte_t *)pmd,
+					   (unsigned long)__va(addr), pte);

 			spin_lock(&init_mm.page_table_lock);
 			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
@@ -738,7 +805,8 @@ phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)

 		spin_lock(&init_mm.page_table_lock);
 		pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
-		phys_pte_remove(pte, addr, end);
+		phys_pte_remove(pte, addr, next);
+		free_pte_table(pte, pmd);
 		unmap_low_page(pte);
 		spin_unlock(&init_mm.page_table_lock);
 	}
@@ -751,21 +819,19 @@ phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
 	unsigned long pages = 0, next;
 	int i = pud_index(addr);

-	for (; i < PTRS_PER_PUD; i++, addr = next) {
+	for (; i < PTRS_PER_PUD && addr < end; i++, addr = next) {
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;

-		if (addr >= end)
-			break;
-
-		next = (addr & PUD_MASK) + PUD_SIZE;
+		next = pud_addr_end(addr, end);

 		if (!pud_present(*pud))
 			continue;

 		if (pud_large(*pud)) {
-			if ((addr & ~PUD_MASK) == 0 && next <= end) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE)) {
 				set_pud(pud, __pud(0));
 				pages++;
 				continue;
@@ -776,15 +842,18 @@ phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
 			 * so split 1G page to 2M page.
 			 */
 			pmd = alloc_low_page(&pmd_phys);
-			__split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
+			__split_large_page((pte_t *)pud,
+					   (unsigned long)__va(addr),
+					   (pte_t *)pmd);

 			spin_lock(&init_mm.page_table_lock);
 			pud_populate(&init_mm, pud, __va(pmd_phys));
 			spin_unlock(&init_mm.page_table_lock);
 		}

-		pmd = map_low_page(pmd_offset(pud, 0));
-		phys_pmd_remove(pmd, addr, end);
+		pmd = map_low_page((pmd_t *)pud_page_vaddr(*pud));
+		phys_pmd_remove(pmd, addr, next);
+		free_pmd_table(pmd, pud);
 		unmap_low_page(pmd);
 		__flush_tlb_all();
 	}
@@ -805,15 +874,14 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 		pgd_t *pgd = pgd_offset_k(start);
 		pud_t *pud;

-		next = (start + PGDIR_SIZE) & PGDIR_MASK;
-		if (next > end)
-			next = end;
+		next = pgd_addr_end(start, end);

 		if (!pgd_present(*pgd))
 			continue;

 		pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
-		phys_pud_remove(pud, __pa(start), __pa(end));
+		phys_pud_remove(pud, __pa(start), __pa(next));
+		free_pud_table(pud, pgd);
 		unmap_low_page(pud);
 	}

-- 1.7.6.1 .


On 2012-10-5 10:36, Yasuaki Ishimatsu wrote:
> From: Wen Congyang <wency@cn.fujitsu.com>
> 
> For hot removing memory, we sholud remove page table about the memory.
> So the patch searches a page table about the removed memory, and clear
> page table.
> 
> CC: David Rientjes <rientjes@google.com>
> CC: Jiang Liu <liuj97@gmail.com>
> CC: Len Brown <len.brown@intel.com>
> CC: Christoph Lameter <cl@linux.com>
> Cc: Minchan Kim <minchan.kim@gmail.com>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/pgtable_types.h |    1 
>  arch/x86/mm/init_64.c                |  147 +++++++++++++++++++++++++++++++++++
>  arch/x86/mm/pageattr.c               |   47 +++++------
>  3 files changed, 173 insertions(+), 22 deletions(-)
> 
> Index: linux-3.6/arch/x86/mm/init_64.c
> ===================================================================
> --- linux-3.6.orig/arch/x86/mm/init_64.c	2012-10-04 18:30:21.171698416 +0900
> +++ linux-3.6/arch/x86/mm/init_64.c	2012-10-04 18:30:27.317704652 +0900
> @@ -675,6 +675,151 @@ int arch_add_memory(int nid, u64 start, 
>  }
>  EXPORT_SYMBOL_GPL(arch_add_memory);
>  
> +static void __meminit
> +phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned pages = 0;
> +	int i = pte_index(addr);
> +
> +	pte_t *pte = pte_page + pte_index(addr);
> +
> +	for (; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
> +
> +		if (addr >= end)
> +			break;
> +
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		pages++;
> +		set_pte(pte, __pte(0));
> +	}
> +
> +	update_page_count(PG_LEVEL_4K, -pages);
> +}
> +
> +static void __meminit
> +phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned long pages = 0, next;
> +	int i = pmd_index(addr);
> +
> +	for (; i < PTRS_PER_PMD; i++, addr = next) {
> +		unsigned long pte_phys;
> +		pmd_t *pmd = pmd_page + pmd_index(addr);
> +		pte_t *pte;
> +
> +		if (addr >= end)
> +			break;
> +
> +		next = (addr & PMD_MASK) + PMD_SIZE;
> +
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_large(*pmd)) {
> +			if ((addr & ~PMD_MASK) == 0 && next <= end) {
> +				set_pmd(pmd, __pmd(0));
> +				pages++;
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 2M page, but we need to remove part of them,
> +			 * so split 2M page to 4K page.
> +			 */
> +			pte = alloc_low_page(&pte_phys);
> +			__split_large_page((pte_t *)pmd, addr, pte);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +		}
> +
> +		spin_lock(&init_mm.page_table_lock);
> +		pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
> +		phys_pte_remove(pte, addr, end);
> +		unmap_low_page(pte);
> +		spin_unlock(&init_mm.page_table_lock);
> +	}
> +	update_page_count(PG_LEVEL_2M, -pages);
> +}
> +
> +static void __meminit
> +phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
> +{
> +	unsigned long pages = 0, next;
> +	int i = pud_index(addr);
> +
> +	for (; i < PTRS_PER_PUD; i++, addr = next) {
> +		unsigned long pmd_phys;
> +		pud_t *pud = pud_page + pud_index(addr);
> +		pmd_t *pmd;
> +
> +		if (addr >= end)
> +			break;
> +
> +		next = (addr & PUD_MASK) + PUD_SIZE;
> +
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_large(*pud)) {
> +			if ((addr & ~PUD_MASK) == 0 && next <= end) {
> +				set_pud(pud, __pud(0));
> +				pages++;
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 1G page, but we need to remove part of them,
> +			 * so split 1G page to 2M page.
> +			 */
> +			pmd = alloc_low_page(&pmd_phys);
> +			__split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pud_populate(&init_mm, pud, __va(pmd_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +		}
> +
> +		pmd = map_low_page(pmd_offset(pud, 0));
> +		phys_pmd_remove(pmd, addr, end);
> +		unmap_low_page(pmd);
> +		__flush_tlb_all();
> +	}
> +	__flush_tlb_all();
> +
> +	update_page_count(PG_LEVEL_1G, -pages);
> +}
> +
> +void __meminit
> +kernel_physical_mapping_remove(unsigned long start, unsigned long end)
> +{
> +	unsigned long next;
> +
> +	start = (unsigned long)__va(start);
> +	end = (unsigned long)__va(end);
> +
> +	for (; start < end; start = next) {
> +		pgd_t *pgd = pgd_offset_k(start);
> +		pud_t *pud;
> +
> +		next = (start + PGDIR_SIZE) & PGDIR_MASK;
> +		if (next > end)
> +			next = end;
> +
> +		if (!pgd_present(*pgd))
> +			continue;
> +
> +		pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
> +		phys_pud_remove(pud, __pa(start), __pa(end));
> +		unmap_low_page(pud);
> +	}
> +
> +	__flush_tlb_all();
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  int __ref arch_remove_memory(u64 start, u64 size)
>  {
> @@ -687,6 +832,8 @@ int __ref arch_remove_memory(u64 start, 
>  	ret = __remove_pages(zone, start_pfn, nr_pages);
>  	WARN_ON_ONCE(ret);
>  
> +	kernel_physical_mapping_remove(start, start + size);
> +
>  	return ret;
>  }
>  #endif
> Index: linux-3.6/arch/x86/include/asm/pgtable_types.h
> ===================================================================
> --- linux-3.6.orig/arch/x86/include/asm/pgtable_types.h	2012-10-04 18:26:51.925486954 +0900
> +++ linux-3.6/arch/x86/include/asm/pgtable_types.h	2012-10-04 18:30:27.322704656 +0900
> @@ -334,6 +334,7 @@ static inline void update_page_count(int
>   * as a pte too.
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> Index: linux-3.6/arch/x86/mm/pageattr.c
> ===================================================================
> --- linux-3.6.orig/arch/x86/mm/pageattr.c	2012-10-04 18:26:51.923486952 +0900
> +++ linux-3.6/arch/x86/mm/pageattr.c	2012-10-04 18:30:27.328704662 +0900
> @@ -501,21 +501,13 @@ out_unlock:
>  	return do_split;
>  }
>  
> -static int split_large_page(pte_t *kpte, unsigned long address)
> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
>  {
>  	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
> -	pte_t *pbase, *tmp;
> +	pte_t *tmp;
>  	pgprot_t ref_prot;
> -	struct page *base;
> -
> -	if (!debug_pagealloc)
> -		spin_unlock(&cpa_lock);
> -	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> -	if (!debug_pagealloc)
> -		spin_lock(&cpa_lock);
> -	if (!base)
> -		return -ENOMEM;
> +	struct page *base = virt_to_page(pbase);
>  
>  	spin_lock(&pgd_lock);
>  	/*
> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte,
>  	 * up for us already:
>  	 */
>  	tmp = lookup_address(address, &level);
> -	if (tmp != kpte)
> -		goto out_unlock;
> +	if (tmp != kpte) {
> +		spin_unlock(&pgd_lock);
> +		return 1;
> +	}
>  
> -	pbase = (pte_t *)page_address(base);
>  	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>  	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
>  	/*
> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte,
>  	 * going on.
>  	 */
>  	__flush_tlb_all();
> +	spin_unlock(&pgd_lock);
>  
> -	base = NULL;
> +	return 0;
> +}
>  
> -out_unlock:
> -	/*
> -	 * If we dropped out via the lookup_address check under
> -	 * pgd_lock then stick the page back into the pool:
> -	 */
> -	if (base)
> +static int split_large_page(pte_t *kpte, unsigned long address)
> +{
> +	pte_t *pbase;
> +	struct page *base;
> +
> +	if (!debug_pagealloc)
> +		spin_unlock(&cpa_lock);
> +	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> +	if (!debug_pagealloc)
> +		spin_lock(&cpa_lock);
> +	if (!base)
> +		return -ENOMEM;
> +
> +	pbase = (pte_t *)page_address(base);
> +	if (__split_large_page(kpte, address, pbase))
>  		__free_page(base);
> -	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2012-10-09  8:26 UTC|newest]

Thread overview: 129+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-10-05  2:20 [PATCH 0/10] memory-hotplug: hot-remove physical memory Yasuaki Ishimatsu
2012-10-05  2:20 ` Yasuaki Ishimatsu
2012-10-05  2:20 ` Yasuaki Ishimatsu
2012-10-05  2:20 ` Yasuaki Ishimatsu
2012-10-05  2:20 ` Yasuaki Ishimatsu
2012-10-05  2:25 ` [PATCH 1/10] memory-hotplug : check whether memory is offline or not when removing memory Yasuaki Ishimatsu
2012-10-05  2:25   ` Yasuaki Ishimatsu
2012-10-05  2:25   ` Yasuaki Ishimatsu
2012-10-05  2:25   ` Yasuaki Ishimatsu
2012-10-05 19:27   ` KOSAKI Motohiro
2012-10-05 19:27     ` KOSAKI Motohiro
2012-10-05 19:27     ` KOSAKI Motohiro
2012-10-05 19:27     ` KOSAKI Motohiro
2012-10-19 10:44     ` Wen Congyang
2012-10-19 10:44       ` Wen Congyang
2012-10-19 10:44       ` Wen Congyang
2012-10-19 10:44       ` Wen Congyang
2012-10-19 14:15       ` Wen Congyang
2012-10-19 14:15         ` Wen Congyang
2012-10-19 14:15         ` Wen Congyang
2012-10-19 14:15         ` Wen Congyang
2012-10-19 18:33         ` KOSAKI Motohiro
2012-10-19 18:33           ` KOSAKI Motohiro
2012-10-19 18:33           ` KOSAKI Motohiro
2012-10-19 18:33           ` KOSAKI Motohiro
2012-10-20  0:50           ` Wen Congyang
2012-10-20  0:50             ` Wen Congyang
2012-10-20  0:50             ` Wen Congyang
2012-10-20  0:50             ` Wen Congyang
2012-10-05  2:26 ` [PATCH 2/10] memory-hotplug : remove /sys/firmware/memmap/X sysfs Yasuaki Ishimatsu
2012-10-05  2:26   ` Yasuaki Ishimatsu
2012-10-05  2:26   ` Yasuaki Ishimatsu
2012-10-05  2:26   ` Yasuaki Ishimatsu
2012-10-05  2:26   ` Yasuaki Ishimatsu
2012-10-05 19:36   ` KOSAKI Motohiro
2012-10-05 19:36     ` KOSAKI Motohiro
2012-10-05 19:36     ` KOSAKI Motohiro
2012-10-05 19:36     ` KOSAKI Motohiro
2012-10-11  7:06     ` Yasuaki Ishimatsu
2012-10-11  7:06       ` Yasuaki Ishimatsu
2012-10-11  7:06       ` Yasuaki Ishimatsu
2012-10-11  7:06       ` Yasuaki Ishimatsu
2012-10-11  7:06       ` Yasuaki Ishimatsu
2012-10-05  2:29 ` [PATCH 3/10] memory-hotplug : introduce new function arch_remove_memory() for removing page table depends on architecture Yasuaki Ishimatsu
2012-10-05  2:29   ` Yasuaki Ishimatsu
2012-10-05  2:29   ` Yasuaki Ishimatsu
2012-10-05  2:29   ` Yasuaki Ishimatsu
2012-10-05  2:29   ` [PATCH 3/10] memory-hotplug : introduce new function arch_remove_memory() for removing page table de Yasuaki Ishimatsu
2012-10-05  2:31 ` [PATCH 4/10] memory-hotplug : unregister memory section on SPARSEMEM_VMEMMAP Yasuaki Ishimatsu
2012-10-05  2:31   ` Yasuaki Ishimatsu
2012-10-05  2:31   ` Yasuaki Ishimatsu
2012-10-05  2:31   ` Yasuaki Ishimatsu
2012-10-05  2:31   ` Yasuaki Ishimatsu
2012-10-05  2:32 ` [PATCH 5/10] memory-hotplug : memory-hotplug: check page type in get_page_bootmem Yasuaki Ishimatsu
2012-10-05  2:32   ` Yasuaki Ishimatsu
2012-10-05  2:32   ` Yasuaki Ishimatsu
2012-10-05  2:32   ` Yasuaki Ishimatsu
2012-10-05  2:32   ` Yasuaki Ishimatsu
2012-10-12 19:28   ` KOSAKI Motohiro
2012-10-12 19:28     ` KOSAKI Motohiro
2012-10-12 19:28     ` KOSAKI Motohiro
2012-10-12 19:28     ` KOSAKI Motohiro
2012-10-19  0:49     ` Yasuaki Ishimatsu
2012-10-19  0:49       ` Yasuaki Ishimatsu
2012-10-19  0:49       ` Yasuaki Ishimatsu
2012-10-19  0:49       ` Yasuaki Ishimatsu
2012-10-19  0:49       ` Yasuaki Ishimatsu
2012-10-19  1:55       ` Wen Congyang
2012-10-19  1:55         ` Wen Congyang
2012-10-19  1:55         ` Wen Congyang
2012-10-19  1:55         ` Wen Congyang
2012-10-05  2:33 ` [PATCH 6/10] memory-hotplug : implement register_page_bootmem_info_section of sparse-vmemmap Yasuaki Ishimatsu
2012-10-05  2:33   ` Yasuaki Ishimatsu
2012-10-05  2:33   ` Yasuaki Ishimatsu
2012-10-05  2:33   ` Yasuaki Ishimatsu
2012-10-05  2:34 ` [PATCH 7/10] memory-hotplug : remove memmap " Yasuaki Ishimatsu
2012-10-05  2:34   ` Yasuaki Ishimatsu
2012-10-05  2:34   ` Yasuaki Ishimatsu
2012-10-05  2:34   ` Yasuaki Ishimatsu
2012-10-05  2:34   ` Yasuaki Ishimatsu
2012-10-05  2:36 ` [PATCH 8/10] memory-hotplug : remove page table of x86_64 architecture Yasuaki Ishimatsu
2012-10-05  2:36   ` Yasuaki Ishimatsu
2012-10-05  2:36   ` Yasuaki Ishimatsu
2012-10-05  2:36   ` Yasuaki Ishimatsu
2012-10-08  4:37   ` Andi Kleen
2012-10-08  4:37     ` Andi Kleen
2012-10-08  4:37     ` Andi Kleen
2012-10-08  4:37     ` Andi Kleen
2012-10-08  5:23     ` Wen Congyang
2012-10-08  5:23       ` Wen Congyang
2012-10-08  5:23       ` Wen Congyang
2012-10-08  5:23       ` Wen Congyang
2012-10-11  0:35       ` Ni zhan Chen
2012-10-11  0:35         ` Ni zhan Chen
2012-10-11  0:35         ` Ni zhan Chen
2012-10-11  0:35         ` Ni zhan Chen
2012-10-09  8:26   ` wujianguo [this message]
2012-10-09  8:26     ` wujianguo
2012-10-09  8:26     ` wujianguo
2012-10-09  8:26     ` wujianguo
2012-10-22  7:11     ` Wen Congyang
2012-10-22  7:11       ` Wen Congyang
2012-10-22  7:11       ` Wen Congyang
2012-10-22  7:11       ` Wen Congyang
2012-10-23  7:09       ` wujianguo
2012-10-23  7:09         ` wujianguo
2012-10-23  7:09         ` wujianguo
2012-10-23  7:09         ` wujianguo
2012-10-23  7:41         ` Wen Congyang
2012-10-23  7:41           ` Wen Congyang
2012-10-23  7:41           ` Wen Congyang
2012-10-23  7:41           ` Wen Congyang
2012-10-05  2:37 ` [PATCH 9/10] memory-hotplug : memory_hotplug: clear zone when removing the memory Yasuaki Ishimatsu
2012-10-05  2:37   ` Yasuaki Ishimatsu
2012-10-05  2:37   ` Yasuaki Ishimatsu
2012-10-05  2:37   ` Yasuaki Ishimatsu
2012-10-05  2:38 ` [PATCH 10/10] memory-hotplug : remove sysfs file of node Yasuaki Ishimatsu
2012-10-05  2:38   ` Yasuaki Ishimatsu
2012-10-05  2:38   ` Yasuaki Ishimatsu
2012-10-05  2:38   ` Yasuaki Ishimatsu
2012-10-05  2:38   ` Yasuaki Ishimatsu
2012-10-05 19:06 ` [PATCH 0/10] memory-hotplug: hot-remove physical memory KOSAKI Motohiro
2012-10-05 19:06   ` KOSAKI Motohiro
2012-10-05 19:06   ` KOSAKI Motohiro
2012-10-05 19:06   ` KOSAKI Motohiro
2012-10-08  5:26   ` Wen Congyang
2012-10-08  5:26     ` Wen Congyang
2012-10-08  5:26     ` Wen Congyang
2012-10-08  5:26     ` Wen Congyang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5073DFC0.3010400@gmail.com \
    --to=wujianguo106@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=cl@linux.com \
    --cc=cmetcalf@tilera.com \
    --cc=isimatu.yasuaki@jp.fujitsu.com \
    --cc=jiang.liu@huawei.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=len.brown@intel.com \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-ia64@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=linux-sh@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=liuj97@gmail.com \
    --cc=minchan.kim@gmail.com \
    --cc=qiuxishi@huawei.com \
    --cc=rientjes@google.com \
    --cc=sparclinux@vger.kernel.org \
    --cc=wency@cn.fujitsu.com \
    --cc=wujianguo@huawei.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.