From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S267540AbUBTFKi (ORCPT ); Fri, 20 Feb 2004 00:10:38 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S267479AbUBTFKi (ORCPT ); Fri, 20 Feb 2004 00:10:38 -0500 Received: from mx1.redhat.com ([66.187.233.31]:17109 "EHLO mx1.redhat.com") by vger.kernel.org with ESMTP id S267540AbUBTFKN (ORCPT ); Fri, 20 Feb 2004 00:10:13 -0500 From: Daniel Phillips To: paulmck@us.ibm.com Subject: Re: Non-GPL export of invalidate_mmap_range Date: Fri, 20 Feb 2004 00:07:25 -0500 User-Agent: KMail/1.5.4 Cc: "Stephen C. Tweedie" , Andrew Morton , Christoph Hellwig , linux-kernel , linux-mm References: <20040216190927.GA2969@us.ibm.com> <200402192106.02086.phillips@arcor.de> <20040219194751.GN1269@us.ibm.com> In-Reply-To: <20040219194751.GN1269@us.ibm.com> MIME-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: 7bit Content-Disposition: inline Message-Id: <200402200007.25832.phillips@arcor.de> Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org On Thursday 19 February 2004 14:47, Paul E. McKenney wrote: > OK, I surrender. I got some private email agreeing with this > viewpoint. Any dissenters, speak soon, or... An implementation is going to look something like the patch below. Unfortunately I don't think there is a way around passing an extra parameter all the way down the unmap call chain. Doubly unfortunately, this doesn't give any benefit at all to anybody who doesn't use a clustered filesystem (which is nearly everybody) while there is a marginal cost. Do you know a better way? Anyway, this is the price of correct MAP_PRIVATE semantics for clustered filesystems. At least I have quantified it so we can decide if it's worth it. (My opinion: correctness is always worth it.) Regards, Daniel --- 2.6.3.clean/include/linux/mm.h 2004-02-17 22:57:13.000000000 -0500 +++ 2.6.3/include/linux/mm.h 2004-02-19 23:18:08.000000000 -0500 @@ -434,9 +434,7 @@ unsigned long size); int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, struct vm_area_struct *start_vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted); -void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long address, unsigned long size); + unsigned long end_addr, unsigned long *nr_accounted, int zap); void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); @@ -444,8 +442,7 @@ unsigned long size, pgprot_t prot); extern void invalidate_mmap_range(struct address_space *mapping, - loff_t const holebegin, - loff_t const holelen); + loff_t const holebegin, loff_t const holelen, int zap); extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); --- 2.6.3.clean/mm/memory.c 2004-02-17 22:57:47.000000000 -0500 +++ 2.6.3/mm/memory.c 2004-02-19 23:48:23.000000000 -0500 @@ -386,7 +386,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd, - unsigned long address, unsigned long size) + unsigned long address, unsigned long size, int zap) { unsigned long offset; pte_t *ptep; @@ -414,7 +414,7 @@ tlb_remove_tlb_entry(tlb, ptep, address+offset); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - if (!PageReserved(page)) { + if (!PageReserved(page) && (zap || (page->mapping && !PageSwapCache(page)))) { if (pte_dirty(pte)) set_page_dirty(page); if (page->mapping && pte_young(pte) && @@ -436,7 +436,7 @@ static void zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, - unsigned long address, unsigned long size) + unsigned long address, unsigned long size, int zap) { pmd_t * pmd; unsigned long end; @@ -453,14 +453,14 @@ if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); - address = (address + PMD_SIZE) & PMD_MASK; + zap_pte_range(tlb, pmd, address, end - address, zap); + address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); } -void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long address, unsigned long end) +static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long address, unsigned long end, int zap) { pgd_t * dir; @@ -474,7 +474,7 @@ dir = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); do { - zap_pmd_range(tlb, dir, address, end - address); + zap_pmd_range(tlb, dir, address, end - address, zap); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -524,7 +524,7 @@ */ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted) + unsigned long end_addr, unsigned long *nr_accounted, int zap) { unsigned long zap_bytes = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ @@ -568,7 +568,7 @@ tlb_start_valid = 1; } - unmap_page_range(*tlbp, vma, start, start + block); + unmap_page_range(*tlbp, vma, start, start + block, zap); start += block; zap_bytes -= block; if ((long)zap_bytes > 0) @@ -594,8 +594,8 @@ * @address: starting address of pages to zap * @size: number of bytes to zap */ -void zap_page_range(struct vm_area_struct *vma, - unsigned long address, unsigned long size) +void invalidate_page_range(struct vm_area_struct *vma, + unsigned long address, unsigned long size, int zap) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather *tlb; @@ -612,11 +612,17 @@ lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted); + unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, zap); tlb_finish_mmu(tlb, address, end); spin_unlock(&mm->page_table_lock); } +void zap_page_range(struct vm_area_struct *vma, + unsigned long address, unsigned long size) +{ + invalidate_page_range(vma, address, size, 1); +} + /* * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. @@ -1095,9 +1101,9 @@ continue; /* Mapping disjoint from hole. */ zba = (hba <= vba) ? vba : hba; zea = (vea <= hea) ? vea : hea; - zap_page_range(vp, + invalidate_page_range(vp, ((zba - vba) << PAGE_SHIFT) + vp->vm_start, - (zea - zba + 1) << PAGE_SHIFT); + (zea - zba + 1) << PAGE_SHIFT, 1); } } @@ -1116,7 +1122,7 @@ * end of the file. */ void invalidate_mmap_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) + loff_t const holebegin, loff_t const holelen, int zap) { unsigned long hba = holebegin >> PAGE_SHIFT; unsigned long hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -1156,7 +1162,7 @@ if (inode->i_size < offset) goto do_expand; i_size_write(inode, offset); - invalidate_mmap_range(mapping, offset + PAGE_SIZE - 1, 0); + invalidate_mmap_range(mapping, offset + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(mapping, offset); goto out_truncate; --- 2.6.3.clean/mm/mmap.c 2004-02-17 22:58:32.000000000 -0500 +++ 2.6.3/mm/mmap.c 2004-02-19 22:46:01.000000000 -0500 @@ -1134,7 +1134,7 @@ lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted); + unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, 1); vm_unacct_memory(nr_accounted); if (is_hugepage_only_range(start, end - start)) @@ -1436,7 +1436,7 @@ flush_cache_mm(mm); /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, - ~0UL, &nr_accounted); + ~0UL, &nr_accounted, 1); vm_unacct_memory(nr_accounted); BUG_ON(mm->map_count); /* This is just debugging */ clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);