All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alistair Popple <apopple@nvidia.com>
To: Dan Williams <dan.j.williams@intel.com>
Cc: linux-mm@kvack.org, vishal.l.verma@intel.com,
	dave.jiang@intel.com, logang@deltatee.com, bhelgaas@google.com,
	jack@suse.cz, jgg@ziepe.ca, catalin.marinas@arm.com,
	will@kernel.org, mpe@ellerman.id.au, npiggin@gmail.com,
	dave.hansen@linux.intel.com, ira.weiny@intel.com,
	willy@infradead.org, djwong@kernel.org, tytso@mit.edu,
	linmiaohe@huawei.com, david@redhat.com, peterx@redhat.com,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-ext4@vger.kernel.org, linux-xfs@vger.kernel.org,
	jhubbard@nvidia.com, hch@lst.de, david@fromorbit.com
Subject: Re: [PATCH 07/12] huge_memory: Allow mappings of PMD sized pages
Date: Mon, 14 Oct 2024 17:53:31 +1100	[thread overview]
Message-ID: <87bjznnp6v.fsf@nvdebian.thelocal> (raw)
In-Reply-To: <66f61ce4da80_964f2294fb@dwillia2-xfh.jf.intel.com.notmuch>


Dan Williams <dan.j.williams@intel.com> writes:

> Alistair Popple wrote:
>> Currently DAX folio/page reference counts are managed differently to
>> normal pages. To allow these to be managed the same as normal pages
>> introduce dax_insert_pfn_pmd. This will map the entire PMD-sized folio
>> and take references as it would for a normally mapped page.
>> 
>> This is distinct from the current mechanism, vmf_insert_pfn_pmd, which
>> simply inserts a special devmap PMD entry into the page table without
>> holding a reference to the page for the mapping.
>
> It would be useful to mention the rationale for the locking changes and
> your understanding of the new "pgtable deposit" handling, because those
> things make this not a trivial conversion.

My intent was not to change the locking for the existing
vmf_insert_pfn_pmd() but just to move it up a level in the stack so
dax_insert_pfn_pmd() could do the metadata manipulation while holding
the lock. Looks like I didn't get that quite right though, so I will
review it for the next version.

>> 
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> ---
>>  include/linux/huge_mm.h |  1 +-
>>  mm/huge_memory.c        | 57 ++++++++++++++++++++++++++++++++++--------
>>  2 files changed, 48 insertions(+), 10 deletions(-)
>> 
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index d3a1872..eaf3f78 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -40,6 +40,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>  
>>  vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
>>  vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
>> +vm_fault_t dax_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
>>  vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
>>  
>>  enum transparent_hugepage_flag {
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index e8985a4..790041e 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1237,14 +1237,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>>  {
>>  	struct mm_struct *mm = vma->vm_mm;
>>  	pmd_t entry;
>> -	spinlock_t *ptl;
>>  
>> -	ptl = pmd_lock(mm, pmd);
>>  	if (!pmd_none(*pmd)) {
>>  		if (write) {
>>  			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
>>  				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
>> -				goto out_unlock;
>> +				return;
>>  			}
>>  			entry = pmd_mkyoung(*pmd);
>>  			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> @@ -1252,7 +1250,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>>  				update_mmu_cache_pmd(vma, addr, pmd);
>>  		}
>>  
>> -		goto out_unlock;
>> +		return;
>>  	}
>>  
>>  	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
>> @@ -1271,11 +1269,6 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>>  
>>  	set_pmd_at(mm, addr, pmd, entry);
>>  	update_mmu_cache_pmd(vma, addr, pmd);
>> -
>> -out_unlock:
>> -	spin_unlock(ptl);
>> -	if (pgtable)
>> -		pte_free(mm, pgtable);
>>  }
>>  
>>  /**
>> @@ -1294,6 +1287,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	pgprot_t pgprot = vma->vm_page_prot;
>>  	pgtable_t pgtable = NULL;
>> +	spinlock_t *ptl;
>>  
>>  	/*
>>  	 * If we had pmd_special, we could avoid all these restrictions,
>> @@ -1316,12 +1310,55 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
>>  	}
>>  
>>  	track_pfn_insert(vma, &pgprot, pfn);
>> -
>> +	ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>  	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
>> +	spin_unlock(ptl);
>> +	if (pgtable)
>> +		pte_free(vma->vm_mm, pgtable);
>> +
>>  	return VM_FAULT_NOPAGE;
>>  }
>>  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
>>  
>> +vm_fault_t dax_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	unsigned long addr = vmf->address & PMD_MASK;
>> +	struct mm_struct *mm = vma->vm_mm;
>> +	spinlock_t *ptl;
>> +	pgtable_t pgtable = NULL;
>> +	struct folio *folio;
>> +	struct page *page;
>> +
>> +	if (addr < vma->vm_start || addr >= vma->vm_end)
>> +		return VM_FAULT_SIGBUS;
>> +
>> +	if (arch_needs_pgtable_deposit()) {
>> +		pgtable = pte_alloc_one(vma->vm_mm);
>> +		if (!pgtable)
>> +			return VM_FAULT_OOM;
>> +	}
>> +
>> +	track_pfn_insert(vma, &vma->vm_page_prot, pfn);
>> +
>> +	ptl = pmd_lock(mm, vmf->pmd);
>> +	if (pmd_none(*vmf->pmd)) {
>> +		page = pfn_t_to_page(pfn);
>> +		folio = page_folio(page);
>> +		folio_get(folio);
>> +		folio_add_file_rmap_pmd(folio, page, vma);
>> +		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
>> +	}
>> +	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, vma->vm_page_prot,
>> +		write, pgtable);
>> +	spin_unlock(ptl);
>> +	if (pgtable)
>> +		pte_free(mm, pgtable);
>
> Are not the deposit rules that the extra page table stick around for the
> lifetime of the inserted pte? So would that not require this incremental
> change?

Yeah, thanks for catching this.

> ---
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index ea65c2db2bb1..5ef1e5d21a96 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1232,7 +1232,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>  
>  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>  			   pmd_t *pmd, unsigned long pfn, pgprot_t prot,
> -			   bool write, pgtable_t pgtable)
> +			   bool write, pgtable_t *pgtable)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	pmd_t entry;
> @@ -1258,10 +1258,10 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>  		entry = maybe_pmd_mkwrite(entry, vma);
>  	}
>  
> -	if (pgtable) {
> -		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +	if (*pgtable) {
> +		pgtable_trans_huge_deposit(mm, pmd, *pgtable);
>  		mm_inc_nr_ptes(mm);
> -		pgtable = NULL;
> +		*pgtable = NULL;
>  	}
>  
>  	set_pmd_at(mm, addr, pmd, entry);
> @@ -1306,7 +1306,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, bool writ
>  
>  	track_pfn_insert(vma, &pgprot, pfn);
>  	ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> -	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
> +	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, &pgtable);
>  	spin_unlock(ptl);
>  	if (pgtable)
>  		pte_free(vma->vm_mm, pgtable);
> @@ -1344,8 +1344,8 @@ vm_fault_t dax_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, bool writ
>  		folio_add_file_rmap_pmd(folio, page, vma);
>  		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
>  	}
> -	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, vma->vm_page_prot,
> -		write, pgtable);
> +	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, vma->vm_page_prot, write,
> +		       &pgtable);
>  	spin_unlock(ptl);
>  	if (pgtable)
>  		pte_free(mm, pgtable);
> ---
>
> Along these lines it would be lovely if someone from the PowerPC side
> could test these changes, or if someone has a canned qemu command line
> to test radix vs hash with pmem+dax that they can share?

Michael, Nick, do you know of a qemu command or anyone who might?

>> +
>> +	return VM_FAULT_NOPAGE;
>> +}
>> +EXPORT_SYMBOL_GPL(dax_insert_pfn_pmd);
>
> Like I mentioned before, lets make the exported function
> vmf_insert_folio() and move the pte, pmd, pud internal private / static
> details of the implementation. The "dax_" specific aspect of this was
> removed at the conversion of a dax_pfn to a folio.

Ok, let me try that. Note that vmf_insert_pfn{_pmd|_pud} will have to
stick around though.

  reply	other threads:[~2024-10-14  7:01 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-10  4:14 [PATCH 00/12] fs/dax: Fix FS DAX page reference counts Alistair Popple
2024-09-10  4:14 ` [PATCH 01/12] mm/gup.c: Remove redundant check for PCI P2PDMA page Alistair Popple
2024-09-22  1:00   ` Dan Williams
2024-09-10  4:14 ` [PATCH 02/12] pci/p2pdma: Don't initialise page refcount to one Alistair Popple
2024-09-10 13:47   ` Bjorn Helgaas
2024-09-11  1:07     ` Alistair Popple
2024-09-11 13:51       ` Bjorn Helgaas
2024-09-11  0:48   ` Logan Gunthorpe
2024-10-11  0:20     ` Alistair Popple
2024-09-22  1:00   ` Dan Williams
2024-10-11  0:17     ` Alistair Popple
2024-09-10  4:14 ` [PATCH 03/12] fs/dax: Refactor wait for dax idle page Alistair Popple
2024-09-22  1:01   ` Dan Williams
2024-09-10  4:14 ` [PATCH 04/12] mm: Allow compound zone device pages Alistair Popple
2024-09-10  4:47   ` Matthew Wilcox
2024-09-10  6:57     ` Alistair Popple
2024-09-10 13:41       ` Matthew Wilcox
2024-09-12 12:44   ` kernel test robot
2024-09-12 12:44   ` kernel test robot
2024-09-22  1:01   ` Dan Williams
2024-09-10  4:14 ` [PATCH 05/12] mm/memory: Add dax_insert_pfn Alistair Popple
2024-09-22  1:41   ` Dan Williams
2024-10-01 10:43     ` Gerald Schaefer
2024-09-10  4:14 ` [PATCH 06/12] huge_memory: Allow mappings of PUD sized pages Alistair Popple
2024-09-22  2:07   ` Dan Williams
2024-10-14  6:33     ` Alistair Popple
2024-09-10  4:14 ` [PATCH 07/12] huge_memory: Allow mappings of PMD " Alistair Popple
2024-09-27  2:48   ` Dan Williams
2024-10-14  6:53     ` Alistair Popple [this message]
2024-10-23 23:14       ` Alistair Popple
2024-10-23 23:38         ` Dan Williams
2024-09-10  4:14 ` [PATCH 08/12] gup: Don't allow FOLL_LONGTERM pinning of FS DAX pages Alistair Popple
2024-09-25  0:17   ` Dan Williams
2024-09-27  2:52     ` Dan Williams
2024-10-14  7:03       ` Alistair Popple
2024-09-10  4:14 ` [PATCH 09/12] mm: Update vm_normal_page() callers to accept " Alistair Popple
2024-09-27  7:15   ` Dan Williams
2024-10-14  7:16     ` Alistair Popple
2024-09-10  4:14 ` [PATCH 10/12] fs/dax: Properly refcount fs dax pages Alistair Popple
2024-09-27  7:59   ` Dan Williams
2024-10-24  7:52     ` Alistair Popple
2024-10-24 23:52       ` Dan Williams
2024-10-25  2:46         ` Alistair Popple
2024-10-25  4:35           ` Dan Williams
2024-10-28  4:24             ` Alistair Popple
2024-10-29  2:03               ` Dan Williams
2024-10-30  5:57                 ` Alistair Popple
2024-09-10  4:14 ` [PATCH 11/12] mm: Remove pXX_devmap callers Alistair Popple
2024-09-27 12:29   ` Alexander Gordeev
2024-10-14  7:14     ` Alistair Popple
2024-09-10  4:14 ` [PATCH 12/12] mm: Remove devmap related functions and page table bits Alistair Popple
2024-09-11  7:47   ` Chunyan Zhang
2024-09-12 12:55   ` kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87bjznnp6v.fsf@nvdebian.thelocal \
    --to=apopple@nvidia.com \
    --cc=bhelgaas@google.com \
    --cc=catalin.marinas@arm.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dave.jiang@intel.com \
    --cc=david@fromorbit.com \
    --cc=david@redhat.com \
    --cc=djwong@kernel.org \
    --cc=hch@lst.de \
    --cc=ira.weiny@intel.com \
    --cc=jack@suse.cz \
    --cc=jgg@ziepe.ca \
    --cc=jhubbard@nvidia.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=logang@deltatee.com \
    --cc=mpe@ellerman.id.au \
    --cc=npiggin@gmail.com \
    --cc=nvdimm@lists.linux.dev \
    --cc=peterx@redhat.com \
    --cc=tytso@mit.edu \
    --cc=vishal.l.verma@intel.com \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.