[PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field

cgroups.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
       [not found] <20170703211415.11283-1-jglisse@redhat.com>
@ 2017-07-03 21:14 ` Jérôme Glisse
  2017-07-04 12:51   ` Michal Hocko
  2017-07-03 21:14 ` [PATCH 5/5] mm/memcontrol: support MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_PUBLIC Jérôme Glisse
  1 sibling, 1 reply; 15+ messages in thread
From: Jérôme Glisse @ 2017-07-03 21:14 UTC (permalink / raw)
  To: linux-kernel, linux-mm
  Cc: John Hubbard, David Nellans, Dan Williams, Balbir Singh,
	Jérôme Glisse, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, cgroups

HMM pages (private or public device pages) are ZONE_DEVICE page and
thus you can not use page->lru fields of those pages. This patch
re-arrange the uncharge to allow single page to be uncharge without
modifying the lru field of the struct page.

There is no change to memcontrol logic, it is the same as it was
before this patch.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: cgroups@vger.kernel.org
---
 mm/memcontrol.c | 168 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 92 insertions(+), 76 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..c709fdceac13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5509,48 +5509,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
 	cancel_charge(memcg, nr_pages);
 }
 
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-			   unsigned long nr_anon, unsigned long nr_file,
-			   unsigned long nr_kmem, unsigned long nr_huge,
-			   unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+	struct mem_cgroup *memcg;
+	unsigned long pgpgout;
+	unsigned long nr_anon;
+	unsigned long nr_file;
+	unsigned long nr_kmem;
+	unsigned long nr_huge;
+	unsigned long nr_shmem;
+	struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
 {
-	unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+	memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
+{
+	unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
 	unsigned long flags;
 
-	if (!mem_cgroup_is_root(memcg)) {
-		page_counter_uncharge(&memcg->memory, nr_pages);
+	if (!mem_cgroup_is_root(ug->memcg)) {
+		page_counter_uncharge(&ug->memcg->memory, nr_pages);
 		if (do_memsw_account())
-			page_counter_uncharge(&memcg->memsw, nr_pages);
-		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
-			page_counter_uncharge(&memcg->kmem, nr_kmem);
-		memcg_oom_recover(memcg);
+			page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+		memcg_oom_recover(ug->memcg);
 	}
 
 	local_irq_save(flags);
-	__this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
-	__this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
-	__this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
-	__this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
-	__this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
-	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-	memcg_check_events(memcg, dummy_page);
+	__this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+	__this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+	__this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+	__this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+	__this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+	__this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+	memcg_check_events(ug->memcg, ug->dummy_page);
 	local_irq_restore(flags);
 
-	if (!mem_cgroup_is_root(memcg))
-		css_put_many(&memcg->css, nr_pages);
+	if (!mem_cgroup_is_root(ug->memcg))
+		css_put_many(&ug->memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+	VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+	if (!page->mem_cgroup)
+		return;
+
+	/*
+	 * Nobody should be changing or seriously looking at
+	 * page->mem_cgroup at this point, we have fully
+	 * exclusive access to the page.
+	 */
+
+	if (ug->memcg != page->mem_cgroup) {
+		if (ug->memcg) {
+			uncharge_batch(ug);
+			uncharge_gather_clear(ug);
+		}
+		ug->memcg = page->mem_cgroup;
+	}
+
+	if (!PageKmemcg(page)) {
+		unsigned int nr_pages = 1;
+
+		if (PageTransHuge(page)) {
+			nr_pages <<= compound_order(page);
+			ug->nr_huge += nr_pages;
+		}
+		if (PageAnon(page))
+			ug->nr_anon += nr_pages;
+		else {
+			ug->nr_file += nr_pages;
+			if (PageSwapBacked(page))
+				ug->nr_shmem += nr_pages;
+		}
+		ug->pgpgout++;
+	} else {
+		ug->nr_kmem += 1 << compound_order(page);
+		__ClearPageKmemcg(page);
+	}
+
+	ug->dummy_page = page;
+	page->mem_cgroup = NULL;
 }
 
 static void uncharge_list(struct list_head *page_list)
 {
-	struct mem_cgroup *memcg = NULL;
-	unsigned long nr_shmem = 0;
-	unsigned long nr_anon = 0;
-	unsigned long nr_file = 0;
-	unsigned long nr_huge = 0;
-	unsigned long nr_kmem = 0;
-	unsigned long pgpgout = 0;
+	struct uncharge_gather ug;
 	struct list_head *next;
-	struct page *page;
+
+	uncharge_gather_clear(&ug);
 
 	/*
 	 * Note that the list can be a single page->lru; hence the
@@ -5558,57 +5612,16 @@ static void uncharge_list(struct list_head *page_list)
 	 */
 	next = page_list->next;
 	do {
+		struct page *page;
+
 		page = list_entry(next, struct page, lru);
 		next = page->lru.next;
 
-		VM_BUG_ON_PAGE(PageLRU(page), page);
-		VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
-
-		if (!page->mem_cgroup)
-			continue;
-
-		/*
-		 * Nobody should be changing or seriously looking at
-		 * page->mem_cgroup at this point, we have fully
-		 * exclusive access to the page.
-		 */
-
-		if (memcg != page->mem_cgroup) {
-			if (memcg) {
-				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-					       nr_kmem, nr_huge, nr_shmem, page);
-				pgpgout = nr_anon = nr_file = nr_kmem = 0;
-				nr_huge = nr_shmem = 0;
-			}
-			memcg = page->mem_cgroup;
-		}
-
-		if (!PageKmemcg(page)) {
-			unsigned int nr_pages = 1;
-
-			if (PageTransHuge(page)) {
-				nr_pages <<= compound_order(page);
-				nr_huge += nr_pages;
-			}
-			if (PageAnon(page))
-				nr_anon += nr_pages;
-			else {
-				nr_file += nr_pages;
-				if (PageSwapBacked(page))
-					nr_shmem += nr_pages;
-			}
-			pgpgout++;
-		} else {
-			nr_kmem += 1 << compound_order(page);
-			__ClearPageKmemcg(page);
-		}
-
-		page->mem_cgroup = NULL;
+		uncharge_page(page, &ug);
 	} while (next != page_list);
 
-	if (memcg)
-		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-			       nr_kmem, nr_huge, nr_shmem, page);
+	if (ug.memcg)
+		uncharge_batch(&ug);
 }
 
 /**
@@ -5620,6 +5633,8 @@ static void uncharge_list(struct list_head *page_list)
  */
 void mem_cgroup_uncharge(struct page *page)
 {
+	struct uncharge_gather ug;
+
 	if (mem_cgroup_disabled())
 		return;
 
@@ -5627,8 +5642,9 @@ void mem_cgroup_uncharge(struct page *page)
 	if (!page->mem_cgroup)
 		return;
 
-	INIT_LIST_HEAD(&page->lru);
-	uncharge_list(&page->lru);
+	uncharge_gather_clear(&ug);
+	uncharge_page(page, &ug);
+	uncharge_batch(&ug);
 }
 
 /**
-- 
2.13.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/5] mm/memcontrol: support MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_PUBLIC
       [not found] <20170703211415.11283-1-jglisse@redhat.com>
  2017-07-03 21:14 ` [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field Jérôme Glisse
@ 2017-07-03 21:14 ` Jérôme Glisse
  1 sibling, 0 replies; 15+ messages in thread
From: Jérôme Glisse @ 2017-07-03 21:14 UTC (permalink / raw)
  To: linux-kernel, linux-mm
  Cc: John Hubbard, David Nellans, Dan Williams, Balbir Singh,
	Jérôme Glisse, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, cgroups

HMM pages (private or public device pages) are ZONE_DEVICE page and
thus need special handling when it comes to lru or refcount. This
patch make sure that memcontrol properly handle those when it face
them. Those pages are use like regular pages in a process address
space either as anonymous page or as file back page. So from memcg
point of view we want to handle them like regular page for now at
least.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: cgroups@vger.kernel.org
---
 kernel/memremap.c |  2 ++
 mm/memcontrol.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index da74775f2247..584984cf7d18 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -479,6 +479,8 @@ void put_zone_device_private_or_public_page(struct page *page)
 		__ClearPageActive(page);
 		__ClearPageWaiters(page);
 
+		mem_cgroup_uncharge(page);
+
 		page->pgmap->page_free(page, page->pgmap->data);
 	}
 	else if (!count)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c709fdceac13..04a88aedddbe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4391,12 +4391,13 @@ enum mc_target_type {
 	MC_TARGET_NONE = 0,
 	MC_TARGET_PAGE,
 	MC_TARGET_SWAP,
+	MC_TARGET_DEVICE,
 };
 
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
-	struct page *page = vm_normal_page(vma, addr, ptent);
+	struct page *page = _vm_normal_page(vma, addr, ptent, true);
 
 	if (!page || !page_mapped(page))
 		return NULL;
@@ -4407,13 +4408,20 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 		if (!(mc.flags & MOVE_FILE))
 			return NULL;
 	}
-	if (!get_page_unless_zero(page))
+	if (is_device_public_page(page)) {
+		/*
+		 * MEMORY_DEVICE_PUBLIC means ZONE_DEVICE page and which have a
+		 * refcount of 1 when free (unlike normal page)
+		 */
+		if (!page_ref_add_unless(page, 1, 1))
+			return NULL;
+	} else if (!get_page_unless_zero(page))
 		return NULL;
 
 	return page;
 }
 
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			pte_t ptent, swp_entry_t *entry)
 {
@@ -4422,6 +4430,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 
 	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
 		return NULL;
+
+	/*
+	 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+	 * a device and because they are not accessible by CPU they are store
+	 * as special swap entry in the CPU page table.
+	 */
+	if (is_device_private_entry(ent)) {
+		page = device_private_entry_to_page(ent);
+		/*
+		 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+		 * a refcount of 1 when free (unlike normal page)
+		 */
+		if (!page_ref_add_unless(page, 1, 1))
+			return NULL;
+		return page;
+	}
+
 	/*
 	 * Because lookup_swap_cache() updates some statistics counter,
 	 * we call find_get_page() with swapper_space directly.
@@ -4582,6 +4607,8 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
+ *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
  *
  * Called with pte lock held.
  */
@@ -4610,6 +4637,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 */
 		if (page->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
+			if (is_device_private_page(page) ||
+			    is_device_public_page(page))
+				ret = MC_TARGET_DEVICE;
 			if (target)
 				target->page = page;
 		}
@@ -4669,6 +4699,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
+		/*
+		 * Note their can not be MC_TARGET_DEVICE for now as we do not
+		 * support transparent huge page with MEMORY_DEVICE_PUBLIC or
+		 * MEMORY_DEVICE_PRIVATE but this might change.
+		 */
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
 		spin_unlock(ptl);
@@ -4884,6 +4919,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 				putback_lru_page(page);
 			}
 			put_page(page);
+		} else if (target_type == MC_TARGET_DEVICE) {
+			page = target.page;
+			if (!mem_cgroup_move_account(page, true,
+						     mc.from, mc.to)) {
+				mc.precharge -= HPAGE_PMD_NR;
+				mc.moved_charge += HPAGE_PMD_NR;
+			}
+			put_page(page);
 		}
 		spin_unlock(ptl);
 		return 0;
@@ -4895,12 +4938,16 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
 		pte_t ptent = *(pte++);
+		bool device = false;
 		swp_entry_t ent;
 
 		if (!mc.precharge)
 			break;
 
 		switch (get_mctgt_type(vma, addr, ptent, &target)) {
+		case MC_TARGET_DEVICE:
+			device = true;
+			/* fall through */
 		case MC_TARGET_PAGE:
 			page = target.page;
 			/*
@@ -4911,7 +4958,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			 */
 			if (PageTransCompound(page))
 				goto put;
-			if (isolate_lru_page(page))
+			if (!device && isolate_lru_page(page))
 				goto put;
 			if (!mem_cgroup_move_account(page, false,
 						mc.from, mc.to)) {
@@ -4919,7 +4966,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
-			putback_lru_page(page);
+			if (!device)
+				putback_lru_page(page);
 put:			/* get_mctgt_type() gets the page */
 			put_page(page);
 			break;
-- 
2.13.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
  2017-07-03 21:14 ` [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field Jérôme Glisse
@ 2017-07-04 12:51   ` Michal Hocko
       [not found]     ` <20170704125113.GC14727-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
  2017-07-05 14:35     ` Jerome Glisse
  0 siblings, 2 replies; 15+ messages in thread
From: Michal Hocko @ 2017-07-04 12:51 UTC (permalink / raw)
  To: Jérôme Glisse
  Cc: linux-kernel, linux-mm, John Hubbard, David Nellans, Dan Williams,
	Balbir Singh, Johannes Weiner, Vladimir Davydov, cgroups

On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
> HMM pages (private or public device pages) are ZONE_DEVICE page and
> thus you can not use page->lru fields of those pages. This patch
> re-arrange the uncharge to allow single page to be uncharge without
> modifying the lru field of the struct page.
> 
> There is no change to memcontrol logic, it is the same as it was
> before this patch.

What is the memcg semantic of the memory? Why is it even charged? AFAIR
this is not a reclaimable memory. If yes how are we going to deal with
memory limits? What should happen if go OOM? Does killing an process
actually help to release that memory? Isn't it pinned by a device?

For the patch itself. It is quite ugly but I haven't spotted anything
obviously wrong with it. It is the memcg semantic with this class of
memory which makes me worried.

> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> Cc: cgroups@vger.kernel.org
> ---
>  mm/memcontrol.c | 168 +++++++++++++++++++++++++++++++-------------------------
>  1 file changed, 92 insertions(+), 76 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 3df3c04d73ab..c709fdceac13 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5509,48 +5509,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
>  	cancel_charge(memcg, nr_pages);
>  }
>  
> -static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
> -			   unsigned long nr_anon, unsigned long nr_file,
> -			   unsigned long nr_kmem, unsigned long nr_huge,
> -			   unsigned long nr_shmem, struct page *dummy_page)
> +struct uncharge_gather {
> +	struct mem_cgroup *memcg;
> +	unsigned long pgpgout;
> +	unsigned long nr_anon;
> +	unsigned long nr_file;
> +	unsigned long nr_kmem;
> +	unsigned long nr_huge;
> +	unsigned long nr_shmem;
> +	struct page *dummy_page;
> +};
> +
> +static inline void uncharge_gather_clear(struct uncharge_gather *ug)
>  {
> -	unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
> +	memset(ug, 0, sizeof(*ug));
> +}
> +
> +static void uncharge_batch(const struct uncharge_gather *ug)
> +{
> +	unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
>  	unsigned long flags;
>  
> -	if (!mem_cgroup_is_root(memcg)) {
> -		page_counter_uncharge(&memcg->memory, nr_pages);
> +	if (!mem_cgroup_is_root(ug->memcg)) {
> +		page_counter_uncharge(&ug->memcg->memory, nr_pages);
>  		if (do_memsw_account())
> -			page_counter_uncharge(&memcg->memsw, nr_pages);
> -		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
> -			page_counter_uncharge(&memcg->kmem, nr_kmem);
> -		memcg_oom_recover(memcg);
> +			page_counter_uncharge(&ug->memcg->memsw, nr_pages);
> +		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
> +			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
> +		memcg_oom_recover(ug->memcg);
>  	}
>  
>  	local_irq_save(flags);
> -	__this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
> -	__this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
> -	__this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
> -	__this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
> -	__this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
> -	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
> -	memcg_check_events(memcg, dummy_page);
> +	__this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
> +	__this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
> +	__this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
> +	__this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
> +	__this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
> +	__this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
> +	memcg_check_events(ug->memcg, ug->dummy_page);
>  	local_irq_restore(flags);
>  
> -	if (!mem_cgroup_is_root(memcg))
> -		css_put_many(&memcg->css, nr_pages);
> +	if (!mem_cgroup_is_root(ug->memcg))
> +		css_put_many(&ug->memcg->css, nr_pages);
> +}
> +
> +static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> +{
> +	VM_BUG_ON_PAGE(PageLRU(page), page);
> +	VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
> +
> +	if (!page->mem_cgroup)
> +		return;
> +
> +	/*
> +	 * Nobody should be changing or seriously looking at
> +	 * page->mem_cgroup at this point, we have fully
> +	 * exclusive access to the page.
> +	 */
> +
> +	if (ug->memcg != page->mem_cgroup) {
> +		if (ug->memcg) {
> +			uncharge_batch(ug);
> +			uncharge_gather_clear(ug);
> +		}
> +		ug->memcg = page->mem_cgroup;
> +	}
> +
> +	if (!PageKmemcg(page)) {
> +		unsigned int nr_pages = 1;
> +
> +		if (PageTransHuge(page)) {
> +			nr_pages <<= compound_order(page);
> +			ug->nr_huge += nr_pages;
> +		}
> +		if (PageAnon(page))
> +			ug->nr_anon += nr_pages;
> +		else {
> +			ug->nr_file += nr_pages;
> +			if (PageSwapBacked(page))
> +				ug->nr_shmem += nr_pages;
> +		}
> +		ug->pgpgout++;
> +	} else {
> +		ug->nr_kmem += 1 << compound_order(page);
> +		__ClearPageKmemcg(page);
> +	}
> +
> +	ug->dummy_page = page;
> +	page->mem_cgroup = NULL;
>  }
>  
>  static void uncharge_list(struct list_head *page_list)
>  {
> -	struct mem_cgroup *memcg = NULL;
> -	unsigned long nr_shmem = 0;
> -	unsigned long nr_anon = 0;
> -	unsigned long nr_file = 0;
> -	unsigned long nr_huge = 0;
> -	unsigned long nr_kmem = 0;
> -	unsigned long pgpgout = 0;
> +	struct uncharge_gather ug;
>  	struct list_head *next;
> -	struct page *page;
> +
> +	uncharge_gather_clear(&ug);
>  
>  	/*
>  	 * Note that the list can be a single page->lru; hence the
> @@ -5558,57 +5612,16 @@ static void uncharge_list(struct list_head *page_list)
>  	 */
>  	next = page_list->next;
>  	do {
> +		struct page *page;
> +
>  		page = list_entry(next, struct page, lru);
>  		next = page->lru.next;
>  
> -		VM_BUG_ON_PAGE(PageLRU(page), page);
> -		VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
> -
> -		if (!page->mem_cgroup)
> -			continue;
> -
> -		/*
> -		 * Nobody should be changing or seriously looking at
> -		 * page->mem_cgroup at this point, we have fully
> -		 * exclusive access to the page.
> -		 */
> -
> -		if (memcg != page->mem_cgroup) {
> -			if (memcg) {
> -				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
> -					       nr_kmem, nr_huge, nr_shmem, page);
> -				pgpgout = nr_anon = nr_file = nr_kmem = 0;
> -				nr_huge = nr_shmem = 0;
> -			}
> -			memcg = page->mem_cgroup;
> -		}
> -
> -		if (!PageKmemcg(page)) {
> -			unsigned int nr_pages = 1;
> -
> -			if (PageTransHuge(page)) {
> -				nr_pages <<= compound_order(page);
> -				nr_huge += nr_pages;
> -			}
> -			if (PageAnon(page))
> -				nr_anon += nr_pages;
> -			else {
> -				nr_file += nr_pages;
> -				if (PageSwapBacked(page))
> -					nr_shmem += nr_pages;
> -			}
> -			pgpgout++;
> -		} else {
> -			nr_kmem += 1 << compound_order(page);
> -			__ClearPageKmemcg(page);
> -		}
> -
> -		page->mem_cgroup = NULL;
> +		uncharge_page(page, &ug);
>  	} while (next != page_list);
>  
> -	if (memcg)
> -		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
> -			       nr_kmem, nr_huge, nr_shmem, page);
> +	if (ug.memcg)
> +		uncharge_batch(&ug);
>  }
>  
>  /**
> @@ -5620,6 +5633,8 @@ static void uncharge_list(struct list_head *page_list)
>   */
>  void mem_cgroup_uncharge(struct page *page)
>  {
> +	struct uncharge_gather ug;
> +
>  	if (mem_cgroup_disabled())
>  		return;
>  
> @@ -5627,8 +5642,9 @@ void mem_cgroup_uncharge(struct page *page)
>  	if (!page->mem_cgroup)
>  		return;
>  
> -	INIT_LIST_HEAD(&page->lru);
> -	uncharge_list(&page->lru);
> +	uncharge_gather_clear(&ug);
> +	uncharge_page(page, &ug);
> +	uncharge_batch(&ug);
>  }
>  
>  /**
> -- 
> 2.13.0

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
       [not found]     ` <20170704125113.GC14727-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2017-07-05  3:18       ` Balbir Singh
  2017-07-05  6:38         ` Michal Hocko
  0 siblings, 1 reply; 15+ messages in thread
From: Balbir Singh @ 2017-07-05  3:18 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Jérôme Glisse,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, linux-mm,
	John Hubbard, David Nellans, Dan Williams, Johannes Weiner,
	Vladimir Davydov, cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On Tue, Jul 4, 2017 at 10:51 PM, Michal Hocko <mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
>> HMM pages (private or public device pages) are ZONE_DEVICE page and
>> thus you can not use page->lru fields of those pages. This patch
>> re-arrange the uncharge to allow single page to be uncharge without
>> modifying the lru field of the struct page.
>>
>> There is no change to memcontrol logic, it is the same as it was
>> before this patch.
>
> What is the memcg semantic of the memory? Why is it even charged? AFAIR
> this is not a reclaimable memory. If yes how are we going to deal with
> memory limits? What should happen if go OOM? Does killing an process
> actually help to release that memory? Isn't it pinned by a device?
>
> For the patch itself. It is quite ugly but I haven't spotted anything
> obviously wrong with it. It is the memcg semantic with this class of
> memory which makes me worried.
>

This is the HMM CDM case. Memory is normally malloc'd and then
migrated to ZONE_DEVICE or vice-versa. One of the things we did
discuss was seeing ZONE_DEVICE memory in user page tables.

Balbir Singh.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
  2017-07-05  3:18       ` Balbir Singh
@ 2017-07-05  6:38         ` Michal Hocko
       [not found]           ` <20170705063813.GB10354-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Michal Hocko @ 2017-07-05  6:38 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Jérôme Glisse, linux-kernel@vger.kernel.org, linux-mm,
	John Hubbard, David Nellans, Dan Williams, Johannes Weiner,
	Vladimir Davydov, cgroups@vger.kernel.org

On Wed 05-07-17 13:18:18, Balbir Singh wrote:
> On Tue, Jul 4, 2017 at 10:51 PM, Michal Hocko <mhocko@kernel.org> wrote:
> > On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
> >> HMM pages (private or public device pages) are ZONE_DEVICE page and
> >> thus you can not use page->lru fields of those pages. This patch
> >> re-arrange the uncharge to allow single page to be uncharge without
> >> modifying the lru field of the struct page.
> >>
> >> There is no change to memcontrol logic, it is the same as it was
> >> before this patch.
> >
> > What is the memcg semantic of the memory? Why is it even charged? AFAIR
> > this is not a reclaimable memory. If yes how are we going to deal with
> > memory limits? What should happen if go OOM? Does killing an process
> > actually help to release that memory? Isn't it pinned by a device?
> >
> > For the patch itself. It is quite ugly but I haven't spotted anything
> > obviously wrong with it. It is the memcg semantic with this class of
> > memory which makes me worried.
> >
> 
> This is the HMM CDM case. Memory is normally malloc'd and then
> migrated to ZONE_DEVICE or vice-versa. One of the things we did
> discuss was seeing ZONE_DEVICE memory in user page tables.

This doesn't answer any of the above questions though.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
       [not found]           ` <20170705063813.GB10354-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2017-07-05 10:22             ` Balbir Singh
  0 siblings, 0 replies; 15+ messages in thread
From: Balbir Singh @ 2017-07-05 10:22 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Jérôme Glisse,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, linux-mm,
	John Hubbard, David Nellans, Dan Williams, Johannes Weiner,
	Vladimir Davydov, cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On Wed, Jul 5, 2017 at 4:38 PM, Michal Hocko <mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> On Wed 05-07-17 13:18:18, Balbir Singh wrote:
>> On Tue, Jul 4, 2017 at 10:51 PM, Michal Hocko <mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
>> > On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
>> >> HMM pages (private or public device pages) are ZONE_DEVICE page and
>> >> thus you can not use page->lru fields of those pages. This patch
>> >> re-arrange the uncharge to allow single page to be uncharge without
>> >> modifying the lru field of the struct page.
>> >>
>> >> There is no change to memcontrol logic, it is the same as it was
>> >> before this patch.
>> >
>> > What is the memcg semantic of the memory? Why is it even charged? AFAIR
>> > this is not a reclaimable memory. If yes how are we going to deal with
>> > memory limits? What should happen if go OOM? Does killing an process
>> > actually help to release that memory? Isn't it pinned by a device?
>> >
>> > For the patch itself. It is quite ugly but I haven't spotted anything
>> > obviously wrong with it. It is the memcg semantic with this class of
>> > memory which makes me worried.
>> >
>>
>> This is the HMM CDM case. Memory is normally malloc'd and then
>> migrated to ZONE_DEVICE or vice-versa. One of the things we did
>> discuss was seeing ZONE_DEVICE memory in user page tables.
>
> This doesn't answer any of the above questions though.


Jerome is the expert and I am sure he has a better answer, but my understanding
is that this path gets called through release_pages() <-- zap_pte_range().
At first even I pondered about the same thing, but then came across this path.

Balbir Singh.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
  2017-07-04 12:51   ` Michal Hocko
       [not found]     ` <20170704125113.GC14727-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2017-07-05 14:35     ` Jerome Glisse
  2017-07-10  8:28       ` Michal Hocko
  1 sibling, 1 reply; 15+ messages in thread
From: Jerome Glisse @ 2017-07-05 14:35 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, John Hubbard, David Nellans, Dan Williams,
	Balbir Singh, Johannes Weiner, Vladimir Davydov, cgroups

On Tue, Jul 04, 2017 at 02:51:13PM +0200, Michal Hocko wrote:
> On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
> > HMM pages (private or public device pages) are ZONE_DEVICE page and
> > thus you can not use page->lru fields of those pages. This patch
> > re-arrange the uncharge to allow single page to be uncharge without
> > modifying the lru field of the struct page.
> > 
> > There is no change to memcontrol logic, it is the same as it was
> > before this patch.
> 
> What is the memcg semantic of the memory? Why is it even charged? AFAIR
> this is not a reclaimable memory. If yes how are we going to deal with
> memory limits? What should happen if go OOM? Does killing an process
> actually help to release that memory? Isn't it pinned by a device?
> 
> For the patch itself. It is quite ugly but I haven't spotted anything
> obviously wrong with it. It is the memcg semantic with this class of
> memory which makes me worried.

So i am facing 3 choices. First one not account device memory at all.
Second one is account device memory like any other memory inside a
process. Third one is account device memory as something entirely new.

I pick the second one for two reasons. First because when migrating
back from device memory it means that migration can not fail because
of memory cgroup limit, this simplify an already complex migration
code. Second because i assume that device memory usage is a transient
state ie once device is done with its computation the most likely
outcome is memory is migrated back. From this assumption it means
that you do not want to allow a process to overuse regular memory
while it is using un-accounted device memory. It sounds safer to
account device memory and to keep the process within its memcg
boundary.

Admittedly here i am making an assumption and i can be wrong. Thing
is we do not have enough real data of how this will be use and how
much of an impact device memory will have. That is why for now i
would rather restrict myself to either not account it or account it
as usual.

If you prefer not accounting it until we have more experience on how
it is use and how it impacts memory resource management i am fine with
that too. It will make the migration code slightly more complex.

Cheers,
Jérôme

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
  2017-07-05 14:35     ` Jerome Glisse
@ 2017-07-10  8:28       ` Michal Hocko
       [not found]         ` <20170710082805.GD19185-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Michal Hocko @ 2017-07-10  8:28 UTC (permalink / raw)
  To: Jerome Glisse
  Cc: linux-kernel, linux-mm, John Hubbard, David Nellans, Dan Williams,
	Balbir Singh, Johannes Weiner, Vladimir Davydov, cgroups

On Wed 05-07-17 10:35:29, Jerome Glisse wrote:
> On Tue, Jul 04, 2017 at 02:51:13PM +0200, Michal Hocko wrote:
> > On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
> > > HMM pages (private or public device pages) are ZONE_DEVICE page and
> > > thus you can not use page->lru fields of those pages. This patch
> > > re-arrange the uncharge to allow single page to be uncharge without
> > > modifying the lru field of the struct page.
> > > 
> > > There is no change to memcontrol logic, it is the same as it was
> > > before this patch.
> > 
> > What is the memcg semantic of the memory? Why is it even charged? AFAIR
> > this is not a reclaimable memory. If yes how are we going to deal with
> > memory limits? What should happen if go OOM? Does killing an process
> > actually help to release that memory? Isn't it pinned by a device?
> > 
> > For the patch itself. It is quite ugly but I haven't spotted anything
> > obviously wrong with it. It is the memcg semantic with this class of
> > memory which makes me worried.
> 
> So i am facing 3 choices. First one not account device memory at all.
> Second one is account device memory like any other memory inside a
> process. Third one is account device memory as something entirely new.
> 
> I pick the second one for two reasons. First because when migrating
> back from device memory it means that migration can not fail because
> of memory cgroup limit, this simplify an already complex migration
> code. Second because i assume that device memory usage is a transient
> state ie once device is done with its computation the most likely
> outcome is memory is migrated back. From this assumption it means
> that you do not want to allow a process to overuse regular memory
> while it is using un-accounted device memory. It sounds safer to
> account device memory and to keep the process within its memcg
> boundary.
> 
> Admittedly here i am making an assumption and i can be wrong. Thing
> is we do not have enough real data of how this will be use and how
> much of an impact device memory will have. That is why for now i
> would rather restrict myself to either not account it or account it
> as usual.
> 
> If you prefer not accounting it until we have more experience on how
> it is use and how it impacts memory resource management i am fine with
> that too. It will make the migration code slightly more complex.

I can see why you want to do this but the semantic _has_ to be clear.
And as such make sure that the exiting task will simply unpin and
invalidate all the device memory (assuming this memory is not shared
which I am not sure is even possible).
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
       [not found]         ` <20170710082805.GD19185-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2017-07-10 15:32           ` Jerome Glisse
  2017-07-10 16:04             ` Michal Hocko
  0 siblings, 1 reply; 15+ messages in thread
From: Jerome Glisse @ 2017-07-10 15:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, John Hubbard, David Nellans,
	Dan Williams, Balbir Singh, Johannes Weiner, Vladimir Davydov,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Mon, Jul 10, 2017 at 10:28:06AM +0200, Michal Hocko wrote:
> On Wed 05-07-17 10:35:29, Jerome Glisse wrote:
> > On Tue, Jul 04, 2017 at 02:51:13PM +0200, Michal Hocko wrote:
> > > On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
> > > > HMM pages (private or public device pages) are ZONE_DEVICE page and
> > > > thus you can not use page->lru fields of those pages. This patch
> > > > re-arrange the uncharge to allow single page to be uncharge without
> > > > modifying the lru field of the struct page.
> > > > 
> > > > There is no change to memcontrol logic, it is the same as it was
> > > > before this patch.
> > > 
> > > What is the memcg semantic of the memory? Why is it even charged? AFAIR
> > > this is not a reclaimable memory. If yes how are we going to deal with
> > > memory limits? What should happen if go OOM? Does killing an process
> > > actually help to release that memory? Isn't it pinned by a device?
> > > 
> > > For the patch itself. It is quite ugly but I haven't spotted anything
> > > obviously wrong with it. It is the memcg semantic with this class of
> > > memory which makes me worried.
> > 
> > So i am facing 3 choices. First one not account device memory at all.
> > Second one is account device memory like any other memory inside a
> > process. Third one is account device memory as something entirely new.
> > 
> > I pick the second one for two reasons. First because when migrating
> > back from device memory it means that migration can not fail because
> > of memory cgroup limit, this simplify an already complex migration
> > code. Second because i assume that device memory usage is a transient
> > state ie once device is done with its computation the most likely
> > outcome is memory is migrated back. From this assumption it means
> > that you do not want to allow a process to overuse regular memory
> > while it is using un-accounted device memory. It sounds safer to
> > account device memory and to keep the process within its memcg
> > boundary.
> > 
> > Admittedly here i am making an assumption and i can be wrong. Thing
> > is we do not have enough real data of how this will be use and how
> > much of an impact device memory will have. That is why for now i
> > would rather restrict myself to either not account it or account it
> > as usual.
> > 
> > If you prefer not accounting it until we have more experience on how
> > it is use and how it impacts memory resource management i am fine with
> > that too. It will make the migration code slightly more complex.
> 
> I can see why you want to do this but the semantic _has_ to be clear.
> And as such make sure that the exiting task will simply unpin and
> invalidate all the device memory (assuming this memory is not shared
> which I am not sure is even possible).

So there is 2 differents path out of device memory:
  - munmap/process exiting: memory will get uncharge from its memory
    cgroup just like regular memory
  - migration to non device memory, the memory cgroup charge get
    transfer to the new page just like for any other page

Do you want me to document all this in any specific place ? I will
add a comment in memory_control.c and in HMM documentations for this
but should i add it anywhere else ?

Note that the device memory is not pin. The whole point of HMM is to
do away with any pining. Thought as device page are not on lru they
are not reclaim like any other page. However we expect that device
driver might implement something akin to device memory reclaim to
make room for more important data base on statistic collected by the
device driver. If there is enough commonality accross devices then
we might implement a more generic mechanisms but at this point i
rather grow as we learn.

Cheers,
Jérôme

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
  2017-07-10 15:32           ` Jerome Glisse
@ 2017-07-10 16:04             ` Michal Hocko
       [not found]               ` <20170710160444.GB7071-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Michal Hocko @ 2017-07-10 16:04 UTC (permalink / raw)
  To: Jerome Glisse
  Cc: linux-kernel, linux-mm, John Hubbard, David Nellans, Dan Williams,
	Balbir Singh, Johannes Weiner, Vladimir Davydov, cgroups

On Mon 10-07-17 11:32:23, Jerome Glisse wrote:
> On Mon, Jul 10, 2017 at 10:28:06AM +0200, Michal Hocko wrote:
> > On Wed 05-07-17 10:35:29, Jerome Glisse wrote:
> > > On Tue, Jul 04, 2017 at 02:51:13PM +0200, Michal Hocko wrote:
> > > > On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
> > > > > HMM pages (private or public device pages) are ZONE_DEVICE page and
> > > > > thus you can not use page->lru fields of those pages. This patch
> > > > > re-arrange the uncharge to allow single page to be uncharge without
> > > > > modifying the lru field of the struct page.
> > > > > 
> > > > > There is no change to memcontrol logic, it is the same as it was
> > > > > before this patch.
> > > > 
> > > > What is the memcg semantic of the memory? Why is it even charged? AFAIR
> > > > this is not a reclaimable memory. If yes how are we going to deal with
> > > > memory limits? What should happen if go OOM? Does killing an process
> > > > actually help to release that memory? Isn't it pinned by a device?
> > > > 
> > > > For the patch itself. It is quite ugly but I haven't spotted anything
> > > > obviously wrong with it. It is the memcg semantic with this class of
> > > > memory which makes me worried.
> > > 
> > > So i am facing 3 choices. First one not account device memory at all.
> > > Second one is account device memory like any other memory inside a
> > > process. Third one is account device memory as something entirely new.
> > > 
> > > I pick the second one for two reasons. First because when migrating
> > > back from device memory it means that migration can not fail because
> > > of memory cgroup limit, this simplify an already complex migration
> > > code. Second because i assume that device memory usage is a transient
> > > state ie once device is done with its computation the most likely
> > > outcome is memory is migrated back. From this assumption it means
> > > that you do not want to allow a process to overuse regular memory
> > > while it is using un-accounted device memory. It sounds safer to
> > > account device memory and to keep the process within its memcg
> > > boundary.
> > > 
> > > Admittedly here i am making an assumption and i can be wrong. Thing
> > > is we do not have enough real data of how this will be use and how
> > > much of an impact device memory will have. That is why for now i
> > > would rather restrict myself to either not account it or account it
> > > as usual.
> > > 
> > > If you prefer not accounting it until we have more experience on how
> > > it is use and how it impacts memory resource management i am fine with
> > > that too. It will make the migration code slightly more complex.
> > 
> > I can see why you want to do this but the semantic _has_ to be clear.
> > And as such make sure that the exiting task will simply unpin and
> > invalidate all the device memory (assuming this memory is not shared
> > which I am not sure is even possible).
> 
> So there is 2 differents path out of device memory:
>   - munmap/process exiting: memory will get uncharge from its memory
>     cgroup just like regular memory

I might have missed that in your patch, I admit I only glanced through
that, but the memcg uncharged when the last reference to the page is
released. So if the device pins the page for some reason then the charge
will be there even when the oom victim unmaps the memory.

>   - migration to non device memory, the memory cgroup charge get
>     transfer to the new page just like for any other page
> 
> Do you want me to document all this in any specific place ? I will
> add a comment in memory_control.c and in HMM documentations for this
> but should i add it anywhere else ?

hmm documentation is sufficient and the uncharge path if it needs any
special handling.

> Note that the device memory is not pin. The whole point of HMM is to
> do away with any pining. Thought as device page are not on lru they
> are not reclaim like any other page. However we expect that device
> driver might implement something akin to device memory reclaim to
> make room for more important data base on statistic collected by the
> device driver. If there is enough commonality accross devices then
> we might implement a more generic mechanisms but at this point i
> rather grow as we learn.

Do we have any guarantee that devices will _never_ pin those pages? If
no then we have to make sure we can forcefully tear them down.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
       [not found]               ` <20170710160444.GB7071-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2017-07-10 16:25                 ` Jerome Glisse
       [not found]                   ` <20170710162542.GB4964-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Jerome Glisse @ 2017-07-10 16:25 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, John Hubbard, David Nellans,
	Dan Williams, Balbir Singh, Johannes Weiner, Vladimir Davydov,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Mon, Jul 10, 2017 at 06:04:46PM +0200, Michal Hocko wrote:
> On Mon 10-07-17 11:32:23, Jerome Glisse wrote:
> > On Mon, Jul 10, 2017 at 10:28:06AM +0200, Michal Hocko wrote:
> > > On Wed 05-07-17 10:35:29, Jerome Glisse wrote:
> > > > On Tue, Jul 04, 2017 at 02:51:13PM +0200, Michal Hocko wrote:
> > > > > On Mon 03-07-17 17:14:14, Jérôme Glisse wrote:
> > > > > > HMM pages (private or public device pages) are ZONE_DEVICE page and
> > > > > > thus you can not use page->lru fields of those pages. This patch
> > > > > > re-arrange the uncharge to allow single page to be uncharge without
> > > > > > modifying the lru field of the struct page.
> > > > > > 
> > > > > > There is no change to memcontrol logic, it is the same as it was
> > > > > > before this patch.
> > > > > 
> > > > > What is the memcg semantic of the memory? Why is it even charged? AFAIR
> > > > > this is not a reclaimable memory. If yes how are we going to deal with
> > > > > memory limits? What should happen if go OOM? Does killing an process
> > > > > actually help to release that memory? Isn't it pinned by a device?
> > > > > 
> > > > > For the patch itself. It is quite ugly but I haven't spotted anything
> > > > > obviously wrong with it. It is the memcg semantic with this class of
> > > > > memory which makes me worried.
> > > > 
> > > > So i am facing 3 choices. First one not account device memory at all.
> > > > Second one is account device memory like any other memory inside a
> > > > process. Third one is account device memory as something entirely new.
> > > > 
> > > > I pick the second one for two reasons. First because when migrating
> > > > back from device memory it means that migration can not fail because
> > > > of memory cgroup limit, this simplify an already complex migration
> > > > code. Second because i assume that device memory usage is a transient
> > > > state ie once device is done with its computation the most likely
> > > > outcome is memory is migrated back. From this assumption it means
> > > > that you do not want to allow a process to overuse regular memory
> > > > while it is using un-accounted device memory. It sounds safer to
> > > > account device memory and to keep the process within its memcg
> > > > boundary.
> > > > 
> > > > Admittedly here i am making an assumption and i can be wrong. Thing
> > > > is we do not have enough real data of how this will be use and how
> > > > much of an impact device memory will have. That is why for now i
> > > > would rather restrict myself to either not account it or account it
> > > > as usual.
> > > > 
> > > > If you prefer not accounting it until we have more experience on how
> > > > it is use and how it impacts memory resource management i am fine with
> > > > that too. It will make the migration code slightly more complex.
> > > 
> > > I can see why you want to do this but the semantic _has_ to be clear.
> > > And as such make sure that the exiting task will simply unpin and
> > > invalidate all the device memory (assuming this memory is not shared
> > > which I am not sure is even possible).
> > 
> > So there is 2 differents path out of device memory:
> >   - munmap/process exiting: memory will get uncharge from its memory
> >     cgroup just like regular memory
> 
> I might have missed that in your patch, I admit I only glanced through
> that, but the memcg uncharged when the last reference to the page is
> released. So if the device pins the page for some reason then the charge
> will be there even when the oom victim unmaps the memory.

Device can not pin memory it is part of the "contract" when using HMM.
Device memory can never be pin. Nor by device driver nor by any other
means ie we want GUP to trigger a migration back to regular memory. We
will relax the GUP requirement a one point (especialy for direct I/O
and other short time GUP).


> >   - migration to non device memory, the memory cgroup charge get
> >     transfer to the new page just like for any other page
> > 
> > Do you want me to document all this in any specific place ? I will
> > add a comment in memory_control.c and in HMM documentations for this
> > but should i add it anywhere else ?
> 
> hmm documentation is sufficient and the uncharge path if it needs any
> special handling.

Uncharge happens in the ZONE_DEVICE special handling of page refcount
ie a ZONE_DEVICE is free when its refcount reach 1 not 0.

> 
> > Note that the device memory is not pin. The whole point of HMM is to
> > do away with any pining. Thought as device page are not on lru they
> > are not reclaim like any other page. However we expect that device
> > driver might implement something akin to device memory reclaim to
> > make room for more important data base on statistic collected by the
> > device driver. If there is enough commonality accross devices then
> > we might implement a more generic mechanisms but at this point i
> > rather grow as we learn.
> 
> Do we have any guarantee that devices will _never_ pin those pages? If
> no then we have to make sure we can forcefully tear them down.

Well yes we do, as long as i monitor how driver use thing :) Device we
are targetting are like CPU from MMU point of view ie you can tear down
a device page table entry without having the device to freak about it.
So there is no need for device to pin anything, if we update its page
table to non present entry any further access to the virtual address
will trigger a fault that is then handled by the device driver.

If the process is being kill than the GPU threads can be kill by the
device driver too. Otherwise the page fault is handled with the help
of HMM like any reguler CPU page fault. If for some reasons we can not
service the fault than the device driver is responsible to decide how
to handle various VM_FAULT_ERROR. Expectation is that it kills the
device threads and inform userspace through device specific API. I
think at one point down the road we will want to standardize way to
communicate fatal error condition that affect device threads.


I will review HMM documentation again to make sure this is all in
black and white. I am pretty sure that some of it is already there.

Bottom line is that we can always free and uncharge device memory
page just like any regular page.

Cheers,
Jérôme

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
       [not found]                   ` <20170710162542.GB4964-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2017-07-10 16:36                     ` Michal Hocko
  2017-07-10 16:54                       ` Jerome Glisse
  0 siblings, 1 reply; 15+ messages in thread
From: Michal Hocko @ 2017-07-10 16:36 UTC (permalink / raw)
  To: Jerome Glisse
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, John Hubbard, David Nellans,
	Dan Williams, Balbir Singh, Johannes Weiner, Vladimir Davydov,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Mon 10-07-17 12:25:42, Jerome Glisse wrote:
[...]
> Bottom line is that we can always free and uncharge device memory
> page just like any regular page.

OK, this answers my earlier question. Then it should be feasible to
charge this memory. There are still some things to handle. E.g. how do
we consider this memory during oom victim selection (this is not
accounted as an anonymous memory in get_mm_counter, right?), maybe others.
But the primary point is that nobody pins the memory outside of the
mapping.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
  2017-07-10 16:36                     ` Michal Hocko
@ 2017-07-10 16:54                       ` Jerome Glisse
  2017-07-10 17:48                         ` Michal Hocko
  0 siblings, 1 reply; 15+ messages in thread
From: Jerome Glisse @ 2017-07-10 16:54 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, John Hubbard, David Nellans, Dan Williams,
	Balbir Singh, Johannes Weiner, Vladimir Davydov, cgroups

On Mon, Jul 10, 2017 at 06:36:52PM +0200, Michal Hocko wrote:
> On Mon 10-07-17 12:25:42, Jerome Glisse wrote:
> [...]
> > Bottom line is that we can always free and uncharge device memory
> > page just like any regular page.
> 
> OK, this answers my earlier question. Then it should be feasible to
> charge this memory. There are still some things to handle. E.g. how do
> we consider this memory during oom victim selection (this is not
> accounted as an anonymous memory in get_mm_counter, right?), maybe others.
> But the primary point is that nobody pins the memory outside of the
> mapping.

At this point it is accounted as a regular page would be (anonymous, file
or share memory). I wanted mm_counters to reflect memcg but i can untie
that. Like i said at this point we are unsure how usage of such memory
will impact thing so i wanted to keep all thing as if it was regular
memory to avoid anuything to behave too much differently.

Jérôme

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
  2017-07-10 16:54                       ` Jerome Glisse
@ 2017-07-10 17:48                         ` Michal Hocko
       [not found]                           ` <20170710174857.GF7071-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Michal Hocko @ 2017-07-10 17:48 UTC (permalink / raw)
  To: Jerome Glisse
  Cc: linux-kernel, linux-mm, John Hubbard, David Nellans, Dan Williams,
	Balbir Singh, Johannes Weiner, Vladimir Davydov, cgroups

On Mon 10-07-17 12:54:21, Jerome Glisse wrote:
> On Mon, Jul 10, 2017 at 06:36:52PM +0200, Michal Hocko wrote:
> > On Mon 10-07-17 12:25:42, Jerome Glisse wrote:
> > [...]
> > > Bottom line is that we can always free and uncharge device memory
> > > page just like any regular page.
> > 
> > OK, this answers my earlier question. Then it should be feasible to
> > charge this memory. There are still some things to handle. E.g. how do
> > we consider this memory during oom victim selection (this is not
> > accounted as an anonymous memory in get_mm_counter, right?), maybe others.
> > But the primary point is that nobody pins the memory outside of the
> > mapping.
> 
> At this point it is accounted as a regular page would be (anonymous, file
> or share memory). I wanted mm_counters to reflect memcg but i can untie
> that.

I am not sure I understand. If the device memory is accounted to the
same mm counter as the original page then it is correct. I will try to
double check the implementation (hopefully soon).

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field
       [not found]                           ` <20170710174857.GF7071-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2017-07-10 18:10                             ` Jerome Glisse
  0 siblings, 0 replies; 15+ messages in thread
From: Jerome Glisse @ 2017-07-10 18:10 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, John Hubbard, David Nellans,
	Dan Williams, Balbir Singh, Johannes Weiner, Vladimir Davydov,
	cgroups-u79uwXL29TY76Z2rM5mHXA

On Mon, Jul 10, 2017 at 07:48:58PM +0200, Michal Hocko wrote:
> On Mon 10-07-17 12:54:21, Jerome Glisse wrote:
> > On Mon, Jul 10, 2017 at 06:36:52PM +0200, Michal Hocko wrote:
> > > On Mon 10-07-17 12:25:42, Jerome Glisse wrote:
> > > [...]
> > > > Bottom line is that we can always free and uncharge device memory
> > > > page just like any regular page.
> > > 
> > > OK, this answers my earlier question. Then it should be feasible to
> > > charge this memory. There are still some things to handle. E.g. how do
> > > we consider this memory during oom victim selection (this is not
> > > accounted as an anonymous memory in get_mm_counter, right?), maybe others.
> > > But the primary point is that nobody pins the memory outside of the
> > > mapping.
> > 
> > At this point it is accounted as a regular page would be (anonymous, file
> > or share memory). I wanted mm_counters to reflect memcg but i can untie
> > that.
> 
> I am not sure I understand. If the device memory is accounted to the
> same mm counter as the original page then it is correct. I will try to
> double check the implementation (hopefully soon).

It is accounted like the original page. By same as memcg i mean i made
the same kind of choice for mm counter than i made for memcg. It is
all in the migrate code (migrate.c) ie i don't touch any of the mm
counter when migrating page.

Jérôme

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2017-07-10 18:10 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20170703211415.11283-1-jglisse@redhat.com>
2017-07-03 21:14 ` [PATCH 4/5] mm/memcontrol: allow to uncharge page without using page->lru field Jérôme Glisse
2017-07-04 12:51   ` Michal Hocko
     [not found]     ` <20170704125113.GC14727-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2017-07-05  3:18       ` Balbir Singh
2017-07-05  6:38         ` Michal Hocko
     [not found]           ` <20170705063813.GB10354-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2017-07-05 10:22             ` Balbir Singh
2017-07-05 14:35     ` Jerome Glisse
2017-07-10  8:28       ` Michal Hocko
     [not found]         ` <20170710082805.GD19185-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2017-07-10 15:32           ` Jerome Glisse
2017-07-10 16:04             ` Michal Hocko
     [not found]               ` <20170710160444.GB7071-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2017-07-10 16:25                 ` Jerome Glisse
     [not found]                   ` <20170710162542.GB4964-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2017-07-10 16:36                     ` Michal Hocko
2017-07-10 16:54                       ` Jerome Glisse
2017-07-10 17:48                         ` Michal Hocko
     [not found]                           ` <20170710174857.GF7071-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2017-07-10 18:10                             ` Jerome Glisse
2017-07-03 21:14 ` [PATCH 5/5] mm/memcontrol: support MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_PUBLIC Jérôme Glisse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).