linux-acpi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mel Gorman <mgorman@techsingularity.net>
To: Jirka Hladky <jhladky@redhat.com>
Cc: Jakub Racek <jracek@redhat.com>,
	linux-kernel <linux-kernel@vger.kernel.org>,
	"Rafael J. Wysocki" <rjw@rjwysocki.net>,
	Len Brown <lenb@kernel.org>,
	linux-acpi@vger.kernel.org,
	"kkolakow@redhat.com" <kkolakow@redhat.com>
Subject: Re: [4.17 regression] Performance drop on kernel-4.17 visible on Stream, Linpack and NAS parallel benchmarks
Date: Wed, 27 Jun 2018 09:49:55 +0100	[thread overview]
Message-ID: <20180627084954.73ucqvac62v5gje4@techsingularity.net> (raw)
In-Reply-To: <CAE4VaGDHcZbnDpJ+FiQLfA1DRftY0j_GJSnh3FDRi34OztVH6Q@mail.gmail.com>

On Wed, Jun 27, 2018 at 12:18:37AM +0200, Jirka Hladky wrote:
> Hi Mel,
> 
> we have results for the "Fixes for sched/numa_balancing" series and overall
> it looks very promising.
> 
> We see improvements in the range 15-20% for the stream benchmark and
> upto 60% for the OpenMP NAS benchmark. While NAS results are noisy (have
> quite big variations)  we see improvements on a wide range of 2 and 4 NUMA
> systems.  More importantly, we don't see any regressions. I have posted
> screenshots of the median differences on 4 NUMA servers for NAS benchmark
> with
> 
> 4x Gold 6126 CPU @ 2.60GHz
> 4x E5-4627 v2 @ 3.30GHz
> 
> to illustrate the typical results.
> 
> How are things looking at your side?
> 

I saw similar results in that it was a general win. I'm also trialing
the following monolithic patch on top if you want to try it. The timing
of this will depend on when/if numa_blancing fixes get picked up and I'm
still waiting on test results to come through. Unfortunately, on Friday,
I'll be unavailable for two weeks so this may drag on a bit.  The expanded
set of patches is at

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git sched-numa-fast-crossnode-v1r12

If the results are positive, I'll update the series and post it as a
RFC. Monolithic patch on top of fixes for sched/numa_balancing is as follows.

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0dbe1d5bb936..eea5f82ca447 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -669,11 +669,6 @@ typedef struct pglist_data {
 	struct task_struct *kcompactd;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
-	/* Rate limiting time interval */
-	unsigned long numabalancing_migrate_next_window;
-
-	/* Number of pages migrated during the rate limiting time interval */
-	unsigned long numabalancing_migrate_nr_pages;
 	int active_node_migrate;
 #endif
 	/*
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 711372845945..de8c73f9abcf 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -71,32 +71,6 @@ TRACE_EVENT(mm_migrate_pages,
 		__print_symbolic(__entry->reason, MIGRATE_REASON))
 );
 
-TRACE_EVENT(mm_numa_migrate_ratelimit,
-
-	TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages),
-
-	TP_ARGS(p, dst_nid, nr_pages),
-
-	TP_STRUCT__entry(
-		__array(	char,		comm,	TASK_COMM_LEN)
-		__field(	pid_t,		pid)
-		__field(	int,		dst_nid)
-		__field(	unsigned long,	nr_pages)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->dst_nid	= dst_nid;
-		__entry->nr_pages	= nr_pages;
-	),
-
-	TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu",
-		__entry->comm,
-		__entry->pid,
-		__entry->dst_nid,
-		__entry->nr_pages)
-);
 #endif /* _TRACE_MIGRATE_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6ca3be059872..c020af2c58ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1394,6 +1394,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 	int last_cpupid, this_cpupid;
 
 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+
+	/*
+	 * Allow first faults or private faults to migrate immediately early in
+	 * the lifetime of a task. The magic number 4 is based on waiting for
+	 * two full passes of the "multi-stage node selection" test that is
+	 * executed below.
+	 */
+	if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+		return true;
 
 	/*
 	 * Multi-stage node selection is used in conjunction with a periodic
@@ -1412,7 +1423,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 	 * This quadric squishes small probabilities, making it less likely we
 	 * act on an unlikely task<->page relation.
 	 */
-	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
 	if (!cpupid_pid_unset(last_cpupid) &&
 				cpupid_to_nid(last_cpupid) != dst_nid)
 		return false;
@@ -6702,6 +6712,9 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu __maybe_unus
 	p->se.exec_start = 0;
 
 #ifdef CONFIG_NUMA_BALANCING
+	if (!static_branch_likely(&sched_numa_balancing))
+		return;
+
 	if (!p->mm || (p->flags & PF_EXITING))
 		return;
 
@@ -6709,8 +6722,26 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu __maybe_unus
 		int src_nid = cpu_to_node(task_cpu(p));
 		int dst_nid = cpu_to_node(new_cpu);
 
-		if (src_nid != dst_nid)
-			p->numa_scan_period = task_scan_start(p);
+		if (src_nid == dst_nid)
+			return;
+
+		/*
+		 * Allow resets if faults have been trapped before one scan
+		 * has completed. This is most likely due to a new task that
+		 * is pulled cross-node due to wakeups or load balancing.
+		 */
+		if (p->numa_scan_seq) {
+			/*
+			 * Avoid scan adjustments if moving to the preferred
+			 * node or if the task was not previously running on
+			 * the preferred node.
+			 */
+			if (dst_nid == p->numa_preferred_nid ||
+			    (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+				return;
+		}
+
+		p->numa_scan_period = task_scan_start(p);
 	}
 #endif
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index c7749902a160..f935f4781036 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1856,54 +1856,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 	return newpage;
 }
 
-/*
- * page migration rate limiting control.
- * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
- * window of time. Default here says do not migrate more than 1280M per second.
- */
-static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-
-/* Returns true if the node is migrate rate-limited after the update */
-static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
-					unsigned long nr_pages)
-{
-	unsigned long next_window, interval;
-
-	next_window = READ_ONCE(pgdat->numabalancing_migrate_next_window);
-	interval = msecs_to_jiffies(migrate_interval_millisecs);
-
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	if (time_after(jiffies, next_window)) {
-		if (xchg(&pgdat->numabalancing_migrate_nr_pages, 0)) {
-			do {
-				next_window += interval;
-			} while (unlikely(time_after(jiffies, next_window)));
-
-			WRITE_ONCE(pgdat->numabalancing_migrate_next_window,
-							       next_window);
-		}
-	}
-	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
-		trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
-								nr_pages);
-		return true;
-	}
-
-	/*
-	 * This is an unlocked non-atomic update so errors are possible.
-	 * The consequences are failing to migrate when we potentiall should
-	 * have which is not severe enough to warrant locking. If it is ever
-	 * a problem, it can be converted to a per-cpu counter.
-	 */
-	pgdat->numabalancing_migrate_nr_pages += nr_pages;
-	return false;
-}
-
 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
 	int page_lru;
@@ -1976,14 +1928,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 	if (page_is_file_cache(page) && PageDirty(page))
 		goto out;
 
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	if (numamigrate_update_ratelimit(pgdat, 1))
-		goto out;
-
 	isolated = numamigrate_isolate_page(pgdat, page);
 	if (!isolated)
 		goto out;
@@ -2030,14 +1974,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	unsigned long mmun_start = address & HPAGE_PMD_MASK;
 	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
 
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
-		goto out_dropref;
-
 	new_page = alloc_pages_node(node,
 		(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
 		HPAGE_PMD_ORDER);
@@ -2134,7 +2070,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 out_fail:
 	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-out_dropref:
 	ptl = pmd_lock(mm, pmd);
 	if (pmd_same(*pmd, entry)) {
 		entry = pmd_modify(entry, vma->vm_page_prot);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a4fc9b0798df..9049e7b26e92 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6211,11 +6211,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 	int nid = pgdat->node_id;
 
 	pgdat_resize_init(pgdat);
-#ifdef CONFIG_NUMA_BALANCING
-	pgdat->numabalancing_migrate_nr_pages = 0;
-	pgdat->active_node_migrate = 0;
-	pgdat->numabalancing_migrate_next_window = jiffies;
-#endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	spin_lock_init(&pgdat->split_queue_lock);
 	INIT_LIST_HEAD(&pgdat->split_queue);

-- 
Mel Gorman
SUSE Labs

  parent reply	other threads:[~2018-06-27  8:49 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CAE4VaGBRFBM-uZEE=DdUzQkcNmpnUHdjK-7hgEeywmG8bvOOgw@mail.gmail.com>
2018-06-11 14:11 ` [4.17 regression] Performance drop on kernel-4.17 visible on Stream, Linpack and NAS parallel benchmarks Mel Gorman
     [not found]   ` <CAE4VaGCMS2pXfPVSnMbudexv_m5wRCTuBKA5ijh2x==11uQg9g@mail.gmail.com>
2018-06-14  8:36     ` Mel Gorman
     [not found]       ` <CAE4VaGCzB99es_TpAaYvtjX8fqzFA=7HX-ezqgO6FaEB5if4zg@mail.gmail.com>
2018-06-15 11:25         ` Mel Gorman
     [not found]           ` <CAE4VaGBtasbDBoZ-c5R-AY++Y1BXgjrE7DwN0zOt113xmV95xw@mail.gmail.com>
2018-06-15 13:52             ` Mel Gorman
     [not found]               ` <CAE4VaGAdXNYXMUn4eQgMqQtLKfp6-YHMa1NUSpL-L078oX7C-w@mail.gmail.com>
     [not found]                 ` <CAE4VaGBeTpxd1phR4rVAjqOXuLgLWPtVMPoRSOcG3HXfWDF=8w@mail.gmail.com>
2018-06-19 15:18                   ` Mel Gorman
     [not found]                     ` <CAE4VaGAPOfy0RtQehKoe+443C1GRrJXCveBFgcAZ1nChVavp1g@mail.gmail.com>
     [not found]                       ` <CAE4VaGBMeL82SJK53gtcWkor-9eXeLX6VP9juw=FW=BOyp+hMA@mail.gmail.com>
2018-06-21  9:23                         ` Mel Gorman
     [not found]                           ` <CAE4VaGCQV+cS-vhdLyMwzftbB-xBHPt4Y4chg_0ykLHTE9cRfw@mail.gmail.com>
     [not found]                             ` <CAE4VaGDHcZbnDpJ+FiQLfA1DRftY0j_GJSnh3FDRi34OztVH6Q@mail.gmail.com>
2018-06-27  8:49                               ` Mel Gorman [this message]
     [not found]                                 ` <CAE4VaGA9KzX05rdfw2PhEATLisV-NVMc9rOyjzSg-rX1rug9Dw@mail.gmail.com>
     [not found]                                   ` <CABuKy6MUNX85PBVchz_hqXy+FxXU2x0U9ZEZB13rVSLGpWOWvQ@mail.gmail.com>
     [not found]                                     ` <CAE4VaGD12BLS_kk=pRwgTKL8YOU63Nowwa42cEdZObQ=P1MFnA@mail.gmail.com>
     [not found]                                       ` <CAE4VaGArxDYHzg8G203yKjgkuw3mULFSw8yCYbCcqvAUSUxy+A@mail.gmail.com>
2018-07-17 10:03                                         ` Mel Gorman
     [not found]                                           ` <CAE4VaGA_L1AEj+Un0oQEEqZp_jgaFLk+Z=vNoad08oXnU2T1nw@mail.gmail.com>
2018-09-03 15:07                                             ` Jirka Hladky
2018-09-04  9:00                                               ` Mel Gorman
2018-09-04 10:07                                                 ` Jirka Hladky
2018-09-06  8:16                                                   ` Jirka Hladky
2018-09-06 12:58                                                     ` Mel Gorman
2018-09-07  8:09                                                       ` Jirka Hladky
2018-09-14 16:50                                                         ` Jirka Hladky
2018-06-06 12:27 Jakub Racek
2018-06-06 12:34 ` Rafael J. Wysocki
2018-06-06 12:44   ` Rafael J. Wysocki
2018-06-07 11:07 ` Michal Hocko
2018-06-07 11:19   ` Jakub Raček
2018-06-07 12:39 ` Mel Gorman
     [not found]   ` <CAE4VaGBAZ0HCy-M2rC3ce9ePOBhE6H-LDVBuJDJMNFf40j70Aw@mail.gmail.com>
2018-06-08  7:40     ` Mel Gorman
     [not found]       ` <CAE4VaGAgC7vDwaa-9AzJYst9hdQ5KbnrBUnk_mfp=NeTEe5dAQ@mail.gmail.com>
2018-06-08  9:24         ` Mel Gorman
     [not found]           ` <CAE4VaGATk3_Hr_2Wh44BZvXDc06A=rxUZXRFj+D=Xwh2x1YOyg@mail.gmail.com>
2018-06-08 11:15             ` Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180627084954.73ucqvac62v5gje4@techsingularity.net \
    --to=mgorman@techsingularity.net \
    --cc=jhladky@redhat.com \
    --cc=jracek@redhat.com \
    --cc=kkolakow@redhat.com \
    --cc=lenb@kernel.org \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=rjw@rjwysocki.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).