All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jan Stancek <jstancek@redhat.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: alex shi <alex.shi@intel.com>, guz fnst <guz.fnst@cn.fujitsu.com>,
	mingo@redhat.com, jolsa@redhat.com, riel@redhat.com,
	linux-kernel@vger.kernel.org
Subject: Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
Date: Mon, 8 Feb 2016 14:40:48 +0100	[thread overview]
Message-ID: <56B89AE0.9090603@redhat.com> (raw)
In-Reply-To: <654964868.14006956.1454063625314.JavaMail.zimbra@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 2600 bytes --]

On 01/29/2016 11:33 AM, Jan Stancek wrote:
>>
>> Also note that I don't think failing this test is a bug per se.
>> Undesirable maybe, but within spec, since SIGALRM is process wide, so it
>> being delivered to the SCHED_OTHER task is accepted, and SCHED_OTHER has
>> no timeliness guarantees.
>>
>> That said; if I could reliably reproduce I'd have a go at fixing this, I
>> suspect there's a 'fun' problem at the bottom of this.
> 
> Thanks for trying, I'll see if I can find some more reliable way.

I think I have found a more reliably way, however it requires an older
stable kernel: 3.12.53 up to 4.1.17.

Consider following scenario:
- all tasks on system have RT sched class
- main thread of reproducer becomes the only SCHED_OTHER task on system
- when alarm(2) expires, main thread is woken up on cpu that is occupied by
  busy looping RT thread (low_priority_thread)
- because main thread was sleeping for 2 seconds, its load has decayed to 0
- the only chance for main thread to run is if it gets balanced to idle CPU
- task_tick_fair() doesn't run, there is RT task running on this CPU
- main thread is on cfs run queue but its load stays 0
- load balancer never sees this CPU (group) as busy

Attached is reproducer and script, which tries to trigger scenario above.
I can reproduce it with 4.1.17 on baremetal 4 CPU x86_64 with about 1:50 chance.

In this setup failure state persists for a long time, perhaps indefinitely.
I tried extending RUNTIME to 10 minutes, main thread still wouldn't run.

One more clue: I could work around this issue if I forced an update_entity_load_avg()
on sched_entities that have not been updated for some time, as part of
periodic rebalance_domains() call.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7c1d28..1b5fe80 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5264,6 +5264,7 @@ static void update_blocked_averages(int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq;
 	unsigned long flags;
+	struct rb_node *rb;

 	raw_spin_lock_irqsave(&rq->lock, flags);
 	update_rq_clock(rq);
@@ -5281,6 +5282,19 @@ static void update_blocked_averages(int cpu)
 	}

 	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	cfs_rq = &(cpu_rq(cpu)->cfs);
+	for (rb = rb_first_postorder(&cfs_rq->tasks_timeline); rb; rb = rb_next_postorder(rb)) {
+		struct sched_entity *se = rb_entry(rb, struct sched_entity, run_node);
+
+		// Task on rq has not been updated for 500ms :-(
+		if ((cfs_rq_clock_task(cfs_rq) - se->avg.last_runnable_update) > 500L * (1 << 20))
+			update_entity_load_avg(se, 1);
+	}
 }

 /*

Regards,
Jan


[-- Attachment #2: pthread_cond_wait_1_v3.c --]
[-- Type: text/plain, Size: 5201 bytes --]

/*
 * reproducer v3 for:
 * [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
 *
 * Based on LTP's pthread_cond_wait_1.c
 *
 */

#define _GNU_SOURCE
#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <inttypes.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

#define ERROR_PREFIX "unexpected error: "

#define HIGH_PRIORITY 10
#define LOW_PRIORITY  5
#define RUNTIME       5
#define POLICY        SCHED_RR

#define PTS_PASS 0
#define PTS_FAIL 1
#define PTS_UNRESOLVED 2

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

/* Flags that the threads use to indicate events */
volatile int woken_up = 0;
volatile int low_done = 0;

/* Signal handler that handle the ALRM and wakes up
 * the high priority thread
 */
void signal_handler(int sig)
{
	(void) sig;
	if (pthread_cond_signal(&cond) != 0) {
		printf(ERROR_PREFIX "pthread_cond_signal\n");
		exit(PTS_UNRESOLVED);
	}
}

/* Utility function to find difference between two time values */
float timediff(struct timespec t2, struct timespec t1)
{
	float diff = t2.tv_sec - t1.tv_sec;
	diff += (t2.tv_nsec - t1.tv_nsec) / 1000000000.0;
	return diff;
}

void *hi_priority_thread(void *tmp)
{
	struct sched_param param;
	int policy;
	int rc = 0;

	(void) tmp;
	param.sched_priority = HIGH_PRIORITY;

	rc = pthread_setschedparam(pthread_self(), POLICY, &param);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_setschedparam\n");
		exit(PTS_UNRESOLVED);
	}
	rc = pthread_getschedparam(pthread_self(), &policy, &param);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_getschedparam\n");
		exit(PTS_UNRESOLVED);
	}
	if ((policy != POLICY) || (param.sched_priority != HIGH_PRIORITY)) {
		printf("Error: the policy or priority not correct\n");
		exit(PTS_UNRESOLVED);
	}

	/* Install a signal handler for ALRM */
	if (signal(SIGALRM, signal_handler) != 0) {
		perror(ERROR_PREFIX "signal:");
		exit(PTS_UNRESOLVED);
	}

	/* acquire the mutex */
	rc = pthread_mutex_lock(&mutex);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_mutex_lock\n");
		exit(PTS_UNRESOLVED);
	}

	/* Setup an alarm to go off in 2 seconds */
	alarm(2);

	/* Block, to be woken up by the signal handler */
	rc = pthread_cond_wait(&cond, &mutex);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_cond_wait\n");
		exit(PTS_UNRESOLVED);
	}

	/* This variable is unprotected because the scheduling removes
	 * the contention
	 */
	if (low_done != 1)
		woken_up = 1;

	rc = pthread_mutex_unlock(&mutex);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_mutex_unlock\n");
		exit(PTS_UNRESOLVED);
	}
	return NULL;
}

void *low_priority_thread(void *tmp)
{
	struct timespec start_time, current_time;
	struct sched_param param;
	int policy;
	cpu_set_t cpuset;
	int rc = 0, slept_times = 0;
	float slept_for = 0;
	uintptr_t tnum = (uintptr_t)tmp;

	param.sched_priority = LOW_PRIORITY;

	rc = pthread_setschedparam(pthread_self(), POLICY, &param);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_setschedparam\n");
		exit(PTS_UNRESOLVED);
	}
	rc = pthread_getschedparam(pthread_self(), &policy, &param);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_getschedparam\n");
		exit(PTS_UNRESOLVED);
	}
	if ((policy != POLICY) || (param.sched_priority != LOW_PRIORITY)) {
		printf("Error: the policy or priority not correct\n");
		exit(PTS_UNRESOLVED);
	}

	CPU_ZERO(&cpuset);
	CPU_SET(tnum, &cpuset);

	rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_setaffinity_np\n");
		exit(PTS_UNRESOLVED);
	}

	/* grab the start time and busy loop for 5 seconds */
	clock_gettime(CLOCK_REALTIME, &start_time);
	while (!woken_up && !low_done) {
		clock_gettime(CLOCK_REALTIME, &current_time);
		if (timediff(current_time, start_time) > RUNTIME)
			break;
	}

	low_done = 1;
	return NULL;
}

int main()
{
	pthread_t high_id, *low_id, paused_id;
	struct sched_param param;
	int rc = 0;
	int i, ncpus = sysconf(_SC_NPROCESSORS_ONLN);

	low_id = malloc(ncpus * sizeof(pthread_t));

	/* high prio thread */
	rc = pthread_create(&high_id, NULL, hi_priority_thread, NULL);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_create\n");
		exit(PTS_UNRESOLVED);
	}

	/* low prio thread on each cpu except last one */
	for (i = 0; i < ncpus - 1; i++) {
		uintptr_t tnum = i;
		rc = pthread_create(&low_id[i], NULL, low_priority_thread, (void *)tnum);
		if (rc != 0) {
			printf(ERROR_PREFIX "pthread_create\n");
			exit(PTS_UNRESOLVED);
		}
	}

	param.sched_priority = 0;
	rc = pthread_setschedparam(pthread_self(), SCHED_OTHER, &param);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_setschedparam\n");
		exit(PTS_UNRESOLVED);
	}

	/* Wait for the threads to exit */
	rc = pthread_join(high_id, NULL);
	if (rc != 0) {
		printf(ERROR_PREFIX "pthread_join\n");
		exit(PTS_UNRESOLVED);
	}

	for (i = 0; i < ncpus - 1; i++) {
		rc = pthread_join(low_id[i], NULL);
		if (rc != 0) {
			printf(ERROR_PREFIX "pthread_join\n");
			exit(PTS_UNRESOLVED);
		}
	}

	if (woken_up == 0) {
		printf("Test FAILED: high priority was not woken up\n");
		exit(PTS_FAIL);
	}

	printf("Test PASSED\n");
	exit(PTS_PASS);
}

[-- Attachment #3: reproduce_v3.sh --]
[-- Type: application/x-shellscript, Size: 260 bytes --]

      reply	other threads:[~2016-02-08 13:40 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-27 14:52 [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds Jan Stancek
2016-01-28 15:55 ` Jan Stancek
2016-01-28 17:49   ` Peter Zijlstra
2016-01-28 18:43     ` Jan Stancek
2016-01-29 10:15       ` Peter Zijlstra
2016-01-29 10:33         ` Jan Stancek
2016-02-08 13:40           ` Jan Stancek [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=56B89AE0.9090603@redhat.com \
    --to=jstancek@redhat.com \
    --cc=alex.shi@intel.com \
    --cc=guz.fnst@cn.fujitsu.com \
    --cc=jolsa@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.