Skip to content

Commit

Permalink
psi: cgroup support
Browse files Browse the repository at this point in the history
On a system that executes multiple cgrouped jobs and independent
workloads, we don't just care about the health of the overall system, but
also that of individual jobs, so that we can ensure individual job health,
fairness between jobs, or prioritize some jobs over others.

This patch implements pressure stall tracking for cgroups.  In kernels
with CONFIG_PSI=y, cgroup2 groups will have cpu.pressure, memory.pressure,
and io.pressure files that track aggregate pressure stall times for only
the tasks inside the cgroup.

Link: http://lkml.kernel.org/r/20180828172258.3185-10-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
hnaz authored and torvalds committed Oct 26, 2018
1 parent eb41468 commit 2ce7135
Show file tree
Hide file tree
Showing 8 changed files with 228 additions and 10 deletions.
9 changes: 9 additions & 0 deletions Documentation/accounting/psi.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,12 @@ well as medium and long term trends. The total absolute stall time is
tracked and exported as well, to allow detection of latency spikes
which wouldn't necessarily make a dent in the time averages, or to
average trends over custom time frames.

Cgroup2 interface
=================

In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
mounted, pressure stall information is also tracked for tasks grouped
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
cpu.pressure, memory.pressure, and io.pressure files; the format is
the same as the /proc/pressure/ files.
18 changes: 18 additions & 0 deletions Documentation/admin-guide/cgroup-v2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -966,6 +966,12 @@ All time durations are in microseconds.
$PERIOD duration. "max" for $MAX indicates no limit. If only
one number is written, $MAX is updated.

cpu.pressure
A read-only nested-key file which exists on non-root cgroups.

Shows pressure stall information for CPU. See
Documentation/accounting/psi.txt for details.


Memory
------
Expand Down Expand Up @@ -1271,6 +1277,12 @@ PAGE_SIZE multiple when read back.
higher than the limit for an extended period of time. This
reduces the impact on the workload and memory management.

memory.pressure
A read-only nested-key file which exists on non-root cgroups.

Shows pressure stall information for memory. See
Documentation/accounting/psi.txt for details.


Usage Guidelines
~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -1408,6 +1420,12 @@ IO Interface Files

8:16 rbps=2097152 wbps=max riops=max wiops=max

io.pressure
A read-only nested-key file which exists on non-root cgroups.

Shows pressure stall information for IO. See
Documentation/accounting/psi.txt for details.


Writeback
~~~~~~~~~
Expand Down
4 changes: 4 additions & 0 deletions include/linux/cgroup-defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <linux/u64_stats_sync.h>
#include <linux/workqueue.h>
#include <linux/bpf-cgroup.h>
#include <linux/psi_types.h>

#ifdef CONFIG_CGROUPS

Expand Down Expand Up @@ -436,6 +437,9 @@ struct cgroup {
/* used to schedule release agent */
struct work_struct release_agent_work;

/* used to track pressure stalls */
struct psi_group psi;

/* used to store eBPF programs */
struct cgroup_bpf bpf;

Expand Down
15 changes: 15 additions & 0 deletions include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
}

static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return &cgrp->psi;
}

static inline void cgroup_init_kthreadd(void)
{
/*
Expand Down Expand Up @@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
return NULL;
}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
return NULL;
}

static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return NULL;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
struct cgroup *ancestor)
{
Expand Down
25 changes: 25 additions & 0 deletions include/linux/psi.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#include <linux/psi_types.h>
#include <linux/sched.h>

struct seq_file;
struct css_set;

#ifdef CONFIG_PSI

extern bool psi_disabled;
Expand All @@ -16,13 +19,35 @@ void psi_memstall_tick(struct task_struct *task, int cpu);
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);

int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);

#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
#endif

#else /* CONFIG_PSI */

static inline void psi_init(void) {}

static inline void psi_memstall_enter(unsigned long *flags) {}
static inline void psi_memstall_leave(unsigned long *flags) {}

#ifdef CONFIG_CGROUPS
static inline int psi_cgroup_alloc(struct cgroup *cgrp)
{
return 0;
}
static inline void psi_cgroup_free(struct cgroup *cgrp)
{
}
static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{
rcu_assign_pointer(p->cgroups, to);
}
#endif

#endif /* CONFIG_PSI */

#endif /* _LINUX_PSI_H */
4 changes: 4 additions & 0 deletions init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,10 @@ config PSI
the share of walltime in which some or all tasks in the system are
delayed due to contention of the respective resource.

In kernels with cgroup support, cgroups (cgroup2 only) will
have cpu.pressure, memory.pressure, and io.pressure files,
which aggregate pressure stalls for the grouped tasks only.

For more details see Documentation/accounting/psi.txt.

Say N if unsure.
Expand Down
45 changes: 43 additions & 2 deletions kernel/cgroup/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/sched/cputime.h>
#include <linux/psi.h>
#include <net/sock.h>

#define CREATE_TRACE_POINTS
Expand Down Expand Up @@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task,
*/
WARN_ON_ONCE(task->flags & PF_EXITING);

rcu_assign_pointer(task->cgroups, to_cset);
cgroup_move_task(task, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
}
Expand Down Expand Up @@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
return ret;
}

#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
}
#endif

static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
Expand Down Expand Up @@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
},
#ifdef CONFIG_PSI
{
.name = "io.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_io_pressure_show,
},
{
.name = "memory.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_memory_pressure_show,
},
{
.name = "cpu.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_cpu_pressure_show,
},
#endif
{ } /* terminate */
};

Expand Down Expand Up @@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
if (cgroup_on_dfl(cgrp))
cgroup_rstat_exit(cgrp);
kfree(cgrp);
Expand Down Expand Up @@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
ret = cgroup_bpf_inherit(cgrp);

ret = psi_cgroup_alloc(cgrp);
if (ret)
goto out_idr_free;

ret = cgroup_bpf_inherit(cgrp);
if (ret)
goto out_psi_free;

for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;

Expand Down Expand Up @@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)

return cgrp;

out_psi_free:
psi_cgroup_free(cgrp);
out_idr_free:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_stat_exit:
Expand Down
118 changes: 110 additions & 8 deletions kernel/sched/psi.c
Original file line number Diff line number Diff line change
Expand Up @@ -473,9 +473,35 @@ static void psi_group_change(struct psi_group *group, int cpu,
schedule_delayed_work(&group->clock_work, PSI_FREQ);
}

static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
{
#ifdef CONFIG_CGROUPS
struct cgroup *cgroup = NULL;

if (!*iter)
cgroup = task->cgroups->dfl_cgrp;
else if (*iter == &psi_system)
return NULL;
else
cgroup = cgroup_parent(*iter);

if (cgroup && cgroup_parent(cgroup)) {
*iter = cgroup;
return cgroup_psi(cgroup);
}
#else
if (*iter)
return NULL;
#endif
*iter = &psi_system;
return &psi_system;
}

void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
void *iter = NULL;

if (!task->pid)
return;
Expand All @@ -492,17 +518,23 @@ void psi_task_change(struct task_struct *task, int clear, int set)
task->psi_flags &= ~clear;
task->psi_flags |= set;

psi_group_change(&psi_system, cpu, clear, set);
while ((group = iterate_groups(task, &iter)))
psi_group_change(group, cpu, clear, set);
}

void psi_memstall_tick(struct task_struct *task, int cpu)
{
struct psi_group_cpu *groupc;
struct psi_group *group;
void *iter = NULL;

groupc = per_cpu_ptr(psi_system.pcpu, cpu);
write_seqcount_begin(&groupc->seq);
record_times(groupc, cpu, true);
write_seqcount_end(&groupc->seq);
while ((group = iterate_groups(task, &iter))) {
struct psi_group_cpu *groupc;

groupc = per_cpu_ptr(group->pcpu, cpu);
write_seqcount_begin(&groupc->seq);
record_times(groupc, cpu, true);
write_seqcount_end(&groupc->seq);
}
}

/**
Expand Down Expand Up @@ -565,8 +597,78 @@ void psi_memstall_leave(unsigned long *flags)
rq_unlock_irq(rq, &rf);
}

static int psi_show(struct seq_file *m, struct psi_group *group,
enum psi_res res)
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
if (psi_disabled)
return 0;

cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
if (!cgroup->psi.pcpu)
return -ENOMEM;
group_init(&cgroup->psi);
return 0;
}

void psi_cgroup_free(struct cgroup *cgroup)
{
if (psi_disabled)
return;

cancel_delayed_work_sync(&cgroup->psi.clock_work);
free_percpu(cgroup->psi.pcpu);
}

/**
* cgroup_move_task - move task to a different cgroup
* @task: the task
* @to: the target css_set
*
* Move task to a new cgroup and safely migrate its associated stall
* state between the different groups.
*
* This function acquires the task's rq lock to lock out concurrent
* changes to the task's scheduling state and - in case the task is
* running - concurrent changes to its stall state.
*/
void cgroup_move_task(struct task_struct *task, struct css_set *to)
{
bool move_psi = !psi_disabled;
unsigned int task_flags = 0;
struct rq_flags rf;
struct rq *rq;

if (move_psi) {
rq = task_rq_lock(task, &rf);

if (task_on_rq_queued(task))
task_flags = TSK_RUNNING;
else if (task->in_iowait)
task_flags = TSK_IOWAIT;

if (task->flags & PF_MEMSTALL)
task_flags |= TSK_MEMSTALL;

if (task_flags)
psi_task_change(task, task_flags, 0);
}

/*
* Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/.
*/
rcu_assign_pointer(task->cgroups, to);

if (move_psi) {
if (task_flags)
psi_task_change(task, 0, task_flags);

task_rq_unlock(rq, task, &rf);
}
}
#endif /* CONFIG_CGROUPS */

int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
int full;

Expand Down

0 comments on commit 2ce7135

Please sign in to comment.