OpenVZ Forum


Home » Mailing lists » Devel » [PATCH v2 0/3] cpuacct cgroup refactoring
[PATCH v2 0/3] cpuacct cgroup refactoring [message #44274] Mon, 28 November 2011 16:45 Go to next message
Glauber Costa is currently offline  Glauber Costa
Messages: 916
Registered: October 2011
Senior Member
From: *parallels.com
Hi,

This is the same series I've already sent before. Just sending again
after the conflict with the code that was in tip.git

Best regards,

Glauber Costa (3):
Change cpustat fields to an array.
Reuse cgroup's parent pointer
cpuacct.stat: re-use scheduler statistics for the root cgroup

arch/s390/appldata/appldata_os.c | 16 +-
arch/x86/include/asm/i387.h | 2 +-
drivers/cpufreq/cpufreq_conservative.c | 38 +++---
drivers/cpufreq/cpufreq_ondemand.c | 38 +++---
drivers/macintosh/rack-meter.c | 8 +-
fs/proc/stat.c | 63 ++++-----
fs/proc/uptime.c | 4 +-
include/linux/kernel_stat.h | 36 +++--
kernel/sched/core.c | 242 +++++++++++++++----------------
kernel/sched/sched.h | 33 ++++-
10 files changed, 248 insertions(+), 232 deletions(-)

--
1.7.6.4
[PATCH v2 2/3] Reuse cgroup's parent pointer [message #44275 is a reply to message #44274] Mon, 28 November 2011 16:45 Go to previous messageGo to next message
Glauber Costa is currently offline  Glauber Costa
Messages: 916
Registered: October 2011
Senior Member
From: *parallels.com
We already have a pointer to the cgroup parent (whose data is more likely
to be in the cache than this, anyway), so there is no need to have this one
in cpuacct.

This patch makes the underlying cgroup be used instead.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Paul Tuner <pjt@google.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched/core.c | 15 +++++++++------
1 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9e0c19..c36f926 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7840,7 +7840,6 @@ struct cpuacct {
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 __percpu *cpuusage;
struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
- struct cpuacct *parent;
};

struct cgroup_subsys cpuacct_subsys;
@@ -7859,6 +7858,13 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
struct cpuacct, css);
}

+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ if (!ca || !ca->css.cgroup->parent)
+ return NULL;
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
/* create a new cpu accounting group */
static struct cgroup_subsys_state *cpuacct_create(
struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -7877,9 +7883,6 @@ static struct cgroup_subsys_state *cpuacct_create(
if (percpu_counter_init(&ca->cpustat[i], 0))
goto out_free_counters;

- if (cgrp->parent)
- ca->parent = cgroup_ca(cgrp->parent);
-
return &ca->css;

out_free_counters:
@@ -8046,7 +8049,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)

ca = task_ca(tsk);

- for (; ca; ca = ca->parent) {
+ for (; ca; ca = parent_ca(ca)) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
@@ -8088,7 +8091,7 @@ void cpuacct_update_stats(struct task_struct *tsk,

do {
__percpu_counter_add(&ca->cpustat[idx], val, batch);
- ca = ca->parent;
+ ca = parent_ca(ca);
} while (ca);
rcu_read_unlock();
}
--
1.7.6.4
[PATCH v2 1/3] Change cpustat fields to an array. [message #44276 is a reply to message #44274] Mon, 28 November 2011 16:45 Go to previous messageGo to next message
Glauber Costa is currently offline  Glauber Costa
Messages: 916
Registered: October 2011
Senior Member
From: *parallels.com
This patch changes fields in cpustat from a structure, to an
u64 array. Math gets easier, and the code is more flexible.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Paul Tuner <pjt@google.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/s390/appldata/appldata_os.c | 16 +++---
arch/x86/include/asm/i387.h | 2 +-
drivers/cpufreq/cpufreq_conservative.c | 38 +++++++--------
drivers/cpufreq/cpufreq_ondemand.c | 38 +++++++--------
drivers/macintosh/rack-meter.c | 8 ++--
fs/proc/stat.c | 63 ++++++++++++--------------
fs/proc/uptime.c | 4 +-
include/linux/kernel_stat.h | 36 +++++++++------
kernel/sched/core.c | 78 ++++++++++++++++---------------
9 files changed, 142 insertions(+), 141 deletions(-)

diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 92f1cb7..4de031d 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data)
j = 0;
for_each_online_cpu(i) {
os_data->os_cpu[j].per_cpu_user =
- cputime_to_jiffies(kstat_cpu(i).cpustat.user);
+ cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]);
os_data->os_cpu[j].per_cpu_nice =
- cputime_to_jiffies(kstat_cpu(i).cpustat.nice);
+ cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]);
os_data->os_cpu[j].per_cpu_system =
- cputime_to_jiffies(kstat_cpu(i).cpustat.system);
+ cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]);
os_data->os_cpu[j].per_cpu_idle =
- cputime_to_jiffies(kstat_cpu(i).cpustat.idle);
+ cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]);
os_data->os_cpu[j].per_cpu_irq =
- cputime_to_jiffies(kstat_cpu(i).cpustat.irq);
+ cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]);
os_data->os_cpu[j].per_cpu_softirq =
- cputime_to_jiffies(kstat_cpu(i).cpustat.softirq);
+ cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]) ;
os_data->os_cpu[j].per_cpu_iowait =
- cputime_to_jiffies(kstat_cpu(i).cpustat.iowait);
+ cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]);
os_data->os_cpu[j].per_cpu_steal =
- cputime_to_jiffies(kstat_cpu(i).cpustat.steal);
+ cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]);
os_data->os_cpu[j].cpu_id = i;
j++;
}
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c9e09ea..6919e93 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
#ifdef CONFIG_SMP
#define safe_address (__per_cpu_offset[0])
#else
-#define safe_address (kstat_cpu(0).cpustat.user)
+#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
#endif

/*
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index c97b468..118bff7 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -95,27 +95,26 @@ static struct dbs_tuners {
.freq_step = 5,
};

-static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
- cputime64_t *wall)
+static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
{
- cputime64_t idle_time;
+ u64 idle_time;
cputime64_t cur_wall_time;
- cputime64_t busy_time;
+ u64 busy_time;

cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
- busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
- kstat_cpu(cpu).cpustat.system);
+ busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER] +
+ kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];

- busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
- busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
- busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
- busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
+ busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
+ busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
+ busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+ busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];

idle_time = cputime64_sub(cur_wall_time, busy_time);
if (wall)
- *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
+ *wall = jiffies_to_usecs(cur_wall_time);

- return (cputime64_t)jiffies_to_usecs(idle_time);
+ return jiffies_to_usecs(idle_time);
}

static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -272,7 +271,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->prev_cpu_wall);
if (dbs_tuners_ins.ignore_nice)
- dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
+ dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
}
return count;
}
@@ -362,11 +361,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
j_dbs_info->prev_cpu_idle = cur_idle_time;

if (dbs_tuners_ins.ignore_nice) {
- cputime64_t cur_nice;
+ u64 cur_nice;
unsigned long cur_nice_jiffies;

- cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
- j_dbs_info->prev_cpu_nice);
+ cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
+ j_dbs_info->prev_cpu_nice;
/*
* Assumption: nice time between sampling periods will
* be less than 2^32 jiffies for 32 bit sys
@@ -374,7 +373,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
cur_nice_jiffies = (unsigned long)
cputime64_to_jiffies64(cur_nice);

- j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
+ j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
idle_time += jiffies_to_usecs(cur_nice_jiffies);
}

@@ -501,10 +500,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,

j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
&j_dbs_info->prev_cpu_wall);
- if (dbs_tuners_ins.ignore_nice) {
+ if (dbs_tuners_ins.ignore_nice)
j_dbs_info->prev_cpu_nice =
- kstat_cpu(j).cpustat.nice;
- }
+ kcpustat_cpu(j).cpustat[CPUTIME_NICE];
}
this_dbs_info->down_skip = 0;
this_dbs_info->requested_freq = policy->cur;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index fa8af4e..f3d327c 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -119,27 +119,26 @@ static struct dbs_tuners {
.powersave_bias = 0,
};

-static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
- cputime64_t *wall)
+static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
{
- cputime64_t idle_time;
+ u64 idle_time;
cputime64_t cur_wall_time;
- cputime64_t busy_time;
+ u64 busy_time;

cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
- busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
- kstat_cpu(cpu).cpustat.system);
+ busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER] +
+ kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];

- busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
- busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
- busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
- busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
+ busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
+ busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
+ busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+ busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];

idle_time = cputime64_sub(cur_wall_time, busy_time);
if (wall)
- *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
+ *wall = jiffies_to_usecs(cur_wall_time);

- return (cputime64_t)jiffies_to_usecs(idle_time);
+ return jiffies_to_usecs(idle_time);
}

static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -345,7 +344,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->prev_cpu_wall);
if (dbs_tuners_ins.ignore_nice)
- dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
+ dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];

}
return count;
@@ -455,11 +454,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
j_dbs_info->prev_cpu_iowait = cur_iowait_time;

if (dbs_tuners_ins.ignore_nice) {
- cputime64_t cur_nice;
+ u64 cur_nice;
unsigned long cur_nice_jiffies;

- cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
- j_dbs_info->prev_cpu_nice);
+ cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
+ j_dbs_info->prev_cpu_nice;
/*
* Assumption: nice time between sampling periods will
* be less than 2^32 jiffies for 32 bit sys
@@ -467,7 +466,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
cur_nice_jiffies = (unsigned long)
cputime64_to_jiffies64(cur_nice);

- j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
+ j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
idle_time += jiffies_to_usecs(cur_nice_jiffies);
}

@@ -646,10 +645,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,

j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
&j_dbs_info->prev_cpu_wall);
- if (dbs_tuners_ins.ignore_nice) {
+ if (dbs_tuners_ins.ignore_nice)
j_dbs_info->prev_cpu_nice =
- kstat_cpu(j).cpustat.nice;
- }
+ kcpustat_cpu(j).cpustat[CPUTIME_NICE];
}
this_dbs_info->cpu = cpu;
this_dbs_info->rate_mult = 1;
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 2637c13..66d7f1c7 1
...

[PATCH v2 3/3] cpuacct.stat: re-use scheduler statistics for the root cgroup [message #44277 is a reply to message #44274] Mon, 28 November 2011 16:45 Go to previous messageGo to next message
Glauber Costa is currently offline  Glauber Costa
Messages: 916
Registered: October 2011
Senior Member
From: *parallels.com
Right now, after we collect tick statistics for user and system and store them
in a well known location, we keep the same statistics again for cpuacct.
Since cpuacct is hierarchical, the numbers for the root cgroup should be
absolutely equal to the system-wide numbers.

So it would be better to just use it: this patch changes cpuacct accounting
in a way that the cpustat statistics are kept in a struct kernel_cpustat percpu
array. In the root cgroup case, we just point it to the main array. The rest of
the hierarchy walk can be totally disabled later with a static branch - but I am
not doing it here.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Paul Tuner <pjt@google.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched/core.c | 165 +++++++++++++++++++++++---------------------------
kernel/sched/sched.h | 33 +++++++++-
2 files changed, 105 insertions(+), 93 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c36f926..fd73bea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2555,6 +2555,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}

+#ifdef CONFIG_CGROUP_CPUACCT
+struct cgroup_subsys cpuacct_subsys;
+struct cpuacct root_cpuacct;
+#endif
+
+static inline void task_group_account_field(struct task_struct *p,
+ u64 tmp, int index)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+#endif
+ /*
+ * Since all updates are sure to touch the root cgroup, we
+ * get ourselves ahead and touch it first. If the root cgroup
+ * is the only cgroup, then nothing else should be necessary.
+ *
+ */
+ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+
+#ifdef CONFIG_CGROUP_CPUACCT
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca && (ca != &root_cpuacct)) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += tmp;
+ ca = parent_ca(ca);
+ }
+ rcu_read_unlock();
+#endif
+}
+
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
@@ -2579,7 +2615,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
cpustat[index] += tmp;

- cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
+ task_group_account_field(p, index, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
@@ -2635,7 +2671,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,

/* Add system time to cpustat. */
cpustat[index] += tmp;
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ task_group_account_field(p, index, cputime);

/* Account for system time used */
acct_update_integrals(p);
@@ -6772,8 +6808,15 @@ void __init sched_init(void)
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
+
#endif /* CONFIG_CGROUP_SCHED */

+#ifdef CONFIG_CGROUP_CPUACCT
+ root_cpuacct.cpustat = &kernel_cpustat;
+ root_cpuacct.cpuusage = alloc_percpu(u64);
+ /* Too early, not expected to fail */
+ BUG_ON(!root_cpuacct.cpuusage);
+#endif
for_each_possible_cpu(i) {
struct rq *rq;

@@ -7834,44 +7877,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
* (balbir@in.ibm.com).
*/

-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
- struct cgroup_subsys_state css;
- /* cpuusage holds pointer to a u64-type object on every cpu */
- u64 __percpu *cpuusage;
- struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
-};
-
-struct cgroup_subsys cpuacct_subsys;
-
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
-static inline struct cpuacct *parent_ca(struct cpuacct *ca)
-{
- if (!ca || !ca->css.cgroup->parent)
- return NULL;
- return cgroup_ca(ca->css.cgroup->parent);
-}
-
/* create a new cpu accounting group */
static struct cgroup_subsys_state *cpuacct_create(
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
- struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- int i;
+ struct cpuacct *ca;

+ if (!cgrp->parent)
+ return &root_cpuacct.css;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto out;

@@ -7879,15 +7894,13 @@ static struct cgroup_subsys_state *cpuacct_create(
if (!ca->cpuusage)
goto out_free_ca;

- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- if (percpu_counter_init(&ca->cpustat[i], 0))
- goto out_free_counters;
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;

return &ca->css;

-out_free_counters:
- while (--i >= 0)
- percpu_counter_destroy(&ca->cpustat[i]);
+out_free_cpuusage:
free_percpu(ca->cpuusage);
out_free_ca:
kfree(ca);
@@ -7900,10 +7913,8 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
- int i;

- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- percpu_counter_destroy(&ca->cpustat[i]);
+ free_percpu(ca->cpustat);
free_percpu(ca->cpuusage);
kfree(ca);
}
@@ -7996,16 +8007,31 @@ static const char *cpuacct_stat_desc[] = {
};

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
+ struct cgroup_map_cb *cb)
{
struct cpuacct *ca = cgroup_ca(cgrp);
- int i;
+ int cpu;
+ s64 val = 0;

- for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
- s64 val = percpu_counter_read(&ca->cpustat[i]);
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[i], val);
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
}
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
return 0;
}

@@ -8057,45 +8083,6 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
rcu_read_unlock();
}

-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- * in cputime_t units. As a result, cpuacct_update_stats calls
- * percpu_counter_add with values large enough to always overflow the
- * per cpu batch limit causing bad SMP scalability.
- *
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- */
-#ifdef CONFIG_SMP
-#define CPUACCT_BATCH \
- min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
-#else
-#define CPUACCT_BATCH 0
-#endif
-
-/*
- * Charge the system/user time to the task's accounting group.
- */
-void cpuacct_update_stats(struct task_struct *tsk,
- enum cpuacct_stat_index idx, cputime_t val)
-{
- struct cpuacct *ca;
- int batch = CPUACCT_BATCH;
-
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(tsk);
-
- do {
- __percpu_counter_add(&ca->cpustat[idx], val, batch);
- ca = parent_ca(ca);
- } while (ca);
- rcu_read_unlock();
-}
-
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.create = cpuacct_create,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2e7802..53f6f83 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -828,13 +828,38 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
extern void update_cpu_load(struct rq *this_rq);

#ifdef CONFIG_CGROUP_CPUACCT
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ if (!ca || !ca->css.cgroup->parent)
+ return NULL;
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_update_stats(struct task_struct *tsk,
- enum cpuacct_stat_index idx, cputime_t val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-static inline void cpuacct_update_stats(struct task_struct *tsk,
- enum cpuacct_stat_index idx, cputime_t val) {}
#endif

static inline void inc_nr_running(struct rq *rq)
--
1.7.6.4
...

Re: [PATCH v2 0/3] cpuacct cgroup refactoring [message #44337 is a reply to message #44274] Thu, 01 December 2011 10:56 Go to previous message
Peter Zijlstra is currently offline  Peter Zijlstra
Messages: 61
Registered: September 2006
Member
From: *parallels.com
On Mon, 2011-11-28 at 14:45 -0200, Glauber Costa wrote:
> Hi,
>
> This is the same series I've already sent before. Just sending again
> after the conflict with the code that was in tip.git

Utter lack of complaints here.. lets just merge it :-)

Should show up in:

git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.g it

soonish.
Previous Topic: [PATCH 1/5] NFS: handle blocklayout pipe PipeFS dentry by network namespace aware routines
Next Topic: [PATCH v7 00/10] Request for Inclusion: per-cgroup tcp memory pressure
Goto Forum:
  


Current Time: Tue Aug 20 04:04:24 GMT 2019