| Home » Mailing lists » Devel » [PATCH 0/4] cpuacct cleanup Goto Forum:
	| 
		
			| [PATCH 0/4] cpuacct cleanup [message #44206] | Fri, 25 November 2011 01:33  |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| Hi, 
 Leaving aside all the hierarchy walk discussion, I tried to come up
 with a series that concentrates only the basic points of it all.
 I think we benefit from having it independently of the rest of the work:
 in general (comments on the specifics welcome) cpuacct is made more naturally
 integrated with the scheduler, and the statistics it collects are now exactly
 the same as the system-wide ones for the root cgroup case. I think quite similar
 can be done with cpuusage by associating the root cgroup with the main runqueues,
 but this here is just me scratching my own itches - we can do it later.
 
 Please let me know if this is acceptable.
 
 Glauber Costa (4):
 Change cpustat fields to an array.
 Reuse cgroup's parent pointer
 Move part of cpuacct code
 cpuacct.stat: re-use scheduler statistics for the root cgroup
 
 arch/s390/appldata/appldata_os.c       |   16 +-
 arch/x86/include/asm/i387.h            |    2 +-
 drivers/cpufreq/cpufreq_conservative.c |   38 ++---
 drivers/cpufreq/cpufreq_ondemand.c     |   38 ++---
 drivers/macintosh/rack-meter.c         |    8 +-
 fs/proc/stat.c                         |   63 ++++----
 fs/proc/uptime.c                       |    4 +-
 include/linux/kernel_stat.h            |   36 +++--
 kernel/sched.c                         |  270 ++++++++++++++++++--------------
 9 files changed, 252 insertions(+), 223 deletions(-)
 
 --
 1.7.6.4
 |  
	|  |  |  
	| 
		
			| [PATCH 2/4] Reuse cgroup's parent pointer [message #44207 is a reply to message #44206] | Fri, 25 November 2011 01:33   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| We already have a pointer to the cgroup parent (whose data is more likely to be in the cache than this, anyway), so there is no need to have this one
 in cpuacct.
 
 This patch makes the underlying cgroup be used instead.
 
 Signed-off-by: Glauber Costa <glommer@parallels.com>
 CC: Paul Tuner <pjt@google.com>
 CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
 ---
 kernel/sched.c |   15 +++++++++------
 1 files changed, 9 insertions(+), 6 deletions(-)
 
 diff --git a/kernel/sched.c b/kernel/sched.c
 index 2e57942..d504c7b 100644
 --- a/kernel/sched.c
 +++ b/kernel/sched.c
 @@ -9509,7 +9509,6 @@ struct cpuacct {
 /* cpuusage holds pointer to a u64-type object on every cpu */
 u64 __percpu *cpuusage;
 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 -	struct cpuacct *parent;
 };
 
 struct cgroup_subsys cpuacct_subsys;
 @@ -9528,6 +9527,13 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
 struct cpuacct, css);
 }
 
 +static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 +{
 +	if (!ca || !ca->css.cgroup->parent)
 +		return NULL;
 +	return cgroup_ca(ca->css.cgroup->parent);
 +}
 +
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
 struct cgroup_subsys *ss, struct cgroup *cgrp)
 @@ -9546,9 +9552,6 @@ static struct cgroup_subsys_state *cpuacct_create(
 if (percpu_counter_init(&ca->cpustat[i], 0))
 goto out_free_counters;
 
 -	if (cgrp->parent)
 -		ca->parent = cgroup_ca(cgrp->parent);
 -
 return &ca->css;
 
 out_free_counters:
 @@ -9715,7 +9718,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 
 ca = task_ca(tsk);
 
 -	for (; ca; ca = ca->parent) {
 +	for (; ca; ca = parent_ca(ca)) {
 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 *cpuusage += cputime;
 }
 @@ -9757,7 +9760,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
 
 do {
 __percpu_counter_add(&ca->cpustat[idx], val, batch);
 -		ca = ca->parent;
 +		ca = parent_ca(ca);
 } while (ca);
 rcu_read_unlock();
 }
 --
 1.7.6.4
 |  
	|  |  |  
	| 
		
			| [PATCH 1/4] Change cpustat fields to an array. [message #44208 is a reply to message #44206] | Fri, 25 November 2011 01:33   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| This patch changes fields in cpustat from a structure, to an u64 array. Math gets easier, and the code is more flexible.
 
 Signed-off-by: Glauber Costa <glommer@parallels.com>
 CC: Paul Tuner <pjt@google.com>
 CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
 ---
 arch/s390/appldata/appldata_os.c       |   16 +++---
 arch/x86/include/asm/i387.h            |    2 +-
 drivers/cpufreq/cpufreq_conservative.c |   38 +++++++--------
 drivers/cpufreq/cpufreq_ondemand.c     |   38 +++++++--------
 drivers/macintosh/rack-meter.c         |    8 ++--
 fs/proc/stat.c                         |   63 ++++++++++++--------------
 fs/proc/uptime.c                       |    4 +-
 include/linux/kernel_stat.h            |   36 +++++++++------
 kernel/sched.c                         |   78 ++++++++++++++++---------------
 9 files changed, 142 insertions(+), 141 deletions(-)
 
 diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
 index 92f1cb7..4de031d 100644
 --- a/arch/s390/appldata/appldata_os.c
 +++ b/arch/s390/appldata/appldata_os.c
 @@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data)
 j = 0;
 for_each_online_cpu(i) {
 os_data->os_cpu[j].per_cpu_user =
 -			cputime_to_jiffies(kstat_cpu(i).cpustat.user);
 +			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]);
 os_data->os_cpu[j].per_cpu_nice =
 -			cputime_to_jiffies(kstat_cpu(i).cpustat.nice);
 +			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]);
 os_data->os_cpu[j].per_cpu_system =
 -			cputime_to_jiffies(kstat_cpu(i).cpustat.system);
 +			 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]);
 os_data->os_cpu[j].per_cpu_idle =
 -			cputime_to_jiffies(kstat_cpu(i).cpustat.idle);
 +			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]);
 os_data->os_cpu[j].per_cpu_irq =
 -			cputime_to_jiffies(kstat_cpu(i).cpustat.irq);
 +			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]);
 os_data->os_cpu[j].per_cpu_softirq =
 -			cputime_to_jiffies(kstat_cpu(i).cpustat.softirq);
 +			 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]) ;
 os_data->os_cpu[j].per_cpu_iowait =
 -			cputime_to_jiffies(kstat_cpu(i).cpustat.iowait);
 +			 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]);
 os_data->os_cpu[j].per_cpu_steal =
 -			cputime_to_jiffies(kstat_cpu(i).cpustat.steal);
 +			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]);
 os_data->os_cpu[j].cpu_id = i;
 j++;
 }
 diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
 index c9e09ea..6919e93 100644
 --- a/arch/x86/include/asm/i387.h
 +++ b/arch/x86/include/asm/i387.h
 @@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
 #ifdef CONFIG_SMP
 #define safe_address (__per_cpu_offset[0])
 #else
 -#define safe_address (kstat_cpu(0).cpustat.user)
 +#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
 #endif
 
 /*
 diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
 index c97b468..118bff7 100644
 --- a/drivers/cpufreq/cpufreq_conservative.c
 +++ b/drivers/cpufreq/cpufreq_conservative.c
 @@ -95,27 +95,26 @@ static struct dbs_tuners {
 .freq_step = 5,
 };
 
 -static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
 -							cputime64_t *wall)
 +static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
 {
 -	cputime64_t idle_time;
 +	u64 idle_time;
 cputime64_t cur_wall_time;
 -	cputime64_t busy_time;
 +	u64 busy_time;
 
 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
 -	busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
 -			kstat_cpu(cpu).cpustat.system);
 +	busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER] +
 +		    kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
 
 -	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
 -	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
 -	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
 -	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
 +	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
 +	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
 +	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
 +	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
 
 idle_time = cputime64_sub(cur_wall_time, busy_time);
 if (wall)
 -		*wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
 +		*wall = jiffies_to_usecs(cur_wall_time);
 
 -	return (cputime64_t)jiffies_to_usecs(idle_time);
 +	return jiffies_to_usecs(idle_time);
 }
 
 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
 @@ -272,7 +271,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
 &dbs_info->prev_cpu_wall);
 if (dbs_tuners_ins.ignore_nice)
 -			dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
 +			dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 }
 return count;
 }
 @@ -362,11 +361,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 j_dbs_info->prev_cpu_idle = cur_idle_time;
 
 if (dbs_tuners_ins.ignore_nice) {
 -			cputime64_t cur_nice;
 +			u64 cur_nice;
 unsigned long cur_nice_jiffies;
 
 -			cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
 -					 j_dbs_info->prev_cpu_nice);
 +			cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
 +					 j_dbs_info->prev_cpu_nice;
 /*
 * Assumption: nice time between sampling periods will
 * be less than 2^32 jiffies for 32 bit sys
 @@ -374,7 +373,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 cur_nice_jiffies = (unsigned long)
 cputime64_to_jiffies64(cur_nice);
 
 -			j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
 +			j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 idle_time += jiffies_to_usecs(cur_nice_jiffies);
 }
 
 @@ -501,10 +500,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 
 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
 &j_dbs_info->prev_cpu_wall);
 -			if (dbs_tuners_ins.ignore_nice) {
 +			if (dbs_tuners_ins.ignore_nice)
 j_dbs_info->prev_cpu_nice =
 -						kstat_cpu(j).cpustat.nice;
 -			}
 +						kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 }
 this_dbs_info->down_skip = 0;
 this_dbs_info->requested_freq = policy->cur;
 diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
 index fa8af4e..f3d327c 100644
 --- a/drivers/cpufreq/cpufreq_ondemand.c
 +++ b/drivers/cpufreq/cpufreq_ondemand.c
 @@ -119,27 +119,26 @@ static struct dbs_tuners {
 .powersave_bias = 0,
 };
 
 -static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
 -							cputime64_t *wall)
 +static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
 {
 -	cputime64_t idle_time;
 +	u64 idle_time;
 cputime64_t cur_wall_time;
 -	cputime64_t busy_time;
 +	u64 busy_time;
 
 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
 -	busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
 -			kstat_cpu(cpu).cpustat.system);
 +	busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER] +
 +		    kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
 
 -	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
 -	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
 -	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
 -	busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
 +	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
 +	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
 +	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
 +	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
 
 idle_time = cputime64_sub(cur_wall_time, busy_time);
 if (wall)
 -		*wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
 +		*wall = jiffies_to_usecs(cur_wall_time);
 
 -	return (cputime64_t)jiffies_to_usecs(idle_time);
 +	return jiffies_to_usecs(idle_time);
 }
 
 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
 @@ -345,7 +344,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
 &dbs_info->prev_cpu_wall);
 if (dbs_tuners_ins.ignore_nice)
 -			dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
 +			dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
 }
 return count;
 @@ -455,11 +454,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 j_dbs_info->prev_cpu_iowait = cur_iowait_time;
 
 if (dbs_tuners_ins.ignore_nice) {
 -			cputime64_t cur_nice;
 +			u64 cur_nice;
 unsigned long cur_nice_jiffies;
 
 -			cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
 -					 j_dbs_info->prev_cpu_nice);
 +			cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
 +					 j_dbs_info->prev_cpu_nice;
 /*
 * Assumption: nice time between sampling periods will
 * be less than 2^32 jiffies for 32 bit sys
 @@ -467,7 +466,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 cur_nice_jiffies = (unsigned long)
 cputime64_to_jiffies64(cur_nice);
 
 -			j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
 +			j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 idle_time += jiffies_to_usecs(cur_nice_jiffies);
 }
 
 @@ -646,10 +645,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 
 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
 &j_dbs_info->prev_cpu_wall);
 -			if (dbs_tuners_ins.ignore_nice) {
 +			if (dbs_tuners_ins.ignore_nice)
 j_dbs_info->prev_cpu_nice =
 -						kstat_cpu(j).cpustat.nice;
 -			}
 +						kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 }
 this_dbs_info->cpu = cpu;
 this_dbs_info->rate_mult = 1;
 diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
 index 2637c13..66d7f1c7 100644
 --- a/drivers/macintosh/rack-meter.c
 +++ b/drivers/macintosh/rack-me
...
 
 
 |  
	|  |  |  
	| 
		
			| [PATCH 3/4] Move part of cpuacct code [message #44209 is a reply to message #44206] | Fri, 25 November 2011 01:33   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| This patch is just a preparation patch for the next one in the series. It moves the cpuacct structure definition and some helper functions early
 in the file so we can access its members from here on.
 
 Signed-off-by: Glauber Costa <glommer@parallels.com>
 CC: Paul Tuner <pjt@google.com>
 CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
 ---
 kernel/sched.c |   77 +++++++++++++++++++++++++++----------------------------
 1 files changed, 38 insertions(+), 39 deletions(-)
 
 diff --git a/kernel/sched.c b/kernel/sched.c
 index d504c7b..9b70305 100644
 --- a/kernel/sched.c
 +++ b/kernel/sched.c
 @@ -1596,6 +1596,44 @@ enum cpuacct_stat_index {
 };
 
 #ifdef CONFIG_CGROUP_CPUACCT
 +/*
 + * CPU accounting code for task groups.
 + *
 + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
 + * (balbir@in.ibm.com).
 + */
 +
 +/* track cpu usage of a group of tasks and its child groups */
 +struct cpuacct {
 +	struct cgroup_subsys_state css;
 +	/* cpuusage holds pointer to a u64-type object on every cpu */
 +	u64 __percpu *cpuusage;
 +	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 +};
 +
 +struct cgroup_subsys cpuacct_subsys;
 +
 +/* return cpu accounting group corresponding to this container */
 +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
 +{
 +	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
 +			    struct cpuacct, css);
 +}
 +
 +/* return cpu accounting group to which this task belongs */
 +static inline struct cpuacct *task_ca(struct task_struct *tsk)
 +{
 +	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
 +			    struct cpuacct, css);
 +}
 +
 +static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 +{
 +	if (!ca || !ca->css.cgroup->parent)
 +		return NULL;
 +	return cgroup_ca(ca->css.cgroup->parent);
 +}
 +
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static void cpuacct_update_stats(struct task_struct *tsk,
 enum cpuacct_stat_index idx, cputime_t val);
 @@ -9495,45 +9533,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 #endif	/* CONFIG_CGROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_CPUACCT
 -
 -/*
 - * CPU accounting code for task groups.
 - *
 - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
 - * (balbir@in.ibm.com).
 - */
 -
 -/* track cpu usage of a group of tasks and its child groups */
 -struct cpuacct {
 -	struct cgroup_subsys_state css;
 -	/* cpuusage holds pointer to a u64-type object on every cpu */
 -	u64 __percpu *cpuusage;
 -	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 -};
 -
 -struct cgroup_subsys cpuacct_subsys;
 -
 -/* return cpu accounting group corresponding to this container */
 -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
 -{
 -	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
 -			    struct cpuacct, css);
 -}
 -
 -/* return cpu accounting group to which this task belongs */
 -static inline struct cpuacct *task_ca(struct task_struct *tsk)
 -{
 -	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
 -			    struct cpuacct, css);
 -}
 -
 -static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 -{
 -	if (!ca || !ca->css.cgroup->parent)
 -		return NULL;
 -	return cgroup_ca(ca->css.cgroup->parent);
 -}
 -
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
 struct cgroup_subsys *ss, struct cgroup *cgrp)
 --
 1.7.6.4
 |  
	|  |  |  
	| 
		
			| [PATCH 4/4] cpuacct.stat: re-use scheduler statistics for the root cgroup [message #44210 is a reply to message #44206] | Fri, 25 November 2011 01:33   |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| Right now, after we collect tick statistics for user and system and store them in a well known location, we keep the same statistics again for cpuacct.
 Since cpuacct is hierarchical, the numbers for the root cgroup should be
 absolutely equal to the system-wide numbers.
 
 So it would be better to just use it: this patch changes cpuacct accounting
 in a way that the cpustat statistics are kept in a struct kernel_cpustat percpu
 array. In the root cgroup case, we just point it to the main array. The rest of
 the hierarchy walk can be totally disabled later with a static branch - but I am
 not doing it here.
 
 Signed-off-by: Glauber Costa <glommer@parallels.com>
 CC: Paul Tuner <pjt@google.com>
 CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
 ---
 kernel/sched.c |  118 ++++++++++++++++++++++++++++++++++----------------------
 1 files changed, 72 insertions(+), 46 deletions(-)
 
 diff --git a/kernel/sched.c b/kernel/sched.c
 index 9b70305..9961817 100644
 --- a/kernel/sched.c
 +++ b/kernel/sched.c
 @@ -1608,10 +1608,11 @@ struct cpuacct {
 struct cgroup_subsys_state css;
 /* cpuusage holds pointer to a u64-type object on every cpu */
 u64 __percpu *cpuusage;
 -	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 +	struct kernel_cpustat __percpu *cpustat;
 };
 
 struct cgroup_subsys cpuacct_subsys;
 +struct cpuacct root_cpuacct;
 
 /* return cpu accounting group corresponding to this container */
 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
 @@ -1635,14 +1636,40 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 }
 
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 -static void cpuacct_update_stats(struct task_struct *tsk,
 -		enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 -static inline void cpuacct_update_stats(struct task_struct *tsk,
 -		enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 
 +static inline void task_group_account_field(struct task_struct *p,
 +					     u64 tmp, int index)
 +{
 +#ifdef CONFIG_CGROUP_CPUACCT
 +	struct kernel_cpustat *kcpustat;
 +	struct cpuacct *ca;
 +#endif
 +	/*
 +	 * Since all updates are sure to touch the root cgroup, we
 +	 * get ourselves ahead and touch it first. If the root cgroup
 +	 * is the only cgroup, then nothing else should be necessary.
 +	 *
 +	 */
 +	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
 +
 +#ifdef CONFIG_CGROUP_CPUACCT
 +	if (unlikely(!cpuacct_subsys.active))
 +		return;
 +
 +	rcu_read_lock();
 +	ca = task_ca(p);
 +	while (ca && (ca != &root_cpuacct)) {
 +		kcpustat = this_cpu_ptr(ca->cpustat);
 +		kcpustat->cpustat[index] += tmp;
 +		ca = parent_ca(ca);
 +	}
 +	rcu_read_unlock();
 +#endif
 +}
 +
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
 {
 update_load_add(&rq->load, load);
 @@ -3921,7 +3948,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 cpustat[index] += tmp;
 
 -	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
 +	task_group_account_field(p, index, cputime);
 /* Account for user time used */
 acct_update_integrals(p);
 }
 @@ -3977,7 +4004,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
 
 /* Add system time to cpustat. */
 cpustat[index] += tmp;
 -	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
 +	task_group_account_field(p, index, cputime);
 
 /* Account for system time used */
 acct_update_integrals(p);
 @@ -8275,8 +8302,15 @@ void __init sched_init(void)
 list_add(&root_task_group.list, &task_groups);
 INIT_LIST_HEAD(&root_task_group.children);
 autogroup_init(&init_task);
 +
 #endif /* CONFIG_CGROUP_SCHED */
 
 +#ifdef CONFIG_CGROUP_CPUACCT
 +	root_cpuacct.cpustat = &kernel_cpustat;
 +	root_cpuacct.cpuusage = alloc_percpu(u64);
 +	/* Too early, not expected to fail */
 +	BUG_ON(!root_cpuacct.cpuusage);
 +#endif
 for_each_possible_cpu(i) {
 struct rq *rq;
 
 @@ -9537,9 +9571,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 static struct cgroup_subsys_state *cpuacct_create(
 struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 -	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 -	int i;
 +	struct cpuacct *ca;
 +
 +	if (!cgrp->parent)
 +		return &root_cpuacct.css;
 
 +	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 if (!ca)
 goto out;
 
 @@ -9547,15 +9584,13 @@ static struct cgroup_subsys_state *cpuacct_create(
 if (!ca->cpuusage)
 goto out_free_ca;
 
 -	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 -		if (percpu_counter_init(&ca->cpustat[i], 0))
 -			goto out_free_counters;
 +	ca->cpustat = alloc_percpu(struct kernel_cpustat);
 +	if (!ca->cpustat)
 +		goto out_free_cpuusage;
 
 return &ca->css;
 
 -out_free_counters:
 -	while (--i >= 0)
 -		percpu_counter_destroy(&ca->cpustat[i]);
 +out_free_cpuusage:
 free_percpu(ca->cpuusage);
 out_free_ca:
 kfree(ca);
 @@ -9568,10 +9603,8 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 struct cpuacct *ca = cgroup_ca(cgrp);
 -	int i;
 
 -	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 -		percpu_counter_destroy(&ca->cpustat[i]);
 +	free_percpu(ca->cpustat);
 free_percpu(ca->cpuusage);
 kfree(ca);
 }
 @@ -9664,16 +9697,31 @@ static const char *cpuacct_stat_desc[] = {
 };
 
 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
 -		struct cgroup_map_cb *cb)
 +			      struct cgroup_map_cb *cb)
 {
 struct cpuacct *ca = cgroup_ca(cgrp);
 -	int i;
 +	int cpu;
 +	s64 val = 0;
 +
 +	for_each_online_cpu(cpu) {
 +		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
 +		val += kcpustat->cpustat[CPUTIME_USER];
 +		val += kcpustat->cpustat[CPUTIME_NICE];
 +	}
 +	val = cputime64_to_clock_t(val);
 +	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
 
 -	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
 -		s64 val = percpu_counter_read(&ca->cpustat[i]);
 -		val = cputime64_to_clock_t(val);
 -		cb->fill(cb, cpuacct_stat_desc[i], val);
 +	val = 0;
 +	for_each_online_cpu(cpu) {
 +		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
 +		val += kcpustat->cpustat[CPUTIME_SYSTEM];
 +		val += kcpustat->cpustat[CPUTIME_IRQ];
 +		val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
 }
 +
 +	val = cputime64_to_clock_t(val);
 +	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
 +
 return 0;
 }
 
 @@ -9742,28 +9790,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 #define CPUACCT_BATCH	0
 #endif
 
 -/*
 - * Charge the system/user time to the task's accounting group.
 - */
 -static void cpuacct_update_stats(struct task_struct *tsk,
 -		enum cpuacct_stat_index idx, cputime_t val)
 -{
 -	struct cpuacct *ca;
 -	int batch = CPUACCT_BATCH;
 -
 -	if (unlikely(!cpuacct_subsys.active))
 -		return;
 -
 -	rcu_read_lock();
 -	ca = task_ca(tsk);
 -
 -	do {
 -		__percpu_counter_add(&ca->cpustat[idx], val, batch);
 -		ca = parent_ca(ca);
 -	} while (ca);
 -	rcu_read_unlock();
 -}
 -
 struct cgroup_subsys cpuacct_subsys = {
 .name = "cpuacct",
 .create = cpuacct_create,
 --
 1.7.6.4
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [PATCH 3/4] Move part of cpuacct code [message #44242 is a reply to message #44241] | Sat, 26 November 2011 20:17  |  
			| 
				
				
					|  Glauber Costa Messages: 916
 Registered: October 2011
 | Senior Member |  |  |  
	| On 11/26/2011 11:21 AM, Paul Turner wrote: > On Thu, Nov 24, 2011 at 5:33 PM, Glauber Costa<glommer@parallels.com>  wrote:
 >> This patch is just a preparation patch for the next one in the series.
 >> It moves the cpuacct structure definition and some helper functions early
 >> in the file so we can access its members from here on.
 >>
 >> Signed-off-by: Glauber Costa<glommer@parallels.com>
 >> CC: Paul Tuner<pjt@google.com>
 >> CC: Peter Zijlstra<a.p.zijlstra@chello.nl>
 >> ---
 >>   kernel/sched.c |   77 +++++++++++++++++++++++++++----------------------------
 >>   1 files changed, 38 insertions(+), 39 deletions(-)
 >
 > Bad news -- You've run afoul of a massive file re-structuring conflict :(
 >
 > All of sched has been refactored under "kernel/sched/"; sched.c and
 > friends don't exist anymore.
 >
 > - Paul
 
 can you tell me where this lives? This does not seem to be the case in
 any of my git trees here (just fetched, including your sched.git/master
 at kernel.org)
 |  
	|  |  | 
 
 
 Current Time: Sun Oct 26 17:28:22 GMT 2025 
 Total time taken to generate the page: 0.09837 seconds |