From: Nick Piggin This is a derivative of Rick's schedstats patch which includes as little bit more SMP and sched-domains centric stuff. It drops some features from Rick's codebase. I'd like to reconcile them in future, but I'd like to get this into -mm now so it can help with tracking down performance problems. kernel/sched.o size doesn't increase at all here when SCHEDSTATS is off (SMP .config). A userspace reporting application is at http://www.zip.com.au/~akpm/linux/patches/stuff/stats7.c --- 25-akpm/arch/i386/Kconfig | 13 +++ 25-akpm/arch/ia64/Kconfig | 13 +++ 25-akpm/arch/ppc/Kconfig | 13 +++ 25-akpm/arch/ppc64/Kconfig | 15 +++ 25-akpm/arch/x86_64/Kconfig | 13 +++ 25-akpm/fs/proc/proc_misc.c | 7 + 25-akpm/include/linux/sched.h | 38 ++++++++ 25-akpm/kernel/sched.c | 179 ++++++++++++++++++++++++++++++++++++++++-- 8 files changed, 283 insertions(+), 8 deletions(-) diff -puN arch/i386/Kconfig~schedstats arch/i386/Kconfig --- 25/arch/i386/Kconfig~schedstats 2004-04-30 00:02:35.887007016 -0700 +++ 25-akpm/arch/i386/Kconfig 2004-04-30 00:02:35.901004888 -0700 @@ -1478,6 +1478,19 @@ config 4KSTACKS on the VM subsystem for higher order allocations. This option will also use IRQ stacks to compensate for the reduced stackspace. +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config X86_FIND_SMP_CONFIG bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -puN arch/ia64/Kconfig~schedstats arch/ia64/Kconfig --- 25/arch/ia64/Kconfig~schedstats 2004-04-30 00:02:35.888006864 -0700 +++ 25-akpm/arch/ia64/Kconfig 2004-04-30 00:02:35.902004736 -0700 @@ -436,6 +436,19 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config DEBUG_SLAB bool "Debug memory allocations" depends on DEBUG_KERNEL diff -puN arch/ppc64/Kconfig~schedstats arch/ppc64/Kconfig --- 25/arch/ppc64/Kconfig~schedstats 2004-04-30 00:02:35.890006560 -0700 +++ 25-akpm/arch/ppc64/Kconfig 2004-04-30 00:02:35.903004584 -0700 @@ -403,7 +403,20 @@ config DEBUG_INFO debugging info resulting in a larger kernel image. Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. - + +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + endmenu source "security/Kconfig" diff -puN arch/ppc/Kconfig~schedstats arch/ppc/Kconfig --- 25/arch/ppc/Kconfig~schedstats 2004-04-30 00:02:35.891006408 -0700 +++ 25-akpm/arch/ppc/Kconfig 2004-04-30 00:02:35.904004432 -0700 @@ -1227,6 +1227,19 @@ config DEBUG_INFO debug the kernel. If you don't debug the kernel, you can say N. +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config BOOTX_TEXT bool "Support for early boot text console (BootX or OpenFirmware only)" depends PPC_OF diff -puN arch/x86_64/Kconfig~schedstats arch/x86_64/Kconfig --- 25/arch/x86_64/Kconfig~schedstats 2004-04-30 00:02:35.893006104 -0700 +++ 25-akpm/arch/x86_64/Kconfig 2004-04-30 00:02:35.904004432 -0700 @@ -453,6 +453,19 @@ config DEBUG_INFO Say Y here only if you plan to use gdb to debug the kernel. Please note that this option requires new binutils. If you don't debug the kernel, you can say N. + +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. config FRAME_POINTER bool "Compile the kernel with frame pointers" diff -puN fs/proc/proc_misc.c~schedstats fs/proc/proc_misc.c --- 25/fs/proc/proc_misc.c~schedstats 2004-04-30 00:02:35.894005952 -0700 +++ 25-akpm/fs/proc/proc_misc.c 2004-04-30 00:02:35.905004280 -0700 @@ -286,6 +286,10 @@ static struct file_operations proc_vmsta .release = seq_release, }; +#ifdef CONFIG_SCHEDSTATS +extern struct file_operations proc_schedstat_operations; +#endif + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -697,6 +701,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); #endif +#ifdef CONFIG_SCHEDSTATS + create_seq_entry("schedstat", 0, &proc_schedstat_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { diff -puN include/linux/sched.h~schedstats include/linux/sched.h --- 25/include/linux/sched.h~schedstats 2004-04-30 00:02:35.896005648 -0700 +++ 25-akpm/include/linux/sched.h 2004-04-30 00:02:35.906004128 -0700 @@ -97,6 +97,14 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); +#ifdef CONFIG_SCHEDSTATS +#define schedstat_inc(s, field) ((s)->field++) +#define schedstat_add(s, field, amt) ((s)->field += amt) +#else +#define schedstat_inc(s, field) do { } while (0) +#define schedstat_add(d, field, amt) do { } while (0) +#endif + #include #include #include @@ -347,7 +355,6 @@ struct k_itimer { struct sigqueue *sigq; /* signal queue entry. */ }; - struct io_context; /* See blkdev.h */ void exit_io_context(void); @@ -589,6 +596,35 @@ struct sched_domain { unsigned long last_balance; /* init to jiffies. units in jiffies */ unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ + +#ifdef CONFIG_SCHEDSTATS + unsigned long lb_cnt[3]; + unsigned long lb_balanced[3]; + unsigned long lb_failed[3]; + unsigned long lb_pulled[3]; + unsigned long lb_hot_pulled[3]; + unsigned long lb_imbalance[3]; + + /* Active load balancing */ + unsigned long alb_cnt; + unsigned long alb_failed; + unsigned long alb_pushed; + + /* Wakeups */ + unsigned long sched_wake_remote; + + /* Passive load balancing */ + unsigned long plb_pulled; + + /* Affine wakeups */ + unsigned long afw_pulled; + + /* SD_BALANCE_EXEC balances */ + unsigned long sbe_pushed; + + /* SD_BALANCE_CLONE balances */ + unsigned long sbc_pushed; +#endif }; /* Common values for SMT siblings */ diff -puN kernel/sched.c~schedstats kernel/sched.c --- 25/kernel/sched.c~schedstats 2004-04-30 00:02:35.897005496 -0700 +++ 25-akpm/kernel/sched.c 2004-04-30 00:02:35.910003520 -0700 @@ -42,6 +42,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -225,6 +227,7 @@ struct runqueue { unsigned long expired_timestamp, nr_uninterruptible; unsigned long long timestamp_last_tick; task_t *curr, *idle; + struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int best_expired_prio; @@ -240,6 +243,22 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; #endif +#ifdef CONFIG_SCHEDSTATS + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_cnt; + unsigned long sched_switch; + unsigned long sched_idle; + + /* wake stats */ + unsigned long sched_wake; + unsigned long sched_wake_local; +#endif }; static DEFINE_PER_CPU(struct runqueue, runqueues); @@ -293,6 +312,91 @@ static inline void task_rq_unlock(runque spin_unlock_irqrestore(&rq->lock, *flags); } + +#ifdef CONFIG_SCHEDSTATS + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 7 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_cpu(i) { + /* Include offline CPUs */ + runqueue_t *rq = cpu_rq(i); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int j = 0; +#endif + + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu", + i, + rq->yld_both_empty, rq->yld_act_empty, + rq->yld_exp_empty, rq->yld_cnt, + rq->sched_switch, rq->sched_cnt, + rq->sched_idle, rq->sched_wake, rq->sched_wake_local); +#ifdef CONFIG_SMP + for_each_domain(i, sd) { + char str[NR_CPUS]; + int k; + cpumask_scnprintf(str, NR_CPUS, sd->span); + seq_printf(seq, " domain%d %s", j++, str); + + for (k = 0; k < 3; k++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu", + sd->lb_cnt[k], sd->lb_balanced[k], + sd->lb_failed[k], sd->lb_pulled[k], + sd->lb_hot_pulled[k], sd->lb_imbalance[k]); + } + + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + sd->alb_cnt, sd->alb_failed, + sd->alb_pushed, sd->sched_wake_remote, + sd->plb_pulled, sd->afw_pulled, + sd->sbe_pushed, sd->sbc_pushed); + } +#endif + + seq_printf(seq, "\n"); + } + + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned size = 4096 * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * rq_lock - lock a given runqueue and disable interrupts. */ @@ -755,7 +859,24 @@ static int try_to_wake_up(task_t * p, un cpu = task_cpu(p); this_cpu = smp_processor_id(); + schedstat_inc(rq, sched_wake); +#ifndef CONFIG_SMP + schedstat_inc(rq, sched_wake_local); +#endif + #ifdef CONFIG_SMP +#ifdef CONFIG_SCHEDSTATS + if (cpu == this_cpu) + schedstat_inc(rq, sched_wake_local); + else { + for_each_domain(this_cpu, sd) + if (cpu_isset(cpu, sd->span)) + break; + if (sd) + schedstat_inc(sd, sched_wake_remote); + } +#endif + if (unlikely(task_running(rq, p))) goto out_activate; @@ -789,12 +910,22 @@ static int try_to_wake_up(task_t * p, un !task_hot(p, rq->timestamp_last_tick, sd)) || ((sd->flags & SD_WAKE_BALANCE) && imbalance*this_load <= 100*load) ) { + /* * Now sd has SD_WAKE_AFFINE and p is cache cold in sd * or sd has SD_WAKE_BALANCE and there is an imbalance */ - if (cpu_isset(cpu, sd->span)) + if (cpu_isset(cpu, sd->span)) { +#ifdef CONFIG_SCHEDSTATS + if ((sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, sd)) + schedstat_inc(sd, afw_pulled); + else if ((sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) + schedstat_inc(sd, plb_pulled); +#endif goto out_set_cpu; + } } } @@ -1280,6 +1411,7 @@ lock_again: rq->nr_running++; } } else { + schedstat_inc(sd, sbc_pushed); __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1353,6 +1485,7 @@ void sched_balance_exec(void) if (sd) { new_cpu = find_idlest_cpu(current, this_cpu, sd); if (new_cpu != this_cpu) { + schedstat_inc(sd, sbe_pushed); put_cpu(); sched_migrate_task(current, new_cpu); return; @@ -1425,6 +1558,13 @@ int can_migrate_task(task_t *p, runqueue return -1; } +#ifdef CONFIG_SCHEDSTATS + if (!task_hot(p, rq->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_pulled[idle]); + else + schedstat_inc(sd, lb_hot_pulled[idle]); +#endif + return 1; } @@ -1685,18 +1825,25 @@ static int load_balance(int this_cpu, ru int nr_moved; spin_lock(&this_rq->lock); + schedstat_inc(sd, lb_cnt[idle]); group = find_busiest_group(sd, this_cpu, &imbalance, idle); - if (!group) + if (!group) { + schedstat_inc(sd, lb_balanced[idle]); goto out_balanced; + } busiest = find_busiest_queue(group); - if (!busiest) + if (!busiest) { + schedstat_inc(sd, lb_balanced[idle]); goto out_balanced; + } + if (unlikely(busiest == this_rq)) { WARN_ON(1); goto out_balanced; } + schedstat_add(sd, lb_imbalance[idle], imbalance); /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); @@ -1706,6 +1853,7 @@ static int load_balance(int this_cpu, ru spin_unlock(&busiest->lock); if (!nr_moved) { + schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { @@ -1760,13 +1908,20 @@ static int load_balance_newidle(int this unsigned long imbalance; int nr_moved = 0; + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); - if (!group) + if (!group) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); goto out; + } busiest = find_busiest_queue(group); - if (!busiest || busiest == this_rq) + if (!busiest || busiest == this_rq) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); goto out; + } + + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); @@ -1811,6 +1966,7 @@ static void active_load_balance(runqueue struct sched_domain *sd; struct sched_group *group, *busy_group; int i; + int moved = 0; if (busiest->nr_running <= 1) return; @@ -1822,6 +1978,7 @@ static void active_load_balance(runqueue WARN_ON(1); return; } + schedstat_inc(sd, alb_cnt); group = sd->groups; while (!cpu_isset(busiest_cpu, group->cpumask)) @@ -1849,11 +2006,16 @@ static void active_load_balance(runqueue rq = cpu_rq(push_cpu); double_lock_balance(busiest, rq); - move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); + moved += move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); spin_unlock(&rq->lock); next_group: group = group->next; } while (group != sd->groups); + + if (moved) + schedstat_add(sd, alb_pushed, moved); + else + schedstat_inc(sd, alb_failed); } /* @@ -2186,6 +2348,7 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); + schedstat_inc(rq, sched_cnt); now = sched_clock(); if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) run_time = now - prev->timestamp; @@ -2223,6 +2386,7 @@ need_resched: next = rq->idle; rq->expired_timestamp = 0; wake_sleeping_dependent(cpu, rq); + schedstat_inc(rq, sched_idle); goto switch_tasks; } } @@ -2232,6 +2396,7 @@ need_resched: /* * Switch the active and expired arrays. */ + schedstat_inc(rq, sched_switch); rq->active = rq->expired; rq->expired = array; array = rq->active; @@ -2975,6 +3140,7 @@ asmlinkage long sys_sched_yield(void) prio_array_t *array = current->array; prio_array_t *target = rq->expired; + schedstat_inc(rq, yld_cnt); /* * We implement yielding by moving the task into the expired * queue. @@ -3734,6 +3900,7 @@ static void __init arch_init_sched_domai struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); *cpu_sd = SD_CPU_INIT; + cpu_sd->cache_nice_tries = 2; cpu_sd->span = cpu_possible_map; cpu_sd->groups = &sched_group_cpus[i]; } _